1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
3 /* This the fast UPGMA algorithm (O(N^2)) as implemented in Bob Edgar's
4 * Muscle (UPGMA2.cpp; version 3.7) ported to pure C.
5 *
6 * Muscle's code is public domain and so is this code here.
7 *
8 * From http://www.drive5.com/muscle/license.htm:
9 * """
10 * MUSCLE is public domain software
11 *
12 * The MUSCLE software, including object and source code and
13 * documentation, is hereby donated to the public domain.
14 *
15 * Disclaimer of warranty
16 * THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND,
17 * EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * """
20 *
21 */
22
23 /*
24 * RCS $Id: muscle_upgma.c 230 2011-04-09 15:37:50Z andreas $
25 *
26 *
27 * Notes:
28 * ------
29 * LINKAGE become linkage_t here
30 *
31 * Replaced the the following member functions for DistCalc DC:
32 * DC.GetId = sequence id as int
33 * DC.GetName = sequence name
34 * DC.GetCount = matrix dim
35 * DC.DistRange = vector / matrix row for object i with index j<i
36 *
37 * Log() has been replaced with Clustal's Info(), Quiet() with Log(&rLog, LOG_FATAL)
38 *
39 * Made TriangleSubscript() and g_ulTriangleSize ulong to prevent overflow for many sequences
40 */
41
42 #ifndef ulint
43 /* limit use of unsigned vars (see coding_style_guideline.txt) */
44 typedef unsigned long int ulong;
45 #endif
46
47
48
49 #include <stdlib.h>
50 #include <stdio.h>
51 #include <assert.h>
52
53 #include "util.h"
54 #include "log.h"
55 #include "symmatrix.h"
56
57 #include "muscle_tree.h"
58 #include "muscle_upgma.h"
59
60 /* from distcalc.h */
61 typedef float dist_t;
62 static const dist_t BIG_DIST = (dist_t) 1e29;
63 /* from muscle.h */
64 static const unsigned uInsane = 8888888;
65
66
67
68
69 /*static inline*/
70 ulong TriangleSubscript(uint uIndex1, uint uIndex2);
71
72
73
74
75 #define TRACE 0
76
77 #ifndef MIN
78 #define MIN(x, y) ((x) < (y) ? (x) : (y))
79 #endif
80 #ifndef MIN
81 #define MAX(x, y) ((x) > (y) ? (x) : (y))
82 #endif
83 #define AVG(x, y) (((x) + (y))/2)
84
85 static uint g_uLeafCount;
86 static ulong g_ulTriangleSize;
87 static uint g_uInternalNodeCount;
88 static uint g_uInternalNodeIndex;
89
90 /* Triangular distance matrix is g_Dist, which is allocated
91 * as a one-dimensional vector of length g_ulTriangleSize.
92 * TriangleSubscript(i,j) maps row,column=i,j to the subscript
93 * into this vector.
94 * Row / column coordinates are a bit messy.
95 * Initially they are leaf indexes 0..N-1.
96 * But each time we create a new node (=new cluster, new subtree),
97 * we re-use one of the two rows that become available (the children
98 * of the new node). This saves memory.
99 * We keep track of this through the g_uNodeIndex vector.
100 */
101 static dist_t *g_Dist;
102
103 /* Distance to nearest neighbor in row i of distance matrix.
104 * Subscript is distance matrix row.
105 */
106 static dist_t *g_MinDist;
107
108 /* Nearest neighbor to row i of distance matrix.
109 * Subscript is distance matrix row.
110 */
111 static uint *g_uNearestNeighbor;
112
113 /* Node index of row i in distance matrix.
114 * Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes.
115 * Subscript is distance matrix row.
116 */
117 static uint *g_uNodeIndex;
118
119 /* The following vectors are defined on internal nodes,
120 * subscripts are internal node index 0..N-2.
121 * For g_uLeft/Right, value is the node index 0 .. 2N-2
122 * because a child can be internal or leaf.
123 */
124 static uint *g_uLeft;
125 static uint *g_uRight;
126 static dist_t *g_Height;
127 static dist_t *g_LeftLength;
128 static dist_t *g_RightLength;
129
130
131 /*** CalcDistRange
132 *
133 * Imitation of DistCalc.DistRange
134 *
135 * Sets values of row (vector / matrix row) to distances for object i with index j<i
136 *
137 * row must be preallocated
138 */
CalcDistRange(symmatrix_t * distmat,uint i,dist_t * row)139 void CalcDistRange(symmatrix_t *distmat, uint i, dist_t *row)
140 {
141 uint j;
142 for (j = 0; j < i; ++j) {
143 row[j] = SymMatrixGetValue(distmat, i, j);
144 }
145 }
146 /* end of CalcDistRange */
147
148
149
150 /*static inline*/
151 ulong
TriangleSubscript(uint uIndex1,uint uIndex2)152 TriangleSubscript(uint uIndex1, uint uIndex2)
153 {
154 ulong v;
155 #ifndef NDEBUG
156 if (uIndex1 >= g_uLeafCount || uIndex2 >= g_uLeafCount)
157 Log(&rLog, LOG_FATAL, "TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount);
158 #endif
159 if (uIndex1 >= uIndex2)
160 v = uIndex2 + (uIndex1*(uIndex1 - 1))/2;
161 else
162 v = uIndex1 + (uIndex2*(uIndex2 - 1))/2;
163 assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2);
164 return v;
165 }
166
167 #ifdef UNUSED
ListState()168 static void ListState()
169 {
170 uint i, j;
171 Info("Dist matrix\n");
172 Info(" ");
173 for (i = 0; i < g_uLeafCount; ++i)
174 {
175 if (uInsane == g_uNodeIndex[i])
176 continue;
177 Info(" %5u", g_uNodeIndex[i]);
178 }
179 Info("\n");
180
181 for (i = 0; i < g_uLeafCount; ++i)
182 {
183 if (uInsane == g_uNodeIndex[i])
184 continue;
185 Info("%5u ", g_uNodeIndex[i]);
186 for (j = 0; j < g_uLeafCount; ++j)
187 {
188 if (uInsane == g_uNodeIndex[j])
189 continue;
190 if (i == j)
191 Info(" ");
192 else
193 {
194 ulong v = TriangleSubscript(i, j);
195 Info("%5.2g ", g_Dist[v]);
196 }
197 }
198 Info("\n");
199 }
200
201 Info("\n");
202 Info(" i Node NrNb Dist\n");
203 Info("----- ----- ----- --------\n");
204 for (i = 0; i < g_uLeafCount; ++i)
205 {
206 if (uInsane == g_uNodeIndex[i])
207 continue;
208 Info("%5u %5u %5u %8.3f\n",
209 i,
210 g_uNodeIndex[i],
211 g_uNearestNeighbor[i],
212 g_MinDist[i]);
213 }
214
215 Info("\n");
216 Info(" Node L R Height LLength RLength\n");
217 Info("----- ----- ----- ------ ------- -------\n");
218 for (i = 0; i <= g_uInternalNodeIndex; ++i)
219 Info("%5u %5u %5u %6.2g %6.2g %6.2g\n",
220 i,
221 g_uLeft[i],
222 g_uRight[i],
223 g_Height[i],
224 g_LeftLength[i],
225 g_RightLength[i]);
226 }
227 #endif
228 /* ifdef UNUSED */
229
230 /**
231 * @brief Creates a UPGMA in O(N^2) tree from given distmat
232 *
233 * @param[out] tree
234 * newly created rooted UPGMA tree
235 * @param[in] distmat
236 * distance matrix to be clustered
237 * @param[in] linkage
238 * linkage type
239 * @param[in] names
240 * leaf names, will be copied
241 *
242 * @note called UPGMA2() in Muscle3.7.
243 * caller has to free with FreeMuscleTree()
244 *
245 * @see FreeMuscleTree()
246 */
247 void
MuscleUpgma2(tree_t * tree,symmatrix_t * distmat,linkage_t linkage,char ** names)248 MuscleUpgma2(tree_t *tree, symmatrix_t *distmat, linkage_t linkage, char **names)
249 {
250 int i, j;
251 uint *Ids;
252
253 /* only works on full symmetric matrices */
254 assert (distmat->nrows==distmat->ncols);
255
256 g_uLeafCount = distmat->ncols;
257 g_ulTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2;
258 g_uInternalNodeCount = g_uLeafCount - 1;
259
260 g_Dist = (dist_t *) CKMALLOC(g_ulTriangleSize * sizeof(dist_t));
261
262 g_uNodeIndex = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount);
263 g_uNearestNeighbor = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount);
264 g_MinDist = (dist_t *) CKMALLOC(sizeof(dist_t) * g_uLeafCount);
265 Ids = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount);
266 /* NOTE: we replaced Names with argument names */
267
268 /**
269 * left and right node indices, as well as left and right
270 * branch-length and height for for internal nodes
271 */
272 g_uLeft = (uint*) CKMALLOC(sizeof(uint) * g_uInternalNodeCount);
273 g_uRight = (uint*) CKMALLOC(sizeof(uint) * g_uInternalNodeCount);
274 g_Height = (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount);
275 g_LeftLength = (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount);
276 g_RightLength = (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount);
277
278 for (i = 0; i < g_uLeafCount; ++i) {
279 g_MinDist[i] = BIG_DIST;
280 g_uNodeIndex[i] = i;
281 g_uNearestNeighbor[i] = uInsane;
282 Ids[i] = i;
283 }
284
285 for (i = 0; i < g_uInternalNodeCount; ++i) {
286 g_uLeft[i] = uInsane;
287 g_uRight[i] = uInsane;
288 g_LeftLength[i] = BIG_DIST;
289 g_RightLength[i] = BIG_DIST;
290 g_Height[i] = BIG_DIST;
291 }
292
293 /* Compute initial NxN triangular distance matrix.
294 * Store minimum distance for each full (not triangular) row.
295 * Loop from 1, not 0, because "row" is 0, 1 ... i-1,
296 * so nothing to do when i=0.
297 */
298 for (i = 1; i < g_uLeafCount; ++i) {
299 dist_t *Row = g_Dist + TriangleSubscript(i, 0);
300 CalcDistRange(distmat, i, Row);
301 for (j = 0; j < i; ++j) {
302 const dist_t d = Row[j];
303 if (d < g_MinDist[i]) {
304 g_MinDist[i] = d;
305 g_uNearestNeighbor[i] = j;
306 }
307 if (d < g_MinDist[j]) {
308 g_MinDist[j] = d;
309 g_uNearestNeighbor[j] = i;
310 }
311 }
312 }
313
314 #if TRACE
315 Info("Initial state:\n");
316 ListState();
317 #endif
318
319 for (g_uInternalNodeIndex = 0;
320 g_uInternalNodeIndex < g_uLeafCount - 1;
321 ++g_uInternalNodeIndex) {
322
323 dist_t dtNewMinDist = BIG_DIST;
324 uint uNewNearestNeighbor = uInsane;
325
326 #if TRACE
327 Info("\n");
328 Info("Internal node index %5u\n", g_uInternalNodeIndex);
329 Info("-------------------------\n");
330 #endif
331
332 /* Find nearest neighbors */
333 uint Lmin = uInsane;
334 uint Rmin = uInsane;
335 dist_t dtMinDist = BIG_DIST;
336 for (j = 0; j < g_uLeafCount; ++j) {
337 dist_t d;
338 if (uInsane == g_uNodeIndex[j])
339 continue;
340
341 d = g_MinDist[j];
342 if (d < dtMinDist) {
343 dtMinDist = d;
344 Lmin = j;
345 Rmin = g_uNearestNeighbor[j];
346 assert(uInsane != Rmin);
347 assert(uInsane != g_uNodeIndex[Rmin]);
348 }
349 }
350
351 assert(Lmin != uInsane);
352 assert(Rmin != uInsane);
353 assert(dtMinDist != BIG_DIST);
354
355 #if TRACE
356 Info("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n",
357 Lmin,
358 g_uNodeIndex[Lmin],
359 Rmin,
360 g_uNodeIndex[Rmin],
361 dtMinDist);
362 #endif
363
364 /* Compute distances to new node
365 * New node overwrites row currently assigned to Lmin
366 */
367 for ( j = 0; j < g_uLeafCount; ++j) {
368 ulong vL, vR;
369 dist_t dL, dR;
370 dist_t dtNewDist;
371
372 if (j == Lmin || j == Rmin)
373 continue;
374 if (uInsane == g_uNodeIndex[j])
375 continue;
376
377 vL = TriangleSubscript(Lmin, j);
378 vR = TriangleSubscript(Rmin, j);
379 dL = g_Dist[vL];
380 dR = g_Dist[vR];
381 dtNewDist = 0.0;
382
383 switch (linkage) {
384 case LINKAGE_AVG:
385 dtNewDist = AVG(dL, dR);
386 break;
387
388 case LINKAGE_MIN:
389 dtNewDist = MIN(dL, dR);
390 break;
391
392 case LINKAGE_MAX:
393 dtNewDist = MAX(dL, dR);
394 break;
395 /* couldn't be arsed to figure out proper usage of g_dSUEFF */
396 #if 0
397 case LINKAGE_BIASED:
398 dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR);
399 break;
400 #endif
401 default:
402 Log(&rLog, LOG_FATAL, "UPGMA2: Invalid LINKAGE_%u", linkage);
403 }
404
405 /* Nasty special case.
406 * If nearest neighbor of j is Lmin or Rmin, then make the new
407 * node (which overwrites the row currently occupied by Lmin)
408 * the nearest neighbor. This situation can occur when there are
409 * equal distances in the matrix. If we don't make this fix,
410 * the nearest neighbor pointer for j would become invalid.
411 * (We don't need to test for == Lmin, because in that case
412 * the net change needed is zero due to the change in row
413 * numbering).
414 */
415 if (g_uNearestNeighbor[j] == Rmin)
416 g_uNearestNeighbor[j] = Lmin;
417
418 #if TRACE
419 Info("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n",
420 j, Lmin, dL, Rmin, dR, dtNewDist);
421 #endif
422 g_Dist[vL] = dtNewDist;
423 if (dtNewDist < dtNewMinDist) {
424 dtNewMinDist = dtNewDist;
425 uNewNearestNeighbor = j;
426 }
427 }
428
429 assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist);
430 assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor);
431
432 const ulong v = TriangleSubscript(Lmin, Rmin);
433 const dist_t dLR = g_Dist[v];
434 const dist_t dHeightNew = dLR/2;
435 const uint uLeft = g_uNodeIndex[Lmin];
436 const uint uRight = g_uNodeIndex[Rmin];
437 const dist_t HeightLeft =
438 uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount];
439 const dist_t HeightRight =
440 uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount];
441
442 g_uLeft[g_uInternalNodeIndex] = uLeft;
443 g_uRight[g_uInternalNodeIndex] = uRight;
444 g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft;
445 g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight;
446 g_Height[g_uInternalNodeIndex] = dHeightNew;
447
448 /* Row for left child overwritten by row for new node */
449 g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex;
450 g_uNearestNeighbor[Lmin] = uNewNearestNeighbor;
451 g_MinDist[Lmin] = dtNewMinDist;
452
453 /* Delete row for right child */
454 g_uNodeIndex[Rmin] = uInsane;
455
456 #if TRACE
457 Info("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n",
458 g_uInternalNodeIndex, Lmin, Rmin);
459 ListState();
460 #endif
461 }
462
463 uint uRoot = g_uLeafCount - 2;
464
465 #if TRACE
466 Log(&rLog, LOG_FORCED_DEBUG, "uRoot=%d g_uLeafCount=%d g_uInternalNodeCount=%d", uRoot, g_uLeafCount, g_uInternalNodeCount);
467 for (i=0; i<g_uInternalNodeCount; i++) {
468 Log(&rLog, LOG_FORCED_DEBUG, "internal node=%d: g_uLeft=%d g_uRight=%d g_LeftLength=%f g_RightLength=%f g_Height=%f",
469 i, g_uLeft[i], g_uRight[i],
470 g_LeftLength[i], g_RightLength[i],
471 g_Height[i]);
472 }
473 for (i=0; i<g_uLeafCount; i++) {
474 Log(&rLog, LOG_FORCED_DEBUG, "leaf node=%d: Ids=%d names=%s",
475 i, Ids[i], names[i]);
476 }
477 #endif
478
479 MuscleTreeCreate(tree, g_uLeafCount, uRoot,
480 g_uLeft, g_uRight,
481 g_LeftLength, g_RightLength,
482 Ids, names);
483 #if TRACE
484 tree.LogMe();
485 #endif
486
487 free(g_Dist);
488
489 free(g_uNodeIndex);
490 free(g_uNearestNeighbor);
491 free(g_MinDist);
492 free(g_Height);
493
494 free(g_uLeft);
495 free(g_uRight);
496 free(g_LeftLength);
497 free(g_RightLength);
498
499 /* NOTE: Muscle's "Names" variable is here the argument "names" */
500 free(Ids);
501 }
502 /*** end of UPGMA2 ***/
503