1 /* SQUID - A C function library for biological sequence analysis
2 * Copyright (C) 1992-1996 Sean R. Eddy
3 *
4 * This source code is distributed under terms of the
5 * GNU General Public License. See the files COPYING
6 * and GNULICENSE for further details.
7 *
8 */
9
10 /* cluster.c
11 * SRE, Sun Jul 18 09:49:47 1993
12 * moved to squid Thu Mar 3 08:42:57 1994
13 * almost identical to bord.c, from fd
14 * also now contains routines for constructing difference matrices
15 * from alignments
16 *
17 * "branch ordering": Input a symmetric or upper-right-diagonal
18 * NxN difference matrix (usually constructed by pairwise alignment
19 * and similarity calculations for N sequences). Use the simple
20 * cluster analysis part of the Fitch/Margoliash tree-building algorithm
21 * (as described by Fitch and Margoliash 1967 as well as Feng
22 * and Doolittle 1987) to calculate the topology of an "evolutionary
23 * tree" consistent with the difference matrix. Returns an array
24 * which represents the tree.
25 *
26 * The input difference matrix is just an NxN matrix of floats.
27 * A good match is a small difference score (the algorithm is going
28 * to search for minima among the difference scores). The original difference
29 * matrix remains unchanged by the calculations.
30 *
31 * The output requires some explanation. A phylogenetic
32 * tree is a binary tree, with N "leaves" and N-1 "nodes". The
33 * topology of the tree may be completely described by N-1 structures
34 * containing two pointers; each pointer points to either a leaf
35 * or another node. Here, this is implemented with integer indices
36 * rather than pointers. An array of N-1 pairs of ints is returned.
37 * If the index is in the range (0..N-1), it is a "leaf" -- the
38 * number of one of the sequences. If the index is in the range
39 * (N..2N-2), it is another "node" -- (index-N) is the index
40 * of the node in the returned array.
41 *
42 * If both indices of a member of the returned array point to
43 * nodes, the tree is "compound": composed of more than one
44 * cluster of related sequences.
45 *
46 * The higher-numbered elements of the returned array were the
47 * first constructed, and hence represent the distal tips
48 * of the tree -- the most similar sequences. The root
49 * is node 0.
50 ******************************************************************
51 *
52 * Algorithm
53 *
54 * INITIALIZATIONS:
55 * - copy the difference matrix (otherwise the caller's copy would
56 * get destroyed by the operations of this algorithm). If
57 * it's asymmetric, make it symmetric.
58 * - make a (0..N-1) array of ints to keep track of the indices in
59 * the difference matrix as they get swapped around. Initialize
60 * this matrix to 0..N-1.
61 * - make a (0..N-2) array of int[2] to store the results (the tree
62 * topology). Doesn't need to be initialized.
63 * - keep track of a "N'", the current size of the difference
64 * matrix being operated on.
65 *
66 * PROCESSING THE DIFFERENCE MATRIX:
67 * - for N' = N down to N' = 2 (N-1 steps):
68 * - in the half-diagonal N'xN' matrix, find the indices i,j at which
69 * there's the minimum difference score
70 *
71 * Store the results:
72 * - at position N'-2 of the result array, store coords[i] and
73 * coords[j].
74 *
75 * Move i,j rows, cols to the outside edges of the matrix:
76 * - swap row i and row N'-2
77 * - swap row j and row N'-1
78 * - swap column i and column N'-2
79 * - swap column j and column N'-1
80 * - swap indices i, N'-2 in the index array
81 * - swap indices j, N'-1 in the index array
82 *
83 * Build a average difference score for differences to i,j:
84 * - for all columns, find avg difference between rows i and j and store in row i:
85 * row[i][col] = (row[i][col] + row[j][col]) / 2.0
86 * - copy the contents of row i to column i (it's a symmetric
87 * matrix, no need to recalculate)
88 * - store an index N'+N-2 at position N'-2 of the index array: means
89 * that this row/column is now a node rather than a leaf, and
90 * contains minimum values
91 *
92 * Continue:
93 * - go to the next N'
94 *
95 * GARBAGE COLLECTION & RETURN.
96 *
97 **********************************************************************
98 *
99 * References:
100 *
101 * Feng D-F and R.F. Doolittle. "Progressive sequence alignment as a
102 * prerequisite to correct phylogenetic trees." J. Mol. Evol.
103 * 25:351-360, 1987.
104 *
105 * Fitch W.M. and Margoliash E. "Construction of phylogenetic trees."
106 * Science 155:279-284, 1967.
107 *
108 **********************************************************************
109 *
110 * SRE, 18 March 1992 (bord.c)
111 * SRE, Sun Jul 18 09:52:14 1993 (cluster.c)
112 * added to squid Thu Mar 3 09:13:56 1994
113 **********************************************************************
114 * Mon May 4 09:47:02 1992: keep track of difference scores at each node
115 */
116
117
118 #include <stdio.h>
119 #include <string.h>
120 #include <math.h>
121
122 #include "squid.h"
123 #include "sqfuncs.h"
124
125 #ifdef MEMDEBUG
126 #include "dbmalloc.h"
127 #endif
128
129 /* Function: Cluster()
130 *
131 * Purpose: Cluster analysis on a distance matrix. Constructs a
132 * phylogenetic tree which contains the topology
133 * and info for each node: branch lengths, how many
134 * sequences are included under the node, and which
135 * sequences are included under the node.
136 *
137 * Args: dmx - the NxN distance matrix ( >= 0.0, larger means more diverged)
138 * N - size of mx (number of sequences)
139 * mode - CLUSTER_MEAN, CLUSTER_MAX, or CLUSTER_MIN
140 * ret_tree- RETURN: the tree
141 *
142 * Return: 1 on success, 0 on failure.
143 * The caller is responsible for freeing the tree's memory,
144 * by calling FreePhylo(tree, N).
145 */
146 int
Cluster(float ** dmx,int N,enum clust_strategy mode,struct phylo_s ** ret_tree)147 Cluster(float **dmx, int N, enum clust_strategy mode, struct phylo_s **ret_tree)
148 {
149 struct phylo_s *tree; /* (0..N-2) phylogenetic tree */
150 float **mx; /* copy of difference matrix */
151 int *coord; /* (0..N-1), indices for matrix coords */
152 int i, j; /* coords of minimum difference */
153 int idx; /* counter over seqs */
154 int Np; /* N', a working copy of N */
155 int row, col; /* loop variables */
156 float min; /* best minimum score found */
157 float *trow; /* tmp pointer for swapping rows */
158 float tcol; /* tmp storage for swapping cols */
159 float *diff; /* (0..N-2) difference scores at nodes */
160 int swapfoo; /* for SWAP() macro */
161
162 /**************************
163 * Initializations.
164 **************************/
165 /* We destroy the matrix we work on, so make a copy of dmx.
166 */
167 if ((mx = (float **) malloc (sizeof(float *) * N)) == NULL)
168 Die("malloc failed");
169 for (i = 0; i < N; i++)
170 {
171 if ((mx[i] = (float *) malloc (sizeof(float) * N)) == NULL)
172 Die("malloc failed");
173 for (j = 0; j < N; j++)
174 mx[i][j] = dmx[i][j];
175 }
176 /* coord array alloc, (0..N-1) */
177 if ((coord = (int *) malloc (N * sizeof(int))) == NULL ||
178 (diff = (float *) malloc ((N-1) * sizeof(float))) == NULL)
179 Die("malloc failed");
180 /* init the coord array to 0..N-1 */
181 for (col = 0; col < N; col++) coord[col] = col;
182 for (i = 0; i < N-1; i++) diff[i] = 0.0;
183
184 /* tree array alloc, (0..N-2) */
185 if ((tree = AllocPhylo(N)) == NULL) Die("AllocPhylo() failed");
186
187 /*********************************
188 * Process the difference matrix
189 *********************************/
190
191 /* N-prime, for an NxN down to a 2x2 diffmx */
192 for (Np = N; Np >= 2; Np--)
193 {
194 /* find a minimum on the N'xN' matrix*/
195 min = 999999.;
196 for (row = 0; row < Np; row++)
197 for (col = row+1; col < Np; col++)
198 if (mx[row][col] < min)
199 {
200 min = mx[row][col];
201 i = row;
202 j = col;
203 }
204
205 /* We're clustering row i with col j. write necessary
206 * data into a node on the tree
207 */
208 /* topology info */
209 tree[Np-2].left = coord[i];
210 tree[Np-2].right = coord[j];
211 if (coord[i] >= N) tree[coord[i]-N].parent = N + Np - 2;
212 if (coord[j] >= N) tree[coord[j]-N].parent = N + Np - 2;
213
214 /* keep score info */
215 diff[Np-2] = tree[Np-2].diff = min;
216
217 /* way-simple branch length estimation */
218 tree[Np-2].lblen = tree[Np-2].rblen = min;
219 if (coord[i] >= N) tree[Np-2].lblen -= diff[coord[i]-N];
220 if (coord[j] >= N) tree[Np-2].rblen -= diff[coord[j]-N];
221
222 /* number seqs included at node */
223 if (coord[i] < N)
224 {
225 tree[Np-2].incnum ++;
226 tree[Np-2].is_in[coord[i]] = 1;
227 }
228 else
229 {
230 tree[Np-2].incnum += tree[coord[i]-N].incnum;
231 for (idx = 0; idx < N; idx++)
232 tree[Np-2].is_in[idx] |= tree[coord[i]-N].is_in[idx];
233 }
234
235 if (coord[j] < N)
236 {
237 tree[Np-2].incnum ++;
238 tree[Np-2].is_in[coord[j]] = 1;
239 }
240 else
241 {
242 tree[Np-2].incnum += tree[coord[j]-N].incnum;
243 for (idx = 0; idx < N; idx++)
244 tree[Np-2].is_in[idx] |= tree[coord[j]-N].is_in[idx];
245 }
246
247
248 /* Now build a new matrix, by merging row i with row j and
249 * column i with column j; see Fitch and Margoliash
250 */
251 /* Row and column swapping. */
252 /* watch out for swapping i, j away: */
253 if (i == Np-1 || j == Np-2)
254 SWAP(i,j);
255
256 if (i != Np-2)
257 {
258 /* swap row i, row N'-2 */
259 trow = mx[Np-2]; mx[Np-2] = mx[i]; mx[i] = trow;
260 /* swap col i, col N'-2 */
261 for (row = 0; row < Np; row++)
262 {
263 tcol = mx[row][Np-2];
264 mx[row][Np-2] = mx[row][i];
265 mx[row][i] = tcol;
266 }
267 /* swap coord i, coord N'-2 */
268 SWAP(coord[i], coord[Np-2]);
269 }
270
271 if (j != Np-1)
272 {
273 /* swap row j, row N'-1 */
274 trow = mx[Np-1]; mx[Np-1] = mx[j]; mx[j] = trow;
275 /* swap col j, col N'-1 */
276 for (row = 0; row < Np; row++)
277 {
278 tcol = mx[row][Np-1];
279 mx[row][Np-1] = mx[row][j];
280 mx[row][j] = tcol;
281 }
282 /* swap coord j, coord N'-1 */
283 SWAP(coord[j], coord[Np-1]);
284 }
285
286 /* average i and j together; they're now
287 at Np-2 and Np-1 though */
288 i = Np-2;
289 j = Np-1;
290 /* merge by saving avg of cols of row i and row j */
291 for (col = 0; col < Np; col++)
292 {
293 switch (mode) {
294 case CLUSTER_MEAN: mx[i][col] =(mx[i][col]+ mx[j][col]) / 2.0; break;
295 case CLUSTER_MIN: mx[i][col] = MIN(mx[i][col], mx[j][col]); break;
296 case CLUSTER_MAX: mx[i][col] = MAX(mx[i][col], mx[j][col]); break;
297 default: mx[i][col] =(mx[i][col]+ mx[j][col]) / 2.0; break;
298 }
299 }
300 /* copy those rows to columns */
301 for (col = 0; col < Np; col++)
302 mx[col][i] = mx[i][col];
303 /* store the node index in coords */
304 coord[Np-2] = Np+N-2;
305 }
306
307 /**************************
308 * Garbage collection and return
309 **************************/
310 Free2DArray(mx, N);
311 free(coord);
312 free(diff);
313 *ret_tree = tree;
314 return 1;
315 }
316
317 /* Function: AllocPhylo()
318 *
319 * Purpose: Allocate space for a phylo_s array. N-1 structures
320 * are allocated, one for each node; in each node, a 0..N
321 * is_in flag array is also allocated and initialized to
322 * all zeros.
323 *
324 * Args: N - size; number of sequences being clustered
325 *
326 * Return: pointer to the allocated array
327 *
328 */
329 struct phylo_s *
AllocPhylo(int N)330 AllocPhylo(int N)
331 {
332 struct phylo_s *tree;
333 int i;
334
335 if ((tree = (struct phylo_s *) malloc ((N-1) * sizeof(struct phylo_s))) == NULL)
336 return NULL;
337
338 for (i = 0; i < N-1; i++)
339 {
340 tree[i].diff = 0.0;
341 tree[i].lblen = tree[i].rblen = 0.0;
342 tree[i].left = tree[i].right = tree[i].parent = -1;
343 tree[i].incnum = 0;
344 if ((tree[i].is_in = (char *) calloc (N, sizeof(char))) == NULL)
345 return NULL;
346 }
347 return tree;
348 }
349
350
351 /* Function: FreePhylo()
352 *
353 * Purpose: Free a clustree array that was built to cluster N sequences.
354 *
355 * Args: tree - phylogenetic tree to free
356 * N - size of clustree; number of sequences it clustered
357 *
358 * Return: (void)
359 */
360 void
FreePhylo(struct phylo_s * tree,int N)361 FreePhylo(struct phylo_s *tree, int N)
362 {
363 int idx;
364
365 for (idx = 0; idx < N-1; idx++)
366 free(tree[idx].is_in);
367 free(tree);
368 }
369
370
371 /* Function: MakeDiffMx()
372 *
373 * Purpose: Given a set of aligned sequences, construct
374 * an NxN fractional difference matrix. (i.e. 1.0 is
375 * completely different, 0.0 is exactly identical).
376 *
377 * Args: aseqs - flushed, aligned sequences
378 * num - number of aseqs
379 * ret_dmx - RETURN: difference matrix
380 *
381 * Return: 1 on success, 0 on failure.
382 * Caller must free diff matrix with FMX2Free(dmx)
383 */
384 void
MakeDiffMx(char ** aseqs,int num,float *** ret_dmx)385 MakeDiffMx(char **aseqs, int num, float ***ret_dmx)
386 {
387 float **dmx; /* RETURN: distance matrix */
388 int i,j; /* counters over sequences */
389
390 /* Allocate 2D float matrix
391 */
392 dmx = FMX2Alloc(num, num);
393
394 /* Calculate distances; symmetric matrix
395 * record difference, not identity (1 - identity)
396 */
397 for (i = 0; i < num; i++)
398 for (j = i; j < num; j++)
399 dmx[i][j] = dmx[j][i] = 1.0 - PairwiseIdentity(aseqs[i], aseqs[j]);
400
401 *ret_dmx = dmx;
402 return;
403 }
404
405 /* Function: MakeIdentityMx()
406 *
407 * Purpose: Given a set of aligned sequences, construct
408 * an NxN fractional identity matrix. (i.e. 1.0 is
409 * completely identical, 0.0 is completely different).
410 * Virtually identical to MakeDiffMx(). It's
411 * less confusing to have two distinct functions, I find.
412 *
413 * Args: aseqs - flushed, aligned sequences
414 * num - number of aseqs
415 * ret_imx - RETURN: identity matrix (caller must free)
416 *
417 * Return: 1 on success, 0 on failure.
418 * Caller must free imx using FMX2Free(imx)
419 */
420 void
MakeIdentityMx(char ** aseqs,int num,float *** ret_imx)421 MakeIdentityMx(char **aseqs, int num, float ***ret_imx)
422 {
423 float **imx; /* RETURN: identity matrix */
424 int i,j; /* counters over sequences */
425
426 /* Allocate 2D float matrix
427 */
428 imx = FMX2Alloc(num, num);
429
430 /* Calculate distances, symmetric matrix
431 */
432 for (i = 0; i < num; i++)
433 for (j = i; j < num; j++)
434 imx[i][j] = imx[j][i] = PairwiseIdentity(aseqs[i], aseqs[j]);
435
436 *ret_imx = imx;
437 return;
438 }
439
440
441
442 /* Function: PrintNewHampshireTree()
443 *
444 * Purpose: Print out a tree in the "New Hampshire" standard
445 * format. See PHYLIP's draw.doc for a definition of
446 * the New Hampshire format.
447 *
448 * Like a CFG, we generate the format string left to
449 * right by a preorder tree traversal.
450 *
451 * Args: fp - file to print to
452 * ainfo- alignment info, including sequence names
453 * tree - tree to print
454 * N - number of leaves
455 *
456 */
457 void
PrintNewHampshireTree(FILE * fp,AINFO * ainfo,struct phylo_s * tree,int N)458 PrintNewHampshireTree(FILE *fp, AINFO *ainfo, struct phylo_s *tree, int N)
459 {
460 struct intstack_s *stack;
461 int code;
462 float *blen;
463 int docomma;
464
465 blen = (float *) MallocOrDie (sizeof(float) * (2*N-1));
466 stack = InitIntStack();
467 PushIntStack(stack, N); /* push root on stack */
468 docomma = FALSE;
469
470 /* node index code:
471 * 0..N-1 = leaves; indexes of sequences.
472 * N..2N-2 = interior nodes; node-N = index of node in tree structure.
473 * code N is the root.
474 * 2N..3N-2 = special flags for closing interior nodes; node-2N = index in tree
475 */
476 while (PopIntStack(stack, &code))
477 {
478 if (code < N) /* we're a leaf. */
479 {
480 /* 1) print name:branchlength */
481 if (docomma) fputs(",", fp);
482 fprintf(fp, "%s:%.5f", ainfo->sqinfo[code].name, blen[code]);
483 docomma = TRUE;
484 }
485
486 else if (code < 2*N) /* we're an interior node */
487 {
488 /* 1) print a '(' */
489 if (docomma) fputs(",\n", fp);
490 fputs("(", fp);
491 /* 2) push on stack: ), rchild, lchild */
492 PushIntStack(stack, code+N);
493 PushIntStack(stack, tree[code-N].right);
494 PushIntStack(stack, tree[code-N].left);
495 /* 3) record branch lengths */
496 blen[tree[code-N].right] = tree[code-N].rblen;
497 blen[tree[code-N].left] = tree[code-N].lblen;
498 docomma = FALSE;
499 }
500
501 else /* we're closing an interior node */
502 {
503 /* print a ):branchlength */
504 if (code == 2*N) fprintf(fp, ");\n");
505 else fprintf(fp, "):%.5f", blen[code-N]);
506 docomma = TRUE;
507 }
508 }
509
510 FreeIntStack(stack);
511 free(blen);
512 return;
513 }
514
515
516 /* Function: PrintPhylo()
517 *
518 * Purpose: Debugging output of a phylogenetic tree structure.
519 */
520 void
PrintPhylo(FILE * fp,AINFO * ainfo,struct phylo_s * tree,int N)521 PrintPhylo(FILE *fp, AINFO *ainfo, struct phylo_s *tree, int N)
522 {
523 int idx;
524
525 for (idx = 0; idx < N-1; idx++)
526 {
527 fprintf(fp, "Interior node %d (code %d)\n", idx, idx+N);
528 fprintf(fp, "\tParent: %d (code %d)\n", tree[idx].parent-N, tree[idx].parent);
529 fprintf(fp, "\tLeft: %d (%s) %f\n",
530 tree[idx].left < N ? tree[idx].left-N : tree[idx].left,
531 tree[idx].left < N ? ainfo->sqinfo[tree[idx].left].name : "interior",
532 tree[idx].lblen);
533 fprintf(fp, "\tRight: %d (%s) %f\n",
534 tree[idx].right < N ? tree[idx].right-N : tree[idx].right,
535 tree[idx].right < N ? ainfo->sqinfo[tree[idx].right].name : "interior",
536 tree[idx].rblen);
537 fprintf(fp, "\tHeight: %f\n", tree[idx].diff);
538 fprintf(fp, "\tIncludes:%d seqs\n", tree[idx].incnum);
539 }
540 }
541
542
543
544