1 /* SQUID - A C function library for biological sequence analysis
2  * Copyright (C) 1992-1996 Sean R. Eddy
3  *
4  *    This source code is distributed under terms of the
5  *    GNU General Public License. See the files COPYING
6  *    and GNULICENSE for further details.
7  *
8  */
9 
10 /* cluster.c
11  * SRE, Sun Jul 18 09:49:47 1993
12  * moved to squid Thu Mar  3 08:42:57 1994
13  * almost identical to bord.c, from fd
14  * also now contains routines for constructing difference matrices
15  * from alignments
16  *
17  * "branch ordering": Input a symmetric or upper-right-diagonal
18  * NxN difference matrix (usually constructed by pairwise alignment
19  * and similarity calculations for N sequences). Use the simple
20  * cluster analysis part of the Fitch/Margoliash tree-building algorithm
21  * (as described by Fitch and Margoliash 1967 as well as Feng
22  * and Doolittle 1987) to calculate the topology of an "evolutionary
23  * tree" consistent with the difference matrix. Returns an array
24  * which represents the tree.
25  *
26  * The input difference matrix is just an NxN matrix of floats.
27  * A good match is a small difference score (the algorithm is going
28  * to search for minima among the difference scores). The original difference
29  * matrix remains unchanged by the calculations.
30  *
31  * The output requires some explanation. A phylogenetic
32  * tree is a binary tree, with N "leaves" and N-1 "nodes". The
33  * topology of the tree may be completely described by N-1 structures
34  * containing two pointers; each pointer points to either a leaf
35  * or another node. Here, this is implemented with integer indices
36  * rather than pointers. An array of N-1 pairs of ints is returned.
37  * If the index is in the range (0..N-1), it is a "leaf" -- the
38  * number of one of the sequences. If the index is in the range
39  * (N..2N-2), it is another "node" -- (index-N) is the index
40  * of the node in the returned array.
41  *
42  * If both indices of a member of the returned array point to
43  * nodes, the tree is "compound": composed of more than one
44  * cluster of related sequences.
45  *
46  * The higher-numbered elements of the returned array were the
47  * first constructed, and hence represent the distal tips
48  * of the tree -- the most similar sequences. The root
49  * is node 0.
50  ******************************************************************
51  *
52  * Algorithm
53  *
54  * INITIALIZATIONS:
55  *  - copy the difference matrix (otherwise the caller's copy would
56  *       get destroyed by the operations of this algorithm). If
57  *       it's asymmetric, make it symmetric.
58  *  - make a (0..N-1) array of ints to keep track of the indices in
59  *       the difference matrix as they get swapped around. Initialize
60  *       this matrix to 0..N-1.
61  *  - make a (0..N-2) array of int[2] to store the results (the tree
62  *       topology). Doesn't need to be initialized.
63  *  - keep track of a "N'", the current size of the difference
64  *       matrix being operated on.
65  *
66  * PROCESSING THE DIFFERENCE MATRIX:
67  *  - for N' = N down to N' = 2  (N-1 steps):
68  *    - in the half-diagonal N'xN' matrix, find the indices i,j at which
69  *      there's the minimum difference score
70  *
71  *     Store the results:
72  *    - at position N'-2 of the result array, store coords[i] and
73  *         coords[j].
74  *
75  *     Move i,j rows, cols to the outside edges of the matrix:
76  *    - swap row i and row N'-2
77  *    - swap row j and row N'-1
78  *    - swap column i and column N'-2
79  *    - swap column j and column N'-1
80  *    - swap indices i, N'-2 in the index array
81  *    - swap indices j, N'-1 in the index array
82  *
83  *     Build a average difference score for differences to i,j:
84  *    - for all columns, find avg difference between rows i and j and store in row i:
85  *       row[i][col] = (row[i][col] + row[j][col]) / 2.0
86  *    - copy the contents of row i to column i (it's a symmetric
87  *       matrix, no need to recalculate)
88  *    - store an index N'+N-2 at position N'-2 of the index array: means
89  *       that this row/column is now a node rather than a leaf, and
90  *       contains minimum values
91  *
92  *     Continue:
93  *    - go to the next N'
94  *
95  * GARBAGE COLLECTION & RETURN.
96  *
97  **********************************************************************
98  *
99  * References:
100  *
101  * Feng D-F and R.F. Doolittle. "Progressive sequence alignment as a
102  *    prerequisite to correct phylogenetic trees." J. Mol. Evol.
103  *    25:351-360, 1987.
104  *
105  * Fitch W.M. and Margoliash E. "Construction of phylogenetic trees."
106  *    Science 155:279-284, 1967.
107  *
108  **********************************************************************
109  *
110  * SRE, 18 March 1992 (bord.c)
111  * SRE, Sun Jul 18 09:52:14 1993 (cluster.c)
112  * added to squid Thu Mar  3 09:13:56 1994
113  **********************************************************************
114  * Mon May  4 09:47:02 1992: keep track of difference scores at each node
115  */
116 
117 
118 #include <stdio.h>
119 #include <string.h>
120 #include <math.h>
121 
122 #include "squid.h"
123 #include "sqfuncs.h"
124 
125 #ifdef MEMDEBUG
126 #include "dbmalloc.h"
127 #endif
128 
129 /* Function: Cluster()
130  *
131  * Purpose:  Cluster analysis on a distance matrix. Constructs a
132  *           phylogenetic tree which contains the topology
133  *           and info for each node: branch lengths, how many
134  *           sequences are included under the node, and which
135  *           sequences are included under the node.
136  *
137  * Args:     dmx     - the NxN distance matrix ( >= 0.0, larger means more diverged)
138  *           N       - size of mx (number of sequences)
139  *           mode    - CLUSTER_MEAN, CLUSTER_MAX, or CLUSTER_MIN
140  *           ret_tree- RETURN: the tree
141  *
142  * Return:   1 on success, 0 on failure.
143  *           The caller is responsible for freeing the tree's memory,
144  *           by calling FreePhylo(tree, N).
145  */
146 int
Cluster(float ** dmx,int N,enum clust_strategy mode,struct phylo_s ** ret_tree)147 Cluster(float **dmx, int N, enum clust_strategy mode, struct phylo_s **ret_tree)
148 {
149   struct phylo_s *tree;         /* (0..N-2) phylogenetic tree          */
150   float    **mx;                /* copy of difference matrix           */
151   int       *coord;             /* (0..N-1), indices for matrix coords */
152   int        i, j;		/* coords of minimum difference        */
153   int        idx;		/* counter over seqs                   */
154   int        Np;                /* N', a working copy of N             */
155   int        row, col;          /* loop variables                      */
156   float      min;		/* best minimum score found            */
157   float     *trow;              /* tmp pointer for swapping rows       */
158   float      tcol;              /* tmp storage for swapping cols       */
159   float     *diff;		/* (0..N-2) difference scores at nodes */
160   int        swapfoo;		/* for SWAP() macro                    */
161 
162   /**************************
163    * Initializations.
164    **************************/
165   /* We destroy the matrix we work on, so make a copy of dmx.
166    */
167   if ((mx = (float **) malloc (sizeof(float *) * N)) == NULL)
168     Die("malloc failed");
169   for (i = 0; i < N; i++)
170     {
171       if ((mx[i] = (float *) malloc (sizeof(float) * N)) == NULL)
172 	Die("malloc failed");
173       for (j = 0; j < N; j++)
174 	mx[i][j] = dmx[i][j];
175     }
176 				/* coord array alloc, (0..N-1) */
177   if ((coord = (int *)    malloc  (N *    sizeof(int)))    == NULL ||
178       (diff  = (float *) malloc ((N-1) * sizeof(float))) == NULL)
179     Die("malloc failed");
180 				/* init the coord array to 0..N-1 */
181   for (col = 0; col < N; col++)  coord[col] = col;
182   for (i = 0; i < N-1; i++)      diff[i] = 0.0;
183 
184 				/* tree array alloc, (0..N-2) */
185   if ((tree = AllocPhylo(N)) == NULL)  Die("AllocPhylo() failed");
186 
187   /*********************************
188    * Process the difference matrix
189    *********************************/
190 
191 				/* N-prime, for an NxN down to a 2x2 diffmx */
192   for (Np = N; Np >= 2; Np--)
193     {
194 				/* find a minimum on the N'xN' matrix*/
195       min = 999999.;
196       for (row = 0; row < Np; row++)
197 	for (col = row+1; col < Np; col++)
198 	  if (mx[row][col] < min)
199 	    {
200 	      min = mx[row][col];
201 	      i   = row;
202 	      j   = col;
203 	    }
204 
205       /* We're clustering row i with col j. write necessary
206        * data into a node on the tree
207        */
208 				/* topology info */
209       tree[Np-2].left  = coord[i];
210       tree[Np-2].right = coord[j];
211       if (coord[i] >= N) tree[coord[i]-N].parent = N + Np - 2;
212       if (coord[j] >= N) tree[coord[j]-N].parent = N + Np - 2;
213 
214 				/* keep score info */
215       diff[Np-2] = tree[Np-2].diff = min;
216 
217 				/* way-simple branch length estimation */
218       tree[Np-2].lblen = tree[Np-2].rblen = min;
219       if (coord[i] >= N) tree[Np-2].lblen -= diff[coord[i]-N];
220       if (coord[j] >= N) tree[Np-2].rblen -= diff[coord[j]-N];
221 
222 				/* number seqs included at node */
223       if (coord[i] < N)
224 	{
225 	  tree[Np-2].incnum ++;
226 	  tree[Np-2].is_in[coord[i]] = 1;
227 	}
228       else
229 	{
230 	  tree[Np-2].incnum += tree[coord[i]-N].incnum;
231 	  for (idx = 0; idx < N; idx++)
232 	    tree[Np-2].is_in[idx] |= tree[coord[i]-N].is_in[idx];
233 	}
234 
235       if (coord[j] < N)
236 	{
237 	  tree[Np-2].incnum ++;
238 	  tree[Np-2].is_in[coord[j]] = 1;
239 	}
240       else
241 	{
242 	  tree[Np-2].incnum += tree[coord[j]-N].incnum;
243 	  for (idx = 0; idx < N; idx++)
244 	    tree[Np-2].is_in[idx] |= tree[coord[j]-N].is_in[idx];
245 	}
246 
247 
248       /* Now build a new matrix, by merging row i with row j and
249        * column i with column j; see Fitch and Margoliash
250        */
251 				/* Row and column swapping. */
252 				/* watch out for swapping i, j away: */
253       if (i == Np-1 || j == Np-2)
254 	SWAP(i,j);
255 
256       if (i != Np-2)
257 	{
258 				/* swap row i, row N'-2 */
259 	  trow = mx[Np-2]; mx[Np-2] = mx[i]; mx[i] = trow;
260 				/* swap col i, col N'-2 */
261 	  for (row = 0; row < Np; row++)
262 	    {
263 	      tcol = mx[row][Np-2];
264 	      mx[row][Np-2] = mx[row][i];
265 	      mx[row][i] = tcol;
266 	    }
267 				/* swap coord i, coord N'-2 */
268 	  SWAP(coord[i], coord[Np-2]);
269 	}
270 
271       if (j != Np-1)
272 	{
273 				/* swap row j, row N'-1 */
274 	  trow = mx[Np-1]; mx[Np-1] = mx[j]; mx[j] = trow;
275 				/* swap col j, col N'-1 */
276 	  for (row = 0; row < Np; row++)
277 	    {
278 	      tcol = mx[row][Np-1];
279 	      mx[row][Np-1] = mx[row][j];
280 	      mx[row][j] = tcol;
281 	    }
282 				/* swap coord j, coord N'-1 */
283 	  SWAP(coord[j], coord[Np-1]);
284 	}
285 
286 				/* average i and j together; they're now
287 				   at Np-2 and Np-1 though */
288       i = Np-2;
289       j = Np-1;
290 				/* merge by saving avg of cols of row i and row j */
291       for (col = 0; col < Np; col++)
292 	{
293 	  switch (mode) {
294 	  case CLUSTER_MEAN:  mx[i][col] =(mx[i][col]+ mx[j][col]) / 2.0; break;
295 	  case CLUSTER_MIN:   mx[i][col] = MIN(mx[i][col], mx[j][col]);   break;
296 	  case CLUSTER_MAX:   mx[i][col] = MAX(mx[i][col], mx[j][col]);   break;
297 	  default:            mx[i][col] =(mx[i][col]+ mx[j][col]) / 2.0; break;
298 	  }
299 	}
300 				/* copy those rows to columns */
301       for (col = 0; col < Np; col++)
302 	mx[col][i] = mx[i][col];
303 				/* store the node index in coords */
304       coord[Np-2] = Np+N-2;
305     }
306 
307   /**************************
308    * Garbage collection and return
309    **************************/
310   Free2DArray(mx, N);
311   free(coord);
312   free(diff);
313   *ret_tree = tree;
314   return 1;
315 }
316 
317 /* Function: AllocPhylo()
318  *
319  * Purpose:  Allocate space for a phylo_s array. N-1 structures
320  *           are allocated, one for each node; in each node, a 0..N
321  *           is_in flag array is also allocated and initialized to
322  *           all zeros.
323  *
324  * Args:     N  - size; number of sequences being clustered
325  *
326  * Return:   pointer to the allocated array
327  *
328  */
329 struct phylo_s *
AllocPhylo(int N)330 AllocPhylo(int N)
331 {
332   struct phylo_s *tree;
333   int             i;
334 
335   if ((tree = (struct phylo_s *) malloc ((N-1) * sizeof(struct phylo_s))) == NULL)
336     return NULL;
337 
338   for (i = 0; i < N-1; i++)
339     {
340       tree[i].diff   = 0.0;
341       tree[i].lblen  = tree[i].rblen = 0.0;
342       tree[i].left   = tree[i].right = tree[i].parent = -1;
343       tree[i].incnum = 0;
344       if ((tree[i].is_in = (char *) calloc (N, sizeof(char))) == NULL)
345 	return NULL;
346     }
347   return tree;
348 }
349 
350 
351 /* Function: FreePhylo()
352  *
353  * Purpose:  Free a clustree array that was built to cluster N sequences.
354  *
355  * Args:     tree - phylogenetic tree to free
356  *           N    - size of clustree; number of sequences it clustered
357  *
358  * Return:   (void)
359  */
360 void
FreePhylo(struct phylo_s * tree,int N)361 FreePhylo(struct phylo_s *tree, int N)
362 {
363   int idx;
364 
365   for (idx = 0; idx < N-1; idx++)
366     free(tree[idx].is_in);
367   free(tree);
368 }
369 
370 
371 /* Function: MakeDiffMx()
372  *
373  * Purpose:  Given a set of aligned sequences, construct
374  *           an NxN fractional difference matrix. (i.e. 1.0 is
375  *           completely different, 0.0 is exactly identical).
376  *
377  * Args:     aseqs        - flushed, aligned sequences
378  *           num          - number of aseqs
379  *           ret_dmx      - RETURN: difference matrix
380  *
381  * Return:   1 on success, 0 on failure.
382  *           Caller must free diff matrix with FMX2Free(dmx)
383  */
384 void
MakeDiffMx(char ** aseqs,int num,float *** ret_dmx)385 MakeDiffMx(char **aseqs, int num, float ***ret_dmx)
386 {
387   float **dmx;                  /* RETURN: distance matrix           */
388   int      i,j;			/* counters over sequences           */
389 
390   /* Allocate 2D float matrix
391    */
392   dmx = FMX2Alloc(num, num);
393 
394   /* Calculate distances; symmetric matrix
395    * record difference, not identity (1 - identity)
396    */
397   for (i = 0; i < num; i++)
398     for (j = i; j < num; j++)
399       dmx[i][j] = dmx[j][i] = 1.0 - PairwiseIdentity(aseqs[i], aseqs[j]);
400 
401   *ret_dmx = dmx;
402   return;
403 }
404 
405 /* Function: MakeIdentityMx()
406  *
407  * Purpose:  Given a set of aligned sequences, construct
408  *           an NxN fractional identity matrix. (i.e. 1.0 is
409  *           completely identical, 0.0 is completely different).
410  *           Virtually identical to MakeDiffMx(). It's
411  *           less confusing to have two distinct functions, I find.
412  *
413  * Args:     aseqs        - flushed, aligned sequences
414  *           num          - number of aseqs
415  *           ret_imx      - RETURN: identity matrix (caller must free)
416  *
417  * Return:   1 on success, 0 on failure.
418  *           Caller must free imx using FMX2Free(imx)
419  */
420 void
MakeIdentityMx(char ** aseqs,int num,float *** ret_imx)421 MakeIdentityMx(char **aseqs, int num, float ***ret_imx)
422 {
423   float **imx;          /* RETURN: identity matrix           */
424   int     i,j;		/* counters over sequences           */
425 
426   /* Allocate 2D float matrix
427    */
428   imx = FMX2Alloc(num, num);
429 
430   /* Calculate distances, symmetric matrix
431    */
432   for (i = 0; i < num; i++)
433     for (j = i; j < num; j++)
434       imx[i][j] = imx[j][i] = PairwiseIdentity(aseqs[i], aseqs[j]);
435 
436   *ret_imx = imx;
437   return;
438 }
439 
440 
441 
442 /* Function: PrintNewHampshireTree()
443  *
444  * Purpose:  Print out a tree in the "New Hampshire" standard
445  *           format. See PHYLIP's draw.doc for a definition of
446  *           the New Hampshire format.
447  *
448  *           Like a CFG, we generate the format string left to
449  *           right by a preorder tree traversal.
450  *
451  * Args:     fp   - file to print to
452  *           ainfo- alignment info, including sequence names
453  *           tree - tree to print
454  *           N    - number of leaves
455  *
456  */
457 void
PrintNewHampshireTree(FILE * fp,AINFO * ainfo,struct phylo_s * tree,int N)458 PrintNewHampshireTree(FILE *fp, AINFO *ainfo, struct phylo_s *tree, int N)
459 {
460   struct intstack_s *stack;
461   int    code;
462   float *blen;
463   int    docomma;
464 
465   blen  = (float *) MallocOrDie (sizeof(float) * (2*N-1));
466   stack = InitIntStack();
467   PushIntStack(stack, N);	/* push root on stack */
468   docomma = FALSE;
469 
470   /* node index code:
471    *     0..N-1   = leaves; indexes of sequences.
472    *     N..2N-2  = interior nodes; node-N = index of node in tree structure.
473    *                code N is the root.
474    *     2N..3N-2 = special flags for closing interior nodes; node-2N = index in tree
475    */
476   while (PopIntStack(stack, &code))
477     {
478       if (code < N)		/* we're a leaf. */
479 	{
480 				/* 1) print name:branchlength */
481 	  if (docomma) fputs(",", fp);
482 	  fprintf(fp, "%s:%.5f", ainfo->sqinfo[code].name, blen[code]);
483 	  docomma = TRUE;
484 	}
485 
486       else if (code < 2*N)      /* we're an interior node */
487 	{
488 				/* 1) print a '(' */
489 	  if (docomma) fputs(",\n", fp);
490 	  fputs("(", fp);
491 				/* 2) push on stack: ), rchild, lchild */
492 	  PushIntStack(stack, code+N);
493 	  PushIntStack(stack, tree[code-N].right);
494 	  PushIntStack(stack, tree[code-N].left);
495 				/* 3) record branch lengths */
496 	  blen[tree[code-N].right] = tree[code-N].rblen;
497 	  blen[tree[code-N].left]  = tree[code-N].lblen;
498 	  docomma = FALSE;
499 	}
500 
501       else			/* we're closing an interior node */
502 	{
503 				/* print a ):branchlength */
504 	  if (code == 2*N) fprintf(fp, ");\n");
505 	  else             fprintf(fp, "):%.5f", blen[code-N]);
506 	  docomma = TRUE;
507 	}
508     }
509 
510   FreeIntStack(stack);
511   free(blen);
512   return;
513 }
514 
515 
516 /* Function: PrintPhylo()
517  *
518  * Purpose:  Debugging output of a phylogenetic tree structure.
519  */
520 void
PrintPhylo(FILE * fp,AINFO * ainfo,struct phylo_s * tree,int N)521 PrintPhylo(FILE *fp, AINFO *ainfo, struct phylo_s *tree, int N)
522 {
523   int idx;
524 
525   for (idx = 0; idx < N-1; idx++)
526     {
527       fprintf(fp, "Interior node %d (code %d)\n", idx, idx+N);
528       fprintf(fp, "\tParent: %d (code %d)\n", tree[idx].parent-N, tree[idx].parent);
529       fprintf(fp, "\tLeft:   %d (%s) %f\n",
530 	      tree[idx].left < N ? tree[idx].left-N : tree[idx].left,
531 	      tree[idx].left < N ? ainfo->sqinfo[tree[idx].left].name : "interior",
532 	      tree[idx].lblen);
533       fprintf(fp, "\tRight:   %d (%s) %f\n",
534 	      tree[idx].right < N ? tree[idx].right-N : tree[idx].right,
535 	      tree[idx].right < N ? ainfo->sqinfo[tree[idx].right].name : "interior",
536 	      tree[idx].rblen);
537       fprintf(fp, "\tHeight:  %f\n", tree[idx].diff);
538       fprintf(fp, "\tIncludes:%d seqs\n", tree[idx].incnum);
539     }
540 }
541 
542 
543 
544