1 /* Phylogenetic trees.
2 *
3 * Contents:
4 * 1. The ESL_TREE object.
5 * 2. Newick format i/o
6 * 3. Tree comparison algorithms.
7 * 4. Clustering algorithms for distance-based tree construction.
8 * 5. Generating simulated trees.
9 * 6. Unit tests.
10 * 7. Test driver.
11 * 8. Examples.
12 */
13 #include "esl_config.h"
14
15 #include <math.h>
16 #include <string.h>
17 #include <ctype.h>
18 #include <assert.h>
19
20 #include "easel.h"
21 #include "esl_arr2.h"
22 #include "esl_dmatrix.h"
23 #include "esl_random.h"
24 #include "esl_stack.h"
25 #include "esl_vectorops.h"
26
27 #include "esl_tree.h"
28
29 /*****************************************************************
30 *# 1. The ESL_TREE object.
31 *****************************************************************/
32
33 /* Function: esl_tree_Create()
34 *
35 * Purpose: Allocate an empty tree structure for <ntaxa> taxa
36 * and return a pointer to it. <ntaxa> must be $\geq 2$.
37 *
38 * Args: <ntaxa> - number of taxa
39 *
40 * Returns: pointer to the new <ESL_TREE> object; caller frees
41 * this with <esl_tree_Destroy()>.
42 *
43 * Throws: <NULL> if allocation fails.
44 */
45 ESL_TREE *
esl_tree_Create(int ntaxa)46 esl_tree_Create(int ntaxa)
47 {
48 ESL_TREE *T = NULL;
49 int i;
50 int status;
51
52 /* Contract verification */
53 ESL_DASSERT1((ntaxa >= 2));
54
55 /* 1st allocation round */
56 ESL_ALLOC(T, sizeof(ESL_TREE));
57 T->parent = NULL;
58 T->left = NULL;
59 T->right = NULL;
60 T->ld = NULL;
61 T->rd = NULL;
62
63 /* Optional info starts NULL
64 */
65 T->taxaparent = NULL;
66 T->cladesize = NULL;
67 T->taxonlabel = NULL;
68 T->nodelabel = NULL;
69
70 /* Additive trees are assumed by default, as opposed to linkage trees */
71 T->is_linkage_tree = FALSE;
72
73 /* Tree output options default to PHYLIP style
74 */
75 T->show_unrooted = FALSE;
76 T->show_node_labels = TRUE;
77 T->show_root_branchlength = FALSE;
78 T->show_branchlengths = TRUE;
79 T->show_quoted_labels = FALSE;
80 T->show_numeric_taxonlabels = TRUE;
81
82 /* 2nd allocation round */
83 T->N = ntaxa;
84 ESL_ALLOC(T->parent, sizeof(int) * (ntaxa-1));
85 ESL_ALLOC(T->left, sizeof(int) * (ntaxa-1));
86 ESL_ALLOC(T->right, sizeof(int) * (ntaxa-1));
87 ESL_ALLOC(T->ld, sizeof(double) * (ntaxa-1));
88 ESL_ALLOC(T->rd, sizeof(double) * (ntaxa-1));
89
90 for (i = 0; i < ntaxa-1; i++)
91 {
92 T->parent[i] = 0;
93 T->left[i ] = 0;
94 T->right[i] = 0;
95 T->ld[i] = 0.;
96 T->rd[i] = 0.;
97 }
98 T->nalloc = ntaxa;
99 return T;
100
101 ERROR:
102 esl_tree_Destroy(T);
103 return NULL;
104 }
105
106 /* Function: esl_tree_CreateGrowable()
107 *
108 * Purpose: Allocate a growable tree structure for an initial
109 * allocation of <nalloc> taxa, and return a pointer to it.
110 * <nalloc> must be $\geq 2$.
111 *
112 * Args: <nalloc> - initial allocation size for number of taxa
113 *
114 * Returns: pointer to a new growable <ESL_TREE> object; caller frees
115 * this with <esl_tree_Destroy()>.
116 *
117 * Throws: <NULL> if allocation fails.
118 */
119 ESL_TREE *
esl_tree_CreateGrowable(int nalloc)120 esl_tree_CreateGrowable(int nalloc)
121 {
122 ESL_TREE *T = esl_tree_Create(nalloc);
123 if (T == NULL) return NULL;
124
125 T->N = 0;
126 return T;
127 }
128
129
130 /* Function: esl_tree_CreateFromString()
131 *
132 * Purpose: A convenience for making small test cases in the test
133 * suites: given the contents of a Newick file as a
134 * single string <s>, convert it to an <ESL_TREE>.
135 *
136 * Returns: a pointer to the new <ESL_TREE> on success.
137 *
138 * Throws: <NULL> if it fails to obtain, open, or read the
139 * temporary file that it puts the string <s> in.
140 */
141 ESL_TREE *
esl_tree_CreateFromString(char * s)142 esl_tree_CreateFromString(char *s)
143 {
144 char tmpfile[16] = "esltmpXXXXXX";
145 FILE *fp = NULL;
146 ESL_TREE *T = NULL;
147
148 if (esl_tmpfile(tmpfile, &fp) != eslOK) goto ERROR;
149 fputs(s, fp);
150 rewind(fp);
151 if (esl_tree_ReadNewick(fp, NULL, &T) != eslOK) goto ERROR;
152 fclose(fp);
153 return T;
154
155 ERROR:
156 if (fp != NULL) fclose(fp);
157 if (T != NULL) esl_tree_Destroy(T);
158 return NULL;
159 }
160
161
162
163 /* Function: esl_tree_Grow()
164 *
165 * Purpose: Given a tree <T>, make sure it can hold one more taxon;
166 * reallocate internally if necessary by doubling the
167 * number of taxa it is currently allocated to hold.
168 *
169 * Returns: <eslOK> on success.
170 *
171 * Throws: <eslEMEM> on allocation failure. In this case,
172 * the data in the tree are unaffected.
173 */
174 int
esl_tree_Grow(ESL_TREE * T)175 esl_tree_Grow(ESL_TREE *T)
176 {
177 void *tmp;
178 int nnew;
179 int status;
180 int i;
181
182 if (T->N < T->nalloc) return eslOK; /* do we have room for next taxon? */
183
184 nnew = T->nalloc * 2;
185
186 /* There are N-1 interior nodes, so arrays of info for
187 * interior nodes are allocated for (nnew-1), whereas
188 * arrays of info for the N taxa are allocated (nnew).
189 */
190 ESL_RALLOC(T->parent, tmp, sizeof(int) * (nnew-1));
191 ESL_RALLOC(T->left, tmp, sizeof(int) * (nnew-1));
192 ESL_RALLOC(T->right, tmp, sizeof(int) * (nnew-1));
193 ESL_RALLOC(T->ld, tmp, sizeof(double) * (nnew-1));
194 ESL_RALLOC(T->rd, tmp, sizeof(double) * (nnew-1));
195
196 /* 0..N-2 were already initialized or used.
197 * Initialize newly alloced space N-1..nnew-2.
198 */
199 for (i = T->nalloc-1; i < nnew-1; i++)
200 {
201 T->parent[i] = 0;
202 T->left[i ] = 0;
203 T->right[i] = 0;
204 T->ld[i] = 0.;
205 T->rd[i] = 0.;
206 }
207
208 if (T->taxaparent != NULL)
209 {
210 ESL_RALLOC(T->taxaparent, tmp, sizeof(int) * nnew);
211 for (i = T->nalloc; i < nnew; i++) T->taxaparent[i] = 0;
212 }
213
214 if (T->cladesize != NULL)
215 {
216 ESL_RALLOC(T->cladesize, tmp, sizeof(int) * nnew);
217 for (i = T->nalloc; i < nnew; i++) T->cladesize[i] = 0;
218 }
219
220 if (T->taxonlabel != NULL)
221 {
222 ESL_RALLOC(T->taxonlabel, tmp, sizeof(char *) * nnew);
223 for (i = T->nalloc; i < nnew; i++) T->taxonlabel[i] = NULL;
224 }
225
226 if (T->nodelabel != NULL)
227 {
228 ESL_RALLOC(T->nodelabel, tmp, sizeof(char *) * (nnew-1));
229 for (i = T->nalloc-1; i < nnew-1; i++) T->nodelabel[i] = NULL;
230 }
231
232 T->nalloc = nnew;
233 return eslOK;
234
235 ERROR:
236 return eslEMEM;
237 }
238
239
240 /* Function: esl_tree_SetTaxaParents()
241 *
242 * Purpose: Constructs the <T->taxaparent[]> array in the tree
243 * structure <T>, by an O(N) traversal of the tree.
244 * Upon return, <T->taxaparent[i]> is the index
245 * of the internal node that taxon <i> is a child of.
246 *
247 * Args: T - the tree structure to map
248 *
249 * Returns: <eslOK> on success.
250 *
251 * Throws: <eslEMEM> on internal allocation error. In this case, the tree is
252 * returned unchanged.
253 *
254 * Xref: STL11/63
255 */
256 int
esl_tree_SetTaxaParents(ESL_TREE * T)257 esl_tree_SetTaxaParents(ESL_TREE *T)
258 {
259 int i;
260 int status;
261
262 if (T->taxaparent != NULL) return eslOK; // map already exists.
263 ESL_ALLOC(T->taxaparent, sizeof(int) * T->N);
264 for (i = 0; i < T->N; i++) T->taxaparent[i] = 0; // solely to calm static analyzers.
265
266 for (i = 0; i < T->N-1; i++) /* traversal order doesn't matter */
267 {
268 if (T->left[i] <= 0) T->taxaparent[-(T->left[i])] = i;
269 if (T->right[i] <= 0) T->taxaparent[-(T->right[i])] = i;
270 }
271 return eslOK;
272
273 ERROR:
274 if (T->taxaparent != NULL) { free(T->taxaparent); T->taxaparent = NULL; }
275 return status;
276 }
277
278
279 /* Function: esl_tree_SetCladesizes()
280 *
281 * Purpose: Constructs the <T->cladesize[]> array in tree structure
282 * <T>. Upon successful return, <T->cladesize[i]> is the
283 * number of taxa contained by the clade rooted at node <i>
284 * in the tree. For example, <T->cladesize[0]> is $N$ by
285 * definition, because 0 is the root of the tree.
286 *
287 * Returns: <eslOK> on success.
288 *
289 * Throws: <eslEMEM> on allocation error; in this case, the
290 * original <T> is unmodified.
291 */
292 int
esl_tree_SetCladesizes(ESL_TREE * T)293 esl_tree_SetCladesizes(ESL_TREE *T)
294 {
295 int i;
296 int status;
297
298 if (T->cladesize != NULL) return eslOK; /* already set. */
299 ESL_ALLOC(T->cladesize, sizeof(int) * (T->N-1));
300 esl_vec_ISet(T->cladesize, T->N-1, 0);
301
302 for (i = T->N-2; i >= 0; i--)
303 { /* taxon: ...else... internal node: */
304 if (T->left[i] <= 0) T->cladesize[i]++; else T->cladesize[i] += T->cladesize[T->left[i]];
305 if (T->right[i] <= 0) T->cladesize[i]++; else T->cladesize[i] += T->cladesize[T->right[i]];
306 }
307 return eslOK;
308
309 ERROR:
310 if (T->cladesize != NULL) { free(T->cladesize); T->cladesize = NULL; }
311 return status;
312 }
313
314
315 /* Function: esl_tree_SetTaxonlabels()
316 *
317 * Purpose: Given an array of taxon names <names[0..N-1]> with the
318 * same order and number as the taxa in tree <T>, make a
319 * copy of those names in <T>. For example, <names> might
320 * be the sequence names in a multiple alignment,
321 * <msa->sqname>.
322 *
323 * If the tree already had taxon names assigned to it, they
324 * are replaced.
325 *
326 * As a special case, if the <names> argument is passed as
327 * <NULL>, then the taxon labels are set to a string
328 * corresponding to their internal index; that is, taxon 0
329 * is labeled "0".
330 *
331 * Returns: <eslOK> on success, and internal state of <T>
332 * (specifically, <T->taxonlabel[]>) now contains a copy
333 * of the taxa names.
334 *
335 * Throws: <eslEMEM> on allocation failure. <T->taxonlabel[]> will be
336 * <NULL> (even if it was already set).
337 */
338 int
esl_tree_SetTaxonlabels(ESL_TREE * T,char ** names)339 esl_tree_SetTaxonlabels(ESL_TREE *T, char **names)
340 {
341 int i;
342 int status;
343
344 if (T->taxonlabel != NULL) esl_arr2_Destroy((void **) T->taxonlabel, T->N);
345 ESL_ALLOC(T->taxonlabel, sizeof(char *) * T->nalloc);
346 for (i = 0; i < T->nalloc; i++) T->taxonlabel[i] = NULL;
347
348 if (names != NULL)
349 {
350 for (i = 0; i < T->N; i++)
351 if ((status = esl_strdup(names[i], -1, &(T->taxonlabel[i]))) != eslOK) goto ERROR;
352 }
353 else
354 {
355 for (i = 0; i < T->N; i++)
356 {
357 ESL_ALLOC(T->taxonlabel[i], sizeof(char) * 32); /* enough width for any conceivable int */
358 snprintf(T->taxonlabel[i], 32, "%d", i);
359 }
360 }
361 return eslOK;
362
363 ERROR:
364 if (T->taxonlabel != NULL) esl_arr2_Destroy((void **) T->taxonlabel, T->nalloc);
365 return status;
366 }
367
368 /* Function: esl_tree_RenumberNodes()
369 * Synopsis: Assure nodes are numbered in preorder.
370 *
371 * Purpose: Given a tree <T> whose internal nodes might be numbered in
372 * any order, with the sole requirement that node 0 is the
373 * root; renumber the internal nodes (if necessary) to be in Easel's
374 * convention of preorder traversal. No other aspect of <T> is
375 * altered (including its allocation size).
376 *
377 * Returns: <eslOK> on success.
378 *
379 * Throws: <eslEMEM> on allocation failure.
380 *
381 * Xref: STL11/77
382 */
383 int
esl_tree_RenumberNodes(ESL_TREE * T)384 esl_tree_RenumberNodes(ESL_TREE *T)
385 {
386 ESL_TREE *T2 = NULL;
387 ESL_STACK *vs = NULL;
388 int *map = NULL;
389 int v,new;
390 int status;
391 int needs_rearranging = FALSE;
392
393 /* Pass 1. Preorder traverse of T by its children links;
394 * construct map[old] -> new.
395 */
396 ESL_ALLOC(map, sizeof(int) * (T->N-1));
397 if (( vs = esl_stack_ICreate()) == NULL) { status = eslEMEM; goto ERROR; };
398 if (esl_stack_IPush(vs, 0) != eslOK) { status = eslEMEM; goto ERROR; };
399 new = 0;
400 while (esl_stack_IPop(vs, &v) == eslOK)
401 {
402 if (v != new) needs_rearranging = TRUE;
403 map[v] = new++;
404 if (T->right[v] > 0 && esl_stack_IPush(vs, T->right[v]) != eslOK) { status = eslEMEM; goto ERROR; };
405 if (T->left[v] > 0 && esl_stack_IPush(vs, T->left[v]) != eslOK) { status = eslEMEM; goto ERROR; };
406 }
407 if (! needs_rearranging) { status = eslOK; goto ERROR; } /* not an error; just cleaning up & returning eslOK. */
408
409 /* Pass 2. Construct the guts of correctly numbered new T2.
410 * (traversal order doesn't matter here)
411 */
412 if (( T2 = esl_tree_Create(T->nalloc)) == NULL) { status = eslEMEM; goto ERROR; };
413 if (T->nodelabel != NULL) {
414 ESL_ALLOC(T2->nodelabel, sizeof(char *) * (T2->nalloc-1));
415 for (v = 0; v < T2->nalloc-1; v++) T2->nodelabel[v] = NULL;
416 }
417 if (T->taxaparent != NULL) {
418 ESL_ALLOC(T2->taxaparent, sizeof(int) * (T2->nalloc));
419 for (v = 0; v < T2->nalloc; v++) T2->taxaparent[v] = 0;
420 }
421
422 for (v = 0; v < T->N-1; v++)
423 {
424 T2->parent[map[v]] = map[T->parent[v]];
425 if (T->left[v] > 0) T2->left[map[v]] = map[T->left[v]]; /* internal nodes renumbered... */
426 else T2->left[map[v]] = T->left[v]; /* ...taxon indices unchanged */
427 if (T->right[v] > 0) T2->right[map[v]] = map[T->right[v]];
428 else T2->right[map[v]] = T->right[v];
429 T2->ld[map[v]] = T->ld[v];
430 T2->rd[map[v]] = T->rd[v];
431
432 if (T->taxaparent != NULL) {
433 if (T->left[v] <= 0) T2->taxaparent[-(T->left[v])] = map[v];
434 if (T->right[v] <= 0) T2->taxaparent[-(T->right[v])] = map[v];
435 }
436
437 if (T->nodelabel != NULL)
438 esl_strdup(T->nodelabel[v], -1, &(T2->nodelabel[map[v]]));
439 }
440
441 /* Finally, swap the new guts of T2 with the old guts of T;
442 * destroy T2 and return. T is now renumbered.
443 */
444 ESL_SWAP(T->parent, T2->parent, int *);
445 ESL_SWAP(T->left, T2->left, int *);
446 ESL_SWAP(T->right, T2->right, int *);
447 ESL_SWAP(T->ld, T2->ld, double *);
448 ESL_SWAP(T->rd, T2->rd, double *);
449 ESL_SWAP(T->taxaparent, T2->taxaparent, int *);
450 ESL_SWAP(T->nodelabel, T2->nodelabel, char **);
451
452 free(map);
453 esl_stack_Destroy(vs);
454 esl_tree_Destroy(T2);
455 return eslOK;
456
457 ERROR:
458 if (map != NULL) free(map);
459 if (vs != NULL) esl_stack_Destroy(vs);
460 if (T2 != NULL) esl_tree_Destroy(T2);
461 return status;
462
463 }
464
465 /* Function: esl_tree_VerifyUltrametric()
466 *
467 * Purpose: Verify that tree <T> is ultrametric.
468 *
469 * Returns: <eslOK> if so; <eslFAIL> if not.
470 *
471 * Throws: <eslEMEM> on an allocation failure.
472 */
473 int
esl_tree_VerifyUltrametric(ESL_TREE * T)474 esl_tree_VerifyUltrametric(ESL_TREE *T)
475 {
476 double *d = NULL; /* Distance from root for each OTU */
477 int status;
478 int i, child, parent;
479
480 /* First, calculate distance from root to each taxon.
481 * (This chunk of code might be useful to put on its own someday.)
482 */
483 ESL_ALLOC(d, sizeof(double) * T->N);
484 if ((status = esl_tree_SetTaxaParents(T)) != eslOK) goto ERROR;
485 for (i = 0; i < T->N; i++)
486 {
487 d[i] = 0.0;
488 parent = T->taxaparent[i];
489 if (T->left[parent] == -i) d[i] += T->ld[parent];
490 else if (T->right[parent] == -i) d[i] += T->rd[parent];
491 else ESL_XEXCEPTION(eslEINCONCEIVABLE, "oops");
492
493 while (parent != 0) /* upwards to the root */
494 {
495 child = parent;
496 parent = T->parent[child];
497 if (T->left[parent] == child) d[i] += T->ld[parent];
498 else if (T->right[parent] == child) d[i] += T->rd[parent];
499 else ESL_XEXCEPTION(eslEINCONCEIVABLE, "oops");
500 }
501 }
502
503 /* In an ultrametric tree, all those distances must be equal.
504 */
505 for (i = 1; i < T->N; i++)
506 if ((status = esl_DCompare(d[0], d[i], 0.0001)) != eslOK) break;
507
508 free(d);
509 return status;
510
511 ERROR:
512 if (d != NULL) free(d);
513 return status;
514 }
515
516
517 /* Function: esl_tree_Validate()
518 *
519 * Purpose: Validates the integrity of the data structure in <T>.
520 * Returns <eslOK> if the internal data in <T> are
521 * consistent and valid. Returns <eslFAIL> if not,
522 * and if a non-<NULL> message buffer <errbuf> has been
523 * provided by the caller, an informative message is
524 * left in <errbuf> describing the reason for the
525 * failure.
526 *
527 * Args: T - tree structure to validate
528 * errbuf - NULL, or a buffer of at least p7_ERRBUFSIZE
529 * chars to contain an error message upon
530 * any validation failure.
531 */
532 int
esl_tree_Validate(ESL_TREE * T,char * errbuf)533 esl_tree_Validate(ESL_TREE *T, char *errbuf)
534 {
535 int N;
536 int i, c;
537 int shouldbe;
538 int status;
539
540 if (errbuf != NULL) *errbuf = 0;
541
542 N = T->N; /* just to save writing T->N so many times below */
543 if (N < 2) ESL_XFAIL(eslFAIL, errbuf, "number of taxa is less than 2");
544 if (T->parent[0] != 0) ESL_XFAIL(eslFAIL, errbuf, "parent of root 0 should be set to 0");
545 if (T->nalloc < N) ESL_XFAIL(eslFAIL, errbuf, "number of taxa N is less than allocation");
546
547 /* Verify preorder tree numbering.
548 */
549 for (i = 0; i < N-1; i++)
550 {
551 if (T->left[i] > 0 && T->left[i] < i)
552 ESL_XFAIL(eslFAIL, errbuf, "l child of node %d not in preorder", i);
553 if (T->right[i] > 0 && T->right[i] < i)
554 ESL_XFAIL(eslFAIL, errbuf, "r child of node %d not in preorder", i);
555 }
556
557 /* Range checks on values. */
558 for (i = 0; i < N-1; i++)
559 {
560 if (T->parent[i] < 0 || T->parent[i] > N-2)
561 ESL_XFAIL(eslFAIL, errbuf, "parent idx of node %d invalid", i);
562 if (T->left[i] < -(N-1) || T->left[i] > N-2)
563 ESL_XFAIL(eslFAIL, errbuf, "left child idx of node %d invalid", i);
564 if (T->right[i] < -(N-1) || T->right[i] > N-2)
565 ESL_XFAIL(eslFAIL, errbuf, "right child idx of node %d invalid", i);
566 if (T->ld[i] < 0.)
567 ESL_XFAIL(eslFAIL, errbuf, "negative l branch length at node %d", i);
568 if (T->rd[i] < 0.)
569 ESL_XFAIL(eslFAIL, errbuf, "negative r branch length at node %d", i);
570 if (T->cladesize != NULL && (T->cladesize[i] < 0 || T->cladesize[i] > N))
571 ESL_XFAIL(eslFAIL, errbuf, "invalid cladesize at node %d", i);
572 }
573 for (c = 0; c < N; c++)
574 if (T->taxaparent != NULL && (T->taxaparent[c] < 0 || T->taxaparent[c] > N-2))
575 ESL_XFAIL(eslFAIL, errbuf, "invalid taxaparent at node %d", c);
576
577 /* more sophisticated integrity checks on parent-child relations in
578 nodes ...*/
579 for (i = 1; i < T->N-1; i++)
580 if (T->left[T->parent[i]] != i && T->right[T->parent[i]] != i)
581 ESL_XFAIL(eslFAIL, errbuf, "parent/child link discrepancy at internal node %d\n", i);
582
583 /* ...and between terminal nodes and taxa.
584 */
585 if (T->taxaparent != NULL)
586 for (c = 0; c < T->N; c++)
587 if (T->left[T->taxaparent[c]] != -c && T->right[T->taxaparent[c]] != -c)
588 ESL_XFAIL(eslFAIL, errbuf, "parent/child link discrepancy at taxon %d\n", c);
589
590 /* check on cladesizes */
591 if (T->cladesize != NULL)
592 for (i = 0; i < T->N-1; i++)
593 {
594 shouldbe = 0;
595 if (T->left[i] > 0) shouldbe += T->cladesize[T->left[i]]; else shouldbe++;
596 if (T->right[i] > 0) shouldbe += T->cladesize[T->right[i]]; else shouldbe++;
597 if (shouldbe != T->cladesize[i])
598 ESL_XFAIL(eslFAIL, errbuf, "incorrect cladesize at node %d", i);
599 }
600
601 return eslOK;
602
603 ERROR:
604 return status;
605 }
606
607
608
609 /* Function: esl_tree_Destroy()
610 *
611 * Purpose: Frees an <ESL_TREE> object.
612 */
613 void
esl_tree_Destroy(ESL_TREE * T)614 esl_tree_Destroy(ESL_TREE *T)
615 {
616 if (T == NULL) return;
617
618 esl_free(T->parent);
619 esl_free(T->left);
620 esl_free(T->right);
621 esl_free(T->ld);
622 esl_free(T->rd);
623 esl_free(T->taxaparent);
624 esl_free(T->cladesize);
625
626 esl_arr2_Destroy((void **) T->taxonlabel, T->nalloc);
627 esl_arr2_Destroy((void **) T->nodelabel, T->nalloc-1);
628 free(T);
629 return;
630 }
631
632
633
634 /*----------------- end, ESL_TREE object -----------------------*/
635
636
637 /*****************************************************************
638 *# 2. Newick format i/o
639 *****************************************************************/
640
641 /* newick_validate_unquoted():
642 * Returns <eslOK> if we can represent <label> as an unquoted label
643 * in Newick format. (Spaces are ok, but will be converted to
644 * _ on output.)
645 */
646 static int
newick_validate_unquoted(char * label)647 newick_validate_unquoted(char *label)
648 {
649 char *sptr;
650
651 for (sptr = label; *sptr != '\0'; sptr++)
652 {
653 if (! isprint(*sptr)) return eslFAIL;
654 if (strchr("()[]':;,", *sptr) != NULL) return eslFAIL;
655 }
656 return eslOK;
657 }
658
659 /* newick_validate_quoted():
660 * Returns <eslOK> if we can represent <label> as a
661 * quoted label in Newick format. (Single quotes will
662 * be converted to '' on output.)
663 */
664 static int
newick_validate_quoted(char * label)665 newick_validate_quoted(char *label)
666 {
667 char *sptr;
668 for (sptr = label; *sptr != '\0'; sptr++)
669 {
670 if (! isprint(*sptr)) return eslFAIL;
671 }
672 return eslOK;
673 }
674
675 /* newick_write_unquoted():
676 * Prints <label> to <fp> as an unquoted Newick label.
677 */
678 static int
newick_write_unquoted(FILE * fp,char * label)679 newick_write_unquoted(FILE *fp, char *label)
680 {
681 char *sptr;
682
683 for (sptr = label; *sptr != '\0'; sptr++)
684 {
685 if (*sptr == ' ') { if (fputc('_', fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed"); }
686 else { if (fputc(*sptr, fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed"); }
687 }
688 return eslOK;
689 }
690
691 /* newick_write_quoted():
692 * Prints <label> to <fp> as a quoted Newick label.
693 */
694 static int
newick_write_quoted(FILE * fp,char * label)695 newick_write_quoted(FILE *fp, char *label)
696 {
697 char *sptr;
698
699 if (fputc('\'', fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed");
700 for (sptr = label; *sptr != '\0'; sptr++)
701 {
702 if (*sptr == '\'') { if (fprintf(fp, "''") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed"); }
703 else { if (fputc(*sptr, fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed"); }
704 }
705 if (fputc('\'', fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed");
706 return eslOK;
707 }
708
709 /* newick_write_taxonlabel():
710 * Print the label for taxon <v> to stream <fp>.
711 * Tries to print label as an unquoted label, then
712 * as a quoted label, (then fails).
713 * If label isn't available, does nothing.
714 * If label contains invalid characters, throws <eslECORRUPT>.
715 */
716 static int
newick_write_taxonlabel(FILE * fp,ESL_TREE * T,int v)717 newick_write_taxonlabel(FILE *fp, ESL_TREE *T, int v)
718 {
719 int status;
720
721 if (T->taxonlabel == NULL || T->taxonlabel[v] == NULL)
722 {
723 if (T->show_numeric_taxonlabels && fprintf(fp, "%d", v) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed");
724 return eslOK;
725 }
726
727 if (! T->show_quoted_labels && newick_validate_unquoted(T->taxonlabel[v]) == eslOK)
728 status = newick_write_unquoted(fp, T->taxonlabel[v]);
729 else if (newick_validate_quoted(T->taxonlabel[v]) == eslOK)
730 status = newick_write_quoted(fp, T->taxonlabel[v]);
731 else
732 ESL_EXCEPTION(eslECORRUPT, "bad taxon label");
733
734 return status;
735 }
736
737 /* newick_write_nodelabel():
738 * Print the label for internal node <v> to stream <fp>.
739 * Tries to print label as an unquoted label, then
740 * as a quoted label.
741 * If label isn't available, does nothing.
742 * If tree's options say not to print node labels, does nothing.
743 * If label contains invalid characters, throws <eslECORRUPT>.
744 */
745 static int
newick_write_nodelabel(FILE * fp,ESL_TREE * T,int v)746 newick_write_nodelabel(FILE *fp, ESL_TREE *T, int v)
747 {
748 int status;
749
750 if (T->nodelabel == NULL) return eslOK;
751 if (T->nodelabel[v] == NULL) return eslOK;
752 if (T->show_node_labels != TRUE) return eslOK;
753
754 if (! T->show_quoted_labels && newick_validate_unquoted(T->nodelabel[v]) == eslOK)
755 status = newick_write_unquoted(fp, T->nodelabel[v]);
756 else if (newick_validate_quoted(T->nodelabel[v]) == eslOK)
757 status = newick_write_quoted(fp, T->nodelabel[v]);
758 else
759 ESL_EXCEPTION(eslECORRUPT, "bad node label\n");
760
761 return status;
762 }
763
764 /* newick_write_branchlength()
765 * Writes the branch length *to* <v>.
766 * If <v> is negative, it's a leaf; if <v> is positive, it's an internal node.
767 * You can't pass the root node 0 to this. 0 always means taxon 0.
768 * There is no branch to the root node.
769 */
770 static int
newick_write_branchlength(FILE * fp,ESL_TREE * T,int v)771 newick_write_branchlength(FILE *fp, ESL_TREE *T, int v)
772 {
773 double branchlength;
774
775 if (! T->show_branchlengths) return eslOK;
776 if (T->taxaparent == NULL) ESL_EXCEPTION(eslEINVAL, "T must have taxaparent");
777
778 if (v <= 0) /* leaf */
779 {
780 if (T->left [T->taxaparent[-v]] == v) branchlength = T->ld[T->taxaparent[-v]];
781 else if (T->right[T->taxaparent[-v]] == v) branchlength = T->rd[T->taxaparent[-v]];
782 else ESL_EXCEPTION(eslECORRUPT, "Can't find branch length");
783 }
784 else /* internal node */
785 {
786 if (T->left [T->parent[v]] == v) branchlength = T->ld[T->parent[v]];
787 else if (T->right[T->parent[v]] == v) branchlength = T->rd[T->parent[v]];
788 else ESL_EXCEPTION(eslECORRUPT, "Can't find branch length");
789 }
790
791 if (fprintf(fp, ":%f", branchlength) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed");
792 return eslOK;
793 }
794
795 /* Function: esl_tree_WriteNewick()
796 *
797 * Purpose: Writes tree <T> to stream <fp> in Newick format.
798 *
799 * Certain options are set in <T> to control output style,
800 * as follows.
801 *
802 * If <T->show_unrooted> is <TRUE>, <T> is printed as an
803 * unrooted tree starting with a trifurcation, a la PHYLIP
804 * format (default=<FALSE>).
805 *
806 * If <T->show_node_labels> is <TRUE>, then labels are
807 * shown for internal nodes, if any are available
808 * (default=<TRUE>).
809 *
810 * If <T->show_branchlengths> is <TRUE>, then branch
811 * lengths are shown, as opposed to just printing a labeled
812 * topology (default=<TRUE>).
813 *
814 * If <T->show_root_branchlength> is also <TRUE>, then a
815 * 0.0 branchlength is shown to the root node, a la Hein's
816 * TreeAlign Newick format (default=<FALSE>).
817 *
818 * If <T->show_quoted_labels> is <TRUE>, then all labels
819 * are shown in Newick's quoted format, as opposed to only
820 * using quoted labels where necessary (default=<FALSE>).
821 *
822 * Returns: <eslOK> on success.
823 *
824 * Throws: <eslEMEM> on allocation error.
825 * <eslEWRITE> on any system write error.
826 * <eslEINCONCEIVABLE> on internal error.
827 *
828 * Xref: SRE:STL11/74
829 */
830 int
esl_tree_WriteNewick(FILE * fp,ESL_TREE * T)831 esl_tree_WriteNewick(FILE *fp, ESL_TREE *T)
832 {
833 ESL_STACK *vs = NULL;
834 ESL_STACK *cs = NULL;
835 int v;
836 char c;
837 int status;
838
839 if ((vs = esl_stack_ICreate()) == NULL) { status = eslEMEM; goto ERROR; }
840 if ((cs = esl_stack_CCreate()) == NULL) { status = eslEMEM; goto ERROR; }
841
842 if ((status = esl_tree_SetTaxaParents(T)) != eslOK) goto ERROR;
843
844 /* Initialization.
845 * Push a trifurcation (swallowing the right internal node) if unrooted;
846 * else push the first bifurcation.
847 *
848 * When we push a trifurcation, the branch lengths will come out fine
849 * on output, if the tree followed the correct convention of having
850 * a T->rd[0] = 0.0.
851 */
852 if (fputc('(', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
853 if (T->show_unrooted && T->right[0] > 0)
854 {
855 v = T->right[0];
856 if ((status = esl_stack_CPush(cs, 'x')) != eslOK) goto ERROR;
857 if ((status = esl_stack_IPush(vs, T->right[v])) != eslOK) goto ERROR;
858 if ((status = esl_stack_CPush(cs, ',')) != eslOK) goto ERROR;
859 if ((status = esl_stack_CPush(cs, 'x')) != eslOK) goto ERROR;
860 if ((status = esl_stack_IPush(vs, T->left[v])) != eslOK) goto ERROR;
861 }
862 else
863 {
864 if ((status = esl_stack_CPush(cs, 'x')) != eslOK) goto ERROR;
865 if ((status = esl_stack_IPush(vs, T->right[0])) != eslOK) goto ERROR;
866 }
867 if ((status = esl_stack_CPush(cs, ',')) != eslOK) goto ERROR;
868 if ((status = esl_stack_CPush(cs, 'x')) != eslOK) goto ERROR;
869 if ((status = esl_stack_IPush(vs, T->left[0])) != eslOK) goto ERROR;
870
871
872 /* Main iteration. Pop off stacks 'til they're empty.
873 */
874 while ((status = esl_stack_CPop(cs, &c)) == eslOK)
875 {
876 if (c == ',') { /* comma doesn't have a v stacked with it */
877 if (fputc(',', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
878 continue;
879 }
880
881 if ((status = esl_stack_IPop(vs, &v)) != eslOK) goto ERROR;
882
883 switch (c) {
884 case 'x': /* a subtree, which could be a node or a taxon: */
885 if (v > 0) /* internal node 1..N-2*/
886 {
887 if (fputc('(', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
888 if ((status = esl_stack_CPush(cs, ')')) != eslOK) goto ERROR;
889 if ((status = esl_stack_IPush(vs, v)) != eslOK) goto ERROR;
890 if ((status = esl_stack_CPush(cs, 'x')) != eslOK) goto ERROR;
891 if ((status = esl_stack_IPush(vs, T->right[v])) != eslOK) goto ERROR;
892 if ((status = esl_stack_CPush(cs, ',')) != eslOK) goto ERROR;
893 if ((status = esl_stack_CPush(cs, 'x')) != eslOK) goto ERROR;
894 if ((status = esl_stack_IPush(vs, T->left[v])) != eslOK) goto ERROR;
895 }
896 else /* taxon -(N-1)..0 */
897 { /* -v below to convert taxon code to 0..N-1 */
898 if ((status = newick_write_taxonlabel (fp, T, -v)) != eslOK) goto ERROR;
899 if ((status = newick_write_branchlength(fp, T, v)) != eslOK) goto ERROR;
900 }
901 break;
902
903 case ')': /* closing an internal node. v > 0 is a node code. */
904 if (fputc(')', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
905 if ((status = newick_write_nodelabel (fp, T, v)) != eslOK) goto ERROR;
906 if ((status = newick_write_branchlength(fp, T, v)) != eslOK) goto ERROR;
907 break;
908
909 default:
910 ESL_XEXCEPTION(eslEINCONCEIVABLE, "bad state code");
911 break;
912 }
913 }
914
915 /* Termination
916 */
917 if (fputc(')', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
918 if ((status = newick_write_nodelabel(fp, T, 0)) != eslOK) goto ERROR;
919 if (T->show_branchlengths && T->show_root_branchlength)
920 { if (fprintf(fp, ":0.0") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed"); }
921 if (fputc(';', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
922 if (fputc('\n', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
923
924 esl_stack_Destroy(vs);
925 esl_stack_Destroy(cs);
926 return eslOK;
927
928 ERROR:
929 if (vs != NULL) esl_stack_Destroy(vs);
930 if (cs != NULL) esl_stack_Destroy(cs);
931 return status;
932
933 }
934
935
936 /* newick_advance_buffer()
937 *
938 * Advance the read buffer by one character; reload it
939 * if we reach the end. <eslOK> on success, and <eslEOF>
940 * if the read fails.
941 */
942 static int
newick_advance_buffer(FILE * fp,char * buf,int * pos,int * nc)943 newick_advance_buffer(FILE *fp, char *buf, int *pos, int *nc)
944 {
945 (*pos)++;
946 if (*pos == *nc)
947 {
948 *nc = fread(buf, sizeof(char), 4096, fp);
949 if (*nc == 0) return eslEOF;
950 *pos = 0;
951 }
952 return eslOK;
953 }
954
955 /* newick_skip_whitespace()
956 *
957 * Given the 4k input buffer <buf>, which currently contains
958 * <*nc> total characters and is positioned at position <*pos>,
959 * move <*pos> to be at the first nonwhitespace character,
960 * skipping any Newick comments ([...]) encountered. Read
961 * new data from the stream <fp> into <buf> as needed.
962 *
963 * Return <eslOK> on success. <*pos> is reset to point at a
964 * non-whitespace input character. <*nc> may be reset and the contents
965 * of <buf> altered if a new block was read from <fp> into <buf>.
966 *
967 * Returns <eslEOF> if end-of-file is reached in <fp> before a data
968 * character is found, or if a read error occurs.
969 */
970 static int
newick_skip_whitespace(FILE * fp,char * buf,int * pos,int * nc)971 newick_skip_whitespace(FILE *fp, char *buf, int *pos, int *nc)
972 {
973 int commentlevel = 0;
974
975 while (commentlevel > 0 || isspace(buf[*pos]) || buf[*pos] == '[')
976 {
977 if (buf[*pos] == '[') commentlevel++;
978 if (buf[*pos] == ']') commentlevel--;
979 if (newick_advance_buffer(fp, buf, pos, nc) == eslEOF) return eslEOF;
980 }
981 return eslOK;
982 }
983
984
985 /* newick_parse_quoted_label()
986 *
987 * On entry, buf[pos] == '\'': the opening single quote.
988 * On exit, buf[pos] is positioned at the next data character following the closing
989 * single quote; possibly the ':' for a branch length;
990 * and <ret_label> points to a newly allocated, NUL-terminated string
991 * containing the label that was read (possibly the empty string).
992 * Returns eslOK on success.
993 *
994 * Returns eslEFORMAT on parse error, eslEOF if it runs out of data.
995 */
996 static int
newick_parse_quoted_label(FILE * fp,char * buf,int * pos,int * nc,char ** ret_label)997 newick_parse_quoted_label(FILE *fp, char *buf, int *pos, int *nc, char **ret_label)
998 {
999 char *label = NULL;
1000 void *tmp;
1001 int n = 0;
1002 int nalloc = 0;
1003 int status;
1004
1005 nalloc = 32;
1006 ESL_ALLOC(label, sizeof(char) * nalloc);
1007
1008 /* advance past the opening ' */
1009 if (buf[*pos] != '\'') { status = eslEFORMAT; goto ERROR; }
1010 if ((status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
1011
1012 /* skip leading whitespace (\n and comments forbidden in quoted label) */
1013 while (buf[*pos] == '\t' || buf[*pos] == ' ')
1014 if ((status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
1015
1016 /* Read the label */
1017 while (1) {
1018 if (buf[*pos] == '\'') { /* watch out for escaped single quotes, '' */
1019 if ((status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
1020 if (buf[*pos] != '\'') break; /* we've just moved past the last ' */
1021 }
1022 label[n++] = buf[*pos];
1023 if ((status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
1024 if (n == (nalloc-1)) { /* reallocate label if it fills, leave room for NUL */
1025 ESL_RALLOC(label, tmp, sizeof(char) * (nalloc * 2));
1026 nalloc *= 2;
1027 }
1028 }
1029
1030 /* backtrack over any trailing whitespace and nul-terminate. */
1031 while (n>0 && isspace(label[n-1])) n--;
1032 label[n] = '\0';
1033 *ret_label = label;
1034 return eslOK;
1035
1036 ERROR:
1037 if (label != NULL) { free(label); *ret_label = NULL; }
1038 return status;
1039
1040 }
1041
1042 /* newick_parse_unquoted_label
1043 *
1044 * On entry, buf[pos] == first character in the label.
1045 * On exit, buf[pos] is positioned at the next data character following the end
1046 * of the label -- one of "),\t\n;[:" --
1047 * and <ret_label> points to a newly allocated, NUL-terminated string
1048 * containing the label that was read (possibly the empty string).
1049 * Returns eslOK on success.
1050 *
1051 * Returns eslEFORMAT on parse error, eslEOF if it runs out of data.
1052 */
1053 static int
newick_parse_unquoted_label(FILE * fp,char * buf,int * pos,int * nc,char ** ret_label)1054 newick_parse_unquoted_label(FILE *fp, char *buf, int *pos, int *nc, char **ret_label)
1055 {
1056 char *label = NULL;
1057 char *tmp = NULL;
1058 int n = 0;
1059 int nalloc = 0;
1060 int status;
1061
1062 nalloc = 32;
1063 ESL_ALLOC(label, sizeof(char) * nalloc);
1064
1065 while (1) {
1066 if (strchr("(]", buf[*pos]) != NULL) { status = eslEFORMAT; goto ERROR; }
1067 if (strchr(" \t\n)[':;,", buf[*pos]) != NULL) { break; }
1068 label[n++] = buf[*pos];
1069 if (newick_advance_buffer(fp, buf, pos, nc) == eslEOF) { status = eslEOF; goto ERROR; }
1070
1071 if (n == (nalloc-1)) { /* reallocate label if it fills, leave room for NUL */
1072 ESL_RALLOC(label, tmp, sizeof(char) * (nalloc * 2));
1073 nalloc *= 2;
1074 }
1075 }
1076 label[n] = '\0';
1077 *ret_label = label;
1078 return eslOK;
1079
1080 ERROR:
1081 if (label != NULL) { free(label); *ret_label = NULL; }
1082 return status;
1083 }
1084
1085 /* newick_parse_branchlength
1086 *
1087 * On entry, buf[pos] == ':'
1088 * On exit, buf[pos] is positioned at the next data character following the end
1089 * of the branchlength -- one of "),\t\n;[:"
1090 * and <ret_d> is the branch length that was read.
1091 *
1092 * Returns eslOK on success;
1093 *
1094 * Returns eslEFORMAT on parse error (including nonexistent branch lengths),
1095 * eslEOF if it runs out of data in the file.
1096 */
1097 static int
newick_parse_branchlength(FILE * fp,char * buf,int * pos,int * nc,double * ret_d)1098 newick_parse_branchlength(FILE *fp, char *buf, int *pos, int *nc, double *ret_d)
1099 {
1100 char *branch = NULL;
1101 char *tmp = NULL;
1102 int n = 0;
1103 int nalloc = 0;
1104 int status;
1105
1106 nalloc = 32;
1107 ESL_ALLOC(branch, sizeof(char) * nalloc);
1108
1109 if (buf[*pos] != ':') { status = eslEFORMAT; goto ERROR; }
1110 if (( status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
1111
1112 while (1) {
1113 if (strchr("(]", buf[*pos]) != NULL) { status = eslEFORMAT; goto ERROR; }
1114 if (strchr(" \t\n)[':;,", buf[*pos]) != NULL) break;
1115 branch[n++] = buf[*pos];
1116 if ((status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
1117
1118 if (n == (nalloc-1)) { /* reallocate label if it fills, leave room for NUL */
1119 ESL_RALLOC(branch, tmp, sizeof(char) * (nalloc * 2));
1120 nalloc *= 2;
1121 }
1122 }
1123
1124 branch[n] = '\0';
1125 *ret_d = strtod(branch, &tmp);
1126 if (n == 0 || tmp != branch+n) { status = eslEFORMAT; goto ERROR; }
1127 free(branch);
1128 return eslOK;
1129
1130 ERROR:
1131 if (branch != NULL) free(branch);
1132 *ret_d = 0.;
1133 return status;
1134 }
1135
1136
1137
1138
1139 /* Function: esl_tree_ReadNewick()
1140 * Synopsis: Input a Newick format tree.
1141 *
1142 * Purpose: Read a Newick format tree from an open input stream <fp>.
1143 * Return the new tree in <ret_T>.
1144 *
1145 * The new tree <T> will have the optional <T->taxonlabel> and
1146 * <T->nodelabel> arrays allocated, containing names of all the
1147 * taxa and nodes. Whenever no label appeared in the Newick file
1148 * for a node or taxon, the label is set to the empty string.
1149 *
1150 * Caller may optionally provide an <errbuf> of at least
1151 * <eslERRBUFSIZE> chars, to retrieve diagnostic information
1152 * in case of a parsing problem; or <errbuf> may be passed as
1153 * <NULL>.
1154 *
1155 * Args: fp - open input stream
1156 * errbuf - NULL, or allocated space for >= eslERRBUFSIZE chars
1157 * ret_T - RETURN: the new tree.
1158 *
1159 * Returns: Returns <eslOK> on success, and <ret_T> points
1160 * to the new tree.
1161 *
1162 * Returns <eslEFORMAT> on parse errors, such as premature EOF
1163 * or bad syntax in the Newick file. In this case, <ret_T> is
1164 * returned NULL, and the <errbuf> (if provided> contains an
1165 * informative error message.
1166 *
1167 * Throws: <eslEMEM> on memory allocation errors.
1168 * <eslEINCONCEIVABLE> may also arise in case of internal bugs.
1169 *
1170 * Xref: STL11/75
1171 */
1172 int
esl_tree_ReadNewick(FILE * fp,char * errbuf,ESL_TREE ** ret_T)1173 esl_tree_ReadNewick(FILE *fp, char *errbuf, ESL_TREE **ret_T)
1174 {
1175 ESL_TREE *T = NULL; /* the new, growing tree */
1176 ESL_STACK *cs = NULL; /* state stack: possible states are LRX);, */
1177 ESL_STACK *vs = NULL; /* node index stack: LRX) states are associated with node #'s */
1178 int status;
1179 char buf[4096]; /* 4K input buffer */
1180 int pos,nc; /* position in buf, and number of chars in buf */
1181 char c; /* current state */
1182 int v; /* current node idx */
1183 int currnode;
1184 int currtaxon;
1185 char *label; /* a parsed label */
1186 double d; /* a parsed branch length */
1187
1188 if (errbuf != NULL) *errbuf = '\0';
1189 if ((vs = esl_stack_ICreate()) == NULL) { status = eslEMEM; goto ERROR; };
1190 if ((cs = esl_stack_CCreate()) == NULL) { status = eslEMEM; goto ERROR; };
1191
1192 /* Create the tree, initially allocated for 32 taxa.
1193 * Allocate for taxon and node labels, too.
1194 */
1195 if ((T = esl_tree_CreateGrowable(32)) == NULL) { status = eslEMEM; goto ERROR; };
1196 ESL_ALLOC(T->taxonlabel, sizeof(char *) * 32);
1197 ESL_ALLOC(T->nodelabel, sizeof(char *) * 31);
1198 for (currtaxon = 0; currtaxon < 32; currtaxon++) T->taxonlabel[currtaxon] = NULL;
1199 for (currnode = 0; currnode < 31; currnode++) T->nodelabel[currnode] = NULL;
1200
1201 /* Load the input buffer
1202 */
1203 if ((nc = fread(buf, sizeof(char), 4096, fp)) == 0)
1204 ESL_XFAIL(eslEFORMAT, errbuf, "file is empty.");
1205 pos = 0;
1206
1207 /* Initialization:
1208 * create the root node in the tree;
1209 * push L,R...); onto the stacks;
1210 * swallow the first ( in the file.
1211 *
1212 * A note on memory management in the growing tree:
1213 * we are going to keep T->N set to the number of taxa
1214 * that the tree *will* contain, given the number of nodes
1215 * it currently *does* contain. Before we try to add
1216 * any new node, we call the Grow() routine, which will
1217 * check T->N against T->nalloc. This strategy works
1218 * because nodes always get added before their children
1219 * taxa, so the # of taxa is always <= nodes-1: that is,
1220 * our need for reallocation is driven by new nodes,
1221 * never by new taxa; that is, if we have enough room
1222 * for nodes, we automatically have enough room for the
1223 * taxa.
1224 */
1225 T->parent[0] = 0;
1226 currnode = 1;
1227 currtaxon = 0;
1228 T->N = 2; /* c.f. note above: T->N is the # of taxa we *would* hold, given currnode=1*/
1229 if (esl_stack_CPush(cs, ';') != eslOK) { status = eslEMEM; goto ERROR; };
1230 if (esl_stack_CPush(cs, ')') != eslOK) { status = eslEMEM; goto ERROR; };
1231 if (esl_stack_IPush(vs, 0) != eslOK) { status = eslEMEM; goto ERROR; };
1232 if (esl_stack_CPush(cs, 'X') != eslOK) { status = eslEMEM; goto ERROR; };
1233 if (esl_stack_IPush(vs, 0) != eslOK) { status = eslEMEM; goto ERROR; };
1234 if (esl_stack_CPush(cs, 'R') != eslOK) { status = eslEMEM; goto ERROR; };
1235 if (esl_stack_IPush(vs, 0) != eslOK) { status = eslEMEM; goto ERROR; };
1236 if (esl_stack_CPush(cs, ',') != eslOK) { status = eslEMEM; goto ERROR; };
1237 if (esl_stack_CPush(cs, 'L') != eslOK) { status = eslEMEM; goto ERROR; };
1238 if (esl_stack_IPush(vs, 0) != eslOK) { status = eslEMEM; goto ERROR; };
1239
1240 if (newick_skip_whitespace(fp, buf, &pos, &nc) != eslOK)
1241 ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1242 if (buf[pos] != '(')
1243 ESL_XFAIL(eslEFORMAT, errbuf, "file is not in Newick format.");
1244 if (newick_advance_buffer(fp, buf, &pos, &nc) == eslEOF)
1245 ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1246
1247 /* Iteration.
1248 */
1249 while ((status = esl_stack_CPop(cs, &c)) == eslOK)
1250 {
1251
1252 if (newick_skip_whitespace(fp, buf, &pos, &nc) != eslOK)
1253 ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1254
1255 if (c == ',')
1256 {
1257 if (buf[pos] != ',')
1258 ESL_XFAIL(eslEFORMAT, errbuf, "expected a comma, saw %c.", buf[pos]);
1259 if (newick_advance_buffer(fp, buf, &pos, &nc) == eslEOF)
1260 ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1261 continue;
1262 }
1263
1264 else if (c == ';')
1265 {
1266 if (buf[pos] != ';')
1267 ESL_XFAIL(eslEFORMAT, errbuf, "expected a semicolon, saw %c.", buf[pos]);
1268 break; /* end of the Newick file */
1269 }
1270
1271 else if (c == 'L' || c == 'R') /* c says, we expect to add a subtree next */
1272 {
1273 if (esl_stack_IPop(vs, &v) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* v = parent of currnode */
1274
1275 if (buf[pos] == '(') /* a new interior node attaches to v */
1276 {
1277 if (esl_tree_Grow(T) != eslOK) { status = eslEMEM; goto ERROR; }; /* c.f. memory management note: check that we can add new node */
1278
1279 T->parent[currnode] = v;
1280 if (c == 'L') T->left[v] = currnode;
1281 else T->right[v] = currnode;
1282
1283 if (esl_stack_CPush(cs, ')') != eslOK) { status = eslEMEM; goto ERROR; };
1284 if (esl_stack_IPush(vs, currnode) != eslOK) { status = eslEMEM; goto ERROR; };
1285 if (esl_stack_CPush(cs, 'X') != eslOK) { status = eslEMEM; goto ERROR; };
1286 if (esl_stack_IPush(vs, currnode) != eslOK) { status = eslEMEM; goto ERROR; };
1287 if (esl_stack_CPush(cs, 'R') != eslOK) { status = eslEMEM; goto ERROR; };
1288 if (esl_stack_IPush(vs, currnode) != eslOK) { status = eslEMEM; goto ERROR; };
1289 if (esl_stack_CPush(cs, ',') != eslOK) { status = eslEMEM; goto ERROR; };
1290 if (esl_stack_CPush(cs, 'L') != eslOK) { status = eslEMEM; goto ERROR; };
1291 if (esl_stack_IPush(vs, currnode) != eslOK) { status = eslEMEM; goto ERROR; };
1292
1293 if (newick_advance_buffer(fp, buf, &pos, &nc) == eslEOF)
1294 ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1295 currnode++; /* T->N == # of internal nodes/idx of next internal node */
1296 }
1297 else /* a taxon attaches to v */
1298 {
1299 if (buf[pos] == '\'') { /* a quoted label, for a new taxon attached to v*/
1300 if ((status = newick_parse_quoted_label(fp, buf, &pos, &nc, &label)) != eslOK)
1301 ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse a quoted taxon label");
1302 } else { /* an unquoted label, for a new taxon attached to v */
1303 if ((status = newick_parse_unquoted_label(fp, buf, &pos, &nc, &label)) != eslOK)
1304 ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse an unquoted taxon label");
1305 }
1306
1307 if (newick_skip_whitespace(fp, buf, &pos, &nc) != eslOK)
1308 ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely");
1309
1310 d = 0.;
1311 if (buf[pos] == ':') {
1312 if ((status = newick_parse_branchlength(fp, buf, &pos, &nc, &d)) != eslOK)
1313 ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse a branch length");
1314 }
1315
1316 if (c == 'L') { T->left[v] = -currtaxon; T->ld[v] = d; }
1317 else { T->right[v] = -currtaxon; T->rd[v] = d; }
1318
1319 T->taxonlabel[currtaxon] = label;
1320 currtaxon++;
1321 }
1322 }
1323
1324 else if (c == ')') /* c says, expect to close an interior node next */
1325 {
1326 /* get v = the interior node we're closing, naming, and setting a branch length to */
1327 if (( status = esl_stack_IPop(vs, &v)) != eslOK) goto ERROR;
1328 if (buf[pos] != ')') ESL_XFAIL(eslEFORMAT, errbuf, "Parse error: expected ) to close node #%d\n", v);
1329
1330 if (newick_advance_buffer(fp, buf, &pos, &nc) == eslEOF)
1331 ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1332
1333 if (newick_skip_whitespace(fp, buf, &pos, &nc) != eslOK)
1334 ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1335
1336 if (buf[pos] == '\'') {
1337 if ((status = newick_parse_quoted_label(fp, buf, &pos, &nc, &label)) != eslOK)
1338 ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse a quoted node label");
1339 } else { /* an unquoted label, for a new taxon attached to v */
1340 if ((status = newick_parse_unquoted_label(fp, buf, &pos, &nc, &label)) != eslOK)
1341 ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse an unquoted node label");
1342 }
1343
1344 if (newick_skip_whitespace(fp, buf, &pos, &nc) != eslOK)
1345 ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1346
1347 d = 0.;
1348 if (buf[pos] == ':') {
1349 if ((status = newick_parse_branchlength(fp, buf, &pos, &nc, &d)) != eslOK)
1350 ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse a branch length");
1351 }
1352
1353 if (v > 0) { /* branch length to root node is meaningless, ignore it */
1354 if (T->left [T->parent[v]] == v) T->ld[T->parent[v]] = d;
1355 else if (T->right[T->parent[v]] == v) T->rd[T->parent[v]] = d;
1356 }
1357
1358 T->nodelabel[v] = label;
1359 }
1360
1361 else if (c == 'X') /* optionally, multifurcations: if we see a comma, we have another node to deal with */
1362 {
1363 if ((status = esl_stack_IPop(vs, &v)) != eslOK) goto ERROR;
1364 if (buf[pos] != ',') continue;
1365 if (esl_tree_Grow(T) != eslOK) { status = eslEMEM; goto ERROR; }; /* c.f. memory management note: check that we can add new node */
1366
1367 /* v = the interior node that is multifurcated.
1368 * What we're going to do is to create a new node y; move the existing right child of v
1369 * to the left child of y; and connect y as the new right child of v with a branch
1370 * length of zero. The right child of y is now open. Then, we push a X->,RX production, so the next subtree will
1371 * be parsed as the right child of y. We can do this ad infinitum, resulting in
1372 * a representation of a multifurcation as, for example, a (A,(B,(C,(D,E)))) binary
1373 * subtree with zero length interior branches for a five-way multifurcation.
1374 *
1375 * This swapping destroys the order of the nodes: they will not be in preorder traversal.
1376 * This is temporarily ok. We renumber later.
1377 */
1378 T->left[currnode] = T->right[v];
1379 T->ld[currnode] = T->rd[v];
1380 T->parent[currnode] = v;
1381 if (T->right[v] > 0)
1382 T->parent[T->right[v]] = currnode;
1383 T->right[v] = currnode;
1384 T->rd[v] = 0.;
1385
1386 if (esl_stack_CPush(cs, 'X') != eslOK) { status = eslEMEM; goto ERROR; };
1387 if (esl_stack_IPush(vs, currnode) != eslOK) { status = eslEMEM; goto ERROR; };
1388 if (esl_stack_CPush(cs, 'R') != eslOK) { status = eslEMEM; goto ERROR; };
1389 if (esl_stack_IPush(vs, currnode) != eslOK) { status = eslEMEM; goto ERROR; };
1390 if (esl_stack_CPush(cs, ',') != eslOK) { status = eslEMEM; goto ERROR; };
1391 currnode++;
1392 }
1393
1394 T->N = currnode + 1; /* c.f. memory management note: keep T->N = # of taxa the tree *would* hold, given currnode */
1395 }
1396
1397 esl_tree_RenumberNodes(T);
1398
1399 esl_stack_Destroy(cs);
1400 esl_stack_Destroy(vs);
1401 *ret_T = T;
1402 return eslOK;
1403
1404 ERROR:
1405 if (T != NULL) esl_tree_Destroy(T);
1406 if (cs != NULL) esl_stack_Destroy(cs);
1407 if (vs != NULL) esl_stack_Destroy(vs);
1408 *ret_T = NULL;
1409 return status;
1410 }
1411
1412
1413
1414 /*-------------------- end, Newick i/o --------------------------*/
1415
1416
1417 /*****************************************************************
1418 * 3. Tree comparison algorithms
1419 *****************************************************************/
1420
1421 /* Function: esl_tree_Compare()
1422 *
1423 * Purpose: Given two trees <T1> and <T2> for the same
1424 * set of <N> taxa, compare the topologies of the
1425 * two trees.
1426 *
1427 * The routine must be able to determine which taxa are
1428 * equivalent in <T1> and <T2>. If <T1> and <T2> both have
1429 * taxon labels set, then the routine compares labels.
1430 * This is the usual case. (Therefore, the <N> labels must
1431 * all be different, or the routine will be unable to do
1432 * this mapping uniquely.) As a special case, if neither
1433 * <T1> nor <T2> has taxon labels, then the indexing of
1434 * taxa <0..N-1> is assumed to be exactly the same in the
1435 * two trees. (And if one tree has labels and the other
1436 * does not, an <eslEINVAL> exception is thrown.)
1437 *
1438 * For comparing unrooted topologies, be sure that <T1> and
1439 * <T2> both obey the unrooted tree convention that the
1440 * "root" is placed on the branch to taxon 0. (That is,
1441 * <T->left[0] = 0>.)
1442 *
1443 * Returns: <eslOK> if tree topologies are identical. <eslFAIL>
1444 * if they aren't.
1445 *
1446 * Throws: <eslEMEM> on allocation error. <eslEINVAL> if the taxa in
1447 * the trees can't be mapped uniquely and completely to
1448 * each other (because one tree doesn't have labels and
1449 * one does, or because the labels aren't unique, or the
1450 * two trees have different taxa).
1451 */
1452 int
esl_tree_Compare(ESL_TREE * T1,ESL_TREE * T2)1453 esl_tree_Compare(ESL_TREE *T1, ESL_TREE *T2)
1454 {
1455 int *Mg = NULL; /* the M(g) tree-mapping function for internal nodes [0..N-2] */
1456 int *Mgt = NULL; /* the M(g) tree-mapping function for leaves (taxa), [0..N-1] */
1457 int g, child; /* node indices for parent, children */
1458 int a,b;
1459 int status;
1460
1461 if (T1->N != T2->N) ESL_EXCEPTION(eslEINVAL, "trees don't have the same # of taxa");
1462
1463 /* We need taxon parent map in tree 2, but not tree 1.
1464 */
1465 if ((status = esl_tree_SetTaxaParents(T2)) != eslOK) goto ERROR;
1466
1467 /* We're going to use the tree mapping function M(g) [Goodman79].
1468 * In the implementation here, we split it into two, Mg[] for internal
1469 * nodes 0..N-2 and Mgt[] for taxa 0..N-1.
1470 *
1471 * Mg[g] for node g in T1 is the index of the lowest node in T2
1472 * that contains the same children taxa as the subtree
1473 * under g in T1.
1474 *
1475 * For the taxa, Mgt[g] for taxon g in T1 is the index of the
1476 * corresponding taxon in T2. If neither tree has taxon labels
1477 * Mgt[g] = g for all g. Otherwise we have to compare labels. Right
1478 * now, we do this by brute force, which is O(N^2); if this ever
1479 * becomes rate limiting, replace it with a keyhash to make it O(N)
1480 * (and in fact, the keyhash of taxon names could even become part
1481 * of the ESL_TREE).
1482 */
1483 ESL_ALLOC(Mg, sizeof(int) * (T1->N-1));
1484 ESL_ALLOC(Mgt, sizeof(int) * (T1->N));
1485 if (T1->taxonlabel != NULL && T2->taxonlabel != NULL)
1486 {
1487 esl_vec_ISet(Mgt, T1->N, -1); /* flags for "unset" */
1488 for (a = 0; a < T1->N; a++)
1489 {
1490 for (b = 0; b < T1->N; b++)
1491 if (strcmp(T1->taxonlabel[a], T2->taxonlabel[b]) == 0)
1492 { Mgt[a] = b; break; }
1493 }
1494 for (a = 0; a < T1->N; a++)
1495 if (Mgt[a] == -1) ESL_XEXCEPTION(eslEINVAL, "couldn't map taxa");
1496 }
1497 else if (T1->taxonlabel == NULL && T2->taxonlabel == NULL)
1498 {
1499 for (a = 0; a < T1->N; a++)
1500 Mgt[a] = a;
1501 }
1502 else
1503 ESL_XEXCEPTION(eslEINVAL, "either both trees must have taxon labels, or neither");
1504
1505 /* Finally, we use the SDI algorithm [ZmasekEddy01] to construct
1506 * M(g) for internal nodes, by postorder traversal of T1.
1507 */
1508 for (g = T1->N-2; g >= 0; g--)
1509 {
1510 child = T1->left[g];
1511 if (child <= 0) a = T2->taxaparent[Mgt[-child]];
1512 else a = T2->parent[Mg[child]];
1513
1514 child = T1->right[g];
1515 if (child <= 0) b = T2->taxaparent[Mgt[-child]];
1516 else b = T2->parent[Mg[child]];
1517
1518 /* a shortcut in SDI: special case for exact tree comparison: */
1519 if (a != b) { free(Mg); free(Mgt); return eslFAIL; }
1520 Mg[g] = a;
1521 }
1522
1523 free(Mg);
1524 free(Mgt);
1525 return eslOK;
1526
1527 ERROR:
1528 if (Mg != NULL) free(Mg);
1529 if (Mgt != NULL) free(Mgt);
1530 return status;
1531 }
1532
1533 /*----------------- end, tree comparison -----------------------*/
1534
1535
1536
1537
1538
1539
1540 /*****************************************************************
1541 * 4. Clustering algorithms for tree construction.
1542 *****************************************************************/
1543
1544 /* cluster_engine()
1545 *
1546 * Implements four clustering algorithms for tree construction:
1547 * UPGMA, WPGMA, single-linkage, and maximum-linkage. These differ
1548 * only by the rule used to construct new distances after joining
1549 * two clusters i,j.
1550 *
1551 * Input <D_original> is a symmetric distance matrix, for <D->n> taxa.
1552 * The diagonal is all 0's, and off-diagonals are $\geq 0$. <D->n>
1553 * must be at least two.
1554 *
1555 * <mode> is one of <eslUPGMA>, <eslWPGMA>, <eslSINGLE_LINKAGE>, or
1556 * <eslCOMPLETE_LINKAGE>: a flag specifying which algorithm to use.
1557 *
1558 * The output is a tree structure, returned in <ret_T>.
1559 *
1560 * Returns <eslOK> on success.
1561 *
1562 * Throws <eslEMEM> on allocation failure.
1563 *
1564 * Complexity: O(N^2) in memory, O(N^3) in time.
1565 *
1566 * This function can be optimized. Memory usage is at least
1567 * 4x more than necessary. First, we don't need to make a copy of D
1568 * if the caller doesn't mind it being consumed. Second, D only
1569 * needs to be lower- or upper-triangular, because it's symmetric,
1570 * but that requires changing dmatrix module. In time,
1571 * O(N^2 log N) if not O(N^2) should be possible, by being more
1572 * sophisticated about identifying the minimum element;
1573 * see Gronau and Moran (2006).
1574 *
1575 */
1576 static int
cluster_engine(ESL_DMATRIX * D_original,int mode,ESL_TREE ** ret_T)1577 cluster_engine(ESL_DMATRIX *D_original, int mode, ESL_TREE **ret_T)
1578 {
1579 ESL_DMATRIX *D = NULL;
1580 ESL_TREE *T = NULL;
1581 double *height = NULL; /* height of internal nodes [0..N-2] */
1582 int *idx = NULL; /* taxa or node index of row/col in D [0..N-1] */
1583 int *nin = NULL; /* # of taxa in clade in row/col in D [0..N-1] */
1584 int N;
1585 int i = 0, j = 0;
1586 int row,col;
1587 double minD;
1588 int status;
1589
1590 /* Contract checks.
1591 */
1592 ESL_DASSERT1((D_original != NULL)); /* matrix exists */
1593 ESL_DASSERT1((D_original->n == D_original->m)); /* D is NxN square */
1594 ESL_DASSERT1((D_original->n >= 2)); /* >= 2 taxa */
1595
1596 #if (eslDEBUGLEVEL >=1)
1597 for (i = 0; i < D_original->n; i++) {
1598 assert(D_original->mx[i][i] == 0.); /* self-self d = 0 */
1599 for (j = i+1; j < D_original->n; j++) /* D symmetric */
1600 assert(D_original->mx[i][j] == D_original->mx[j][i]);
1601 }
1602 #endif
1603
1604 /* Allocations.
1605 * NxN copy of the distance matrix, which we'll iteratively whittle down to 2x2;
1606 * tree for N taxa;
1607 */
1608 if ((D = esl_dmatrix_Clone(D_original)) == NULL) return eslEMEM;
1609 if ((T = esl_tree_Create(D->n)) == NULL) return eslEMEM;
1610 ESL_ALLOC(idx, sizeof(int) * D->n);
1611 ESL_ALLOC(nin, sizeof(int) * D->n);
1612 ESL_ALLOC(height, sizeof(double) * (D->n-1));
1613 for (i = 0; i < D->n; i++) idx[i] = -i; /* assign taxa indices to row/col coords */
1614 for (i = 0; i < D->n; i++) nin[i ] = 1; /* each cluster starts as 1 */
1615 for (i = 0; i < D->n-1; i++) height[i] = 0.;
1616
1617 /* If we're doing either single linkage or complete linkage clustering,
1618 * we will construct a "linkage tree", where ld[v], rd[v] "branch lengths"
1619 * below node v are the linkage value for clustering node v; thus
1620 * ld[v] == rd[v] in a linkage tree.
1621 * For UPGMA or WPGMA, we're building an additive tree, where ld[v] and
1622 * rd[v] are branch lengths.
1623 */
1624 if (mode == eslSINGLE_LINKAGE || mode == eslCOMPLETE_LINKAGE)
1625 T->is_linkage_tree = TRUE;
1626
1627 for (N = D->n; N >= 2; N--)
1628 {
1629 /* Find minimum in our current N x N matrix.
1630 * (Don't init minD to -infinity; linkage trees use sparse distance matrices
1631 * with -infinity representing unlinked.)
1632 */
1633 minD = D->mx[0][1]; i = 0; j = 1; /* init with: if nothing else, try to link 0-1 */
1634 for (row = 0; row < N; row++)
1635 for (col = row+1; col < N; col++)
1636 if (D->mx[row][col] < minD)
1637 {
1638 minD = D->mx[row][col];
1639 i = row;
1640 j = col;
1641 }
1642
1643 /* We're joining node at row/col i with node at row/col j.
1644 * Add node (index = N-2) to the tree at height minD/2.
1645 */
1646 T->left[N-2] = idx[i];
1647 T->right[N-2] = idx[j];
1648 if (T->is_linkage_tree) height[N-2] = minD;
1649 else height[N-2] = minD / 2.;
1650
1651 /* Set the branch lengths (additive trees) or heights (linkage trees)
1652 */
1653 T->ld[N-2] = T->rd[N-2] = height[N-2];
1654 if (! T->is_linkage_tree) {
1655 if (idx[i] > 0) T->ld[N-2] = ESL_MAX(0., T->ld[N-2] - height[idx[i]]); // max to 0, to avoid fp roundoff giving us negative length
1656 if (idx[j] > 0) T->rd[N-2] = ESL_MAX(0., T->rd[N-2] - height[idx[j]]);
1657 }
1658
1659 /* If either node was an internal node, record parent in it.
1660 */
1661 if (idx[i] > 0) T->parent[idx[i]] = N-2;
1662 if (idx[j] > 0) T->parent[idx[j]] = N-2;
1663
1664 /* Now, build a new matrix by merging row i+j and col i+j.
1665 * 1. move j to N-1 (unless it's already there)
1666 * 2. move i to N-2 (unless it's already there)
1667 */
1668 if (j != N-1)
1669 {
1670 for (row = 0; row < N; row++)
1671 ESL_SWAP(D->mx[row][N-1], D->mx[row][j], double);
1672 for (col = 0; col < N; col++)
1673 ESL_SWAP(D->mx[N-1][col], D->mx[j][col], double);
1674 ESL_SWAP(idx[j], idx[N-1], int);
1675 ESL_SWAP(nin[j], nin[N-1], int);
1676 }
1677 if (i != N-2)
1678 {
1679 for (row = 0; row < N; row++)
1680 ESL_SWAP(D->mx[row][N-2], D->mx[row][i], double);
1681 for (col = 0; col < N; col++)
1682 ESL_SWAP(D->mx[N-2][col], D->mx[i][col], double);
1683 ESL_SWAP(idx[i], idx[N-2], int);
1684 ESL_SWAP(nin[i], nin[N-2], int);
1685 }
1686 i = N-2;
1687 j = N-1;
1688
1689 /* 3. merge i (now at N-2) with j (now at N-1)
1690 * according to the desired clustering rule.
1691 */
1692 for (col = 0; col < N; col++)
1693 {
1694 switch (mode) {
1695 case eslUPGMA:
1696 D->mx[i][col] = (nin[i] * D->mx[i][col] + nin[j] * D->mx[j][col]) / (double) (nin[i] + nin[j]);
1697 break;
1698 case eslWPGMA: D->mx[i][col] = (D->mx[i][col] + D->mx[j][col]) / 2.; break;
1699 case eslSINGLE_LINKAGE: D->mx[i][col] = ESL_MIN(D->mx[i][col], D->mx[j][col]); break;
1700 case eslCOMPLETE_LINKAGE: D->mx[i][col] = ESL_MAX(D->mx[i][col], D->mx[j][col]); break;
1701 default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "no such strategy");
1702 }
1703 D->mx[col][i] = D->mx[i][col];
1704 }
1705
1706 /* row/col i is now the new cluster, and it corresponds to node N-2
1707 * in the tree (remember, N is decrementing at each iteration).
1708 * row/col j (N-1) falls away when we go back to the start of the loop
1709 * and decrement N.
1710 */
1711 nin[i] += nin[j];
1712 idx[i] = N-2;
1713 }
1714
1715 esl_dmatrix_Destroy(D);
1716 free(height);
1717 free(idx);
1718 free(nin);
1719 if (ret_T != NULL) *ret_T = T;
1720 return eslOK;
1721
1722 ERROR:
1723 if (D != NULL) esl_dmatrix_Destroy(D);
1724 if (T != NULL) esl_tree_Destroy(T);
1725 if (height != NULL) free(height);
1726 if (idx != NULL) free(idx);
1727 if (nin != NULL) free(nin);
1728 if (ret_T != NULL) *ret_T = NULL;
1729 return status;
1730 }
1731
1732
1733 /* Function: esl_tree_UPGMA()
1734 *
1735 * Purpose: Given distance matrix <D>, use the UPGMA algorithm
1736 * to construct a tree <T>.
1737 *
1738 * Returns: <eslOK> on success; the tree is returned in <ret_T>,
1739 * and must be freed by the caller with <esl_tree_Destroy()>.
1740 *
1741 * Throws: <eslEMEM> on allocation problem, and <ret_T> is set <NULL>.
1742 */
1743 int
esl_tree_UPGMA(ESL_DMATRIX * D,ESL_TREE ** ret_T)1744 esl_tree_UPGMA(ESL_DMATRIX *D, ESL_TREE **ret_T)
1745 {
1746 return cluster_engine(D, eslUPGMA, ret_T);
1747 }
1748
1749 /* Function: esl_tree_WPGMA()
1750 *
1751 * Purpose: Given distance matrix <D>, use the WPGMA algorithm
1752 * to construct a tree <T>.
1753 *
1754 * Returns: <eslOK> on success; the tree is returned in <ret_T>,
1755 * and must be freed by the caller with <esl_tree_Destroy()>.
1756 *
1757 * Throws: <eslEMEM> on allocation problem, and <ret_T> is set <NULL>.
1758 */
1759 int
esl_tree_WPGMA(ESL_DMATRIX * D,ESL_TREE ** ret_T)1760 esl_tree_WPGMA(ESL_DMATRIX *D, ESL_TREE **ret_T)
1761 {
1762 return cluster_engine(D, eslWPGMA, ret_T);
1763 }
1764
1765 /* Function: esl_tree_SingleLinkage()
1766 *
1767 * Purpose: Given distance matrix <D>, construct a single-linkage
1768 * (minimum distances) clustering tree <T>.
1769 *
1770 * Returns: <eslOK> on success; the tree is returned in <ret_T>,
1771 * and must be freed by the caller with <esl_tree_Destroy()>.
1772 *
1773 * Throws: <eslEMEM> on allocation problem, and <ret_T> is set <NULL>.
1774 */
1775 int
esl_tree_SingleLinkage(ESL_DMATRIX * D,ESL_TREE ** ret_T)1776 esl_tree_SingleLinkage(ESL_DMATRIX *D, ESL_TREE **ret_T)
1777 {
1778 return cluster_engine(D, eslSINGLE_LINKAGE, ret_T);
1779 }
1780
1781 /* Function: esl_tree_CompleteLinkage()
1782 *
1783 * Purpose: Given distance matrix <D>, construct a complete-linkage
1784 * (maximum distances) clustering tree <T>.
1785 *
1786 * Returns: <eslOK> on success; the tree is returned in <ret_T>,
1787 * and must be freed by the caller with <esl_tree_Destroy()>.
1788 *
1789 * Throws: <eslEMEM> on allocation problem, and <ret_T> is set <NULL>.
1790 */
1791 int
esl_tree_CompleteLinkage(ESL_DMATRIX * D,ESL_TREE ** ret_T)1792 esl_tree_CompleteLinkage(ESL_DMATRIX *D, ESL_TREE **ret_T)
1793 {
1794 return cluster_engine(D, eslCOMPLETE_LINKAGE, ret_T);
1795 }
1796 /*----------------- end, clustering algorithms ----------------*/
1797
1798
1799
1800 /*****************************************************************
1801 * 5. Generating simulated trees
1802 *****************************************************************/
1803
1804 /* Function: esl_tree_Simulate()
1805 * Synopsis: Generate a random rooted ultrametric tree.
1806 *
1807 * Purpose: Generate a random rooted ultrametric tree of <N> taxa,
1808 * using the algorithm of Kuhner and Felsenstein (1994).
1809 *
1810 * The branch lengths are generated by choosing <N-1>
1811 * exponentially distributed split times, with decreasing
1812 * expectations of $\frac{1}{2},\frac{1}{3}..\frac{1}{N}$
1813 * as the simulation proceeds from the root. Thus the
1814 * total expected branch length on the tree is
1815 * $\sum_{k=2}^{N} \frac{1}{k}$.
1816 *
1817 * Args: r - random number source
1818 * N - number of taxa (>= 2)
1819 * ret_T - RETURN: sampled tree
1820 *
1821 * Returns: <eslOK> on success, and the new tree is allocated
1822 * here and returned via <ret_tree>; caller is
1823 * responsible for free'ing it.
1824 *
1825 * Throws: <eslEMEM> on allocation failure, in which case
1826 * the <ret_T> is returned <NULL>.
1827 *
1828 * Xref: STL11/65.
1829 */
1830 int
esl_tree_Simulate(ESL_RANDOMNESS * r,int N,ESL_TREE ** ret_T)1831 esl_tree_Simulate(ESL_RANDOMNESS *r, int N, ESL_TREE **ret_T)
1832 {
1833 ESL_TREE *T = NULL;
1834 int *branchpapa = NULL;
1835 int *branchside = NULL;
1836 int nactive;
1837 double d;
1838 int node;
1839 int bidx; /* index of an active branch */
1840 int status;
1841
1842 ESL_DASSERT1( (r != NULL) );
1843 ESL_DASSERT1( (N >= 2) );
1844
1845 /* Kuhner/Felsenstein uses a list of active branches,
1846 * which we implement by tracking the index of the parent
1847 * node (in <branchpapa>) and a 0/1 flag (in <branchside>)
1848 * for the branch to the left vs. right child.
1849 */
1850 if ((T = esl_tree_Create(N)) == NULL) { status = eslEMEM; goto ERROR; }
1851 ESL_ALLOC(branchpapa, sizeof(int) * N);
1852 ESL_ALLOC(branchside, sizeof(int) * N);
1853
1854 /* Initialize: add two branches from the root
1855 * onto the active list, and set internal node
1856 * counter to start at 1.
1857 */
1858 branchpapa[0] = 0; branchside[0] = 0;
1859 branchpapa[1] = 0; branchside[1] = 1;
1860 nactive = 2;
1861 node = 1;
1862
1863 /* Algorithm proceeds by iterating:
1864 * 1. choose random time <d> from exponential(1/nactive)
1865 * 2. choose random active branch, <bidx>
1866 * 3. add new <node> to active branch at length d
1867 * 4. add d to all other active branches
1868 * 5. delete the old parent branch from the active list,
1869 * add the two new child branches to the active list
1870 */
1871 while (nactive < N)
1872 {
1873 d = (double) nactive * -log(esl_rnd_UniformPositive(r));
1874 bidx = esl_rnd_Roll(r, nactive);
1875 T->parent[node] = branchpapa[bidx];
1876
1877 if (branchside[bidx] == 0) {
1878 T->left[branchpapa[bidx]] = node;
1879 T->ld [branchpapa[bidx]] += d;
1880 } else {
1881 T->right[branchpapa[bidx]] = node;
1882 T->rd [branchpapa[bidx]] += d;
1883 }
1884
1885 ESL_SWAP(branchpapa[bidx], branchpapa[nactive-1], int);
1886 ESL_SWAP(branchside[bidx], branchside[nactive-1], int);
1887 for (bidx = 0; bidx < nactive-1; bidx++) {
1888 if (branchside[bidx] == 0) T->ld[branchpapa[bidx]] += d;
1889 else T->rd[branchpapa[bidx]] += d;
1890 }
1891
1892 /* delete the branch at nactive-1 that we just added to;
1893 * replace it with two new branches
1894 */
1895 branchpapa[nactive-1] = node; branchside[nactive-1] = 0;
1896 branchpapa[nactive] = node; branchside[nactive] = 1;
1897 node++;
1898 nactive++;
1899 }
1900
1901 /* Terminate by adding the N taxa to the N active branches.
1902 */
1903 d = (double) N * -log(esl_rnd_UniformPositive(r));
1904 for (bidx = 0; bidx < N; bidx++)
1905 {
1906 if (branchside[bidx] == 0) {
1907 T->left[branchpapa[bidx]] = -bidx; /* taxa indices stored as neg #'s */
1908 T->ld [branchpapa[bidx]] += d;
1909 } else {
1910 T->right[branchpapa[bidx]] = -bidx;
1911 T->rd [branchpapa[bidx]] += d;
1912 }
1913 }
1914
1915 *ret_T = T;
1916 free(branchpapa);
1917 free(branchside);
1918 return eslOK;
1919
1920 ERROR:
1921 if (T != NULL) esl_tree_Destroy(T);
1922 if (branchpapa != NULL) free(branchpapa);
1923 if (branchside != NULL) free(branchside);
1924 *ret_T = NULL;
1925 return status;
1926 }
1927
1928
1929 /* Function: esl_tree_ToDistanceMatrix()
1930 * Synopsis: Obtain a pairwise distance matrix from a tree.
1931 *
1932 * Purpose: Given tree <T>, calculate a pairwise distance matrix
1933 * and return it in <ret_D>.
1934 *
1935 * Note: Algorithm here is O(N^3). It can probably be improved.
1936 * There ought to be a more efficient recursion that
1937 * saves recalculating node-node distances inside the tree.
1938 * All we do here is a brute force, upwards O(N) LCA
1939 * search for each of the N^2 taxon pairs.
1940 *
1941 * Args: T - input tree
1942 * ret_D - RETURN: the new distance matrix
1943 *
1944 * Returns: <eslOK> on success, and <ret_D> points to the distance
1945 * matrix, which caller is responsible for free'ing with
1946 * <esl_dmatrix_Destroy()>.
1947 *
1948 * Throws: <eslEMEM> on allocation failure, in which case
1949 * <ret_D> is returned <NULL>.
1950 *
1951 * Xref: STL11/66.
1952 */
1953 int
esl_tree_ToDistanceMatrix(ESL_TREE * T,ESL_DMATRIX ** ret_D)1954 esl_tree_ToDistanceMatrix(ESL_TREE *T, ESL_DMATRIX **ret_D)
1955 {
1956 ESL_DMATRIX *D = NULL;
1957 int i,j; /* a pair of taxa {0..N-1} */
1958 int a,b; /* a pair of internal nodes {0..N-2} */
1959 int p; /* a tmp parent index */
1960 double d; /* ij distance */
1961 int status;
1962
1963 D = esl_dmatrix_Create(T->N, T->N); /* creates a NxN square symmetric matrix; really only need triangular */
1964 if (D == NULL) { status = eslEMEM; goto ERROR; }
1965
1966 if ((status = esl_tree_SetTaxaParents(T)) != eslOK) goto ERROR;
1967
1968 for (i = 0; i < T->N; i++)
1969 {
1970 D->mx[i][i] = 0.; /* by definition */
1971 for (j = i+1; j < T->N; j++)
1972 {
1973 a = T->taxaparent[i];
1974 b = T->taxaparent[j];
1975 d = (T->left[a] == -i) ? T->ld[a] : T->rd[a];
1976 d += (T->left[b] == -j) ? T->ld[b] : T->rd[b];
1977 while (a != b) /* a brute force LCA algorithm */
1978 {
1979 if (a < b) ESL_SWAP(a, b, int);
1980 p = T->parent[a];
1981 d += (T->left[p] == a) ? T->ld[p] : T->rd[p];
1982 a = p;
1983 }
1984
1985 D->mx[i][j] = D->mx[j][i] = d;
1986 }
1987 }
1988
1989 *ret_D = D;
1990 return eslOK;
1991
1992 ERROR:
1993 if (D != NULL) esl_dmatrix_Destroy(D);
1994 *ret_D = NULL;
1995 return status;
1996 }
1997
1998
1999
2000 /*****************************************************************
2001 * 6. Unit tests
2002 *****************************************************************/
2003 #ifdef eslTREE_TESTDRIVE
2004
2005 static void
utest_OptionalInformation(ESL_RANDOMNESS * r,int ntaxa)2006 utest_OptionalInformation(ESL_RANDOMNESS *r, int ntaxa)
2007 {
2008 char *msg = "optional information fields unit test failed";
2009 ESL_TREE *T;
2010
2011 if (esl_tree_Simulate(r, ntaxa, &T) != eslOK) esl_fatal(msg);
2012 if (esl_tree_SetTaxaParents(T) != eslOK) esl_fatal(msg);
2013 if (esl_tree_SetCladesizes(T) != eslOK) esl_fatal(msg);
2014 if (esl_tree_Validate(T, NULL) != eslOK) esl_fatal(msg);
2015
2016 esl_tree_Destroy(T);
2017 return;
2018 }
2019
2020
2021 static void
utest_WriteNewick(ESL_RANDOMNESS * r,int ntaxa)2022 utest_WriteNewick(ESL_RANDOMNESS *r, int ntaxa)
2023 {
2024 char *msg = "esl_tree_WriteNewick unit test failed";
2025 char tmpfile[32] = "esltmpXXXXXX";
2026 FILE *fp = NULL;
2027 ESL_TREE *T1 = NULL;
2028 ESL_TREE *T2 = NULL;
2029 char errbuf[eslERRBUFSIZE];
2030
2031 if (esl_tmpfile(tmpfile, &fp) != eslOK) esl_fatal(msg);
2032 if (esl_tree_Simulate(r, ntaxa, &T1) != eslOK) esl_fatal(msg);
2033 if (esl_tree_SetTaxonlabels(T1, NULL) != eslOK) esl_fatal(msg);
2034 if (esl_tree_Validate(T1, NULL) != eslOK) esl_fatal(msg);
2035 if (esl_tree_WriteNewick(fp, T1) != eslOK) esl_fatal(msg);
2036 rewind(fp);
2037 if (esl_tree_ReadNewick(fp, errbuf, &T2) != eslOK) esl_fatal(msg);
2038 if (esl_tree_Validate(T2, NULL) != eslOK) esl_fatal(msg);
2039 if (esl_tree_Compare(T1, T2) != eslOK) esl_fatal(msg);
2040 fclose(fp);
2041
2042 esl_tree_Destroy(T1);
2043 esl_tree_Destroy(T2);
2044 return;
2045 }
2046
2047
2048 static void
utest_UPGMA(ESL_RANDOMNESS * r,int ntaxa)2049 utest_UPGMA(ESL_RANDOMNESS *r, int ntaxa)
2050 {
2051 char *msg = "esl_tree_UPGMA unit test failed";
2052 ESL_TREE *T1 = NULL;
2053 ESL_TREE *T2 = NULL;
2054 ESL_DMATRIX *D1 = NULL;
2055 ESL_DMATRIX *D2 = NULL;
2056
2057 if (esl_tree_Simulate(r, ntaxa, &T1) != eslOK) esl_fatal(msg);
2058 if (esl_tree_ToDistanceMatrix(T1, &D1) != eslOK) esl_fatal(msg);
2059 if (esl_tree_UPGMA(D1, &T2) != eslOK) esl_fatal(msg);
2060
2061 if (esl_tree_Validate(T1, NULL) != eslOK) esl_fatal(msg);
2062 if (esl_tree_Validate(T2, NULL) != eslOK) esl_fatal(msg);
2063 if (esl_tree_VerifyUltrametric(T1) != eslOK) esl_fatal(msg);
2064 if (esl_tree_VerifyUltrametric(T2) != eslOK) esl_fatal(msg);
2065 if (esl_tree_Compare(T1, T2) != eslOK) esl_fatal(msg);
2066
2067 if (esl_tree_ToDistanceMatrix(T1, &D2) != eslOK) esl_fatal(msg);
2068 if (esl_dmatrix_Compare(D1, D2, 0.001) != eslOK) esl_fatal(msg);
2069
2070 esl_tree_Destroy(T1);
2071 esl_tree_Destroy(T2);
2072 esl_dmatrix_Destroy(D1);
2073 esl_dmatrix_Destroy(D2);
2074 return;
2075 }
2076
2077 #endif /*eslTREE_TESTDRIVE*/
2078 /*-------------------- end, unit tests -------------------------*/
2079
2080
2081 /*****************************************************************
2082 * 7. Test driver
2083 *****************************************************************/
2084 #ifdef eslTREE_TESTDRIVE
2085
2086 /*
2087 * gcc -g -Wall -o test -L. -I. -DeslTREE_TESTDRIVE esl_tree.c -leasel -lm
2088 * gcc -g -Wall -o test -L. -I. -DeslTEST_THROWING -DeslTREE_TESTDRIVE esl_msa.c -leasel -lm
2089 * ./test
2090 */
2091 #include "easel.h"
2092 #include "esl_tree.h"
2093 #include "esl_random.h"
2094
2095 int
main(int argc,char ** argv)2096 main(int argc, char **argv)
2097 {
2098 ESL_RANDOMNESS *r = NULL;
2099 int ntaxa;
2100
2101 r = esl_randomness_Create(0);
2102 ntaxa = 20;
2103
2104 utest_OptionalInformation(r, ntaxa); /* SetTaxaparents(), SetCladesizes() */
2105 utest_WriteNewick(r, ntaxa);
2106 utest_UPGMA(r, ntaxa);
2107
2108 esl_randomness_Destroy(r);
2109 return eslOK;
2110 }
2111
2112 #endif /*eslTREE_TESTDRIVE*/
2113 /*-------------------- end, test driver -------------------------*/
2114
2115
2116
2117
2118 /*****************************************************************
2119 * 8. Examples.
2120 *****************************************************************/
2121
2122 /* The first example is an example of inferring a tree by the
2123 * UPGMA algorithm, starting from a multiple sequence alignment.
2124 */
2125 #ifdef eslTREE_EXAMPLE
2126 /*::cexcerpt::tree_example::begin::*/
2127 /* To compile: gcc -g -Wall -o example -I. -DeslTREE_EXAMPLE esl_tree.c esl_dmatrix.c esl_msa.c easel.c -lm
2128 * or: gcc -g -Wall -o example -I. -L. -DeslTREE_EXAMPLE esl_tree.c -leasel -lm
2129 * To run: ./example <MSA file>
2130 */
2131 #include "easel.h"
2132 #include "esl_msa.h"
2133 #include "esl_msafile.h"
2134 #include "esl_distance.h"
2135 #include "esl_tree.h"
2136
main(int argc,char ** argv)2137 int main(int argc, char **argv)
2138 {
2139 ESL_TREE *tree = NULL;
2140 ESL_MSAFILE *afp = NULL;
2141 ESL_MSA *msa = NULL;
2142 ESL_DMATRIX *D = NULL;
2143
2144 esl_msafile_Open(NULL, argv[1], NULL, eslMSAFILE_UNKNOWN, NULL, &afp);
2145 esl_msafile_Read(afp, &msa);
2146 esl_msafile_Close(afp);
2147
2148 esl_dst_CDiffMx(msa->aseq, msa->nseq, &D);
2149 esl_tree_UPGMA(D, &tree);
2150
2151 esl_tree_Destroy(tree);
2152 esl_msa_Destroy(msa);
2153 esl_dmatrix_Destroy(D);
2154 return eslOK;
2155 }
2156 /*::cexcerpt::tree_example::end::*/
2157 #endif /*eslTREE_EXAMPLE*/
2158
2159
2160 /* The second example is an example of reading in a Newick format tree.
2161 */
2162 #ifdef eslTREE_EXAMPLE2
2163 /*::cexcerpt::tree_example2::begin::*/
2164 /* To compile: gcc -g -Wall -o example -I. -DeslTREE_EXAMPLE2 esl_tree.c esl_dmatrix.c esl_msa.c easel.c -lm
2165 * or: gcc -g -Wall -o example -I. -L. -DeslTREE_EXAMPLE2 esl_tree.c -leasel -lm
2166 * To run: ./example <Newick file>
2167 */
2168 #include "easel.h"
2169 #include "esl_msa.h"
2170 #include "esl_distance.h"
2171 #include "esl_tree.h"
2172
main(int argc,char ** argv)2173 int main(int argc, char **argv)
2174 {
2175 ESL_TREE *T;
2176 char errbuf[eslERRBUFSIZE];
2177 FILE *fp;
2178
2179 if ((fp = fopen(argv[1], "r")) == NULL) esl_fatal("Failed to open %s", argv[1]);
2180 if (esl_tree_ReadNewick(fp, errbuf, &T) != eslOK) esl_fatal("Failed to read tree: %s", errbuf);
2181 esl_tree_WriteNewick(stdout, T);
2182
2183 esl_tree_Destroy(T);
2184 fclose(fp);
2185 return eslOK;
2186 }
2187 /*::cexcerpt::tree_example2::end::*/
2188 #endif /*eslTREE_EXAMPLE*/
2189