1 /* Phylogenetic trees.
2  *
3  * Contents:
4  *   1. The ESL_TREE object.
5  *   2. Newick format i/o
6  *   3. Tree comparison algorithms.
7  *   4. Clustering algorithms for distance-based tree construction.
8  *   5. Generating simulated trees.
9  *   6. Unit tests.
10  *   7. Test driver.
11  *   8. Examples.
12  */
13 #include "esl_config.h"
14 
15 #include <math.h>
16 #include <string.h>
17 #include <ctype.h>
18 #include <assert.h>
19 
20 #include "easel.h"
21 #include "esl_arr2.h"
22 #include "esl_dmatrix.h"
23 #include "esl_random.h"
24 #include "esl_stack.h"
25 #include "esl_vectorops.h"
26 
27 #include "esl_tree.h"
28 
29 /*****************************************************************
30  *# 1. The ESL_TREE object.
31  *****************************************************************/
32 
33 /* Function:  esl_tree_Create()
34  *
35  * Purpose:   Allocate an empty tree structure for <ntaxa> taxa
36  *            and return a pointer to it. <ntaxa> must be $\geq 2$.
37  *
38  * Args:      <ntaxa>   - number of taxa
39  *
40  * Returns:   pointer to the new <ESL_TREE> object; caller frees
41  *            this with <esl_tree_Destroy()>.
42  *
43  * Throws:    <NULL> if allocation fails.
44  */
45 ESL_TREE *
esl_tree_Create(int ntaxa)46 esl_tree_Create(int ntaxa)
47 {
48   ESL_TREE *T = NULL;
49   int       i;
50   int       status;
51 
52   /* Contract verification  */
53   ESL_DASSERT1((ntaxa >= 2));
54 
55   /* 1st allocation round  */
56   ESL_ALLOC(T, sizeof(ESL_TREE));
57   T->parent = NULL;
58   T->left   = NULL;
59   T->right  = NULL;
60   T->ld     = NULL;
61   T->rd     = NULL;
62 
63   /* Optional info starts NULL
64    */
65   T->taxaparent  = NULL;
66   T->cladesize   = NULL;
67   T->taxonlabel  = NULL;
68   T->nodelabel   = NULL;
69 
70   /* Additive trees are assumed by default, as opposed to linkage trees  */
71   T->is_linkage_tree = FALSE;
72 
73   /* Tree output options default to PHYLIP style
74    */
75   T->show_unrooted            = FALSE;
76   T->show_node_labels         = TRUE;
77   T->show_root_branchlength   = FALSE;
78   T->show_branchlengths       = TRUE;
79   T->show_quoted_labels       = FALSE;
80   T->show_numeric_taxonlabels = TRUE;
81 
82   /* 2nd allocation round */
83   T->N    = ntaxa;
84   ESL_ALLOC(T->parent, sizeof(int)    * (ntaxa-1));
85   ESL_ALLOC(T->left,   sizeof(int)    * (ntaxa-1));
86   ESL_ALLOC(T->right,  sizeof(int)    * (ntaxa-1));
87   ESL_ALLOC(T->ld,     sizeof(double) * (ntaxa-1));
88   ESL_ALLOC(T->rd,     sizeof(double) * (ntaxa-1));
89 
90   for (i = 0; i < ntaxa-1; i++)
91     {
92       T->parent[i] = 0;
93       T->left[i  ] = 0;
94       T->right[i]  = 0;
95       T->ld[i]   = 0.;
96       T->rd[i]   = 0.;
97     }
98   T->nalloc = ntaxa;
99   return T;
100 
101  ERROR:
102   esl_tree_Destroy(T);
103   return NULL;
104 }
105 
106 /* Function:  esl_tree_CreateGrowable()
107  *
108  * Purpose:   Allocate a growable tree structure for an initial
109  *            allocation of <nalloc> taxa, and return a pointer to it.
110  *            <nalloc> must be $\geq 2$.
111  *
112  * Args:      <nalloc>  - initial allocation size for number of taxa
113  *
114  * Returns:   pointer to a new growable <ESL_TREE> object; caller frees
115  *            this with <esl_tree_Destroy()>.
116  *
117  * Throws:    <NULL> if allocation fails.
118  */
119 ESL_TREE *
esl_tree_CreateGrowable(int nalloc)120 esl_tree_CreateGrowable(int nalloc)
121 {
122   ESL_TREE *T = esl_tree_Create(nalloc);
123   if (T == NULL) return NULL;
124 
125   T->N = 0;
126   return T;
127 }
128 
129 
130 /* Function:  esl_tree_CreateFromString()
131  *
132  * Purpose:   A convenience for making small test cases in the test
133  *            suites: given the contents of a Newick file as a
134  *            single string <s>, convert it to an <ESL_TREE>.
135  *
136  * Returns:   a pointer to the new <ESL_TREE> on success.
137  *
138  * Throws:    <NULL> if it fails to obtain, open, or read the
139  *            temporary file that it puts the string <s> in.
140  */
141 ESL_TREE *
esl_tree_CreateFromString(char * s)142 esl_tree_CreateFromString(char *s)
143 {
144   char      tmpfile[16] = "esltmpXXXXXX";
145   FILE     *fp          = NULL;
146   ESL_TREE *T           = NULL;
147 
148   if (esl_tmpfile(tmpfile, &fp)         != eslOK) goto ERROR;
149   fputs(s, fp);
150   rewind(fp);
151   if (esl_tree_ReadNewick(fp, NULL, &T) != eslOK) goto ERROR;
152   fclose(fp);
153   return T;
154 
155  ERROR:
156   if (fp  != NULL) fclose(fp);
157   if (T   != NULL) esl_tree_Destroy(T);
158   return NULL;
159 }
160 
161 
162 
163 /* Function:  esl_tree_Grow()
164  *
165  * Purpose:   Given a tree <T>, make sure it can hold one more taxon;
166  *            reallocate internally if necessary by doubling the
167  *            number of taxa it is currently allocated to hold.
168  *
169  * Returns:   <eslOK> on success.
170  *
171  * Throws:    <eslEMEM> on allocation failure. In this case,
172  *            the data in the tree are unaffected.
173  */
174 int
esl_tree_Grow(ESL_TREE * T)175 esl_tree_Grow(ESL_TREE *T)
176 {
177   void *tmp;
178   int   nnew;
179   int   status;
180   int   i;
181 
182   if (T->N < T->nalloc) return eslOK; /* do we have room for next taxon? */
183 
184   nnew = T->nalloc * 2;
185 
186   /* There are N-1 interior nodes, so arrays of info for
187    * interior nodes are allocated for (nnew-1), whereas
188    * arrays of info for the N taxa are allocated (nnew).
189    */
190   ESL_RALLOC(T->parent, tmp, sizeof(int)    * (nnew-1));
191   ESL_RALLOC(T->left,   tmp, sizeof(int)    * (nnew-1));
192   ESL_RALLOC(T->right,  tmp, sizeof(int)    * (nnew-1));
193   ESL_RALLOC(T->ld,     tmp, sizeof(double) * (nnew-1));
194   ESL_RALLOC(T->rd,     tmp, sizeof(double) * (nnew-1));
195 
196   /* 0..N-2 were already initialized or used.
197    * Initialize newly alloced space N-1..nnew-2.
198    */
199   for (i = T->nalloc-1; i < nnew-1; i++)
200     {
201       T->parent[i] = 0;
202       T->left[i  ] = 0;
203       T->right[i]  = 0;
204       T->ld[i]   = 0.;
205       T->rd[i]   = 0.;
206     }
207 
208   if (T->taxaparent != NULL)
209     {
210       ESL_RALLOC(T->taxaparent, tmp, sizeof(int)    * nnew);
211       for (i = T->nalloc; i < nnew; i++) T->taxaparent[i] = 0;
212     }
213 
214   if (T->cladesize != NULL)
215     {
216       ESL_RALLOC(T->cladesize, tmp, sizeof(int)    * nnew);
217       for (i = T->nalloc; i < nnew; i++) T->cladesize[i] = 0;
218     }
219 
220   if (T->taxonlabel    != NULL)
221     {
222       ESL_RALLOC(T->taxonlabel,    tmp, sizeof(char *) * nnew);
223       for (i = T->nalloc; i < nnew; i++) T->taxonlabel[i] = NULL;
224     }
225 
226   if (T->nodelabel     != NULL)
227     {
228       ESL_RALLOC(T->nodelabel,     tmp, sizeof(char *) * (nnew-1));
229       for (i = T->nalloc-1; i < nnew-1; i++) T->nodelabel[i] = NULL;
230     }
231 
232   T->nalloc = nnew;
233   return eslOK;
234 
235  ERROR:
236   return eslEMEM;
237 }
238 
239 
240 /* Function:  esl_tree_SetTaxaParents()
241  *
242  * Purpose:   Constructs the <T->taxaparent[]> array in the tree
243  *            structure <T>, by an O(N) traversal of the tree.
244  *            Upon return, <T->taxaparent[i]> is the index
245  *            of the internal node that taxon <i> is a child of.
246  *
247  * Args:      T   - the tree structure to map
248  *
249  * Returns:   <eslOK> on success.
250  *
251  * Throws:    <eslEMEM> on internal allocation error. In this case, the tree is
252  *            returned unchanged.
253  *
254  * Xref:      STL11/63
255  */
256 int
esl_tree_SetTaxaParents(ESL_TREE * T)257 esl_tree_SetTaxaParents(ESL_TREE *T)
258 {
259   int i;
260   int status;
261 
262   if (T->taxaparent != NULL) return eslOK;         // map already exists.
263   ESL_ALLOC(T->taxaparent, sizeof(int) * T->N);
264   for (i = 0; i < T->N; i++) T->taxaparent[i] = 0; // solely to calm static analyzers.
265 
266   for (i = 0; i < T->N-1; i++)	/* traversal order doesn't matter */
267     {
268       if (T->left[i]  <= 0) T->taxaparent[-(T->left[i])]  = i;
269       if (T->right[i] <= 0) T->taxaparent[-(T->right[i])] = i;
270     }
271   return eslOK;
272 
273  ERROR:
274   if (T->taxaparent != NULL) { free(T->taxaparent); T->taxaparent = NULL; }
275   return status;
276 }
277 
278 
279 /* Function:  esl_tree_SetCladesizes()
280  *
281  * Purpose:   Constructs the <T->cladesize[]> array in tree structure
282  *            <T>. Upon successful return, <T->cladesize[i]> is the
283  *            number of taxa contained by the clade rooted at node <i>
284  *            in the tree. For example, <T->cladesize[0]> is $N$ by
285  *            definition, because 0 is the root of the tree.
286  *
287  * Returns:   <eslOK> on success.
288  *
289  * Throws:    <eslEMEM> on allocation error; in this case, the
290  *            original <T> is unmodified.
291  */
292 int
esl_tree_SetCladesizes(ESL_TREE * T)293 esl_tree_SetCladesizes(ESL_TREE *T)
294 {
295   int i;
296   int status;
297 
298   if (T->cladesize != NULL) return eslOK; /* already set. */
299   ESL_ALLOC(T->cladesize, sizeof(int) * (T->N-1));
300   esl_vec_ISet(T->cladesize, T->N-1, 0);
301 
302   for (i = T->N-2; i >= 0; i--)
303     {                        /* taxon:   ...else...   internal node:  */
304       if (T->left[i]  <= 0) T->cladesize[i]++; else T->cladesize[i] += T->cladesize[T->left[i]];
305       if (T->right[i] <= 0) T->cladesize[i]++; else T->cladesize[i] += T->cladesize[T->right[i]];
306     }
307   return eslOK;
308 
309  ERROR:
310   if (T->cladesize != NULL) { free(T->cladesize); T->cladesize = NULL; }
311   return status;
312 }
313 
314 
315 /* Function:  esl_tree_SetTaxonlabels()
316  *
317  * Purpose:   Given an array of taxon names <names[0..N-1]> with the
318  *            same order and number as the taxa in tree <T>, make a
319  *            copy of those names in <T>. For example, <names> might
320  *            be the sequence names in a multiple alignment,
321  *            <msa->sqname>.
322  *
323  *            If the tree already had taxon names assigned to it, they
324  *            are replaced.
325  *
326  *            As a special case, if the <names> argument is passed as
327  *            <NULL>, then the taxon labels are set to a string
328  *            corresponding to their internal index; that is, taxon 0
329  *            is labeled "0".
330  *
331  * Returns:   <eslOK> on success, and internal state of <T>
332  *            (specifically, <T->taxonlabel[]>) now contains a copy
333  *            of the taxa names.
334  *
335  * Throws:    <eslEMEM> on allocation failure. <T->taxonlabel[]> will be
336  *            <NULL> (even if it was already set).
337  */
338 int
esl_tree_SetTaxonlabels(ESL_TREE * T,char ** names)339 esl_tree_SetTaxonlabels(ESL_TREE *T, char **names)
340 {
341   int i;
342   int status;
343 
344   if (T->taxonlabel != NULL) esl_arr2_Destroy((void **) T->taxonlabel, T->N);
345   ESL_ALLOC(T->taxonlabel, sizeof(char *) * T->nalloc);
346   for (i = 0; i < T->nalloc; i++) T->taxonlabel[i] = NULL;
347 
348   if (names != NULL)
349     {
350       for (i = 0; i < T->N; i++)
351 	if ((status = esl_strdup(names[i], -1, &(T->taxonlabel[i]))) != eslOK) goto ERROR;
352     }
353   else
354     {
355       for (i = 0; i < T->N; i++)
356 	{
357 	  ESL_ALLOC(T->taxonlabel[i], sizeof(char) * 32); /* enough width for any conceivable int */
358 	  snprintf(T->taxonlabel[i], 32, "%d", i);
359 	}
360     }
361   return eslOK;
362 
363  ERROR:
364   if (T->taxonlabel != NULL) esl_arr2_Destroy((void **) T->taxonlabel, T->nalloc);
365   return status;
366 }
367 
368 /* Function:  esl_tree_RenumberNodes()
369  * Synopsis:  Assure nodes are numbered in preorder.
370  *
371  * Purpose:   Given a tree <T> whose internal nodes might be numbered in
372  *            any order, with the sole requirement that node 0 is the
373  *            root; renumber the internal nodes (if necessary) to be in Easel's
374  *            convention of preorder traversal. No other aspect of <T> is
375  *            altered (including its allocation size).
376  *
377  * Returns:   <eslOK> on success.
378  *
379  * Throws:    <eslEMEM> on allocation failure.
380  *
381  * Xref:      STL11/77
382  */
383 int
esl_tree_RenumberNodes(ESL_TREE * T)384 esl_tree_RenumberNodes(ESL_TREE *T)
385 {
386   ESL_TREE  *T2  = NULL;
387   ESL_STACK *vs  = NULL;
388   int       *map = NULL;
389   int        v,new;
390   int        status;
391   int        needs_rearranging = FALSE;
392 
393   /* Pass 1. Preorder traverse of T by its children links;
394    *         construct map[old] -> new.
395    */
396   ESL_ALLOC(map, sizeof(int) * (T->N-1));
397   if (( vs = esl_stack_ICreate()) == NULL) { status = eslEMEM; goto ERROR; };
398   if (esl_stack_IPush(vs, 0) != eslOK)     { status = eslEMEM; goto ERROR; };
399   new = 0;
400   while (esl_stack_IPop(vs, &v) == eslOK)
401     {
402       if (v != new) needs_rearranging = TRUE;
403       map[v] = new++;
404       if (T->right[v] > 0 && esl_stack_IPush(vs, T->right[v]) != eslOK) { status = eslEMEM; goto ERROR; };
405       if (T->left[v]  > 0 && esl_stack_IPush(vs, T->left[v])  != eslOK) { status = eslEMEM; goto ERROR; };
406     }
407   if (! needs_rearranging) { status = eslOK; goto ERROR; } /* not an error; just cleaning up & returning eslOK. */
408 
409   /* Pass 2. Construct the guts of correctly numbered new T2.
410    *         (traversal order doesn't matter here)
411    */
412   if (( T2 = esl_tree_Create(T->nalloc)) == NULL) { status = eslEMEM; goto ERROR; };
413   if (T->nodelabel   != NULL) {
414     ESL_ALLOC(T2->nodelabel,  sizeof(char *) * (T2->nalloc-1));
415     for (v = 0; v < T2->nalloc-1; v++) T2->nodelabel[v] = NULL;
416   }
417   if (T->taxaparent != NULL)  {
418     ESL_ALLOC(T2->taxaparent, sizeof(int)    * (T2->nalloc));
419     for (v = 0; v < T2->nalloc; v++)   T2->taxaparent[v] = 0;
420   }
421 
422   for (v = 0; v < T->N-1; v++)
423     {
424       T2->parent[map[v]] = map[T->parent[v]];
425       if (T->left[v]  > 0) T2->left[map[v]]   = map[T->left[v]];  /* internal nodes renumbered... */
426       else                 T2->left[map[v]]   = T->left[v];       /* ...taxon indices unchanged */
427       if (T->right[v] > 0) T2->right[map[v]]  = map[T->right[v]];
428       else                 T2->right[map[v]]  = T->right[v];
429       T2->ld[map[v]]     = T->ld[v];
430       T2->rd[map[v]]     = T->rd[v];
431 
432       if (T->taxaparent != NULL) {
433 	if (T->left[v]  <= 0) T2->taxaparent[-(T->left[v])]  = map[v];
434 	if (T->right[v] <= 0) T2->taxaparent[-(T->right[v])] = map[v];
435       }
436 
437       if (T->nodelabel != NULL)
438 	esl_strdup(T->nodelabel[v], -1, &(T2->nodelabel[map[v]]));
439     }
440 
441  /* Finally, swap the new guts of T2 with the old guts of T;
442    * destroy T2 and return. T is now renumbered.
443    */
444   ESL_SWAP(T->parent,     T2->parent,      int *);
445   ESL_SWAP(T->left,       T2->left,        int *);
446   ESL_SWAP(T->right,      T2->right,       int *);
447   ESL_SWAP(T->ld,         T2->ld,          double *);
448   ESL_SWAP(T->rd,         T2->rd,          double *);
449   ESL_SWAP(T->taxaparent, T2->taxaparent,  int *);
450   ESL_SWAP(T->nodelabel,  T2->nodelabel,   char **);
451 
452   free(map);
453   esl_stack_Destroy(vs);
454   esl_tree_Destroy(T2);
455   return eslOK;
456 
457  ERROR:
458   if (map != NULL) free(map);
459   if (vs  != NULL) esl_stack_Destroy(vs);
460   if (T2  != NULL) esl_tree_Destroy(T2);
461   return status;
462 
463 }
464 
465 /* Function:  esl_tree_VerifyUltrametric()
466  *
467  * Purpose:   Verify that tree <T> is ultrametric.
468  *
469  * Returns:   <eslOK> if so; <eslFAIL> if not.
470  *
471  * Throws:    <eslEMEM> on an allocation failure.
472  */
473 int
esl_tree_VerifyUltrametric(ESL_TREE * T)474 esl_tree_VerifyUltrametric(ESL_TREE *T)
475 {
476   double *d = NULL;		/* Distance from root for each OTU */
477   int status;
478   int i, child, parent;
479 
480   /* First, calculate distance from root to each taxon.
481    * (This chunk of code might be useful to put on its own someday.)
482    */
483   ESL_ALLOC(d, sizeof(double) * T->N);
484   if ((status = esl_tree_SetTaxaParents(T)) != eslOK) goto ERROR;
485   for (i = 0; i < T->N; i++)
486     {
487       d[i]   = 0.0;
488       parent = T->taxaparent[i];
489       if       (T->left[parent]  == -i) d[i] += T->ld[parent];
490       else if  (T->right[parent] == -i) d[i] += T->rd[parent];
491       else     ESL_XEXCEPTION(eslEINCONCEIVABLE, "oops");
492 
493       while (parent != 0)	/* upwards to the root */
494 	{
495 	  child  = parent;
496 	  parent = T->parent[child];
497 	  if      (T->left[parent]  == child) d[i] += T->ld[parent];
498 	  else if (T->right[parent] == child) d[i] += T->rd[parent];
499 	  else    ESL_XEXCEPTION(eslEINCONCEIVABLE, "oops");
500 	}
501     }
502 
503   /* In an ultrametric tree, all those distances must be equal.
504    */
505   for (i = 1; i < T->N; i++)
506     if ((status = esl_DCompare(d[0], d[i], 0.0001)) != eslOK) break;
507 
508   free(d);
509   return status;
510 
511  ERROR:
512   if (d != NULL) free(d);
513   return status;
514 }
515 
516 
517 /* Function:  esl_tree_Validate()
518  *
519  * Purpose:   Validates the integrity of the data structure in <T>.
520  *            Returns <eslOK> if the internal data in <T> are
521  *            consistent and valid. Returns <eslFAIL> if not,
522  *            and if a non-<NULL> message buffer <errbuf> has been
523  *            provided by the caller, an informative message is
524  *            left in <errbuf> describing the reason for the
525  *            failure.
526  *
527  * Args:      T      - tree structure to validate
528  *            errbuf - NULL, or a buffer of at least p7_ERRBUFSIZE
529  *                     chars to contain an error message upon
530  *                     any validation failure.
531  */
532 int
esl_tree_Validate(ESL_TREE * T,char * errbuf)533 esl_tree_Validate(ESL_TREE *T, char *errbuf)
534 {
535   int N;
536   int i, c;
537   int shouldbe;
538   int status;
539 
540   if (errbuf != NULL) *errbuf = 0;
541 
542   N = T->N; /* just to save writing T->N so many times below  */
543   if (N < 2)             ESL_XFAIL(eslFAIL, errbuf, "number of taxa is less than 2");
544   if (T->parent[0] != 0) ESL_XFAIL(eslFAIL, errbuf, "parent of root 0 should be set to 0");
545   if (T->nalloc < N)     ESL_XFAIL(eslFAIL, errbuf, "number of taxa N is less than allocation");
546 
547   /* Verify preorder tree numbering.
548    */
549   for (i = 0; i < N-1; i++)
550     {
551       if (T->left[i]  > 0 && T->left[i]  < i)
552 	ESL_XFAIL(eslFAIL, errbuf, "l child of node %d not in preorder", i);
553       if (T->right[i] > 0 && T->right[i] < i)
554 	ESL_XFAIL(eslFAIL, errbuf, "r child of node %d not in preorder", i);
555     }
556 
557   /* Range checks on values. */
558   for (i = 0; i < N-1; i++)
559     {
560       if (T->parent[i] < 0      || T->parent[i]     > N-2)
561 	ESL_XFAIL(eslFAIL, errbuf, "parent idx of node %d invalid", i);
562       if (T->left[i]   < -(N-1) || T->left[i]       > N-2)
563 	ESL_XFAIL(eslFAIL, errbuf, "left child idx of node %d invalid", i);
564       if (T->right[i]  < -(N-1) || T->right[i]      > N-2)
565 	ESL_XFAIL(eslFAIL, errbuf, "right child idx of node %d invalid", i);
566       if (T->ld[i] < 0.)
567 	ESL_XFAIL(eslFAIL, errbuf, "negative l branch length at node %d", i);
568       if (T->rd[i] < 0.)
569 	ESL_XFAIL(eslFAIL, errbuf, "negative r branch length at node %d", i);
570       if (T->cladesize  != NULL && (T->cladesize[i] < 0  || T->cladesize[i]  > N))
571 	ESL_XFAIL(eslFAIL, errbuf, "invalid cladesize at node %d", i);
572     }
573   for (c = 0; c < N; c++)
574     if (T->taxaparent != NULL && (T->taxaparent[c] < 0 || T->taxaparent[c] > N-2))
575       ESL_XFAIL(eslFAIL, errbuf, "invalid taxaparent at node %d", c);
576 
577   /* more sophisticated integrity checks on parent-child relations in
578      nodes ...*/
579   for (i = 1; i < T->N-1; i++)
580     if (T->left[T->parent[i]] != i && T->right[T->parent[i]] != i)
581       ESL_XFAIL(eslFAIL, errbuf, "parent/child link discrepancy at internal node %d\n", i);
582 
583   /* ...and between terminal nodes and taxa.
584    */
585   if (T->taxaparent != NULL)
586     for (c = 0; c < T->N; c++)
587       if (T->left[T->taxaparent[c]] != -c && T->right[T->taxaparent[c]] != -c)
588 	ESL_XFAIL(eslFAIL, errbuf, "parent/child link discrepancy at taxon %d\n", c);
589 
590   /* check on cladesizes */
591   if (T->cladesize != NULL)
592     for (i = 0; i < T->N-1; i++)
593       {
594 	shouldbe = 0;
595 	if (T->left[i]  > 0) shouldbe += T->cladesize[T->left[i]];  else shouldbe++;
596 	if (T->right[i] > 0) shouldbe += T->cladesize[T->right[i]]; else shouldbe++;
597 	if (shouldbe != T->cladesize[i])
598 	  ESL_XFAIL(eslFAIL, errbuf, "incorrect cladesize at node %d", i);
599       }
600 
601   return eslOK;
602 
603  ERROR:
604   return status;
605 }
606 
607 
608 
609 /* Function:  esl_tree_Destroy()
610  *
611  * Purpose:   Frees an <ESL_TREE> object.
612  */
613 void
esl_tree_Destroy(ESL_TREE * T)614 esl_tree_Destroy(ESL_TREE *T)
615 {
616   if (T == NULL) return;
617 
618   esl_free(T->parent);
619   esl_free(T->left);
620   esl_free(T->right);
621   esl_free(T->ld);
622   esl_free(T->rd);
623   esl_free(T->taxaparent);
624   esl_free(T->cladesize);
625 
626   esl_arr2_Destroy((void **) T->taxonlabel, T->nalloc);
627   esl_arr2_Destroy((void **) T->nodelabel,  T->nalloc-1);
628   free(T);
629   return;
630 }
631 
632 
633 
634 /*----------------- end, ESL_TREE object -----------------------*/
635 
636 
637 /*****************************************************************
638  *# 2. Newick format i/o
639  *****************************************************************/
640 
641 /* newick_validate_unquoted():
642  *   Returns <eslOK> if we can represent <label> as an unquoted label
643  *   in Newick format. (Spaces are ok, but will be converted to
644  *   _ on output.)
645  */
646 static int
newick_validate_unquoted(char * label)647 newick_validate_unquoted(char *label)
648 {
649   char *sptr;
650 
651   for (sptr = label; *sptr != '\0'; sptr++)
652     {
653       if (! isprint(*sptr))                  return eslFAIL;
654       if (strchr("()[]':;,", *sptr) != NULL) return eslFAIL;
655     }
656   return eslOK;
657 }
658 
659 /* newick_validate_quoted():
660  *   Returns <eslOK> if we can represent <label> as a
661  *   quoted label in Newick format. (Single quotes will
662  *   be converted to '' on output.)
663  */
664 static int
newick_validate_quoted(char * label)665 newick_validate_quoted(char *label)
666 {
667   char *sptr;
668   for (sptr = label; *sptr != '\0'; sptr++)
669     {
670       if (! isprint(*sptr))                  return eslFAIL;
671     }
672   return eslOK;
673 }
674 
675 /* newick_write_unquoted():
676  *   Prints <label> to <fp> as an unquoted Newick label.
677  */
678 static int
newick_write_unquoted(FILE * fp,char * label)679 newick_write_unquoted(FILE *fp, char *label)
680 {
681   char *sptr;
682 
683   for (sptr = label; *sptr != '\0'; sptr++)
684     {
685       if (*sptr == ' ') { if (fputc('_',   fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed"); }
686       else              { if (fputc(*sptr, fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed"); }
687     }
688   return eslOK;
689 }
690 
691 /* newick_write_quoted():
692  *   Prints <label> to <fp> as a quoted Newick label.
693  */
694 static int
newick_write_quoted(FILE * fp,char * label)695 newick_write_quoted(FILE *fp, char *label)
696 {
697   char *sptr;
698 
699   if (fputc('\'', fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed");
700   for (sptr = label; *sptr != '\0'; sptr++)
701     {
702       if (*sptr == '\'') { if (fprintf(fp, "''") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed"); }
703       else               { if (fputc(*sptr, fp)  < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed"); }
704     }
705   if (fputc('\'', fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed");
706   return eslOK;
707 }
708 
709 /* newick_write_taxonlabel():
710  *    Print the label for taxon <v> to stream <fp>.
711  *    Tries to print label as an unquoted label, then
712  *    as a quoted label, (then fails).
713  *    If label isn't available, does nothing.
714  *    If label contains invalid characters, throws <eslECORRUPT>.
715  */
716 static int
newick_write_taxonlabel(FILE * fp,ESL_TREE * T,int v)717 newick_write_taxonlabel(FILE *fp, ESL_TREE *T, int v)
718 {
719   int status;
720 
721   if (T->taxonlabel == NULL || T->taxonlabel[v] == NULL)
722     {
723       if (T->show_numeric_taxonlabels && fprintf(fp, "%d", v) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed");
724       return eslOK;
725     }
726 
727   if (! T->show_quoted_labels && newick_validate_unquoted(T->taxonlabel[v]) == eslOK)
728     status = newick_write_unquoted(fp, T->taxonlabel[v]);
729   else if (newick_validate_quoted(T->taxonlabel[v]) == eslOK)
730     status = newick_write_quoted(fp, T->taxonlabel[v]);
731   else
732     ESL_EXCEPTION(eslECORRUPT, "bad taxon label");
733 
734   return status;
735 }
736 
737 /* newick_write_nodelabel():
738  *    Print the label for internal node <v> to stream <fp>.
739  *    Tries to print label as an unquoted label, then
740  *    as a quoted label.
741  *    If label isn't available, does nothing.
742  *    If tree's options say not to print node labels, does nothing.
743  *    If label contains invalid characters, throws <eslECORRUPT>.
744  */
745 static int
newick_write_nodelabel(FILE * fp,ESL_TREE * T,int v)746 newick_write_nodelabel(FILE *fp, ESL_TREE *T, int v)
747 {
748   int status;
749 
750   if (T->nodelabel    == NULL)      return eslOK;
751   if (T->nodelabel[v] == NULL)      return eslOK;
752   if (T->show_node_labels != TRUE)  return eslOK;
753 
754   if (! T->show_quoted_labels && newick_validate_unquoted(T->nodelabel[v]) == eslOK)
755     status = newick_write_unquoted(fp, T->nodelabel[v]);
756   else if (newick_validate_quoted(T->nodelabel[v]) == eslOK)
757     status = newick_write_quoted(fp, T->nodelabel[v]);
758   else
759     ESL_EXCEPTION(eslECORRUPT, "bad node label\n");
760 
761   return status;
762 }
763 
764 /* newick_write_branchlength()
765  *    Writes the branch length *to* <v>.
766  *    If <v> is negative, it's a leaf; if <v> is positive, it's an internal node.
767  *    You can't pass the root node 0 to this. 0 always means taxon 0.
768  *    There is no branch to the root node.
769  */
770 static int
newick_write_branchlength(FILE * fp,ESL_TREE * T,int v)771 newick_write_branchlength(FILE *fp, ESL_TREE *T, int v)
772 {
773   double branchlength;
774 
775   if (! T->show_branchlengths) return eslOK;
776   if (T->taxaparent == NULL)   ESL_EXCEPTION(eslEINVAL, "T must have taxaparent");
777 
778   if (v <= 0)			/* leaf */
779     {
780       if      (T->left [T->taxaparent[-v]] == v) branchlength = T->ld[T->taxaparent[-v]];
781       else if (T->right[T->taxaparent[-v]] == v) branchlength = T->rd[T->taxaparent[-v]];
782       else    ESL_EXCEPTION(eslECORRUPT, "Can't find branch length");
783     }
784   else				/* internal node */
785     {
786       if      (T->left [T->parent[v]] == v) branchlength = T->ld[T->parent[v]];
787       else if (T->right[T->parent[v]] == v) branchlength = T->rd[T->parent[v]];
788       else    ESL_EXCEPTION(eslECORRUPT, "Can't find branch length");
789     }
790 
791   if (fprintf(fp, ":%f", branchlength) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed");
792   return eslOK;
793 }
794 
795 /* Function:  esl_tree_WriteNewick()
796  *
797  * Purpose:   Writes tree <T> to stream <fp> in Newick format.
798  *
799  *            Certain options are set in <T> to control output style,
800  *            as follows.
801  *
802  *            If <T->show_unrooted> is <TRUE>, <T> is printed as an
803  *            unrooted tree starting with a trifurcation, a la PHYLIP
804  *            format (default=<FALSE>).
805  *
806  *            If <T->show_node_labels> is <TRUE>, then labels are
807  *            shown for internal nodes, if any are available
808  *            (default=<TRUE>).
809  *
810  *            If <T->show_branchlengths> is <TRUE>, then branch
811  *            lengths are shown, as opposed to just printing a labeled
812  *            topology (default=<TRUE>).
813  *
814  *            If <T->show_root_branchlength> is also <TRUE>, then a
815  *            0.0 branchlength is shown to the root node, a la Hein's
816  *            TreeAlign Newick format (default=<FALSE>).
817  *
818  *            If <T->show_quoted_labels> is <TRUE>, then all labels
819  *            are shown in Newick's quoted format, as opposed to only
820  *            using quoted labels where necessary (default=<FALSE>).
821  *
822  * Returns:   <eslOK> on success.
823  *
824  * Throws:    <eslEMEM> on allocation error.
825  *            <eslEWRITE> on any system write error.
826  *            <eslEINCONCEIVABLE> on internal error.
827  *
828  * Xref:      SRE:STL11/74
829  */
830 int
esl_tree_WriteNewick(FILE * fp,ESL_TREE * T)831 esl_tree_WriteNewick(FILE *fp, ESL_TREE *T)
832 {
833   ESL_STACK *vs = NULL;
834   ESL_STACK *cs = NULL;
835   int  v;
836   char c;
837   int  status;
838 
839   if ((vs = esl_stack_ICreate()) == NULL) { status = eslEMEM; goto ERROR; }
840   if ((cs = esl_stack_CCreate()) == NULL) { status = eslEMEM; goto ERROR; }
841 
842   if ((status = esl_tree_SetTaxaParents(T)) != eslOK) goto ERROR;
843 
844   /* Initialization.
845    * Push a trifurcation (swallowing the right internal node) if unrooted;
846    * else push the first bifurcation.
847    *
848    * When we push a trifurcation, the branch lengths will come out fine
849    * on output, if the tree followed the correct convention of having
850    * a T->rd[0] = 0.0.
851    */
852   if (fputc('(', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
853   if (T->show_unrooted && T->right[0] > 0)
854     {
855       v = T->right[0];
856       if ((status = esl_stack_CPush(cs, 'x'))         != eslOK) goto ERROR;
857       if ((status = esl_stack_IPush(vs, T->right[v])) != eslOK) goto ERROR;
858       if ((status = esl_stack_CPush(cs, ','))         != eslOK) goto ERROR;
859       if ((status = esl_stack_CPush(cs, 'x'))         != eslOK) goto ERROR;
860       if ((status = esl_stack_IPush(vs, T->left[v]))  != eslOK) goto ERROR;
861     }
862   else
863     {
864       if ((status = esl_stack_CPush(cs, 'x'))         != eslOK) goto ERROR;
865       if ((status = esl_stack_IPush(vs, T->right[0])) != eslOK) goto ERROR;
866     }
867   if ((status = esl_stack_CPush(cs, ','))             != eslOK) goto ERROR;
868   if ((status = esl_stack_CPush(cs, 'x'))             != eslOK) goto ERROR;
869   if ((status = esl_stack_IPush(vs, T->left[0]))      != eslOK) goto ERROR;
870 
871 
872   /* Main iteration. Pop off stacks 'til they're empty.
873    */
874   while ((status = esl_stack_CPop(cs, &c)) == eslOK)
875     {
876       if (c == ',') {  /* comma doesn't have a v stacked with it */
877 	if (fputc(',', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
878 	continue;
879       }
880 
881       if ((status = esl_stack_IPop(vs, &v)) != eslOK) goto ERROR;
882 
883       switch (c) {
884       case 'x':			/* a subtree, which could be a node or a taxon: */
885 	if (v > 0)		/* internal node 1..N-2*/
886 	  {
887 	    if (fputc('(', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
888 	    if ((status = esl_stack_CPush(cs, ')'))         != eslOK) goto ERROR;
889 	    if ((status = esl_stack_IPush(vs, v))           != eslOK) goto ERROR;
890 	    if ((status = esl_stack_CPush(cs, 'x'))         != eslOK) goto ERROR;
891 	    if ((status = esl_stack_IPush(vs, T->right[v])) != eslOK) goto ERROR;
892 	    if ((status = esl_stack_CPush(cs, ','))         != eslOK) goto ERROR;
893 	    if ((status = esl_stack_CPush(cs, 'x'))         != eslOK) goto ERROR;
894 	    if ((status = esl_stack_IPush(vs, T->left[v]))  != eslOK) goto ERROR;
895 	  }
896 	else			/* taxon -(N-1)..0 */
897 	  { 	    /* -v below to convert taxon code to 0..N-1 */
898 	    if ((status = newick_write_taxonlabel  (fp, T, -v)) != eslOK) goto ERROR;
899 	    if ((status = newick_write_branchlength(fp, T,  v)) != eslOK) goto ERROR;
900 	  }
901 	break;
902 
903       case ')':			/* closing an internal node. v > 0 is a node code. */
904 	if (fputc(')', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
905 	if ((status = newick_write_nodelabel   (fp, T, v)) != eslOK) goto ERROR;
906 	if ((status = newick_write_branchlength(fp, T, v)) != eslOK) goto ERROR;
907 	break;
908 
909       default:
910 	ESL_XEXCEPTION(eslEINCONCEIVABLE, "bad state code");
911 	break;
912       }
913     }
914 
915   /* Termination
916    */
917   if (fputc(')', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
918   if ((status = newick_write_nodelabel(fp, T, 0)) != eslOK) goto ERROR;
919   if (T->show_branchlengths && T->show_root_branchlength)
920     { if (fprintf(fp, ":0.0") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed"); }
921   if (fputc(';', fp)  < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
922   if (fputc('\n', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
923 
924   esl_stack_Destroy(vs);
925   esl_stack_Destroy(cs);
926   return eslOK;
927 
928  ERROR:
929   if (vs != NULL) esl_stack_Destroy(vs);
930   if (cs != NULL) esl_stack_Destroy(cs);
931   return status;
932 
933 }
934 
935 
936 /* newick_advance_buffer()
937  *
938  * Advance the read buffer by one character; reload it
939  * if we reach the end. <eslOK> on success, and <eslEOF>
940  * if the read fails.
941  */
942 static int
newick_advance_buffer(FILE * fp,char * buf,int * pos,int * nc)943 newick_advance_buffer(FILE *fp, char *buf, int *pos, int *nc)
944 {
945   (*pos)++;
946   if (*pos == *nc)
947     {
948       *nc = fread(buf, sizeof(char), 4096, fp);
949       if (*nc == 0) return eslEOF;
950       *pos = 0;
951     }
952   return eslOK;
953 }
954 
955 /* newick_skip_whitespace()
956  *
957  * Given the 4k input buffer <buf>, which currently contains
958  * <*nc> total characters and is positioned at position <*pos>,
959  * move <*pos> to be at the first nonwhitespace character,
960  * skipping any Newick comments ([...]) encountered. Read
961  * new data from the stream <fp> into <buf> as needed.
962  *
963  * Return <eslOK> on success. <*pos> is reset to point at a
964  * non-whitespace input character. <*nc> may be reset and the contents
965  * of <buf> altered if a new block was read from <fp> into <buf>.
966  *
967  * Returns <eslEOF> if end-of-file is reached in <fp> before a data
968  * character is found, or if a read error occurs.
969  */
970 static int
newick_skip_whitespace(FILE * fp,char * buf,int * pos,int * nc)971 newick_skip_whitespace(FILE *fp, char *buf, int *pos, int *nc)
972 {
973   int commentlevel = 0;
974 
975   while (commentlevel > 0 || isspace(buf[*pos]) || buf[*pos] == '[')
976     {
977       if (buf[*pos] == '[') commentlevel++;
978       if (buf[*pos] == ']') commentlevel--;
979       if (newick_advance_buffer(fp, buf, pos, nc) == eslEOF) return eslEOF;
980     }
981   return eslOK;
982 }
983 
984 
985 /* newick_parse_quoted_label()
986  *
987  * On entry, buf[pos] == '\'': the opening single quote.
988  * On exit,  buf[pos] is positioned at the next data character following the closing
989  *           single quote; possibly the ':' for a branch length;
990  *           and <ret_label> points to a newly allocated, NUL-terminated string
991  *          containing the label that was read (possibly the empty string).
992  * Returns eslOK on success.
993  *
994  * Returns eslEFORMAT on parse error, eslEOF if it runs out of data.
995  */
996 static int
newick_parse_quoted_label(FILE * fp,char * buf,int * pos,int * nc,char ** ret_label)997 newick_parse_quoted_label(FILE *fp, char *buf, int *pos, int *nc, char **ret_label)
998 {
999   char *label  = NULL;
1000   void *tmp;
1001   int   n      = 0;
1002   int   nalloc = 0;
1003   int   status;
1004 
1005   nalloc = 32;
1006   ESL_ALLOC(label, sizeof(char) * nalloc);
1007 
1008   /* advance past the opening ' */
1009   if (buf[*pos] != '\'') { status = eslEFORMAT; goto ERROR; }
1010   if ((status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
1011 
1012   /* skip leading whitespace (\n and comments forbidden in quoted label) */
1013   while (buf[*pos] == '\t' || buf[*pos] == ' ')
1014     if ((status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
1015 
1016   /* Read the label */
1017   while (1) {
1018     if (buf[*pos] == '\'') {	/* watch out for escaped single quotes, '' */
1019       if ((status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
1020       if (buf[*pos] != '\'') break; /* we've just moved past the last ' */
1021     }
1022     label[n++] = buf[*pos];
1023     if ((status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
1024     if (n == (nalloc-1)) {  /* reallocate label if it fills, leave room for NUL */
1025       ESL_RALLOC(label, tmp, sizeof(char) * (nalloc * 2));
1026       nalloc *= 2;
1027     }
1028   }
1029 
1030   /* backtrack over any trailing whitespace and nul-terminate. */
1031   while (n>0 && isspace(label[n-1])) n--;
1032   label[n] = '\0';
1033   *ret_label = label;
1034   return eslOK;
1035 
1036  ERROR:
1037   if (label != NULL) { free(label); *ret_label = NULL; }
1038   return status;
1039 
1040 }
1041 
1042 /* newick_parse_unquoted_label
1043  *
1044  * On entry, buf[pos] == first character in the label.
1045  * On exit,  buf[pos] is positioned at the next data character following the end
1046  *           of the label --  one of "),\t\n;[:"  --
1047  *           and <ret_label> points to a newly allocated, NUL-terminated string
1048  *           containing the label that was read (possibly the empty string).
1049  * Returns eslOK on success.
1050  *
1051  * Returns eslEFORMAT on parse error, eslEOF if it runs out of data.
1052  */
1053 static int
newick_parse_unquoted_label(FILE * fp,char * buf,int * pos,int * nc,char ** ret_label)1054 newick_parse_unquoted_label(FILE *fp, char *buf, int *pos, int *nc, char **ret_label)
1055 {
1056   char *label  = NULL;
1057   char *tmp    = NULL;
1058   int   n      = 0;
1059   int   nalloc = 0;
1060   int   status;
1061 
1062   nalloc = 32;
1063   ESL_ALLOC(label, sizeof(char) * nalloc);
1064 
1065   while (1) {
1066     if (strchr("(]",          buf[*pos]) != NULL) { status = eslEFORMAT; goto ERROR; }
1067     if (strchr(" \t\n)[':;,", buf[*pos]) != NULL) { break; }
1068     label[n++] = buf[*pos];
1069     if (newick_advance_buffer(fp, buf, pos, nc) == eslEOF) { status = eslEOF; goto ERROR; }
1070 
1071     if (n == (nalloc-1)) {  /* reallocate label if it fills, leave room for NUL */
1072       ESL_RALLOC(label, tmp, sizeof(char) * (nalloc * 2));
1073       nalloc *= 2;
1074     }
1075   }
1076   label[n]   = '\0';
1077   *ret_label = label;
1078   return eslOK;
1079 
1080  ERROR:
1081   if (label != NULL) { free(label); *ret_label = NULL; }
1082   return status;
1083 }
1084 
1085 /* newick_parse_branchlength
1086  *
1087  * On entry, buf[pos] == ':'
1088  * On exit,  buf[pos] is positioned at the next data character following the end
1089  *           of the branchlength --  one of "),\t\n;[:"
1090  *           and <ret_d> is the branch length that was read.
1091  *
1092  * Returns eslOK  on success;
1093  *
1094  * Returns eslEFORMAT on parse error (including nonexistent branch lengths),
1095  *         eslEOF if it runs out of data in the file.
1096  */
1097 static int
newick_parse_branchlength(FILE * fp,char * buf,int * pos,int * nc,double * ret_d)1098 newick_parse_branchlength(FILE *fp, char *buf, int *pos, int *nc, double *ret_d)
1099 {
1100   char *branch = NULL;
1101   char *tmp    = NULL;
1102   int   n      = 0;
1103   int   nalloc = 0;
1104   int   status;
1105 
1106   nalloc = 32;
1107   ESL_ALLOC(branch, sizeof(char) * nalloc);
1108 
1109   if (buf[*pos] != ':') { status = eslEFORMAT; goto ERROR; }
1110   if (( status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK)  goto ERROR;
1111 
1112   while (1) {
1113     if (strchr("(]",          buf[*pos]) != NULL) { status = eslEFORMAT; goto ERROR; }
1114     if (strchr(" \t\n)[':;,", buf[*pos]) != NULL) break;
1115     branch[n++] = buf[*pos];
1116     if ((status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
1117 
1118     if (n == (nalloc-1)) {  /* reallocate label if it fills, leave room for NUL */
1119       ESL_RALLOC(branch, tmp, sizeof(char) * (nalloc * 2));
1120       nalloc *= 2;
1121     }
1122   }
1123 
1124   branch[n]   = '\0';
1125   *ret_d = strtod(branch, &tmp);
1126   if (n == 0 || tmp != branch+n) { status = eslEFORMAT; goto ERROR; }
1127   free(branch);
1128   return eslOK;
1129 
1130  ERROR:
1131   if (branch != NULL) free(branch);
1132   *ret_d = 0.;
1133   return status;
1134 }
1135 
1136 
1137 
1138 
1139 /* Function:  esl_tree_ReadNewick()
1140  * Synopsis:  Input a Newick format tree.
1141  *
1142  * Purpose:   Read a Newick format tree from an open input stream <fp>.
1143  *            Return the new tree in <ret_T>.
1144  *
1145  *            The new tree <T> will have the optional <T->taxonlabel> and
1146  *            <T->nodelabel> arrays allocated, containing names of all the
1147  *            taxa and nodes. Whenever no label appeared in the Newick file
1148  *            for a node or taxon, the label is set to the empty string.
1149  *
1150  *            Caller may optionally provide an <errbuf> of at least
1151  *            <eslERRBUFSIZE> chars, to retrieve diagnostic information
1152  *            in case of a parsing problem; or <errbuf> may be passed as
1153  *            <NULL>.
1154  *
1155  * Args:      fp      - open input stream
1156  *            errbuf  - NULL, or allocated space for >= eslERRBUFSIZE chars
1157  *            ret_T   - RETURN: the new tree.
1158  *
1159  * Returns:   Returns <eslOK> on success, and <ret_T> points
1160  *            to the new tree.
1161  *
1162  *            Returns <eslEFORMAT> on parse errors, such as premature EOF
1163  *            or bad syntax in the Newick file. In this case, <ret_T> is
1164  *            returned NULL, and the <errbuf> (if provided> contains an
1165  *            informative error message.
1166  *
1167  * Throws:    <eslEMEM> on memory allocation errors.
1168  *            <eslEINCONCEIVABLE> may also arise in case of internal bugs.
1169  *
1170  * Xref:      STL11/75
1171  */
1172 int
esl_tree_ReadNewick(FILE * fp,char * errbuf,ESL_TREE ** ret_T)1173 esl_tree_ReadNewick(FILE *fp, char *errbuf, ESL_TREE **ret_T)
1174 {
1175   ESL_TREE  *T   = NULL;	/* the new, growing tree */
1176   ESL_STACK *cs  = NULL;	/* state stack: possible states are LRX);,  */
1177   ESL_STACK *vs  = NULL;	/* node index stack: LRX) states are associated with node #'s */
1178   int        status;
1179   char       buf[4096];		/* 4K input buffer */
1180   int        pos,nc;		/* position in buf, and number of chars in buf */
1181   char       c;			/* current state */
1182   int        v;		        /* current node idx */
1183   int        currnode;
1184   int        currtaxon;
1185   char      *label;		/* a parsed label */
1186   double     d;			/* a parsed branch length */
1187 
1188   if (errbuf != NULL) *errbuf = '\0';
1189   if ((vs = esl_stack_ICreate()) == NULL) { status = eslEMEM; goto ERROR; };
1190   if ((cs = esl_stack_CCreate()) == NULL) { status = eslEMEM; goto ERROR; };
1191 
1192   /* Create the tree, initially allocated for 32 taxa.
1193    * Allocate for taxon and node labels, too.
1194    */
1195   if ((T  = esl_tree_CreateGrowable(32)) == NULL) { status = eslEMEM; goto ERROR; };
1196   ESL_ALLOC(T->taxonlabel, sizeof(char *) * 32);
1197   ESL_ALLOC(T->nodelabel,  sizeof(char *) * 31);
1198   for (currtaxon = 0; currtaxon < 32; currtaxon++) T->taxonlabel[currtaxon] = NULL;
1199   for (currnode  = 0; currnode  < 31; currnode++)  T->nodelabel[currnode]   = NULL;
1200 
1201   /* Load the input buffer
1202    */
1203   if ((nc = fread(buf, sizeof(char), 4096, fp)) == 0)
1204     ESL_XFAIL(eslEFORMAT, errbuf, "file is empty.");
1205   pos = 0;
1206 
1207   /* Initialization:
1208    *    create the root node in the tree;
1209    *    push L,R...); onto the stacks;
1210    *    swallow the first ( in the file.
1211    *
1212    * A note on memory management in the growing tree:
1213    *  we are going to keep T->N set to the number of taxa
1214    *  that the tree *will* contain, given the number of nodes
1215    *  it currently *does* contain. Before we try to add
1216    *  any new node, we call the Grow() routine, which will
1217    *  check T->N against T->nalloc. This strategy works
1218    *  because nodes always get added before their children
1219    *  taxa, so the # of taxa is always <= nodes-1: that is,
1220    *  our need for reallocation is driven by new nodes,
1221    *  never by new taxa; that is, if we have enough room
1222    *  for nodes, we automatically have enough room for the
1223    *  taxa.
1224    */
1225   T->parent[0] = 0;
1226   currnode     = 1;
1227   currtaxon    = 0;
1228   T->N         = 2;   /* c.f. note above: T->N is the # of taxa we *would* hold, given currnode=1*/
1229   if (esl_stack_CPush(cs, ';') != eslOK)  { status = eslEMEM; goto ERROR; };
1230   if (esl_stack_CPush(cs, ')') != eslOK)  { status = eslEMEM; goto ERROR; };
1231   if (esl_stack_IPush(vs, 0)   != eslOK)  { status = eslEMEM; goto ERROR; };
1232   if (esl_stack_CPush(cs, 'X') != eslOK)  { status = eslEMEM; goto ERROR; };
1233   if (esl_stack_IPush(vs, 0)   != eslOK)  { status = eslEMEM; goto ERROR; };
1234   if (esl_stack_CPush(cs, 'R') != eslOK)  { status = eslEMEM; goto ERROR; };
1235   if (esl_stack_IPush(vs, 0)   != eslOK)  { status = eslEMEM; goto ERROR; };
1236   if (esl_stack_CPush(cs, ',') != eslOK)  { status = eslEMEM; goto ERROR; };
1237   if (esl_stack_CPush(cs, 'L') != eslOK)  { status = eslEMEM; goto ERROR; };
1238   if (esl_stack_IPush(vs, 0)   != eslOK)  { status = eslEMEM; goto ERROR; };
1239 
1240   if (newick_skip_whitespace(fp, buf, &pos, &nc) != eslOK)
1241     ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1242   if (buf[pos] != '(')
1243     ESL_XFAIL(eslEFORMAT, errbuf, "file is not in Newick format.");
1244   if (newick_advance_buffer(fp, buf, &pos, &nc) == eslEOF)
1245     ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1246 
1247   /* Iteration.
1248    */
1249   while ((status = esl_stack_CPop(cs, &c)) == eslOK)
1250     {
1251 
1252       if (newick_skip_whitespace(fp, buf, &pos, &nc) != eslOK)
1253 	ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1254 
1255       if (c == ',')
1256 	{
1257 	  if (buf[pos] != ',')
1258 	    ESL_XFAIL(eslEFORMAT, errbuf, "expected a comma, saw %c.", buf[pos]);
1259 	  if (newick_advance_buffer(fp, buf, &pos, &nc) == eslEOF)
1260 	    ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1261 	  continue;
1262 	}
1263 
1264       else if (c == ';')
1265 	{
1266 	  if (buf[pos] != ';')
1267 	    ESL_XFAIL(eslEFORMAT, errbuf, "expected a semicolon, saw %c.", buf[pos]);
1268 	  break;		/* end of the Newick file */
1269 	}
1270 
1271       else if (c == 'L' || c == 'R') /* c says, we expect to add a subtree next */
1272 	{
1273 	  if (esl_stack_IPop(vs, &v) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* v = parent of currnode */
1274 
1275 	  if (buf[pos] == '(')	/* a new interior node attaches to v */
1276 	    {
1277 	      if (esl_tree_Grow(T) != eslOK) { status = eslEMEM; goto ERROR; };	/* c.f. memory management note: check that we can add new node */
1278 
1279 	      T->parent[currnode] = v;
1280 	      if (c == 'L') T->left[v]  = currnode;
1281 	      else          T->right[v] = currnode;
1282 
1283 	      if (esl_stack_CPush(cs, ')')      != eslOK)  { status = eslEMEM; goto ERROR; };
1284 	      if (esl_stack_IPush(vs, currnode) != eslOK)  { status = eslEMEM; goto ERROR; };
1285 	      if (esl_stack_CPush(cs, 'X')      != eslOK)  { status = eslEMEM; goto ERROR; };
1286 	      if (esl_stack_IPush(vs, currnode) != eslOK)  { status = eslEMEM; goto ERROR; };
1287 	      if (esl_stack_CPush(cs, 'R')      != eslOK)  { status = eslEMEM; goto ERROR; };
1288 	      if (esl_stack_IPush(vs, currnode) != eslOK)  { status = eslEMEM; goto ERROR; };
1289 	      if (esl_stack_CPush(cs, ',')      != eslOK)  { status = eslEMEM; goto ERROR; };
1290 	      if (esl_stack_CPush(cs, 'L')      != eslOK)  { status = eslEMEM; goto ERROR; };
1291 	      if (esl_stack_IPush(vs, currnode) != eslOK)  { status = eslEMEM; goto ERROR; };
1292 
1293 	      if (newick_advance_buffer(fp, buf, &pos, &nc) == eslEOF)
1294 		ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1295 	      currnode++;		/* T->N == # of internal nodes/idx of next internal node */
1296 	    }
1297 	  else /* a taxon attaches to v */
1298 	    {
1299 	      if (buf[pos] == '\'') { /* a quoted label, for a new taxon attached to v*/
1300 		if ((status = newick_parse_quoted_label(fp, buf, &pos, &nc,   &label)) != eslOK)
1301 		  ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse a quoted taxon label");
1302 	      } else {               /* an unquoted label, for a new taxon attached to v */
1303 		if ((status = newick_parse_unquoted_label(fp, buf, &pos, &nc, &label)) != eslOK)
1304 		  ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse an unquoted taxon label");
1305 	      }
1306 
1307 	      if (newick_skip_whitespace(fp, buf, &pos, &nc) != eslOK)
1308 		ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely");
1309 
1310 	      d = 0.;
1311 	      if (buf[pos] == ':') {
1312 		if ((status = newick_parse_branchlength(fp, buf, &pos, &nc, &d)) != eslOK)
1313 		  ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse a branch length");
1314 	      }
1315 
1316 	      if (c == 'L') { T->left[v]  = -currtaxon;  T->ld[v] = d; }
1317 	      else          { T->right[v] = -currtaxon;  T->rd[v] = d; }
1318 
1319 	      T->taxonlabel[currtaxon]  = label;
1320 	      currtaxon++;
1321 	    }
1322 	}
1323 
1324       else if (c == ')')	/* c says, expect to close an interior node next */
1325 	{
1326 	  /* get v = the interior node we're closing, naming, and setting a branch length to */
1327 	  if (( status = esl_stack_IPop(vs, &v))  != eslOK)  goto ERROR;
1328 	  if (buf[pos] != ')') ESL_XFAIL(eslEFORMAT, errbuf, "Parse error: expected ) to close node #%d\n", v);
1329 
1330 	  if (newick_advance_buffer(fp, buf, &pos, &nc) == eslEOF)
1331 	    ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1332 
1333 	  if (newick_skip_whitespace(fp, buf, &pos, &nc) != eslOK)
1334 	    ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1335 
1336 	  if (buf[pos] == '\'') {
1337 	    if ((status = newick_parse_quoted_label(fp, buf, &pos, &nc, &label)) != eslOK)
1338 	      ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse a quoted node label");
1339 	  } else {               /* an unquoted label, for a new taxon attached to v */
1340 	    if ((status = newick_parse_unquoted_label(fp, buf, &pos, &nc, &label)) != eslOK)
1341 	      ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse an unquoted node label");
1342 	  }
1343 
1344 	  if (newick_skip_whitespace(fp, buf, &pos, &nc) != eslOK)
1345 	    ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
1346 
1347 	  d = 0.;
1348 	  if (buf[pos] == ':') {
1349 	    if ((status = newick_parse_branchlength(fp, buf, &pos, &nc, &d)) != eslOK)
1350 	      ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse a branch length");
1351 	  }
1352 
1353 	  if (v > 0) { /* branch length to root node is meaningless, ignore it */
1354 	    if      (T->left [T->parent[v]] == v) T->ld[T->parent[v]] = d;
1355 	    else if (T->right[T->parent[v]] == v) T->rd[T->parent[v]] = d;
1356 	  }
1357 
1358 	  T->nodelabel[v] = label;
1359 	}
1360 
1361       else if (c == 'X')	/* optionally, multifurcations: if we see a comma, we have another node to deal with */
1362 	{
1363 	  if ((status = esl_stack_IPop(vs, &v)) != eslOK) goto ERROR;
1364 	  if (buf[pos] != ',') continue;
1365 	  if (esl_tree_Grow(T) != eslOK) { status = eslEMEM; goto ERROR; };	/* c.f. memory management note: check that we can add new node */
1366 
1367 	  /* v = the interior node that is multifurcated.
1368            * What we're going to do is to create a new node y; move the existing right child of v
1369            * to the left child of y; and connect y as the new right child of v with a branch
1370            * length of zero. The right child of y is now open. Then, we push a X->,RX production, so the next subtree will
1371            * be parsed as the right child of y. We can do this ad infinitum, resulting in
1372            * a representation of a multifurcation as, for example, a (A,(B,(C,(D,E)))) binary
1373            * subtree with zero length interior branches for a five-way multifurcation.
1374            *
1375            * This swapping destroys the order of the nodes: they will not be in preorder traversal.
1376            * This is temporarily ok. We renumber later.
1377 	   */
1378 	  T->left[currnode]   = T->right[v];
1379 	  T->ld[currnode]     = T->rd[v];
1380 	  T->parent[currnode] = v;
1381 	  if (T->right[v] > 0)
1382 	    T->parent[T->right[v]] = currnode;
1383 	  T->right[v]         = currnode;
1384 	  T->rd[v]            = 0.;
1385 
1386 	  if (esl_stack_CPush(cs, 'X')       != eslOK)  { status = eslEMEM; goto ERROR; };
1387 	  if (esl_stack_IPush(vs, currnode)  != eslOK)  { status = eslEMEM; goto ERROR; };
1388 	  if (esl_stack_CPush(cs, 'R')       != eslOK)  { status = eslEMEM; goto ERROR; };
1389 	  if (esl_stack_IPush(vs, currnode)  != eslOK)  { status = eslEMEM; goto ERROR; };
1390 	  if (esl_stack_CPush(cs, ',')       != eslOK)  { status = eslEMEM; goto ERROR; };
1391 	  currnode++;
1392 	}
1393 
1394       T->N = currnode + 1; /* c.f. memory management note: keep T->N = # of taxa the tree *would* hold, given currnode */
1395     }
1396 
1397   esl_tree_RenumberNodes(T);
1398 
1399   esl_stack_Destroy(cs);
1400   esl_stack_Destroy(vs);
1401   *ret_T = T;
1402   return eslOK;
1403 
1404  ERROR:
1405   if (T  != NULL) esl_tree_Destroy(T);
1406   if (cs != NULL) esl_stack_Destroy(cs);
1407   if (vs != NULL) esl_stack_Destroy(vs);
1408   *ret_T = NULL;
1409   return status;
1410 }
1411 
1412 
1413 
1414 /*-------------------- end, Newick i/o --------------------------*/
1415 
1416 
1417 /*****************************************************************
1418  * 3. Tree comparison algorithms
1419  *****************************************************************/
1420 
1421 /* Function:  esl_tree_Compare()
1422  *
1423  * Purpose:   Given two trees <T1> and <T2> for the same
1424  *            set of <N> taxa, compare the topologies of the
1425  *            two trees.
1426  *
1427  *            The routine must be able to determine which taxa are
1428  *            equivalent in <T1> and <T2>. If <T1> and <T2> both have
1429  *            taxon labels set, then the routine compares labels.
1430  *            This is the usual case. (Therefore, the <N> labels must
1431  *            all be different, or the routine will be unable to do
1432  *            this mapping uniquely.) As a special case, if neither
1433  *            <T1> nor <T2> has taxon labels, then the indexing of
1434  *            taxa <0..N-1> is assumed to be exactly the same in the
1435  *            two trees. (And if one tree has labels and the other
1436  *            does not, an <eslEINVAL> exception is thrown.)
1437  *
1438  *            For comparing unrooted topologies, be sure that <T1> and
1439  *            <T2> both obey the unrooted tree convention that the
1440  *            "root" is placed on the branch to taxon 0. (That is,
1441  *            <T->left[0] = 0>.)
1442  *
1443  * Returns:   <eslOK> if tree topologies are identical. <eslFAIL>
1444  *            if they aren't.
1445  *
1446  * Throws:    <eslEMEM> on allocation error. <eslEINVAL> if the taxa in
1447  *            the trees can't be mapped uniquely and completely to
1448  *            each other (because one tree doesn't have labels and
1449  *            one does, or because the labels aren't unique, or the
1450  *            two trees have different taxa).
1451  */
1452 int
esl_tree_Compare(ESL_TREE * T1,ESL_TREE * T2)1453 esl_tree_Compare(ESL_TREE *T1, ESL_TREE *T2)
1454 {
1455   int *Mg   = NULL;		/* the M(g) tree-mapping function for internal nodes [0..N-2] */
1456   int *Mgt  = NULL;		/* the M(g) tree-mapping function for leaves (taxa), [0..N-1] */
1457   int  g, child;		/* node indices for parent, children */
1458   int  a,b;
1459   int  status;
1460 
1461   if (T1->N != T2->N) ESL_EXCEPTION(eslEINVAL, "trees don't have the same # of taxa");
1462 
1463   /* We need taxon parent map in tree 2, but not tree 1.
1464    */
1465   if ((status = esl_tree_SetTaxaParents(T2)) != eslOK) goto ERROR;
1466 
1467   /* We're going to use the tree mapping function M(g) [Goodman79].
1468    * In the implementation here, we split it into two, Mg[] for internal
1469    * nodes 0..N-2 and Mgt[] for taxa 0..N-1.
1470    *
1471    * Mg[g] for node g in T1 is the index of the lowest node in T2
1472    * that contains the same children taxa as the subtree
1473    * under g in T1.
1474    *
1475    * For the taxa, Mgt[g] for taxon g in T1 is the index of the
1476    * corresponding taxon in T2. If neither tree has taxon labels
1477    * Mgt[g] = g for all g. Otherwise we have to compare labels. Right
1478    * now, we do this by brute force, which is O(N^2); if this ever
1479    * becomes rate limiting, replace it with a keyhash to make it O(N)
1480    * (and in fact, the keyhash of taxon names could even become part
1481    * of the ESL_TREE).
1482    */
1483   ESL_ALLOC(Mg,  sizeof(int) * (T1->N-1));
1484   ESL_ALLOC(Mgt, sizeof(int) * (T1->N));
1485   if (T1->taxonlabel != NULL && T2->taxonlabel != NULL)
1486     {
1487       esl_vec_ISet(Mgt, T1->N, -1);	/* flags for "unset" */
1488       for (a = 0; a < T1->N; a++)
1489 	{
1490 	  for (b = 0; b < T1->N; b++)
1491 	    if (strcmp(T1->taxonlabel[a], T2->taxonlabel[b]) == 0)
1492 	      { Mgt[a] = b; break; }
1493 	}
1494       for (a = 0; a < T1->N; a++)
1495 	if (Mgt[a] == -1) ESL_XEXCEPTION(eslEINVAL, "couldn't map taxa");
1496     }
1497   else if (T1->taxonlabel == NULL && T2->taxonlabel == NULL)
1498     {
1499       for (a = 0; a < T1->N; a++)
1500 	Mgt[a] = a;
1501     }
1502   else
1503     ESL_XEXCEPTION(eslEINVAL, "either both trees must have taxon labels, or neither");
1504 
1505   /* Finally, we use the SDI algorithm [ZmasekEddy01] to construct
1506    * M(g) for internal nodes, by postorder traversal of T1.
1507    */
1508   for (g = T1->N-2; g >= 0; g--)
1509     {
1510       child = T1->left[g];
1511       if (child <= 0)  a = T2->taxaparent[Mgt[-child]];
1512       else             a = T2->parent[Mg[child]];
1513 
1514       child = T1->right[g];
1515       if (child <= 0)  b = T2->taxaparent[Mgt[-child]];
1516       else             b = T2->parent[Mg[child]];
1517 
1518       /* a shortcut in SDI: special case for exact tree comparison: */
1519       if (a != b) { free(Mg); free(Mgt); return eslFAIL; }
1520       Mg[g] = a;
1521     }
1522 
1523   free(Mg);
1524   free(Mgt);
1525   return eslOK;
1526 
1527  ERROR:
1528   if (Mg  != NULL) free(Mg);
1529   if (Mgt != NULL) free(Mgt);
1530   return status;
1531 }
1532 
1533 /*----------------- end, tree comparison  -----------------------*/
1534 
1535 
1536 
1537 
1538 
1539 
1540 /*****************************************************************
1541  * 4. Clustering algorithms for tree construction.
1542  *****************************************************************/
1543 
1544 /* cluster_engine()
1545  *
1546  * Implements four clustering algorithms for tree construction:
1547  * UPGMA, WPGMA, single-linkage, and maximum-linkage. These differ
1548  * only by the rule used to construct new distances after joining
1549  * two clusters i,j.
1550  *
1551  * Input <D_original> is a symmetric distance matrix, for <D->n> taxa.
1552  * The diagonal is all 0's, and off-diagonals are $\geq 0$. <D->n>
1553  * must be at least two.
1554  *
1555  * <mode> is one of <eslUPGMA>, <eslWPGMA>, <eslSINGLE_LINKAGE>, or
1556  * <eslCOMPLETE_LINKAGE>: a flag specifying which algorithm to use.
1557  *
1558  * The output is a tree structure, returned in <ret_T>.
1559  *
1560  * Returns <eslOK> on success.
1561  *
1562  * Throws <eslEMEM> on allocation failure.
1563  *
1564  * Complexity: O(N^2) in memory, O(N^3) in time.
1565  *
1566  * This function can be optimized. Memory usage is at least
1567  * 4x more than necessary. First, we don't need to make a copy of D
1568  * if the caller doesn't mind it being consumed. Second, D only
1569  * needs to be lower- or upper-triangular, because it's symmetric,
1570  * but that requires changing dmatrix module. In time,
1571  * O(N^2 log N) if not O(N^2) should be possible, by being more
1572  * sophisticated about identifying the minimum element;
1573  * see Gronau and Moran (2006).
1574  *
1575  */
1576 static int
cluster_engine(ESL_DMATRIX * D_original,int mode,ESL_TREE ** ret_T)1577 cluster_engine(ESL_DMATRIX *D_original, int mode, ESL_TREE **ret_T)
1578 {
1579   ESL_DMATRIX *D = NULL;
1580   ESL_TREE    *T = NULL;
1581   double      *height = NULL;	/* height of internal nodes  [0..N-2]          */
1582   int         *idx    = NULL;	/* taxa or node index of row/col in D [0..N-1] */
1583   int         *nin    = NULL;	/* # of taxa in clade in row/col in D [0..N-1] */
1584   int          N;
1585   int          i = 0, j = 0;
1586   int          row,col;
1587   double       minD;
1588   int          status;
1589 
1590   /* Contract checks.
1591    */
1592   ESL_DASSERT1((D_original != NULL));               /* matrix exists      */
1593   ESL_DASSERT1((D_original->n == D_original->m));   /* D is NxN square    */
1594   ESL_DASSERT1((D_original->n >= 2));               /* >= 2 taxa          */
1595 
1596 #if (eslDEBUGLEVEL >=1)
1597   for (i = 0; i < D_original->n; i++) {
1598     assert(D_original->mx[i][i] == 0.);	           /* self-self d = 0    */
1599     for (j = i+1; j < D_original->n; j++)	   /* D symmetric        */
1600       assert(D_original->mx[i][j] == D_original->mx[j][i]);
1601   }
1602 #endif
1603 
1604   /* Allocations.
1605    * NxN copy of the distance matrix, which we'll iteratively whittle down to 2x2;
1606    * tree for N taxa;
1607    */
1608   if ((D = esl_dmatrix_Clone(D_original)) == NULL) return eslEMEM;
1609   if ((T = esl_tree_Create(D->n))         == NULL) return eslEMEM;
1610   ESL_ALLOC(idx,    sizeof(int)    *  D->n);
1611   ESL_ALLOC(nin,    sizeof(int)    *  D->n);
1612   ESL_ALLOC(height, sizeof(double) * (D->n-1));
1613   for (i = 0; i < D->n;   i++) idx[i]    = -i; /* assign taxa indices to row/col coords */
1614   for (i = 0; i < D->n;   i++) nin[i ]   = 1;  /* each cluster starts as 1  */
1615   for (i = 0; i < D->n-1; i++) height[i] = 0.;
1616 
1617   /* If we're doing either single linkage or complete linkage clustering,
1618    * we will construct a "linkage tree", where ld[v], rd[v] "branch lengths"
1619    * below node v are the linkage value for clustering node v; thus
1620    * ld[v] == rd[v] in a linkage tree.
1621    * For UPGMA or WPGMA, we're building an additive tree, where ld[v] and
1622    * rd[v] are branch lengths.
1623    */
1624   if (mode == eslSINGLE_LINKAGE || mode == eslCOMPLETE_LINKAGE)
1625     T->is_linkage_tree = TRUE;
1626 
1627   for (N = D->n; N >= 2; N--)
1628     {
1629       /* Find minimum in our current N x N matrix.
1630        * (Don't init minD to -infinity; linkage trees use sparse distance matrices
1631        * with -infinity representing unlinked.)
1632        */
1633       minD = D->mx[0][1]; i = 0; j = 1;	/* init with: if nothing else, try to link 0-1 */
1634       for (row = 0; row < N; row++)
1635 	for (col = row+1; col < N; col++)
1636 	  if (D->mx[row][col] < minD)
1637 	    {
1638 	      minD = D->mx[row][col];
1639 	      i    = row;
1640 	      j    = col;
1641 	    }
1642 
1643       /* We're joining node at row/col i with node at row/col j.
1644        * Add node (index = N-2) to the tree at height minD/2.
1645        */
1646       T->left[N-2]  = idx[i];
1647       T->right[N-2] = idx[j];
1648       if (T->is_linkage_tree)        height[N-2]   = minD;
1649       else                           height[N-2]   = minD / 2.;
1650 
1651       /* Set the branch lengths (additive trees) or heights (linkage trees)
1652        */
1653       T->ld[N-2] = T->rd[N-2] = height[N-2];
1654       if (! T->is_linkage_tree) {
1655 	if (idx[i] > 0) T->ld[N-2] = ESL_MAX(0., T->ld[N-2] - height[idx[i]]);  // max to 0, to avoid fp roundoff giving us negative length
1656 	if (idx[j] > 0) T->rd[N-2] = ESL_MAX(0., T->rd[N-2] - height[idx[j]]);
1657       }
1658 
1659       /* If either node was an internal node, record parent in it.
1660        */
1661       if (idx[i] > 0)  T->parent[idx[i]] = N-2;
1662       if (idx[j] > 0)  T->parent[idx[j]] = N-2;
1663 
1664       /* Now, build a new matrix by merging row i+j and col i+j.
1665        *  1. move j to N-1 (unless it's already there)
1666        *  2. move i to N-2 (unless it's already there)
1667        */
1668       if (j != N-1)
1669 	{
1670 	  for (row = 0; row < N; row++)
1671 	    ESL_SWAP(D->mx[row][N-1], D->mx[row][j], double);
1672 	  for (col = 0; col < N; col++)
1673 	    ESL_SWAP(D->mx[N-1][col], D->mx[j][col], double);
1674 	  ESL_SWAP(idx[j],  idx[N-1],  int);
1675 	  ESL_SWAP(nin[j], nin[N-1], int);
1676 	}
1677       if (i != N-2)
1678 	{
1679 	  for (row = 0; row < N; row++)
1680 	    ESL_SWAP(D->mx[row][N-2], D->mx[row][i], double);
1681 	  for (col = 0; col < N; col++)
1682 	    ESL_SWAP(D->mx[N-2][col], D->mx[i][col], double);
1683 	  ESL_SWAP(idx[i], idx[N-2], int);
1684 	  ESL_SWAP(nin[i], nin[N-2], int);
1685 	}
1686       i = N-2;
1687       j = N-1;
1688 
1689       /* 3. merge i (now at N-2) with j (now at N-1)
1690        *    according to the desired clustering rule.
1691        */
1692       for (col = 0; col < N; col++)
1693 	{
1694 	  switch (mode) {
1695 	  case eslUPGMA:
1696 	    D->mx[i][col] = (nin[i] * D->mx[i][col] + nin[j] * D->mx[j][col]) / (double) (nin[i] + nin[j]);
1697 	    break;
1698 	  case eslWPGMA:            D->mx[i][col] = (D->mx[i][col] + D->mx[j][col]) / 2.;    break;
1699 	  case eslSINGLE_LINKAGE:   D->mx[i][col] = ESL_MIN(D->mx[i][col], D->mx[j][col]);   break;
1700 	  case eslCOMPLETE_LINKAGE: D->mx[i][col] = ESL_MAX(D->mx[i][col], D->mx[j][col]);   break;
1701 	  default:                  ESL_XEXCEPTION(eslEINCONCEIVABLE, "no such strategy");
1702 	  }
1703 	  D->mx[col][i] = D->mx[i][col];
1704 	}
1705 
1706       /* row/col i is now the new cluster, and it corresponds to node N-2
1707        * in the tree (remember, N is decrementing at each iteration).
1708        * row/col j (N-1) falls away when we go back to the start of the loop
1709        * and decrement N.
1710        */
1711       nin[i] += nin[j];
1712       idx[i]  = N-2;
1713     }
1714 
1715   esl_dmatrix_Destroy(D);
1716   free(height);
1717   free(idx);
1718   free(nin);
1719   if (ret_T != NULL) *ret_T = T;
1720   return eslOK;
1721 
1722  ERROR:
1723   if (D      != NULL) esl_dmatrix_Destroy(D);
1724   if (T      != NULL) esl_tree_Destroy(T);
1725   if (height != NULL) free(height);
1726   if (idx    != NULL) free(idx);
1727   if (nin    != NULL) free(nin);
1728   if (ret_T != NULL) *ret_T = NULL;
1729   return status;
1730 }
1731 
1732 
1733 /* Function:  esl_tree_UPGMA()
1734  *
1735  * Purpose:   Given distance matrix <D>, use the UPGMA algorithm
1736  *            to construct a tree <T>.
1737  *
1738  * Returns:   <eslOK> on success; the tree is returned in <ret_T>,
1739  *            and must be freed by the caller with <esl_tree_Destroy()>.
1740  *
1741  * Throws:    <eslEMEM> on allocation problem, and <ret_T> is set <NULL>.
1742  */
1743 int
esl_tree_UPGMA(ESL_DMATRIX * D,ESL_TREE ** ret_T)1744 esl_tree_UPGMA(ESL_DMATRIX *D, ESL_TREE **ret_T)
1745 {
1746   return cluster_engine(D, eslUPGMA, ret_T);
1747 }
1748 
1749 /* Function:  esl_tree_WPGMA()
1750  *
1751  * Purpose:   Given distance matrix <D>, use the WPGMA algorithm
1752  *            to construct a tree <T>.
1753  *
1754  * Returns:   <eslOK> on success; the tree is returned in <ret_T>,
1755  *            and must be freed by the caller with <esl_tree_Destroy()>.
1756  *
1757  * Throws:    <eslEMEM> on allocation problem, and <ret_T> is set <NULL>.
1758  */
1759 int
esl_tree_WPGMA(ESL_DMATRIX * D,ESL_TREE ** ret_T)1760 esl_tree_WPGMA(ESL_DMATRIX *D, ESL_TREE **ret_T)
1761 {
1762   return cluster_engine(D, eslWPGMA, ret_T);
1763 }
1764 
1765 /* Function:  esl_tree_SingleLinkage()
1766  *
1767  * Purpose:   Given distance matrix <D>, construct a single-linkage
1768  *            (minimum distances) clustering tree <T>.
1769  *
1770  * Returns:   <eslOK> on success; the tree is returned in <ret_T>,
1771  *            and must be freed by the caller with <esl_tree_Destroy()>.
1772  *
1773  * Throws:    <eslEMEM> on allocation problem, and <ret_T> is set <NULL>.
1774  */
1775 int
esl_tree_SingleLinkage(ESL_DMATRIX * D,ESL_TREE ** ret_T)1776 esl_tree_SingleLinkage(ESL_DMATRIX *D, ESL_TREE **ret_T)
1777 {
1778   return cluster_engine(D, eslSINGLE_LINKAGE, ret_T);
1779 }
1780 
1781 /* Function:  esl_tree_CompleteLinkage()
1782  *
1783  * Purpose:   Given distance matrix <D>, construct a complete-linkage
1784  *            (maximum distances) clustering tree <T>.
1785  *
1786  * Returns:   <eslOK> on success; the tree is returned in <ret_T>,
1787  *            and must be freed by the caller with <esl_tree_Destroy()>.
1788  *
1789  * Throws:    <eslEMEM> on allocation problem, and <ret_T> is set <NULL>.
1790  */
1791 int
esl_tree_CompleteLinkage(ESL_DMATRIX * D,ESL_TREE ** ret_T)1792 esl_tree_CompleteLinkage(ESL_DMATRIX *D, ESL_TREE **ret_T)
1793 {
1794   return cluster_engine(D, eslCOMPLETE_LINKAGE, ret_T);
1795 }
1796 /*----------------- end, clustering algorithms  ----------------*/
1797 
1798 
1799 
1800 /*****************************************************************
1801  * 5. Generating simulated trees
1802  *****************************************************************/
1803 
1804 /* Function:  esl_tree_Simulate()
1805  * Synopsis:  Generate a random rooted ultrametric tree.
1806  *
1807  * Purpose:   Generate a random rooted ultrametric tree of <N> taxa,
1808  *            using the algorithm of Kuhner and Felsenstein (1994).
1809  *
1810  *            The branch lengths are generated by choosing <N-1>
1811  *            exponentially distributed split times, with decreasing
1812  *            expectations of $\frac{1}{2},\frac{1}{3}..\frac{1}{N}$
1813  *            as the simulation proceeds from the root. Thus the
1814  *            total expected branch length on the tree is
1815  *            $\sum_{k=2}^{N} \frac{1}{k}$.
1816  *
1817  * Args:      r     - random number source
1818  *            N     - number of taxa (>= 2)
1819  *            ret_T - RETURN: sampled tree
1820  *
1821  * Returns:   <eslOK> on success, and the new tree is allocated
1822  *            here and returned via <ret_tree>; caller is
1823  *            responsible for free'ing it.
1824  *
1825  * Throws:    <eslEMEM> on allocation failure, in which case
1826  *            the <ret_T> is returned <NULL>.
1827  *
1828  * Xref:      STL11/65.
1829  */
1830 int
esl_tree_Simulate(ESL_RANDOMNESS * r,int N,ESL_TREE ** ret_T)1831 esl_tree_Simulate(ESL_RANDOMNESS *r, int N, ESL_TREE **ret_T)
1832 {
1833   ESL_TREE       *T          = NULL;
1834   int            *branchpapa = NULL;
1835   int            *branchside = NULL;
1836   int       nactive;
1837   double    d;
1838   int       node;
1839   int       bidx;	        	/* index of an active branch */
1840   int       status;
1841 
1842   ESL_DASSERT1( (r != NULL) );
1843   ESL_DASSERT1( (N >= 2) );
1844 
1845   /* Kuhner/Felsenstein uses a list of active branches,
1846    * which we implement by tracking the index of the parent
1847    * node (in <branchpapa>) and a 0/1 flag (in <branchside>)
1848    * for the branch to the left vs. right child.
1849    */
1850   if ((T = esl_tree_Create(N)) == NULL)  { status = eslEMEM; goto ERROR; }
1851   ESL_ALLOC(branchpapa, sizeof(int) * N);
1852   ESL_ALLOC(branchside, sizeof(int) * N);
1853 
1854   /* Initialize: add two branches from the root
1855    * onto the active list, and set internal node
1856    * counter to start at 1.
1857    */
1858   branchpapa[0] = 0;   branchside[0] = 0;
1859   branchpapa[1] = 0;   branchside[1] = 1;
1860   nactive = 2;
1861   node    = 1;
1862 
1863   /* Algorithm proceeds by iterating:
1864    *    1. choose random time <d> from exponential(1/nactive)
1865    *    2. choose random active branch, <bidx>
1866    *    3. add new <node> to active branch at length d
1867    *    4. add d to all other active branches
1868    *    5. delete the old parent branch from the active list,
1869    *       add the two new child branches to the active list
1870    */
1871   while (nactive < N)
1872     {
1873       d               = (double) nactive * -log(esl_rnd_UniformPositive(r));
1874       bidx            = esl_rnd_Roll(r, nactive);
1875       T->parent[node] = branchpapa[bidx];
1876 
1877       if (branchside[bidx] == 0) {
1878 	T->left[branchpapa[bidx]]   = node;
1879 	T->ld  [branchpapa[bidx]]  += d;
1880       } else {
1881 	T->right[branchpapa[bidx]]  = node;
1882 	T->rd   [branchpapa[bidx]] += d;
1883       }
1884 
1885       ESL_SWAP(branchpapa[bidx], branchpapa[nactive-1], int);
1886       ESL_SWAP(branchside[bidx], branchside[nactive-1], int);
1887       for (bidx = 0; bidx < nactive-1; bidx++) {
1888 	if (branchside[bidx] == 0) T->ld[branchpapa[bidx]] += d;
1889 	else                       T->rd[branchpapa[bidx]] += d;
1890       }
1891 
1892       /* delete the branch at nactive-1 that we just added to;
1893        * replace it with two new branches
1894        */
1895       branchpapa[nactive-1]  = node;  branchside[nactive-1] = 0;
1896       branchpapa[nactive]    = node;  branchside[nactive]   = 1;
1897       node++;
1898       nactive++;
1899     }
1900 
1901   /* Terminate by adding the N taxa to the N active branches.
1902    */
1903   d = (double) N * -log(esl_rnd_UniformPositive(r));
1904   for (bidx = 0; bidx < N; bidx++)
1905     {
1906       if (branchside[bidx] == 0) {
1907 	T->left[branchpapa[bidx]]  =  -bidx; /* taxa indices stored as neg #'s */
1908 	T->ld  [branchpapa[bidx]]  += d;
1909       } else {
1910 	T->right[branchpapa[bidx]] =  -bidx;
1911 	T->rd  [branchpapa[bidx]]  += d;
1912       }
1913     }
1914 
1915   *ret_T = T;
1916   free(branchpapa);
1917   free(branchside);
1918   return eslOK;
1919 
1920  ERROR:
1921   if (T          != NULL) esl_tree_Destroy(T);
1922   if (branchpapa != NULL) free(branchpapa);
1923   if (branchside != NULL) free(branchside);
1924   *ret_T = NULL;
1925   return status;
1926 }
1927 
1928 
1929 /* Function:  esl_tree_ToDistanceMatrix()
1930  * Synopsis:  Obtain a pairwise distance matrix from a tree.
1931  *
1932  * Purpose:   Given tree <T>, calculate a pairwise distance matrix
1933  *            and return it in <ret_D>.
1934  *
1935  * Note:      Algorithm here is O(N^3). It can probably be improved.
1936  *            There ought to be a more efficient recursion that
1937  *            saves recalculating node-node distances inside the tree.
1938  *            All we do here is a brute force, upwards O(N) LCA
1939  *            search for each of the N^2 taxon pairs.
1940  *
1941  * Args:      T     - input tree
1942  *            ret_D - RETURN: the new distance matrix
1943  *
1944  * Returns:   <eslOK> on success, and <ret_D> points to the distance
1945  *            matrix, which caller is responsible for free'ing with
1946  *            <esl_dmatrix_Destroy()>.
1947  *
1948  * Throws:    <eslEMEM> on allocation failure, in which case
1949  *            <ret_D> is returned <NULL>.
1950  *
1951  * Xref:      STL11/66.
1952  */
1953 int
esl_tree_ToDistanceMatrix(ESL_TREE * T,ESL_DMATRIX ** ret_D)1954 esl_tree_ToDistanceMatrix(ESL_TREE *T, ESL_DMATRIX **ret_D)
1955 {
1956   ESL_DMATRIX *D = NULL;
1957   int i,j;			/* a pair of taxa {0..N-1}           */
1958   int a,b;			/* a pair of internal nodes {0..N-2} */
1959   int p;			/* a tmp parent index */
1960   double d;			/* ij distance */
1961   int status;
1962 
1963   D = esl_dmatrix_Create(T->N, T->N); /* creates a NxN square symmetric matrix; really only need triangular */
1964   if (D == NULL) { status = eslEMEM; goto ERROR; }
1965 
1966   if ((status = esl_tree_SetTaxaParents(T)) != eslOK) goto ERROR;
1967 
1968   for (i = 0; i < T->N; i++)
1969     {
1970       D->mx[i][i] = 0.;		/* by definition */
1971       for (j = i+1; j < T->N; j++)
1972 	{
1973 	  a  = T->taxaparent[i];
1974 	  b  = T->taxaparent[j];
1975 	  d  = (T->left[a] == -i) ? T->ld[a] : T->rd[a];
1976 	  d += (T->left[b] == -j) ? T->ld[b] : T->rd[b];
1977 	  while (a != b)	/* a brute force LCA algorithm */
1978 	    {
1979 	      if (a < b) ESL_SWAP(a, b, int);
1980 	      p  = T->parent[a];
1981 	      d += (T->left[p] == a) ? T->ld[p] : T->rd[p];
1982 	      a  = p;
1983 	    }
1984 
1985 	  D->mx[i][j] = D->mx[j][i] = d;
1986 	}
1987     }
1988 
1989   *ret_D = D;
1990   return eslOK;
1991 
1992  ERROR:
1993   if (D != NULL) esl_dmatrix_Destroy(D);
1994   *ret_D = NULL;
1995   return status;
1996 }
1997 
1998 
1999 
2000 /*****************************************************************
2001  * 6. Unit tests
2002  *****************************************************************/
2003 #ifdef eslTREE_TESTDRIVE
2004 
2005 static void
utest_OptionalInformation(ESL_RANDOMNESS * r,int ntaxa)2006 utest_OptionalInformation(ESL_RANDOMNESS *r, int ntaxa)
2007 {
2008   char *msg = "optional information fields unit test failed";
2009   ESL_TREE *T;
2010 
2011   if (esl_tree_Simulate(r, ntaxa, &T) != eslOK) esl_fatal(msg);
2012   if (esl_tree_SetTaxaParents(T)      != eslOK) esl_fatal(msg);
2013   if (esl_tree_SetCladesizes(T)       != eslOK) esl_fatal(msg);
2014   if (esl_tree_Validate(T, NULL)      != eslOK) esl_fatal(msg);
2015 
2016   esl_tree_Destroy(T);
2017   return;
2018 }
2019 
2020 
2021 static void
utest_WriteNewick(ESL_RANDOMNESS * r,int ntaxa)2022 utest_WriteNewick(ESL_RANDOMNESS *r, int ntaxa)
2023 {
2024   char     *msg      = "esl_tree_WriteNewick unit test failed";
2025   char   tmpfile[32] = "esltmpXXXXXX";
2026   FILE     *fp       = NULL;
2027   ESL_TREE *T1       = NULL;
2028   ESL_TREE *T2       = NULL;
2029   char  errbuf[eslERRBUFSIZE];
2030 
2031   if (esl_tmpfile(tmpfile, &fp)            != eslOK) esl_fatal(msg);
2032   if (esl_tree_Simulate(r, ntaxa, &T1)     != eslOK) esl_fatal(msg);
2033   if (esl_tree_SetTaxonlabels(T1, NULL)    != eslOK) esl_fatal(msg);
2034   if (esl_tree_Validate(T1, NULL)          != eslOK) esl_fatal(msg);
2035   if (esl_tree_WriteNewick(fp, T1)         != eslOK) esl_fatal(msg);
2036   rewind(fp);
2037   if (esl_tree_ReadNewick(fp, errbuf, &T2) != eslOK) esl_fatal(msg);
2038   if (esl_tree_Validate(T2, NULL)          != eslOK) esl_fatal(msg);
2039   if (esl_tree_Compare(T1, T2)             != eslOK) esl_fatal(msg);
2040   fclose(fp);
2041 
2042   esl_tree_Destroy(T1);
2043   esl_tree_Destroy(T2);
2044   return;
2045 }
2046 
2047 
2048 static void
utest_UPGMA(ESL_RANDOMNESS * r,int ntaxa)2049 utest_UPGMA(ESL_RANDOMNESS *r, int ntaxa)
2050 {
2051   char        *msg = "esl_tree_UPGMA unit test failed";
2052   ESL_TREE    *T1 = NULL;
2053   ESL_TREE    *T2 = NULL;
2054   ESL_DMATRIX *D1 = NULL;
2055   ESL_DMATRIX *D2 = NULL;
2056 
2057   if (esl_tree_Simulate(r, ntaxa, &T1)   != eslOK) esl_fatal(msg);
2058   if (esl_tree_ToDistanceMatrix(T1, &D1) != eslOK) esl_fatal(msg);
2059   if (esl_tree_UPGMA(D1, &T2)            != eslOK) esl_fatal(msg);
2060 
2061   if (esl_tree_Validate(T1, NULL)        != eslOK) esl_fatal(msg);
2062   if (esl_tree_Validate(T2, NULL)        != eslOK) esl_fatal(msg);
2063   if (esl_tree_VerifyUltrametric(T1)     != eslOK) esl_fatal(msg);
2064   if (esl_tree_VerifyUltrametric(T2)     != eslOK) esl_fatal(msg);
2065   if (esl_tree_Compare(T1, T2)           != eslOK) esl_fatal(msg);
2066 
2067   if (esl_tree_ToDistanceMatrix(T1, &D2) != eslOK) esl_fatal(msg);
2068   if (esl_dmatrix_Compare(D1, D2, 0.001) != eslOK) esl_fatal(msg);
2069 
2070   esl_tree_Destroy(T1);
2071   esl_tree_Destroy(T2);
2072   esl_dmatrix_Destroy(D1);
2073   esl_dmatrix_Destroy(D2);
2074   return;
2075 }
2076 
2077 #endif /*eslTREE_TESTDRIVE*/
2078 /*-------------------- end, unit tests  -------------------------*/
2079 
2080 
2081 /*****************************************************************
2082  * 7. Test driver
2083  *****************************************************************/
2084 #ifdef eslTREE_TESTDRIVE
2085 
2086 /*
2087  * gcc -g -Wall -o test -L. -I. -DeslTREE_TESTDRIVE esl_tree.c -leasel -lm
2088  * gcc -g -Wall -o test -L. -I. -DeslTEST_THROWING -DeslTREE_TESTDRIVE esl_msa.c -leasel -lm
2089  * ./test
2090  */
2091 #include "easel.h"
2092 #include "esl_tree.h"
2093 #include "esl_random.h"
2094 
2095 int
main(int argc,char ** argv)2096 main(int argc, char **argv)
2097 {
2098   ESL_RANDOMNESS *r = NULL;
2099   int ntaxa;
2100 
2101   r     = esl_randomness_Create(0);
2102   ntaxa = 20;
2103 
2104   utest_OptionalInformation(r, ntaxa); /* SetTaxaparents(), SetCladesizes() */
2105   utest_WriteNewick(r, ntaxa);
2106   utest_UPGMA(r, ntaxa);
2107 
2108   esl_randomness_Destroy(r);
2109   return eslOK;
2110 }
2111 
2112 #endif /*eslTREE_TESTDRIVE*/
2113 /*-------------------- end, test driver  -------------------------*/
2114 
2115 
2116 
2117 
2118 /*****************************************************************
2119  * 8. Examples.
2120  *****************************************************************/
2121 
2122 /* The first example is an example of inferring a tree by the
2123  * UPGMA algorithm, starting from a multiple sequence alignment.
2124  */
2125 #ifdef eslTREE_EXAMPLE
2126 /*::cexcerpt::tree_example::begin::*/
2127 /* To compile: gcc -g -Wall -o example -I. -DeslTREE_EXAMPLE esl_tree.c esl_dmatrix.c esl_msa.c easel.c -lm
2128  *         or: gcc -g -Wall -o example -I. -L. -DeslTREE_EXAMPLE esl_tree.c -leasel -lm
2129  *     To run: ./example <MSA file>
2130  */
2131 #include "easel.h"
2132 #include "esl_msa.h"
2133 #include "esl_msafile.h"
2134 #include "esl_distance.h"
2135 #include "esl_tree.h"
2136 
main(int argc,char ** argv)2137 int main(int argc, char **argv)
2138 {
2139   ESL_TREE     *tree = NULL;
2140   ESL_MSAFILE  *afp  = NULL;
2141   ESL_MSA      *msa  = NULL;
2142   ESL_DMATRIX  *D    = NULL;
2143 
2144   esl_msafile_Open(NULL, argv[1], NULL, eslMSAFILE_UNKNOWN, NULL, &afp);
2145   esl_msafile_Read(afp, &msa);
2146   esl_msafile_Close(afp);
2147 
2148   esl_dst_CDiffMx(msa->aseq, msa->nseq, &D);
2149   esl_tree_UPGMA(D, &tree);
2150 
2151   esl_tree_Destroy(tree);
2152   esl_msa_Destroy(msa);
2153   esl_dmatrix_Destroy(D);
2154   return eslOK;
2155 }
2156 /*::cexcerpt::tree_example::end::*/
2157 #endif /*eslTREE_EXAMPLE*/
2158 
2159 
2160 /* The second example is an example of reading in a Newick format tree.
2161  */
2162 #ifdef eslTREE_EXAMPLE2
2163 /*::cexcerpt::tree_example2::begin::*/
2164 /* To compile: gcc -g -Wall -o example -I. -DeslTREE_EXAMPLE2 esl_tree.c esl_dmatrix.c esl_msa.c easel.c -lm
2165  *         or: gcc -g -Wall -o example -I. -L. -DeslTREE_EXAMPLE2 esl_tree.c -leasel -lm
2166  *     To run: ./example <Newick file>
2167  */
2168 #include "easel.h"
2169 #include "esl_msa.h"
2170 #include "esl_distance.h"
2171 #include "esl_tree.h"
2172 
main(int argc,char ** argv)2173 int main(int argc, char **argv)
2174 {
2175   ESL_TREE    *T;
2176   char         errbuf[eslERRBUFSIZE];
2177   FILE        *fp;
2178 
2179   if ((fp = fopen(argv[1], "r"))           == NULL) esl_fatal("Failed to open %s", argv[1]);
2180   if (esl_tree_ReadNewick(fp, errbuf, &T) != eslOK) esl_fatal("Failed to read tree: %s", errbuf);
2181   esl_tree_WriteNewick(stdout, T);
2182 
2183   esl_tree_Destroy(T);
2184   fclose(fp);
2185   return eslOK;
2186 }
2187 /*::cexcerpt::tree_example2::end::*/
2188 #endif /*eslTREE_EXAMPLE*/
2189