1 /**
2  * PLL (version 1.0.0) a software library for phylogenetic inference
3  * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
4  *
5  * Derived from
6  * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
7  * trees by Alexandros Stamatakis
8  *
9  * This program is free software: you can redistribute it and/or modify it
10  * under the terms of the GNU General Public License as published by the Free
11  * Software Foundation, either version 3 of the License, or (at your option)
12  * any later version.
13  *
14  * This program is distributed in the hope that it will be useful, but WITHOUT
15  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
17  * more details.
18  *
19  * You should have received a copy of the GNU General Public License along with
20  * this program.  If not, see <http://www.gnu.org/licenses/>.
21  *
22  * For any other enquiries send an Email to Tomas Flouri
23  * Tomas.Flouri@h-its.org
24  *
25  * When publishing work that uses PLL please cite PLL
26  *
27  * @file newick.c
28  */
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <assert.h>
33 #include <math.h>
34 
35 #include "pll.h"
36 #include "pllInternal.h"
37 
38 
39 /** @file  newick.c
40 
41     @brief Collection of routines for reading and parsing newick trees
42 
43     Auxiliary functions for reading and parsing newick tree formats
44 */
45 
46 
47 /** @defgroup newickParseGroup Reading and parsing newick trees
48 
49     This set of functions handles the reading and parsing of newick tree formats
50 */
51 
52 static int
parse_newick(pllStack ** stack,int * inp)53 parse_newick (pllStack ** stack, int * inp)
54 {
55   pllNewickNodeInfo * item = NULL;
56   int item_active = 0;
57   pllLexToken token;
58   int input;
59   pllLexToken prev_token;
60   int nop = 0;          /* number of open parentheses */
61   int depth = 0;
62 
63   prev_token.tokenType = PLL_TOKEN_UNKNOWN;
64 
65   input = *inp;
66 
67   NEXT_TOKEN
68 
69   while (token.tokenType != PLL_TOKEN_EOF && token.tokenType != PLL_TOKEN_UNKNOWN)
70   {
71     switch (token.tokenType)
72      {
73        case PLL_TOKEN_OPAREN:
74 #ifdef PLLDEBUG
75        printf ("PLL_TOKEN_OPAREN\n");
76 #endif
77         ++nop;
78         memcpy (&prev_token, &token, sizeof (pllLexToken));
79         ++depth;
80         break;
81 
82        case PLL_TOKEN_CPAREN:
83 #ifdef PLLDEBUG
84        printf ("PLL_TOKEN_CPAREN\n");
85 #endif
86         if (prev_token.tokenType != PLL_TOKEN_CPAREN  &&
87             prev_token.tokenType != PLL_TOKEN_UNKNOWN &&
88             prev_token.tokenType != PLL_TOKEN_STRING  &&
89             prev_token.tokenType != PLL_TOKEN_NUMBER  &&
90             prev_token.tokenType != PLL_TOKEN_FLOAT) return (0);
91 
92         if (!nop) return (0);
93         --nop;
94         memcpy (&prev_token, &token, sizeof (pllLexToken));
95 
96         /* push to the stack */
97         if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); // possibly not nec
98         //if (item->name   == NULL) item->name   = strdup ("INTERNAL_NODE");
99         if (item->name == NULL)
100          {
101            item->name = (char *) rax_malloc ((strlen("INTERNAL_NODE") + 1) * sizeof (char));
102            strcpy (item->name, "INTERNAL_NODE");
103          }
104 
105         //if (item->branch == NULL) item->branch = strdup ("0.000000");
106         if (item->branch == NULL)
107          {
108            item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
109            strcpy (item->branch, "0.000000");
110          }
111         item->depth = depth;
112         pllStackPush (stack, item);
113         item_active  = 1;       /* active = 1 */
114         item = NULL;
115         --depth;
116         break;
117 
118        case PLL_TOKEN_STRING:
119 #ifdef PLLDEBUG
120        printf ("PLL_TOKEN_STRING      %.*s\n", token.len, token.lexeme);
121 #endif
122         if (prev_token.tokenType != PLL_TOKEN_OPAREN &&
123             prev_token.tokenType != PLL_TOKEN_CPAREN &&
124             prev_token.tokenType != PLL_TOKEN_UNKNOWN &&
125             prev_token.tokenType != PLL_TOKEN_COMMA) return (0);
126         if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
127         item->name = my_strndup (token.lexeme, token.len);
128 
129         item_active = 1;
130         item->depth = depth;
131         if (prev_token.tokenType == PLL_TOKEN_COMMA  ||
132             prev_token.tokenType == PLL_TOKEN_OPAREN ||
133             prev_token.tokenType == PLL_TOKEN_UNKNOWN) item->leaf = 1;
134         memcpy (&prev_token, &token, sizeof (pllLexToken));
135         break;
136 
137        case PLL_TOKEN_FLOAT:
138        case PLL_TOKEN_NUMBER:
139 #ifdef PLLDEBUG
140        if (token.tokenType == PLL_TOKEN_FLOAT) printf ("PLL_TOKEN_FLOAT\n"); else printf ("PLL_TOKEN_NUMBER\n");
141 #endif
142          if  (prev_token.tokenType != PLL_TOKEN_OPAREN &&
143               prev_token.tokenType != PLL_TOKEN_CPAREN &&
144               prev_token.tokenType != PLL_TOKEN_COLON  &&
145               prev_token.tokenType != PLL_TOKEN_UNKNOWN &&
146               prev_token.tokenType != PLL_TOKEN_COMMA) return (0);
147         if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
148         if (prev_token.tokenType == PLL_TOKEN_COLON)
149          {
150            item->branch = my_strndup (token.lexeme, token.len);
151          }
152         else
153          {
154            if (prev_token.tokenType == PLL_TOKEN_COMMA  ||
155                prev_token.tokenType == PLL_TOKEN_OPAREN ||
156                prev_token.tokenType == PLL_TOKEN_UNKNOWN) item->leaf = 1;
157            //if (prev_token.tokenType != PLL_TOKEN_UNKNOWN) ++ indent;
158            item->name = my_strndup (token.lexeme, token.len);
159          }
160         item_active = 1;
161         item->depth = depth;
162         memcpy (&prev_token, &token, sizeof (pllLexToken));
163         break;
164 
165        case PLL_TOKEN_COLON:
166 #ifdef PLLDEBUG
167        printf ("PLL_TOKEN_COLON\n");
168 #endif
169         if (prev_token.tokenType != PLL_TOKEN_CPAREN &&
170             prev_token.tokenType != PLL_TOKEN_STRING &&
171             prev_token.tokenType != PLL_TOKEN_FLOAT  &&
172             prev_token.tokenType != PLL_TOKEN_NUMBER) return (0);
173         memcpy (&prev_token, &token, sizeof (pllLexToken));
174         break;
175 
176        case PLL_TOKEN_COMMA:
177 #ifdef PLLDEBUG
178        printf ("PLL_TOKEN_COMMA\n");
179 #endif
180         if (prev_token.tokenType != PLL_TOKEN_CPAREN &&
181              prev_token.tokenType != PLL_TOKEN_STRING &&
182              prev_token.tokenType != PLL_TOKEN_FLOAT &&
183              prev_token.tokenType != PLL_TOKEN_NUMBER) return (0);
184         memcpy (&prev_token, &token, sizeof (pllLexToken));
185 
186         /* push to the stack */
187         if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); // possibly not nece
188         //if (item->name   == NULL) item->name   = strdup ("INTERNAL_NODE");
189         if (item->name == NULL)
190          {
191            item->name = (char *) rax_malloc ((strlen("INTERNAL_NODE") + 1) * sizeof (char));
192            strcpy (item->name, "INTERNAL_NODE");
193          }
194         //if (item->branch == NULL) item->branch = strdup ("0.000000");
195         if (item->branch == NULL)
196          {
197            item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
198            strcpy (item->branch, "0.000000");
199          }
200         item->depth = depth;
201         pllStackPush (stack, item);
202         item_active  = 0;
203         item = NULL;
204         break;
205 
206        case PLL_TOKEN_SEMICOLON:
207 #ifdef PLLDEBUG
208         printf ("PLL_TOKEN_SEMICOLON\n");
209 #endif
210         /* push to the stack */
211         if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
212         //if (item->name   == NULL) item->name   = strdup ("ROOT_NODE");
213         if (item->name == NULL)
214          {
215            item->name = (char *) rax_malloc ((strlen("ROOT_NODE") + 1) * sizeof (char));
216            strcpy (item->name, "ROOT_NODE");
217          }
218         //if (item->branch == NULL) item->branch = strdup ("0.000000");
219         if (item->branch == NULL)
220          {
221            item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
222            strcpy (item->branch, "0.000000");
223          }
224         pllStackPush (stack, item);
225         item_active  = 0;
226         item = NULL;
227         break;
228        default:
229 #ifdef __DEBUGGING_MODE
230          printf ("Unknown token: %d\n", token.tokenType);
231 #endif
232        // TODO: Finish this part and add error codes
233         break;
234      }
235     NEXT_TOKEN
236     CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE);
237   }
238   if (item_active)
239    {
240      if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
241      //if (item->name   == NULL) item->name   = strdup ("ROOT_NODE");
242      if (item->name == NULL)
243       {
244         item->name = (char *) rax_malloc ((strlen("ROOT_NODE") + 1) * sizeof (char));
245         strcpy (item->name, "ROOT_NODE");
246       }
247      //if (item->branch == NULL) item->branch = strdup ("0.000000");
248      if (item->branch == NULL)
249       {
250         item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
251         strcpy (item->branch, "0.000000");
252       }
253      pllStackPush (stack, item);
254      item_active  = 0;
255    }
256 
257   if (nop || token.tokenType == PLL_TOKEN_UNKNOWN)
258    {
259      return (0);
260    }
261 
262   return (1);
263 }
264 
265 #ifdef __DEBUGGING_MODE
stack_dump(pllStack ** stack)266 void stack_dump(pllStack ** stack)
267 {
268   pllNewickNodeInfo * item;
269   pllStack * head;
270   int i;
271 
272   head = *stack;
273   while (head)
274    {
275      item = (pllNewickNodeInfo *) head->item;
276 
277      for (i = 0; i < item->depth; ++ i) printf ("\t");
278 
279      printf ("%s:%s\n", item->name, item->branch);
280 
281      head = head->next;
282    }
283 }
284 #endif
285 
286 static void
assign_ranks(pllStack * stack,int * nodes,int * leaves)287 assign_ranks (pllStack * stack, int * nodes, int * leaves)
288 {
289   pllStack * head;
290   pllNewickNodeInfo * item, * tmp;
291   pllStack * preorder = NULL;
292   int children;
293   int depth;
294 
295   *nodes = *leaves = 0;
296 
297 
298   head = stack;
299   while (head)
300   {
301     assert (head->item);
302     item = (pllNewickNodeInfo *) head->item;
303 
304     if (item->leaf)  ++ (*leaves);
305 
306     if (preorder)
307      {
308        tmp = (pllNewickNodeInfo *) preorder->item;
309        children = 0;
310        while (item->depth < tmp->depth)
311         {
312           children = 1;
313           depth = tmp->depth;
314           pllStackPop (&preorder);
315           tmp = preorder->item;
316           while (tmp->depth == depth)
317            {
318              ++ children;
319              pllStackPop (&preorder);
320              tmp = (pllNewickNodeInfo *)preorder->item;
321            }
322           tmp->rank += children;
323         }
324      }
325 
326     ++ (*nodes);
327     head = head->next;
328 
329     if (item->leaf)
330      {
331        if (!preorder) return;
332 
333        children = 1;
334        tmp = preorder->item;
335        while (tmp->depth == item->depth)
336         {
337           ++ children;
338           pllStackPop (&preorder);
339           assert (preorder);
340           tmp = (pllNewickNodeInfo *)preorder->item;
341         }
342        tmp->rank += children;
343      }
344     else
345      {
346        pllStackPush (&preorder, item);
347      }
348   }
349 
350   while (preorder->item != stack->item)
351   {
352     item = (pllNewickNodeInfo *)pllStackPop (&preorder);
353     tmp  = (pllNewickNodeInfo *) preorder->item;
354     children = 1;
355 
356     while (tmp->depth == item->depth)
357      {
358        ++ children;
359        item = (pllNewickNodeInfo *) pllStackPop (&preorder);
360        tmp  = (pllNewickNodeInfo *) preorder->item;
361      }
362     tmp->rank += children;
363     children = 0;
364   }
365  assert (preorder->item == stack->item);
366 
367  pllStackClear (&preorder);
368 }
369 
370 /** @ingroup newickParseGroup
371     @brief Validate if a newick tree is a valid phylogenetic tree
372 
373     A valid tree is one where the root node is binary or ternary
374     and all other internal nodes are binary. In case the root
375     is ternary then the tree must contain at least another internal
376     node and the total number of nodes must be equal to
377     \f$ 2l - 2\f$, where \f$l\f$ is the number of leaves. If the
378     root is binary, then the total number of nodes must be equal
379     to \f$2l - 1\f$.
380 
381     @param tree
382       Newick tree wrapper structure which contains the stack representation of the parsed newick tree
383 
384     @return
385       Returns \b 1 in case of success, otherwise \b 0
386 */
387 int
pllValidateNewick(pllNewickTree * t)388 pllValidateNewick (pllNewickTree * t)
389 {
390   pllStack * head;
391   pllNewickNodeInfo * item;
392   int correct = 0;
393 
394   item = t->tree->item;
395   if (item->rank != 2 && item->rank != 3) return (0);
396   head = t->tree->next;
397   while (head)
398   {
399     item = head->item;
400     if (item->rank != 2 && item->rank != 0)
401      {
402        return (0);
403      }
404     head = head->next;
405   }
406 
407   item = t->tree->item;
408 
409   if (item->rank == 2)
410    {
411      correct = (t->nodes == 2 * t->tips -1);
412      if (correct)
413       {
414         errno = PLL_NEWICK_ROOTED_TREE;
415       }
416      else
417       {
418         errno = PLL_NEWICK_BAD_STRUCTURE;
419       }
420      return (PLL_FALSE);
421    }
422 
423 
424   correct = ((t->nodes == 2 * t->tips - 2) && t->nodes != 4);
425   if (correct) return (PLL_TRUE);
426 
427   errno = PLL_NEWICK_BAD_STRUCTURE;
428 
429   return (1);
430 }
431 
432 
433 /** @ingroup newickParseGroup
434     @brief Convert a binary rooted trree to a binary unrooted tree
435 
436     Changes the root of the node to have 3 descendants instead of two, deletes its last immediate descendant internal node
437     and takes the two children (of the deleted internal node) as its children.
438 
439     @param
440       Newick tree
441 
442     @return
443       \b PLL_TRUE in case of success, otherwise \b PLL_FALSE and \a errno is set
444 */
445 int
pllNewickUnroot(pllNewickTree * t)446 pllNewickUnroot (pllNewickTree * t)
447 {
448   pllStack * tmp;
449   pllNewickNodeInfo * item;
450 
451   item = t->tree->item;
452   if (item->rank == 2)
453    {
454      item->rank = 3;
455      t->nodes--;
456      item = t->tree->next->item;
457      if (item->rank == 0)
458       {
459         tmp = t->tree->next->next;
460         t->tree->next->next = t->tree->next->next->next;
461       }
462      else
463       {
464         tmp = t->tree->next;
465         t->tree->next = t->tree->next->next;
466       }
467      item = tmp->item;
468      rax_free (item->name);
469      rax_free (tmp->item);
470      rax_free (tmp);
471    }
472 
473   return (pllValidateNewick (t));
474 }
475 
476 
477 /** @ingroup newickParseGroup
478     @brief Parse a newick tree string
479 
480     Parse a newick string and create a stack structure which represents the tree
481     in a preorder traversal form. Each element of the stack represents one node
482     and consists of its name, branch length, number of children and depth. The
483     stack structure is finally wrapped in a \a pllNewickTree structure which
484     also contains the number of nodes and leaves.
485 
486     @param newick
487       String containing the newick tree
488 
489     @return
490       Returns a pointer to the created \a pllNewickTree structure in case of success, otherwise \b NULL
491 */
492 pllNewickTree *
pllNewickParseString(const char * newick)493 pllNewickParseString (const char * newick)
494 {
495   int n, input, rc;
496   pllNewickTree * t;
497   int nodes, leaves;
498 
499   t = (pllNewickTree *) rax_calloc (1, sizeof (pllNewickTree));
500 
501   n = strlen (newick);
502 
503   init_lexan (newick, n);
504   input = get_next_symbol();
505 
506   rc = parse_newick (&(t->tree), &input);
507   if (!rc)
508    {
509      /* TODO: properly clean t->tree */
510      rax_free (t);
511      t = NULL;
512    }
513   else
514    {
515      assign_ranks (t->tree, &nodes, &leaves);
516      t->nodes = nodes;
517      t->tips  = leaves;
518    }
519 
520   return (t);
521 }
522 
523 /** @ingroup newickParseGroup
524     @brief Deallocate newick parser stack structure
525 
526     Deallocates the newick parser stack structure that represents the parsed tree. It
527     also frees all memory allocated by elements of the stack structure.
528 
529     @param tree
530       The tree stack structure
531 */
pllNewickParseDestroy(pllNewickTree ** t)532 void pllNewickParseDestroy (pllNewickTree ** t)
533 {
534   pllNewickNodeInfo *  item;
535 
536   while ((item = (pllNewickNodeInfo *)pllStackPop (&((*t)->tree))))
537    {
538      rax_free (item->name);
539      rax_free (item->branch);
540      rax_free (item);
541    }
542   rax_free (*t);
543   (*t) = NULL;
544 }
545 
546 /** @ingroup newickParseGroup
547     @brief Parse a newick tree file
548 
549     Parse a newick file and create a stack structure which represents the tree
550     in a preorder traversal form. Each element of the stack represents one node
551     and consists of its name, branch length, number of children (rank) and depth. The
552     stack structure is finally wrapped in a \a pllNewickTree structure which
553     also contains the number of nodes and leaves.
554 
555     @param filename
556       Filename containing the newick tree
557 
558     @return
559       Returns a pointer to the created \a pllNewickTree structure in case of success, otherwise \b NULL
560 */
561 pllNewickTree *
pllNewickParseFile(const char * filename)562 pllNewickParseFile (const char * filename)
563 {
564   long n;
565   char * rawdata;
566   pllNewickTree * t;
567 
568   rawdata = pllReadFile (filename, &n);
569   if (!rawdata)
570    {
571      fprintf (stderr, "Error while opening/reading file %s\n", filename);
572      return (0);
573    }
574 
575   //printf ("%s\n\n", rawdata);
576 
577   t = pllNewickParseString (rawdata);
578 
579   rax_free (rawdata);
580 
581   return (t);
582 }
583 
584