1 /**
2 * PLL (version 1.0.0) a software library for phylogenetic inference
3 * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
4 *
5 * Derived from
6 * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
7 * trees by Alexandros Stamatakis
8 *
9 * This program is free software: you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License as published by the Free
11 * Software Foundation, either version 3 of the License, or (at your option)
12 * any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
17 * more details.
18 *
19 * You should have received a copy of the GNU General Public License along with
20 * this program. If not, see <http://www.gnu.org/licenses/>.
21 *
22 * For any other enquiries send an Email to Tomas Flouri
23 * Tomas.Flouri@h-its.org
24 *
25 * When publishing work that uses PLL please cite PLL
26 *
27 * @file newick.c
28 */
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <assert.h>
33 #include <math.h>
34
35 #include "pll.h"
36 #include "pllInternal.h"
37
38
39 /** @file newick.c
40
41 @brief Collection of routines for reading and parsing newick trees
42
43 Auxiliary functions for reading and parsing newick tree formats
44 */
45
46
47 /** @defgroup newickParseGroup Reading and parsing newick trees
48
49 This set of functions handles the reading and parsing of newick tree formats
50 */
51
52 static int
parse_newick(pllStack ** stack,int * inp)53 parse_newick (pllStack ** stack, int * inp)
54 {
55 pllNewickNodeInfo * item = NULL;
56 int item_active = 0;
57 pllLexToken token;
58 int input;
59 pllLexToken prev_token;
60 int nop = 0; /* number of open parentheses */
61 int depth = 0;
62
63 prev_token.tokenType = PLL_TOKEN_UNKNOWN;
64
65 input = *inp;
66
67 NEXT_TOKEN
68
69 while (token.tokenType != PLL_TOKEN_EOF && token.tokenType != PLL_TOKEN_UNKNOWN)
70 {
71 switch (token.tokenType)
72 {
73 case PLL_TOKEN_OPAREN:
74 #ifdef PLLDEBUG
75 printf ("PLL_TOKEN_OPAREN\n");
76 #endif
77 ++nop;
78 memcpy (&prev_token, &token, sizeof (pllLexToken));
79 ++depth;
80 break;
81
82 case PLL_TOKEN_CPAREN:
83 #ifdef PLLDEBUG
84 printf ("PLL_TOKEN_CPAREN\n");
85 #endif
86 if (prev_token.tokenType != PLL_TOKEN_CPAREN &&
87 prev_token.tokenType != PLL_TOKEN_UNKNOWN &&
88 prev_token.tokenType != PLL_TOKEN_STRING &&
89 prev_token.tokenType != PLL_TOKEN_NUMBER &&
90 prev_token.tokenType != PLL_TOKEN_FLOAT) return (0);
91
92 if (!nop) return (0);
93 --nop;
94 memcpy (&prev_token, &token, sizeof (pllLexToken));
95
96 /* push to the stack */
97 if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); // possibly not nec
98 //if (item->name == NULL) item->name = strdup ("INTERNAL_NODE");
99 if (item->name == NULL)
100 {
101 item->name = (char *) rax_malloc ((strlen("INTERNAL_NODE") + 1) * sizeof (char));
102 strcpy (item->name, "INTERNAL_NODE");
103 }
104
105 //if (item->branch == NULL) item->branch = strdup ("0.000000");
106 if (item->branch == NULL)
107 {
108 item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
109 strcpy (item->branch, "0.000000");
110 }
111 item->depth = depth;
112 pllStackPush (stack, item);
113 item_active = 1; /* active = 1 */
114 item = NULL;
115 --depth;
116 break;
117
118 case PLL_TOKEN_STRING:
119 #ifdef PLLDEBUG
120 printf ("PLL_TOKEN_STRING %.*s\n", token.len, token.lexeme);
121 #endif
122 if (prev_token.tokenType != PLL_TOKEN_OPAREN &&
123 prev_token.tokenType != PLL_TOKEN_CPAREN &&
124 prev_token.tokenType != PLL_TOKEN_UNKNOWN &&
125 prev_token.tokenType != PLL_TOKEN_COMMA) return (0);
126 if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
127 item->name = my_strndup (token.lexeme, token.len);
128
129 item_active = 1;
130 item->depth = depth;
131 if (prev_token.tokenType == PLL_TOKEN_COMMA ||
132 prev_token.tokenType == PLL_TOKEN_OPAREN ||
133 prev_token.tokenType == PLL_TOKEN_UNKNOWN) item->leaf = 1;
134 memcpy (&prev_token, &token, sizeof (pllLexToken));
135 break;
136
137 case PLL_TOKEN_FLOAT:
138 case PLL_TOKEN_NUMBER:
139 #ifdef PLLDEBUG
140 if (token.tokenType == PLL_TOKEN_FLOAT) printf ("PLL_TOKEN_FLOAT\n"); else printf ("PLL_TOKEN_NUMBER\n");
141 #endif
142 if (prev_token.tokenType != PLL_TOKEN_OPAREN &&
143 prev_token.tokenType != PLL_TOKEN_CPAREN &&
144 prev_token.tokenType != PLL_TOKEN_COLON &&
145 prev_token.tokenType != PLL_TOKEN_UNKNOWN &&
146 prev_token.tokenType != PLL_TOKEN_COMMA) return (0);
147 if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
148 if (prev_token.tokenType == PLL_TOKEN_COLON)
149 {
150 item->branch = my_strndup (token.lexeme, token.len);
151 }
152 else
153 {
154 if (prev_token.tokenType == PLL_TOKEN_COMMA ||
155 prev_token.tokenType == PLL_TOKEN_OPAREN ||
156 prev_token.tokenType == PLL_TOKEN_UNKNOWN) item->leaf = 1;
157 //if (prev_token.tokenType != PLL_TOKEN_UNKNOWN) ++ indent;
158 item->name = my_strndup (token.lexeme, token.len);
159 }
160 item_active = 1;
161 item->depth = depth;
162 memcpy (&prev_token, &token, sizeof (pllLexToken));
163 break;
164
165 case PLL_TOKEN_COLON:
166 #ifdef PLLDEBUG
167 printf ("PLL_TOKEN_COLON\n");
168 #endif
169 if (prev_token.tokenType != PLL_TOKEN_CPAREN &&
170 prev_token.tokenType != PLL_TOKEN_STRING &&
171 prev_token.tokenType != PLL_TOKEN_FLOAT &&
172 prev_token.tokenType != PLL_TOKEN_NUMBER) return (0);
173 memcpy (&prev_token, &token, sizeof (pllLexToken));
174 break;
175
176 case PLL_TOKEN_COMMA:
177 #ifdef PLLDEBUG
178 printf ("PLL_TOKEN_COMMA\n");
179 #endif
180 if (prev_token.tokenType != PLL_TOKEN_CPAREN &&
181 prev_token.tokenType != PLL_TOKEN_STRING &&
182 prev_token.tokenType != PLL_TOKEN_FLOAT &&
183 prev_token.tokenType != PLL_TOKEN_NUMBER) return (0);
184 memcpy (&prev_token, &token, sizeof (pllLexToken));
185
186 /* push to the stack */
187 if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); // possibly not nece
188 //if (item->name == NULL) item->name = strdup ("INTERNAL_NODE");
189 if (item->name == NULL)
190 {
191 item->name = (char *) rax_malloc ((strlen("INTERNAL_NODE") + 1) * sizeof (char));
192 strcpy (item->name, "INTERNAL_NODE");
193 }
194 //if (item->branch == NULL) item->branch = strdup ("0.000000");
195 if (item->branch == NULL)
196 {
197 item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
198 strcpy (item->branch, "0.000000");
199 }
200 item->depth = depth;
201 pllStackPush (stack, item);
202 item_active = 0;
203 item = NULL;
204 break;
205
206 case PLL_TOKEN_SEMICOLON:
207 #ifdef PLLDEBUG
208 printf ("PLL_TOKEN_SEMICOLON\n");
209 #endif
210 /* push to the stack */
211 if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
212 //if (item->name == NULL) item->name = strdup ("ROOT_NODE");
213 if (item->name == NULL)
214 {
215 item->name = (char *) rax_malloc ((strlen("ROOT_NODE") + 1) * sizeof (char));
216 strcpy (item->name, "ROOT_NODE");
217 }
218 //if (item->branch == NULL) item->branch = strdup ("0.000000");
219 if (item->branch == NULL)
220 {
221 item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
222 strcpy (item->branch, "0.000000");
223 }
224 pllStackPush (stack, item);
225 item_active = 0;
226 item = NULL;
227 break;
228 default:
229 #ifdef __DEBUGGING_MODE
230 printf ("Unknown token: %d\n", token.tokenType);
231 #endif
232 // TODO: Finish this part and add error codes
233 break;
234 }
235 NEXT_TOKEN
236 CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE);
237 }
238 if (item_active)
239 {
240 if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
241 //if (item->name == NULL) item->name = strdup ("ROOT_NODE");
242 if (item->name == NULL)
243 {
244 item->name = (char *) rax_malloc ((strlen("ROOT_NODE") + 1) * sizeof (char));
245 strcpy (item->name, "ROOT_NODE");
246 }
247 //if (item->branch == NULL) item->branch = strdup ("0.000000");
248 if (item->branch == NULL)
249 {
250 item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
251 strcpy (item->branch, "0.000000");
252 }
253 pllStackPush (stack, item);
254 item_active = 0;
255 }
256
257 if (nop || token.tokenType == PLL_TOKEN_UNKNOWN)
258 {
259 return (0);
260 }
261
262 return (1);
263 }
264
265 #ifdef __DEBUGGING_MODE
stack_dump(pllStack ** stack)266 void stack_dump(pllStack ** stack)
267 {
268 pllNewickNodeInfo * item;
269 pllStack * head;
270 int i;
271
272 head = *stack;
273 while (head)
274 {
275 item = (pllNewickNodeInfo *) head->item;
276
277 for (i = 0; i < item->depth; ++ i) printf ("\t");
278
279 printf ("%s:%s\n", item->name, item->branch);
280
281 head = head->next;
282 }
283 }
284 #endif
285
286 static void
assign_ranks(pllStack * stack,int * nodes,int * leaves)287 assign_ranks (pllStack * stack, int * nodes, int * leaves)
288 {
289 pllStack * head;
290 pllNewickNodeInfo * item, * tmp;
291 pllStack * preorder = NULL;
292 int children;
293 int depth;
294
295 *nodes = *leaves = 0;
296
297
298 head = stack;
299 while (head)
300 {
301 assert (head->item);
302 item = (pllNewickNodeInfo *) head->item;
303
304 if (item->leaf) ++ (*leaves);
305
306 if (preorder)
307 {
308 tmp = (pllNewickNodeInfo *) preorder->item;
309 children = 0;
310 while (item->depth < tmp->depth)
311 {
312 children = 1;
313 depth = tmp->depth;
314 pllStackPop (&preorder);
315 tmp = preorder->item;
316 while (tmp->depth == depth)
317 {
318 ++ children;
319 pllStackPop (&preorder);
320 tmp = (pllNewickNodeInfo *)preorder->item;
321 }
322 tmp->rank += children;
323 }
324 }
325
326 ++ (*nodes);
327 head = head->next;
328
329 if (item->leaf)
330 {
331 if (!preorder) return;
332
333 children = 1;
334 tmp = preorder->item;
335 while (tmp->depth == item->depth)
336 {
337 ++ children;
338 pllStackPop (&preorder);
339 assert (preorder);
340 tmp = (pllNewickNodeInfo *)preorder->item;
341 }
342 tmp->rank += children;
343 }
344 else
345 {
346 pllStackPush (&preorder, item);
347 }
348 }
349
350 while (preorder->item != stack->item)
351 {
352 item = (pllNewickNodeInfo *)pllStackPop (&preorder);
353 tmp = (pllNewickNodeInfo *) preorder->item;
354 children = 1;
355
356 while (tmp->depth == item->depth)
357 {
358 ++ children;
359 item = (pllNewickNodeInfo *) pllStackPop (&preorder);
360 tmp = (pllNewickNodeInfo *) preorder->item;
361 }
362 tmp->rank += children;
363 children = 0;
364 }
365 assert (preorder->item == stack->item);
366
367 pllStackClear (&preorder);
368 }
369
370 /** @ingroup newickParseGroup
371 @brief Validate if a newick tree is a valid phylogenetic tree
372
373 A valid tree is one where the root node is binary or ternary
374 and all other internal nodes are binary. In case the root
375 is ternary then the tree must contain at least another internal
376 node and the total number of nodes must be equal to
377 \f$ 2l - 2\f$, where \f$l\f$ is the number of leaves. If the
378 root is binary, then the total number of nodes must be equal
379 to \f$2l - 1\f$.
380
381 @param tree
382 Newick tree wrapper structure which contains the stack representation of the parsed newick tree
383
384 @return
385 Returns \b 1 in case of success, otherwise \b 0
386 */
387 int
pllValidateNewick(pllNewickTree * t)388 pllValidateNewick (pllNewickTree * t)
389 {
390 pllStack * head;
391 pllNewickNodeInfo * item;
392 int correct = 0;
393
394 item = t->tree->item;
395 if (item->rank != 2 && item->rank != 3) return (0);
396 head = t->tree->next;
397 while (head)
398 {
399 item = head->item;
400 if (item->rank != 2 && item->rank != 0)
401 {
402 return (0);
403 }
404 head = head->next;
405 }
406
407 item = t->tree->item;
408
409 if (item->rank == 2)
410 {
411 correct = (t->nodes == 2 * t->tips -1);
412 if (correct)
413 {
414 errno = PLL_NEWICK_ROOTED_TREE;
415 }
416 else
417 {
418 errno = PLL_NEWICK_BAD_STRUCTURE;
419 }
420 return (PLL_FALSE);
421 }
422
423
424 correct = ((t->nodes == 2 * t->tips - 2) && t->nodes != 4);
425 if (correct) return (PLL_TRUE);
426
427 errno = PLL_NEWICK_BAD_STRUCTURE;
428
429 return (1);
430 }
431
432
433 /** @ingroup newickParseGroup
434 @brief Convert a binary rooted trree to a binary unrooted tree
435
436 Changes the root of the node to have 3 descendants instead of two, deletes its last immediate descendant internal node
437 and takes the two children (of the deleted internal node) as its children.
438
439 @param
440 Newick tree
441
442 @return
443 \b PLL_TRUE in case of success, otherwise \b PLL_FALSE and \a errno is set
444 */
445 int
pllNewickUnroot(pllNewickTree * t)446 pllNewickUnroot (pllNewickTree * t)
447 {
448 pllStack * tmp;
449 pllNewickNodeInfo * item;
450
451 item = t->tree->item;
452 if (item->rank == 2)
453 {
454 item->rank = 3;
455 t->nodes--;
456 item = t->tree->next->item;
457 if (item->rank == 0)
458 {
459 tmp = t->tree->next->next;
460 t->tree->next->next = t->tree->next->next->next;
461 }
462 else
463 {
464 tmp = t->tree->next;
465 t->tree->next = t->tree->next->next;
466 }
467 item = tmp->item;
468 rax_free (item->name);
469 rax_free (tmp->item);
470 rax_free (tmp);
471 }
472
473 return (pllValidateNewick (t));
474 }
475
476
477 /** @ingroup newickParseGroup
478 @brief Parse a newick tree string
479
480 Parse a newick string and create a stack structure which represents the tree
481 in a preorder traversal form. Each element of the stack represents one node
482 and consists of its name, branch length, number of children and depth. The
483 stack structure is finally wrapped in a \a pllNewickTree structure which
484 also contains the number of nodes and leaves.
485
486 @param newick
487 String containing the newick tree
488
489 @return
490 Returns a pointer to the created \a pllNewickTree structure in case of success, otherwise \b NULL
491 */
492 pllNewickTree *
pllNewickParseString(const char * newick)493 pllNewickParseString (const char * newick)
494 {
495 int n, input, rc;
496 pllNewickTree * t;
497 int nodes, leaves;
498
499 t = (pllNewickTree *) rax_calloc (1, sizeof (pllNewickTree));
500
501 n = strlen (newick);
502
503 init_lexan (newick, n);
504 input = get_next_symbol();
505
506 rc = parse_newick (&(t->tree), &input);
507 if (!rc)
508 {
509 /* TODO: properly clean t->tree */
510 rax_free (t);
511 t = NULL;
512 }
513 else
514 {
515 assign_ranks (t->tree, &nodes, &leaves);
516 t->nodes = nodes;
517 t->tips = leaves;
518 }
519
520 return (t);
521 }
522
523 /** @ingroup newickParseGroup
524 @brief Deallocate newick parser stack structure
525
526 Deallocates the newick parser stack structure that represents the parsed tree. It
527 also frees all memory allocated by elements of the stack structure.
528
529 @param tree
530 The tree stack structure
531 */
pllNewickParseDestroy(pllNewickTree ** t)532 void pllNewickParseDestroy (pllNewickTree ** t)
533 {
534 pllNewickNodeInfo * item;
535
536 while ((item = (pllNewickNodeInfo *)pllStackPop (&((*t)->tree))))
537 {
538 rax_free (item->name);
539 rax_free (item->branch);
540 rax_free (item);
541 }
542 rax_free (*t);
543 (*t) = NULL;
544 }
545
546 /** @ingroup newickParseGroup
547 @brief Parse a newick tree file
548
549 Parse a newick file and create a stack structure which represents the tree
550 in a preorder traversal form. Each element of the stack represents one node
551 and consists of its name, branch length, number of children (rank) and depth. The
552 stack structure is finally wrapped in a \a pllNewickTree structure which
553 also contains the number of nodes and leaves.
554
555 @param filename
556 Filename containing the newick tree
557
558 @return
559 Returns a pointer to the created \a pllNewickTree structure in case of success, otherwise \b NULL
560 */
561 pllNewickTree *
pllNewickParseFile(const char * filename)562 pllNewickParseFile (const char * filename)
563 {
564 long n;
565 char * rawdata;
566 pllNewickTree * t;
567
568 rawdata = pllReadFile (filename, &n);
569 if (!rawdata)
570 {
571 fprintf (stderr, "Error while opening/reading file %s\n", filename);
572 return (0);
573 }
574
575 //printf ("%s\n\n", rawdata);
576
577 t = pllNewickParseString (rawdata);
578
579 rax_free (rawdata);
580
581 return (t);
582 }
583
584