/** * PLL (version 1.0.0) a software library for phylogenetic inference * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis * * Derived from * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic * trees by Alexandros Stamatakis * * This program is free software: you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation, either version 3 of the License, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * For any other enquiries send an Email to Tomas Flouri * Tomas.Flouri@h-its.org * * When publishing work that uses PLL please cite PLL * * @file newick.c */ #include #include #include #include #include #include "pll.h" #include "pllInternal.h" /** @file newick.c @brief Collection of routines for reading and parsing newick trees Auxiliary functions for reading and parsing newick tree formats */ /** @defgroup newickParseGroup Reading and parsing newick trees This set of functions handles the reading and parsing of newick tree formats */ static int parse_newick (pllStack ** stack, int * inp) { pllNewickNodeInfo * item = NULL; int item_active = 0; pllLexToken token; int input; pllLexToken prev_token; int nop = 0; /* number of open parentheses */ int depth = 0; prev_token.tokenType = PLL_TOKEN_UNKNOWN; input = *inp; NEXT_TOKEN while (token.tokenType != PLL_TOKEN_EOF && token.tokenType != PLL_TOKEN_UNKNOWN) { switch (token.tokenType) { case PLL_TOKEN_OPAREN: #ifdef PLLDEBUG printf ("PLL_TOKEN_OPAREN\n"); #endif ++nop; memcpy (&prev_token, &token, sizeof (pllLexToken)); ++depth; break; case PLL_TOKEN_CPAREN: #ifdef PLLDEBUG printf ("PLL_TOKEN_CPAREN\n"); #endif if (prev_token.tokenType != PLL_TOKEN_CPAREN && prev_token.tokenType != PLL_TOKEN_UNKNOWN && prev_token.tokenType != PLL_TOKEN_STRING && prev_token.tokenType != PLL_TOKEN_NUMBER && prev_token.tokenType != PLL_TOKEN_FLOAT) return (0); if (!nop) return (0); --nop; memcpy (&prev_token, &token, sizeof (pllLexToken)); /* push to the stack */ if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); // possibly not nec //if (item->name == NULL) item->name = strdup ("INTERNAL_NODE"); if (item->name == NULL) { item->name = (char *) rax_malloc ((strlen("INTERNAL_NODE") + 1) * sizeof (char)); strcpy (item->name, "INTERNAL_NODE"); } //if (item->branch == NULL) item->branch = strdup ("0.000000"); if (item->branch == NULL) { item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char)); strcpy (item->branch, "0.000000"); } item->depth = depth; pllStackPush (stack, item); item_active = 1; /* active = 1 */ item = NULL; --depth; break; case PLL_TOKEN_STRING: #ifdef PLLDEBUG printf ("PLL_TOKEN_STRING %.*s\n", token.len, token.lexeme); #endif if (prev_token.tokenType != PLL_TOKEN_OPAREN && prev_token.tokenType != PLL_TOKEN_CPAREN && prev_token.tokenType != PLL_TOKEN_UNKNOWN && prev_token.tokenType != PLL_TOKEN_COMMA) return (0); if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); item->name = my_strndup (token.lexeme, token.len); item_active = 1; item->depth = depth; if (prev_token.tokenType == PLL_TOKEN_COMMA || prev_token.tokenType == PLL_TOKEN_OPAREN || prev_token.tokenType == PLL_TOKEN_UNKNOWN) item->leaf = 1; memcpy (&prev_token, &token, sizeof (pllLexToken)); break; case PLL_TOKEN_FLOAT: case PLL_TOKEN_NUMBER: #ifdef PLLDEBUG if (token.tokenType == PLL_TOKEN_FLOAT) printf ("PLL_TOKEN_FLOAT\n"); else printf ("PLL_TOKEN_NUMBER\n"); #endif if (prev_token.tokenType != PLL_TOKEN_OPAREN && prev_token.tokenType != PLL_TOKEN_CPAREN && prev_token.tokenType != PLL_TOKEN_COLON && prev_token.tokenType != PLL_TOKEN_UNKNOWN && prev_token.tokenType != PLL_TOKEN_COMMA) return (0); if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); if (prev_token.tokenType == PLL_TOKEN_COLON) { item->branch = my_strndup (token.lexeme, token.len); } else { if (prev_token.tokenType == PLL_TOKEN_COMMA || prev_token.tokenType == PLL_TOKEN_OPAREN || prev_token.tokenType == PLL_TOKEN_UNKNOWN) item->leaf = 1; //if (prev_token.tokenType != PLL_TOKEN_UNKNOWN) ++ indent; item->name = my_strndup (token.lexeme, token.len); } item_active = 1; item->depth = depth; memcpy (&prev_token, &token, sizeof (pllLexToken)); break; case PLL_TOKEN_COLON: #ifdef PLLDEBUG printf ("PLL_TOKEN_COLON\n"); #endif if (prev_token.tokenType != PLL_TOKEN_CPAREN && prev_token.tokenType != PLL_TOKEN_STRING && prev_token.tokenType != PLL_TOKEN_FLOAT && prev_token.tokenType != PLL_TOKEN_NUMBER) return (0); memcpy (&prev_token, &token, sizeof (pllLexToken)); break; case PLL_TOKEN_COMMA: #ifdef PLLDEBUG printf ("PLL_TOKEN_COMMA\n"); #endif if (prev_token.tokenType != PLL_TOKEN_CPAREN && prev_token.tokenType != PLL_TOKEN_STRING && prev_token.tokenType != PLL_TOKEN_FLOAT && prev_token.tokenType != PLL_TOKEN_NUMBER) return (0); memcpy (&prev_token, &token, sizeof (pllLexToken)); /* push to the stack */ if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); // possibly not nece //if (item->name == NULL) item->name = strdup ("INTERNAL_NODE"); if (item->name == NULL) { item->name = (char *) rax_malloc ((strlen("INTERNAL_NODE") + 1) * sizeof (char)); strcpy (item->name, "INTERNAL_NODE"); } //if (item->branch == NULL) item->branch = strdup ("0.000000"); if (item->branch == NULL) { item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char)); strcpy (item->branch, "0.000000"); } item->depth = depth; pllStackPush (stack, item); item_active = 0; item = NULL; break; case PLL_TOKEN_SEMICOLON: #ifdef PLLDEBUG printf ("PLL_TOKEN_SEMICOLON\n"); #endif /* push to the stack */ if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); //if (item->name == NULL) item->name = strdup ("ROOT_NODE"); if (item->name == NULL) { item->name = (char *) rax_malloc ((strlen("ROOT_NODE") + 1) * sizeof (char)); strcpy (item->name, "ROOT_NODE"); } //if (item->branch == NULL) item->branch = strdup ("0.000000"); if (item->branch == NULL) { item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char)); strcpy (item->branch, "0.000000"); } pllStackPush (stack, item); item_active = 0; item = NULL; break; default: #ifdef __DEBUGGING_MODE printf ("Unknown token: %d\n", token.tokenType); #endif // TODO: Finish this part and add error codes break; } NEXT_TOKEN CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE); } if (item_active) { if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); //if (item->name == NULL) item->name = strdup ("ROOT_NODE"); if (item->name == NULL) { item->name = (char *) rax_malloc ((strlen("ROOT_NODE") + 1) * sizeof (char)); strcpy (item->name, "ROOT_NODE"); } //if (item->branch == NULL) item->branch = strdup ("0.000000"); if (item->branch == NULL) { item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char)); strcpy (item->branch, "0.000000"); } pllStackPush (stack, item); item_active = 0; } if (nop || token.tokenType == PLL_TOKEN_UNKNOWN) { return (0); } return (1); } #ifdef __DEBUGGING_MODE void stack_dump(pllStack ** stack) { pllNewickNodeInfo * item; pllStack * head; int i; head = *stack; while (head) { item = (pllNewickNodeInfo *) head->item; for (i = 0; i < item->depth; ++ i) printf ("\t"); printf ("%s:%s\n", item->name, item->branch); head = head->next; } } #endif static void assign_ranks (pllStack * stack, int * nodes, int * leaves) { pllStack * head; pllNewickNodeInfo * item, * tmp; pllStack * preorder = NULL; int children; int depth; *nodes = *leaves = 0; head = stack; while (head) { assert (head->item); item = (pllNewickNodeInfo *) head->item; if (item->leaf) ++ (*leaves); if (preorder) { tmp = (pllNewickNodeInfo *) preorder->item; children = 0; while (item->depth < tmp->depth) { children = 1; depth = tmp->depth; pllStackPop (&preorder); tmp = preorder->item; while (tmp->depth == depth) { ++ children; pllStackPop (&preorder); tmp = (pllNewickNodeInfo *)preorder->item; } tmp->rank += children; } } ++ (*nodes); head = head->next; if (item->leaf) { if (!preorder) return; children = 1; tmp = preorder->item; while (tmp->depth == item->depth) { ++ children; pllStackPop (&preorder); assert (preorder); tmp = (pllNewickNodeInfo *)preorder->item; } tmp->rank += children; } else { pllStackPush (&preorder, item); } } while (preorder->item != stack->item) { item = (pllNewickNodeInfo *)pllStackPop (&preorder); tmp = (pllNewickNodeInfo *) preorder->item; children = 1; while (tmp->depth == item->depth) { ++ children; item = (pllNewickNodeInfo *) pllStackPop (&preorder); tmp = (pllNewickNodeInfo *) preorder->item; } tmp->rank += children; children = 0; } assert (preorder->item == stack->item); pllStackClear (&preorder); } /** @ingroup newickParseGroup @brief Validate if a newick tree is a valid phylogenetic tree A valid tree is one where the root node is binary or ternary and all other internal nodes are binary. In case the root is ternary then the tree must contain at least another internal node and the total number of nodes must be equal to \f$ 2l - 2\f$, where \f$l\f$ is the number of leaves. If the root is binary, then the total number of nodes must be equal to \f$2l - 1\f$. @param tree Newick tree wrapper structure which contains the stack representation of the parsed newick tree @return Returns \b 1 in case of success, otherwise \b 0 */ int pllValidateNewick (pllNewickTree * t) { pllStack * head; pllNewickNodeInfo * item; int correct = 0; item = t->tree->item; if (item->rank != 2 && item->rank != 3) return (0); head = t->tree->next; while (head) { item = head->item; if (item->rank != 2 && item->rank != 0) { return (0); } head = head->next; } item = t->tree->item; if (item->rank == 2) { correct = (t->nodes == 2 * t->tips -1); if (correct) { errno = PLL_NEWICK_ROOTED_TREE; } else { errno = PLL_NEWICK_BAD_STRUCTURE; } return (PLL_FALSE); } correct = ((t->nodes == 2 * t->tips - 2) && t->nodes != 4); if (correct) return (PLL_TRUE); errno = PLL_NEWICK_BAD_STRUCTURE; return (1); } /** @ingroup newickParseGroup @brief Convert a binary rooted trree to a binary unrooted tree Changes the root of the node to have 3 descendants instead of two, deletes its last immediate descendant internal node and takes the two children (of the deleted internal node) as its children. @param Newick tree @return \b PLL_TRUE in case of success, otherwise \b PLL_FALSE and \a errno is set */ int pllNewickUnroot (pllNewickTree * t) { pllStack * tmp; pllNewickNodeInfo * item; item = t->tree->item; if (item->rank == 2) { item->rank = 3; t->nodes--; item = t->tree->next->item; if (item->rank == 0) { tmp = t->tree->next->next; t->tree->next->next = t->tree->next->next->next; } else { tmp = t->tree->next; t->tree->next = t->tree->next->next; } item = tmp->item; rax_free (item->name); rax_free (tmp->item); rax_free (tmp); } return (pllValidateNewick (t)); } /** @ingroup newickParseGroup @brief Parse a newick tree string Parse a newick string and create a stack structure which represents the tree in a preorder traversal form. Each element of the stack represents one node and consists of its name, branch length, number of children and depth. The stack structure is finally wrapped in a \a pllNewickTree structure which also contains the number of nodes and leaves. @param newick String containing the newick tree @return Returns a pointer to the created \a pllNewickTree structure in case of success, otherwise \b NULL */ pllNewickTree * pllNewickParseString (const char * newick) { int n, input, rc; pllNewickTree * t; int nodes, leaves; t = (pllNewickTree *) rax_calloc (1, sizeof (pllNewickTree)); n = strlen (newick); init_lexan (newick, n); input = get_next_symbol(); rc = parse_newick (&(t->tree), &input); if (!rc) { /* TODO: properly clean t->tree */ rax_free (t); t = NULL; } else { assign_ranks (t->tree, &nodes, &leaves); t->nodes = nodes; t->tips = leaves; } return (t); } /** @ingroup newickParseGroup @brief Deallocate newick parser stack structure Deallocates the newick parser stack structure that represents the parsed tree. It also frees all memory allocated by elements of the stack structure. @param tree The tree stack structure */ void pllNewickParseDestroy (pllNewickTree ** t) { pllNewickNodeInfo * item; while ((item = (pllNewickNodeInfo *)pllStackPop (&((*t)->tree)))) { rax_free (item->name); rax_free (item->branch); rax_free (item); } rax_free (*t); (*t) = NULL; } /** @ingroup newickParseGroup @brief Parse a newick tree file Parse a newick file and create a stack structure which represents the tree in a preorder traversal form. Each element of the stack represents one node and consists of its name, branch length, number of children (rank) and depth. The stack structure is finally wrapped in a \a pllNewickTree structure which also contains the number of nodes and leaves. @param filename Filename containing the newick tree @return Returns a pointer to the created \a pllNewickTree structure in case of success, otherwise \b NULL */ pllNewickTree * pllNewickParseFile (const char * filename) { long n; char * rawdata; pllNewickTree * t; rawdata = pllReadFile (filename, &n); if (!rawdata) { fprintf (stderr, "Error while opening/reading file %s\n", filename); return (0); } //printf ("%s\n\n", rawdata); t = pllNewickParseString (rawdata); rax_free (rawdata); return (t); }