1# #START_LICENSE########################################################### 2# 3# 4# This file is part of the Environment for Tree Exploration program 5# (ETE). http://etetoolkit.org 6# 7# ETE is free software: you can redistribute it and/or modify it 8# under the terms of the GNU General Public License as published by 9# the Free Software Foundation, either version 3 of the License, or 10# (at your option) any later version. 11# 12# ETE is distributed in the hope that it will be useful, but WITHOUT 13# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 15# License for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with ETE. If not, see <http://www.gnu.org/licenses/>. 19# 20# 21# ABOUT THE ETE PACKAGE 22# ===================== 23# 24# ETE is distributed under the GPL copyleft license (2008-2015). 25# 26# If you make use of ETE in published work, please cite: 27# 28# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon. 29# ETE: a python Environment for Tree Exploration. Jaime BMC 30# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24 31# 32# Note that extra references to the specific methods implemented in 33# the toolkit may be available in the documentation. 34# 35# More info at http://etetoolkit.org. Contact: huerta@embl.de 36# 37# 38# #END_LICENSE############################################################# 39from __future__ import absolute_import 40from __future__ import print_function 41import re 42import os 43import six 44from six.moves import map 45 46__all__ = ["read_newick", "write_newick", "print_supported_formats"] 47 48ITERABLE_TYPES = set([list, set, tuple, frozenset]) 49 50# Regular expressions used for reading newick format 51_ILEGAL_NEWICK_CHARS = ":;(),\[\]\t\n\r=" 52_NON_PRINTABLE_CHARS_RE = "[\x00-\x1f]+" 53 54_NHX_RE = "\[&&NHX:[^\]]*\]" 55_FLOAT_RE = "\s*[+-]?\d+\.?\d*(?:[eE][-+]?\d+)?\s*" 56#_FLOAT_RE = "[+-]?\d+\.?\d*" 57#_NAME_RE = "[^():,;\[\]]+" 58_NAME_RE = "[^():,;]+?" 59 60# thanks to: http://stackoverflow.com/a/29452781/1006828 61_QUOTED_TEXT_RE = r"""((?=["'])(?:"[^"\\]*(?:\\[\s\S][^"\\]*)*"|'[^'\\]*(?:\\[\s\S][^'\\]*)*'))""" 62#_QUOTED_TEXT_RE = r"""["'](?:(?<=")[^"\\]*(?s:\\.[^"\\]*)*"|(?<=')[^'\\]*(?s:\\.[^'\\]*)*')""]"]""" 63#_QUOTED_TEXT_RE = r"""(?=["'])(?:"[^"\\]*(?:\\[\s\S][^"\\]*)*"|'[^'\\]*(?:\\[\s\S][^'\\]*)*')]"]")"]""" 64 65_QUOTED_TEXT_PREFIX='ete3_quotref_' 66 67DEFAULT_DIST = 1.0 68DEFAULT_NAME = '' 69DEFAULT_SUPPORT = 1.0 70FLOAT_FORMATTER = "%0.6g" 71#DIST_FORMATTER = ":"+FLOAT_FORMATTER 72NAME_FORMATTER = "%s" 73 74def set_float_format(formatter): 75 ''' Set the conversion format used to represent float distances and support 76 values in the newick representation of trees. 77 78 For example, use set_float_format('%0.32f') to specify 32 decimal numbers 79 when exporting node distances and bootstrap values. 80 81 Scientific notation (%e) or any other custom format is allowed. The 82 formatter string should not contain any character that may break newick 83 structure (i.e.: ":;,()") 84 85 ''' 86 global FLOAT_FORMATTER 87 FLOAT_FORMATTER = formatter 88 #DIST_FORMATTER = ":"+FLOAT_FORMATTER 89 90# Allowed formats. This table is used to read and write newick using 91# different convenctions. You can also add your own formats in an easy way. 92# 93# 94# FORMAT: [[LeafAttr1, LeafAttr1Type, Strict?], [LeafAttr2, LeafAttr2Type, Strict?],\ 95# [InternalAttr1, InternalAttr1Type, Strict?], [InternalAttr2, InternalAttr2Type, Strict?]] 96# 97# Attributes are placed in the newick as follows: 98# 99# .... ,LeafAttr1:LeafAttr2)InternalAttr1:InternalAttr2 ... 100# 101# 102# /-A 103# -NoName--| 104# | /-B 105# \C-------| 106# | /-D 107# \E-------| 108# \-G 109# 110# Format 0 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737); 111# Format 1 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737); 112# Format 2 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737); 113# Format 3 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737); 114# Format 4 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729))); 115# Format 5 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729):0.642905):0.567737); 116# Format 6 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E)C); 117# Format 7 = (A,(B,(D,G)E)C); 118# Format 8 = (A,(B,(D,G))); 119# Format 9 = (,(,(,))); 120 121NW_FORMAT = { 122 0: [['name', str, True], ["dist", float, True], ['support', float, True], ["dist", float, True]], # Flexible with support 123 1: [['name', str, True], ["dist", float, True], ['name', str, True], ["dist", float, True]], # Flexible with internal node names 124 2: [['name', str, False], ["dist", float, False], ['support', float, False], ["dist", float, False]],# Strict with support values 125 3: [['name', str, False], ["dist", float, False], ['name', str, False], ["dist", float, False]], # Strict with internal node names 126 4: [['name', str, False], ["dist", float, False], [None, None, False], [None, None, False]], 127 5: [['name', str, False], ["dist", float, False], [None, None, False], ["dist", float, False]], 128 6: [['name', str, False], [None, None, False], [None, None, False], ["dist", float, False]], 129 7: [['name', str, False], ["dist", float, False], ["name", str, False], [None, None, False]], 130 8: [['name', str, False], [None, None, False], ["name", str, False], [None, None, False]], 131 9: [['name', str, False], [None, None, False], [None, None, False], [None, None, False]], # Only topology with node names 132 100: [[None, None, False], [None, None, False], [None, None, False], [None, None, False]] # Only Topology 133} 134 135 136def format_node(node, node_type, format, dist_formatter=None, 137 support_formatter=None, name_formatter=None, 138 quoted_names=False): 139 if dist_formatter is None: dist_formatter = FLOAT_FORMATTER 140 if support_formatter is None: support_formatter = FLOAT_FORMATTER 141 if name_formatter is None: name_formatter = NAME_FORMATTER 142 143 if node_type == "leaf": 144 container1 = NW_FORMAT[format][0][0] # name 145 container2 = NW_FORMAT[format][1][0] # dists 146 converterFn1 = NW_FORMAT[format][0][1] 147 converterFn2 = NW_FORMAT[format][1][1] 148 flexible1 = NW_FORMAT[format][0][2] 149 else: 150 container1 = NW_FORMAT[format][2][0] #support/name 151 container2 = NW_FORMAT[format][3][0] #dist 152 converterFn1 = NW_FORMAT[format][2][1] 153 converterFn2 = NW_FORMAT[format][3][1] 154 flexible1 = NW_FORMAT[format][2][2] 155 156 if converterFn1 == str: 157 try: 158 if not quoted_names: 159 FIRST_PART = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ 160 str(getattr(node, container1))) 161 else: 162 FIRST_PART = str(getattr(node, container1)) 163 if not FIRST_PART and container1 == 'name' and not flexible1: 164 FIRST_PART = "NoName" 165 166 except (AttributeError, TypeError): 167 FIRST_PART = "?" 168 169 FIRST_PART = name_formatter %FIRST_PART 170 if quoted_names: 171 #FIRST_PART = '"%s"' %FIRST_PART.decode('string_escape').replace('"', '\\"') 172 FIRST_PART = '"%s"' %FIRST_PART 173 174 elif converterFn1 is None: 175 FIRST_PART = "" 176 else: 177 try: 178 FIRST_PART = support_formatter %(converterFn2(getattr(node, container1))) 179 except (ValueError, TypeError): 180 FIRST_PART = "?" 181 182 if converterFn2 == str: 183 try: 184 SECOND_PART = ":"+re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ 185 str(getattr(node, container2))) 186 except (ValueError, TypeError): 187 SECOND_PART = ":?" 188 elif converterFn2 is None: 189 SECOND_PART = "" 190 else: 191 try: 192 #SECOND_PART = ":%0.6f" %(converterFn2(getattr(node, container2))) 193 SECOND_PART = ":%s" %(dist_formatter %(converterFn2(getattr(node, container2)))) 194 except (ValueError, TypeError): 195 SECOND_PART = ":?" 196 197 return "%s%s" %(FIRST_PART, SECOND_PART) 198 199 200def print_supported_formats(): 201 from ..coretype.tree import TreeNode 202 t = TreeNode() 203 t.populate(4, "ABCDEFGHI") 204 print(t) 205 for f in NW_FORMAT: 206 print("Format", f,"=", write_newick(t, features=None, format=f)) 207 208class NewickError(Exception): 209 """Exception class designed for NewickIO errors.""" 210 def __init__(self, value): 211 if value is None: 212 value = '' 213 value += "\nYou may want to check other newick loading flags like 'format' or 'quoted_node_names'." 214 Exception.__init__(self, value) 215 216def read_newick(newick, root_node=None, format=0, quoted_names=False): 217 """ Reads a newick tree from either a string or a file, and returns 218 an ETE tree structure. 219 220 A previously existent node object can be passed as the root of the 221 tree, which means that all its new children will belong to the same 222 class as the root(This allows to work with custom TreeNode 223 objects). 224 225 You can also take advantage from this behaviour to concatenate 226 several tree structures. 227 """ 228 229 if root_node is None: 230 from ..coretype.tree import TreeNode 231 root_node = TreeNode() 232 233 if isinstance(newick, six.string_types): 234 235 # try to determine whether the file exists. 236 # For very large trees, if newick contains the content of the tree, rather than a file name, 237 # this may fail, at least on Windows, because the os fails to stat the file content, deeming it 238 # too long for testing with os.path.exists. This raises a ValueError with description 239 # "stat: path too long for Windows". This is described in issue #258 240 try: 241 file_exists = os.path.exists(newick) 242 except ValueError: # failed to stat 243 file_exists = False 244 245 # if newick refers to a file, read it from file; otherwise, regard it as a Newick content string. 246 if file_exists: 247 if newick.endswith('.gz'): 248 import gzip 249 with gzip.open(newick) as INPUT: 250 nw = INPUT.read() 251 else: 252 with open(newick) as INPUT: 253 nw = INPUT.read() 254 else: 255 nw = newick 256 257 258 matcher = compile_matchers(formatcode=format) 259 nw = nw.strip() 260 if not nw.startswith('(') and nw.endswith(';'): 261 #return _read_node_data(nw[:-1], root_node, "single", matcher, format) 262 return _read_newick_from_string(nw, root_node, matcher, format, quoted_names) 263 elif not nw.startswith('(') or not nw.endswith(';'): 264 raise NewickError('Unexisting tree file or Malformed newick tree structure.') 265 else: 266 return _read_newick_from_string(nw, root_node, matcher, format, quoted_names) 267 268 else: 269 raise NewickError("'newick' argument must be either a filename or a newick string.") 270 271def _read_newick_from_string(nw, root_node, matcher, formatcode, quoted_names): 272 """ Reads a newick string in the New Hampshire format. """ 273 274 if quoted_names: 275 # Quoted text is mapped to references 276 quoted_map = {} 277 unquoted_nw = '' 278 counter = 0 279 for token in re.split(_QUOTED_TEXT_RE, nw): 280 counter += 1 281 if counter % 2 == 1 : # normal newick tree structure data 282 unquoted_nw += token 283 else: # quoted text, add to dictionary and replace with reference 284 quoted_ref_id= _QUOTED_TEXT_PREFIX + str(int(counter/2)) 285 unquoted_nw += quoted_ref_id 286 quoted_map[quoted_ref_id]=token[1:-1] # without the quotes 287 nw = unquoted_nw 288 289 if not nw.startswith('(') and nw.endswith(';'): 290 _read_node_data(nw[:-1], root_node, "single", matcher, format) 291 if quoted_names: 292 if root_node.name.startswith(_QUOTED_TEXT_PREFIX): 293 root_node.name = quoted_map[root_node.name] 294 return root_node 295 296 if nw.count('(') != nw.count(')'): 297 raise NewickError('Parentheses do not match. Broken tree structure?') 298 299 # white spaces and separators are removed 300 nw = re.sub("[\n\r\t]+", "", nw) 301 302 current_parent = None 303 # Each chunk represents the content of a parent node, and it could contain 304 # leaves and closing parentheses. 305 # We may find: 306 # leaf, ..., leaf, 307 # leaf, ..., leaf))), 308 # leaf)), leaf, leaf)) 309 # leaf)) 310 # ) only if formatcode == 100 311 312 for chunk in nw.split("(")[1:]: 313 # If no node has been created so far, this is the root, so use the node. 314 current_parent = root_node if current_parent is None else current_parent.add_child() 315 316 subchunks = [ch.strip() for ch in chunk.split(",")] 317 # We should expect that the chunk finished with a comma (if next chunk 318 # is an internal sister node) or a subchunk containing closing parenthesis until the end of the tree. 319 #[leaf, leaf, ''] 320 #[leaf, leaf, ')))', leaf, leaf, ''] 321 #[leaf, leaf, ')))', leaf, leaf, ''] 322 #[leaf, leaf, ')))', leaf), leaf, 'leaf);'] 323 if subchunks[-1] != '' and not subchunks[-1].endswith(';'): 324 raise NewickError('Broken newick structure at: %s' %chunk) 325 326 # lets process the subchunks. Every closing parenthesis will close a 327 # node and go up one level. 328 for i, leaf in enumerate(subchunks): 329 if leaf.strip() == '' and i == len(subchunks) - 1: 330 continue # "blah blah ,( blah blah" 331 closing_nodes = leaf.split(")") 332 333 # first part after splitting by ) always contain leaf info 334 _read_node_data(closing_nodes[0], current_parent, "leaf", matcher, formatcode) 335 336 # next contain closing nodes and data about the internal nodes. 337 if len(closing_nodes)>1: 338 for closing_internal in closing_nodes[1:]: 339 closing_internal = closing_internal.rstrip(";") 340 # read internal node data and go up one level 341 _read_node_data(closing_internal, current_parent, "internal", matcher, formatcode) 342 current_parent = current_parent.up 343 344 # references in node names are replaced with quoted text before returning 345 if quoted_names: 346 for node in root_node.traverse(): 347 if node.name.startswith(_QUOTED_TEXT_PREFIX): 348 node.name = quoted_map[node.name] 349 350 return root_node 351 352def _parse_extra_features(node, NHX_string): 353 """ Reads node's extra data form its NHX string. NHX uses this 354 format: [&&NHX:prop1=value1:prop2=value2] """ 355 NHX_string = NHX_string.replace("[&&NHX:", "") 356 NHX_string = NHX_string.replace("]", "") 357 for field in NHX_string.split(":"): 358 try: 359 pname, pvalue = field.split("=") 360 except ValueError as e: 361 raise NewickError('Invalid NHX format %s' %field) 362 node.add_feature(pname, pvalue) 363 364def compile_matchers(formatcode): 365 matchers = {} 366 for node_type in ["leaf", "single", "internal"]: 367 if node_type == "leaf" or node_type == "single": 368 container1 = NW_FORMAT[formatcode][0][0] 369 container2 = NW_FORMAT[formatcode][1][0] 370 converterFn1 = NW_FORMAT[formatcode][0][1] 371 converterFn2 = NW_FORMAT[formatcode][1][1] 372 flexible1 = NW_FORMAT[formatcode][0][2] 373 flexible2 = NW_FORMAT[formatcode][1][2] 374 else: 375 container1 = NW_FORMAT[formatcode][2][0] 376 container2 = NW_FORMAT[formatcode][3][0] 377 converterFn1 = NW_FORMAT[formatcode][2][1] 378 converterFn2 = NW_FORMAT[formatcode][3][1] 379 flexible1 = NW_FORMAT[formatcode][2][2] 380 flexible2 = NW_FORMAT[formatcode][3][2] 381 382 if converterFn1 == str: 383 FIRST_MATCH = "("+_NAME_RE+")" 384 elif converterFn1 == float: 385 FIRST_MATCH = "("+_FLOAT_RE+")" 386 elif converterFn1 is None: 387 FIRST_MATCH = '()' 388 389 if converterFn2 == str: 390 SECOND_MATCH = "(:"+_NAME_RE+")" 391 elif converterFn2 == float: 392 SECOND_MATCH = "(:"+_FLOAT_RE+")" 393 elif converterFn2 is None: 394 SECOND_MATCH = '()' 395 396 if flexible1 and node_type != 'leaf': 397 FIRST_MATCH += "?" 398 if flexible2: 399 SECOND_MATCH += "?" 400 401 402 matcher_str= '^\s*%s\s*%s\s*(%s)?\s*$' % (FIRST_MATCH, SECOND_MATCH, _NHX_RE) 403 compiled_matcher = re.compile(matcher_str) 404 matchers[node_type] = [container1, container2, converterFn1, converterFn2, compiled_matcher] 405 406 return matchers 407 408def _read_node_data(subnw, current_node, node_type, matcher, formatcode): 409 """ Reads a leaf node from a subpart of the original newick 410 tree """ 411 412 if node_type == "leaf" or node_type == "single": 413 if node_type == "leaf": 414 node = current_node.add_child() 415 else: 416 node = current_node 417 else: 418 node = current_node 419 420 subnw = subnw.strip() 421 422 if not subnw and node_type == 'leaf' and formatcode != 100: 423 raise NewickError('Empty leaf node found') 424 elif not subnw: 425 return 426 427 container1, container2, converterFn1, converterFn2, compiled_matcher = matcher[node_type] 428 data = re.match(compiled_matcher, subnw) 429 if data: 430 data = data.groups() 431 # This prevents ignoring errors even in flexible nodes: 432 if subnw and data[0] is None and data[1] is None and data[2] is None: 433 raise NewickError("Unexpected newick format '%s'" %subnw) 434 435 if data[0] is not None and data[0] != '': 436 node.add_feature(container1, converterFn1(data[0].strip())) 437 438 if data[1] is not None and data[1] != '': 439 node.add_feature(container2, converterFn2(data[1][1:].strip())) 440 441 if data[2] is not None \ 442 and data[2].startswith("[&&NHX"): 443 _parse_extra_features(node, data[2]) 444 else: 445 raise NewickError("Unexpected newick format '%s' " %subnw[0:50]) 446 return 447 448def write_newick(rootnode, features=None, format=1, format_root_node=True, 449 is_leaf_fn=None, dist_formatter=None, support_formatter=None, 450 name_formatter=None, quoted_names=False): 451 """ Iteratively export a tree structure and returns its NHX 452 representation. """ 453 newick = [] 454 leaf = is_leaf_fn if is_leaf_fn else lambda n: not bool(n.children) 455 for postorder, node in rootnode.iter_prepostorder(is_leaf_fn=is_leaf_fn): 456 if postorder: 457 newick.append(")") 458 if node.up is not None or format_root_node: 459 newick.append(format_node(node, "internal", format, 460 dist_formatter=dist_formatter, 461 support_formatter=support_formatter, 462 name_formatter=name_formatter, 463 quoted_names=quoted_names)) 464 newick.append(_get_features_string(node, features)) 465 else: 466 if node is not rootnode and node != node.up.children[0]: 467 newick.append(",") 468 469 if leaf(node): 470 newick.append(format_node(node, "leaf", format, 471 dist_formatter=dist_formatter, 472 support_formatter=support_formatter, 473 name_formatter=name_formatter, 474 quoted_names=quoted_names)) 475 newick.append(_get_features_string(node, features)) 476 else: 477 newick.append("(") 478 479 newick.append(";") 480 return ''.join(newick) 481 482def _get_features_string(self, features=None): 483 """ Generates the extended newick string NHX with extra data about 484 a node. """ 485 string = "" 486 if features is None: 487 features = [] 488 elif features == []: 489 features = sorted(self.features) 490 491 for pr in features: 492 if hasattr(self, pr): 493 raw = getattr(self, pr) 494 if type(raw) in ITERABLE_TYPES: 495 raw = '|'.join(map(str, raw)) 496 elif type(raw) == dict: 497 raw = '|'.join(map(lambda x,y: "%s-%s" %(x, y), six.iteritems(raw))) 498 elif type(raw) == str: 499 pass 500 else: 501 raw = str(raw) 502 503 value = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ 504 raw) 505 if string != "": 506 string +=":" 507 string +="%s=%s" %(pr, str(value)) 508 if string != "": 509 string = "[&&NHX:"+string+"]" 510 511 return string 512