1# #START_LICENSE###########################################################
2#
3#
4# This file is part of the Environment for Tree Exploration program
5# (ETE).  http://etetoolkit.org
6#
7# ETE is free software: you can redistribute it and/or modify it
8# under the terms of the GNU General Public License as published by
9# the Free Software Foundation, either version 3 of the License, or
10# (at your option) any later version.
11#
12# ETE is distributed in the hope that it will be useful, but WITHOUT
13# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15# License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with ETE.  If not, see <http://www.gnu.org/licenses/>.
19#
20#
21#                     ABOUT THE ETE PACKAGE
22#                     =====================
23#
24# ETE is distributed under the GPL copyleft license (2008-2015).
25#
26# If you make use of ETE in published work, please cite:
27#
28# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
29# ETE: a python Environment for Tree Exploration. Jaime BMC
30# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
31#
32# Note that extra references to the specific methods implemented in
33# the toolkit may be available in the documentation.
34#
35# More info at http://etetoolkit.org. Contact: huerta@embl.de
36#
37#
38# #END_LICENSE#############################################################
39from __future__ import absolute_import
40from __future__ import print_function
41import re
42import os
43import six
44from six.moves import map
45
46__all__ = ["read_newick", "write_newick", "print_supported_formats"]
47
48ITERABLE_TYPES = set([list, set, tuple, frozenset])
49
50# Regular expressions used for reading newick format
51_ILEGAL_NEWICK_CHARS = ":;(),\[\]\t\n\r="
52_NON_PRINTABLE_CHARS_RE = "[\x00-\x1f]+"
53
54_NHX_RE = "\[&&NHX:[^\]]*\]"
55_FLOAT_RE = "\s*[+-]?\d+\.?\d*(?:[eE][-+]?\d+)?\s*"
56#_FLOAT_RE = "[+-]?\d+\.?\d*"
57#_NAME_RE = "[^():,;\[\]]+"
58_NAME_RE = "[^():,;]+?"
59
60# thanks to: http://stackoverflow.com/a/29452781/1006828
61_QUOTED_TEXT_RE = r"""((?=["'])(?:"[^"\\]*(?:\\[\s\S][^"\\]*)*"|'[^'\\]*(?:\\[\s\S][^'\\]*)*'))"""
62#_QUOTED_TEXT_RE = r"""["'](?:(?<=")[^"\\]*(?s:\\.[^"\\]*)*"|(?<=')[^'\\]*(?s:\\.[^'\\]*)*')""]"]"""
63#_QUOTED_TEXT_RE = r"""(?=["'])(?:"[^"\\]*(?:\\[\s\S][^"\\]*)*"|'[^'\\]*(?:\\[\s\S][^'\\]*)*')]"]")"]"""
64
65_QUOTED_TEXT_PREFIX='ete3_quotref_'
66
67DEFAULT_DIST = 1.0
68DEFAULT_NAME = ''
69DEFAULT_SUPPORT = 1.0
70FLOAT_FORMATTER = "%0.6g"
71#DIST_FORMATTER = ":"+FLOAT_FORMATTER
72NAME_FORMATTER = "%s"
73
74def set_float_format(formatter):
75    ''' Set the conversion format used to represent float distances and support
76    values in the newick representation of trees.
77
78    For example, use set_float_format('%0.32f') to specify 32 decimal numbers
79    when exporting node distances and bootstrap values.
80
81    Scientific notation (%e) or any other custom format is allowed. The
82    formatter string should not contain any character that may break newick
83    structure (i.e.: ":;,()")
84
85    '''
86    global FLOAT_FORMATTER
87    FLOAT_FORMATTER = formatter
88    #DIST_FORMATTER = ":"+FLOAT_FORMATTER
89
90# Allowed formats. This table is used to read and write newick using
91# different convenctions. You can also add your own formats in an easy way.
92#
93#
94# FORMAT: [[LeafAttr1, LeafAttr1Type, Strict?], [LeafAttr2, LeafAttr2Type, Strict?],\
95#    [InternalAttr1, InternalAttr1Type, Strict?], [InternalAttr2, InternalAttr2Type, Strict?]]
96#
97# Attributes are placed in the newick as follows:
98#
99# .... ,LeafAttr1:LeafAttr2)InternalAttr1:InternalAttr2 ...
100#
101#
102#           /-A
103# -NoName--|
104#          |          /-B
105#           \C-------|
106#                    |          /-D
107#                     \E-------|
108#                               \-G
109#
110# Format 0 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737);
111# Format 1 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737);
112# Format 2 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737);
113# Format 3 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737);
114# Format 4 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)));
115# Format 5 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729):0.642905):0.567737);
116# Format 6 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E)C);
117# Format 7 = (A,(B,(D,G)E)C);
118# Format 8 = (A,(B,(D,G)));
119# Format 9 = (,(,(,)));
120
121NW_FORMAT = {
122  0: [['name', str, True],  ["dist", float, True],    ['support', float, True],   ["dist", float, True]], # Flexible with support
123  1: [['name', str, True],  ["dist", float, True],    ['name', str, True],      ["dist", float, True]], # Flexible with internal node names
124  2: [['name', str, False], ["dist", float, False],   ['support', float, False],  ["dist", float, False]],# Strict with support values
125  3: [['name', str, False], ["dist", float, False],   ['name', str, False],     ["dist", float, False]], # Strict with internal node names
126  4: [['name', str, False], ["dist", float, False],   [None, None, False],        [None, None, False]],
127  5: [['name', str, False], ["dist", float, False],   [None, None, False],        ["dist", float, False]],
128  6: [['name', str, False], [None, None, False],      [None, None, False],        ["dist", float, False]],
129  7: [['name', str, False], ["dist", float, False],   ["name", str, False],       [None, None, False]],
130  8: [['name', str, False], [None, None, False],      ["name", str, False],       [None, None, False]],
131  9: [['name', str, False], [None, None, False],      [None, None, False],        [None, None, False]], # Only topology with node names
132  100: [[None, None, False],  [None, None, False],      [None, None, False],        [None, None, False]] # Only Topology
133}
134
135
136def format_node(node, node_type, format, dist_formatter=None,
137                support_formatter=None, name_formatter=None,
138                quoted_names=False):
139    if dist_formatter is None: dist_formatter = FLOAT_FORMATTER
140    if support_formatter is None: support_formatter = FLOAT_FORMATTER
141    if name_formatter is None: name_formatter = NAME_FORMATTER
142
143    if node_type == "leaf":
144        container1 = NW_FORMAT[format][0][0] # name
145        container2 = NW_FORMAT[format][1][0] # dists
146        converterFn1 = NW_FORMAT[format][0][1]
147        converterFn2 = NW_FORMAT[format][1][1]
148        flexible1 = NW_FORMAT[format][0][2]
149    else:
150        container1 = NW_FORMAT[format][2][0] #support/name
151        container2 = NW_FORMAT[format][3][0] #dist
152        converterFn1 = NW_FORMAT[format][2][1]
153        converterFn2 = NW_FORMAT[format][3][1]
154        flexible1 = NW_FORMAT[format][2][2]
155
156    if converterFn1 == str:
157        try:
158            if not quoted_names:
159                FIRST_PART = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
160                                    str(getattr(node, container1)))
161            else:
162                FIRST_PART = str(getattr(node, container1))
163            if not FIRST_PART and container1 == 'name' and not flexible1:
164                FIRST_PART = "NoName"
165
166        except (AttributeError, TypeError):
167            FIRST_PART = "?"
168
169        FIRST_PART = name_formatter %FIRST_PART
170        if quoted_names:
171            #FIRST_PART = '"%s"' %FIRST_PART.decode('string_escape').replace('"', '\\"')
172            FIRST_PART = '"%s"' %FIRST_PART
173
174    elif converterFn1 is None:
175        FIRST_PART = ""
176    else:
177        try:
178            FIRST_PART = support_formatter %(converterFn2(getattr(node, container1)))
179        except (ValueError, TypeError):
180            FIRST_PART = "?"
181
182    if converterFn2 == str:
183        try:
184            SECOND_PART = ":"+re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
185                                  str(getattr(node, container2)))
186        except (ValueError, TypeError):
187            SECOND_PART = ":?"
188    elif converterFn2 is None:
189        SECOND_PART = ""
190    else:
191        try:
192            #SECOND_PART = ":%0.6f" %(converterFn2(getattr(node, container2)))
193            SECOND_PART = ":%s" %(dist_formatter %(converterFn2(getattr(node, container2))))
194        except (ValueError, TypeError):
195            SECOND_PART = ":?"
196
197    return "%s%s" %(FIRST_PART, SECOND_PART)
198
199
200def print_supported_formats():
201    from ..coretype.tree import TreeNode
202    t = TreeNode()
203    t.populate(4, "ABCDEFGHI")
204    print(t)
205    for f in NW_FORMAT:
206        print("Format", f,"=", write_newick(t, features=None, format=f))
207
208class NewickError(Exception):
209    """Exception class designed for NewickIO errors."""
210    def __init__(self, value):
211        if value is None:
212            value = ''
213        value += "\nYou may want to check other newick loading flags like 'format' or 'quoted_node_names'."
214        Exception.__init__(self, value)
215
216def read_newick(newick, root_node=None, format=0, quoted_names=False):
217    """ Reads a newick tree from either a string or a file, and returns
218    an ETE tree structure.
219
220    A previously existent node object can be passed as the root of the
221    tree, which means that all its new children will belong to the same
222    class as the root(This allows to work with custom TreeNode
223    objects).
224
225    You can also take advantage from this behaviour to concatenate
226    several tree structures.
227    """
228
229    if root_node is None:
230        from ..coretype.tree import TreeNode
231        root_node = TreeNode()
232
233    if isinstance(newick, six.string_types):
234
235        # try to determine whether the file exists.
236        # For very large trees, if newick contains the content of the tree, rather than a file name,
237        # this may fail, at least on Windows, because the os fails to stat the file content, deeming it
238        # too long for testing with os.path.exists.  This raises a ValueError with description
239        # "stat: path too long for Windows".  This is described in issue #258
240        try:
241            file_exists = os.path.exists(newick)
242        except ValueError:      # failed to stat
243            file_exists = False
244
245        # if newick refers to a file, read it from file; otherwise, regard it as a Newick content string.
246        if file_exists:
247            if newick.endswith('.gz'):
248                import gzip
249                with gzip.open(newick) as INPUT:
250                    nw = INPUT.read()
251            else:
252                with open(newick) as INPUT:
253                    nw = INPUT.read()
254        else:
255            nw = newick
256
257
258        matcher = compile_matchers(formatcode=format)
259        nw = nw.strip()
260        if not nw.startswith('(') and nw.endswith(';'):
261            #return _read_node_data(nw[:-1], root_node, "single", matcher, format)
262            return _read_newick_from_string(nw, root_node, matcher, format, quoted_names)
263        elif not nw.startswith('(') or not nw.endswith(';'):
264            raise NewickError('Unexisting tree file or Malformed newick tree structure.')
265        else:
266            return _read_newick_from_string(nw, root_node, matcher, format, quoted_names)
267
268    else:
269        raise NewickError("'newick' argument must be either a filename or a newick string.")
270
271def _read_newick_from_string(nw, root_node, matcher, formatcode, quoted_names):
272    """ Reads a newick string in the New Hampshire format. """
273
274    if quoted_names:
275        # Quoted text is mapped to references
276        quoted_map = {}
277        unquoted_nw = ''
278        counter = 0
279        for token in re.split(_QUOTED_TEXT_RE, nw):
280            counter += 1
281            if counter % 2 == 1 : # normal newick tree structure data
282                unquoted_nw += token
283            else: # quoted text, add to dictionary and replace with reference
284                quoted_ref_id= _QUOTED_TEXT_PREFIX + str(int(counter/2))
285                unquoted_nw += quoted_ref_id
286                quoted_map[quoted_ref_id]=token[1:-1]  # without the quotes
287        nw = unquoted_nw
288
289    if not nw.startswith('(') and nw.endswith(';'):
290        _read_node_data(nw[:-1], root_node, "single", matcher, format)
291        if quoted_names:
292            if root_node.name.startswith(_QUOTED_TEXT_PREFIX):
293                root_node.name = quoted_map[root_node.name]
294        return root_node
295
296    if nw.count('(') != nw.count(')'):
297        raise NewickError('Parentheses do not match. Broken tree structure?')
298
299    # white spaces and separators are removed
300    nw = re.sub("[\n\r\t]+", "", nw)
301
302    current_parent = None
303    # Each chunk represents the content of a parent node, and it could contain
304    # leaves and closing parentheses.
305    # We may find:
306    # leaf, ..., leaf,
307    # leaf, ..., leaf))),
308    # leaf)), leaf, leaf))
309    # leaf))
310    # ) only if formatcode == 100
311
312    for chunk in nw.split("(")[1:]:
313        # If no node has been created so far, this is the root, so use the node.
314        current_parent = root_node if current_parent is None else current_parent.add_child()
315
316        subchunks = [ch.strip() for ch in chunk.split(",")]
317        # We should expect that the chunk finished with a comma (if next chunk
318        # is an internal sister node) or a subchunk containing closing parenthesis until the end of the tree.
319        #[leaf, leaf, '']
320        #[leaf, leaf, ')))', leaf, leaf, '']
321        #[leaf, leaf, ')))', leaf, leaf, '']
322        #[leaf, leaf, ')))', leaf), leaf, 'leaf);']
323        if subchunks[-1] != '' and not subchunks[-1].endswith(';'):
324            raise NewickError('Broken newick structure at: %s' %chunk)
325
326        # lets process the subchunks. Every closing parenthesis will close a
327        # node and go up one level.
328        for i, leaf in enumerate(subchunks):
329            if leaf.strip() == '' and i == len(subchunks) - 1:
330                continue # "blah blah ,( blah blah"
331            closing_nodes = leaf.split(")")
332
333            # first part after splitting by ) always contain leaf info
334            _read_node_data(closing_nodes[0], current_parent, "leaf", matcher, formatcode)
335
336            # next contain closing nodes and data about the internal nodes.
337            if len(closing_nodes)>1:
338                for closing_internal in closing_nodes[1:]:
339                    closing_internal =  closing_internal.rstrip(";")
340                    # read internal node data and go up one level
341                    _read_node_data(closing_internal, current_parent, "internal", matcher, formatcode)
342                    current_parent = current_parent.up
343
344    # references in node names are replaced with quoted text before returning
345    if quoted_names:
346        for node in root_node.traverse():
347            if node.name.startswith(_QUOTED_TEXT_PREFIX):
348                node.name = quoted_map[node.name]
349
350    return root_node
351
352def _parse_extra_features(node, NHX_string):
353    """ Reads node's extra data form its NHX string. NHX uses this
354    format:  [&&NHX:prop1=value1:prop2=value2] """
355    NHX_string = NHX_string.replace("[&&NHX:", "")
356    NHX_string = NHX_string.replace("]", "")
357    for field in NHX_string.split(":"):
358        try:
359            pname, pvalue = field.split("=")
360        except ValueError as e:
361            raise NewickError('Invalid NHX format %s' %field)
362        node.add_feature(pname, pvalue)
363
364def compile_matchers(formatcode):
365    matchers = {}
366    for node_type in ["leaf", "single", "internal"]:
367        if node_type == "leaf" or node_type == "single":
368            container1 = NW_FORMAT[formatcode][0][0]
369            container2 = NW_FORMAT[formatcode][1][0]
370            converterFn1 = NW_FORMAT[formatcode][0][1]
371            converterFn2 = NW_FORMAT[formatcode][1][1]
372            flexible1 = NW_FORMAT[formatcode][0][2]
373            flexible2 = NW_FORMAT[formatcode][1][2]
374        else:
375            container1 = NW_FORMAT[formatcode][2][0]
376            container2 = NW_FORMAT[formatcode][3][0]
377            converterFn1 = NW_FORMAT[formatcode][2][1]
378            converterFn2 = NW_FORMAT[formatcode][3][1]
379            flexible1 = NW_FORMAT[formatcode][2][2]
380            flexible2 = NW_FORMAT[formatcode][3][2]
381
382        if converterFn1 == str:
383            FIRST_MATCH = "("+_NAME_RE+")"
384        elif converterFn1 == float:
385            FIRST_MATCH = "("+_FLOAT_RE+")"
386        elif converterFn1 is None:
387            FIRST_MATCH = '()'
388
389        if converterFn2 == str:
390            SECOND_MATCH = "(:"+_NAME_RE+")"
391        elif converterFn2 == float:
392            SECOND_MATCH = "(:"+_FLOAT_RE+")"
393        elif converterFn2 is None:
394            SECOND_MATCH = '()'
395
396        if flexible1 and node_type != 'leaf':
397            FIRST_MATCH += "?"
398        if flexible2:
399            SECOND_MATCH += "?"
400
401
402        matcher_str= '^\s*%s\s*%s\s*(%s)?\s*$' % (FIRST_MATCH, SECOND_MATCH, _NHX_RE)
403        compiled_matcher = re.compile(matcher_str)
404        matchers[node_type] = [container1, container2, converterFn1, converterFn2, compiled_matcher]
405
406    return matchers
407
408def _read_node_data(subnw, current_node, node_type, matcher, formatcode):
409    """ Reads a leaf node from a subpart of the original newick
410    tree """
411
412    if node_type == "leaf" or node_type == "single":
413        if node_type == "leaf":
414            node = current_node.add_child()
415        else:
416            node = current_node
417    else:
418        node = current_node
419
420    subnw = subnw.strip()
421
422    if not subnw and node_type == 'leaf' and formatcode != 100:
423        raise NewickError('Empty leaf node found')
424    elif not subnw:
425        return
426
427    container1, container2, converterFn1, converterFn2, compiled_matcher = matcher[node_type]
428    data = re.match(compiled_matcher, subnw)
429    if data:
430        data = data.groups()
431        # This prevents ignoring errors even in flexible nodes:
432        if subnw and data[0] is None and data[1] is None and data[2] is None:
433            raise NewickError("Unexpected newick format '%s'" %subnw)
434
435        if data[0] is not None and data[0] != '':
436            node.add_feature(container1, converterFn1(data[0].strip()))
437
438        if data[1] is not None and data[1] != '':
439            node.add_feature(container2, converterFn2(data[1][1:].strip()))
440
441        if data[2] is not None \
442                and data[2].startswith("[&&NHX"):
443            _parse_extra_features(node, data[2])
444    else:
445        raise NewickError("Unexpected newick format '%s' " %subnw[0:50])
446    return
447
448def write_newick(rootnode, features=None, format=1, format_root_node=True,
449                 is_leaf_fn=None, dist_formatter=None, support_formatter=None,
450                 name_formatter=None, quoted_names=False):
451    """ Iteratively export a tree structure and returns its NHX
452    representation. """
453    newick = []
454    leaf = is_leaf_fn if is_leaf_fn else lambda n: not bool(n.children)
455    for postorder, node in rootnode.iter_prepostorder(is_leaf_fn=is_leaf_fn):
456        if postorder:
457            newick.append(")")
458            if node.up is not None or format_root_node:
459                newick.append(format_node(node, "internal", format,
460                                          dist_formatter=dist_formatter,
461                                          support_formatter=support_formatter,
462                                          name_formatter=name_formatter,
463                                          quoted_names=quoted_names))
464                newick.append(_get_features_string(node, features))
465        else:
466            if node is not rootnode and node != node.up.children[0]:
467                newick.append(",")
468
469            if leaf(node):
470                newick.append(format_node(node, "leaf", format,
471                                          dist_formatter=dist_formatter,
472                                          support_formatter=support_formatter,
473                                          name_formatter=name_formatter,
474                                          quoted_names=quoted_names))
475                newick.append(_get_features_string(node, features))
476            else:
477                newick.append("(")
478
479    newick.append(";")
480    return ''.join(newick)
481
482def _get_features_string(self, features=None):
483    """ Generates the extended newick string NHX with extra data about
484    a node. """
485    string = ""
486    if features is None:
487        features = []
488    elif features == []:
489        features = sorted(self.features)
490
491    for pr in features:
492        if hasattr(self, pr):
493            raw = getattr(self, pr)
494            if type(raw) in ITERABLE_TYPES:
495                raw = '|'.join(map(str, raw))
496            elif type(raw) == dict:
497                raw = '|'.join(map(lambda x,y: "%s-%s" %(x, y), six.iteritems(raw)))
498            elif type(raw) == str:
499                pass
500            else:
501                raw = str(raw)
502
503            value = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
504                             raw)
505            if string != "":
506                string +=":"
507            string +="%s=%s"  %(pr, str(value))
508    if string != "":
509        string = "[&&NHX:"+string+"]"
510
511    return string
512