1# #START_LICENSE###########################################################
2#
3#
4# This file is part of the Environment for Tree Exploration program
5# (ETE).  http://etetoolkit.org
6#
7# ETE is free software: you can redistribute it and/or modify it
8# under the terms of the GNU General Public License as published by
9# the Free Software Foundation, either version 3 of the License, or
10# (at your option) any later version.
11#
12# ETE is distributed in the hope that it will be useful, but WITHOUT
13# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15# License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with ETE.  If not, see <http://www.gnu.org/licenses/>.
19#
20#
21#                     ABOUT THE ETE PACKAGE
22#                     =====================
23#
24# ETE is distributed under the GPL copyleft license (2008-2015).
25#
26# If you make use of ETE in published work, please cite:
27#
28# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
29# ETE: a python Environment for Tree Exploration. Jaime BMC
30# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
31#
32# Note that extra references to the specific methods implemented in
33# the toolkit may be available in the documentation.
34#
35# More info at http://etetoolkit.org. Contact: huerta@embl.de
36#
37#
38# #END_LICENSE#############################################################
39from __future__ import absolute_import
40from __future__ import print_function
41
42import os
43import string
44import textwrap
45from sys import stderr as STDERR
46from six.moves import map
47
48def read_fasta(source, obj=None, header_delimiter="\t", fix_duplicates=True):
49    """ Reads a collection of sequences econded in FASTA format."""
50
51    if obj is None:
52        from ..coretype import seqgroup
53        SC = seqgroup.SeqGroup()
54    else:
55        SC = obj
56
57    names = set([])
58    seq_id = -1
59
60    # Prepares handle from which read sequences
61    if os.path.isfile(source):
62        if source.endswith('.gz'):
63            import gzip
64            _source = gzip.open(source)
65        else:
66            _source = open(source, "r")
67    else:
68        _source = iter(source.split("\n"))
69
70    seq_name = None
71    for line in _source:
72        line = line.strip()
73        if line.startswith('#') or not line:
74            continue
75        # Reads seq number
76        elif line.startswith('>'):
77            # Checks if previous name had seq
78            if seq_id>-1 and SC.id2seq[seq_id] == "":
79                raise Exception("No sequence found for "+seq_name)
80
81            seq_id += 1
82            # Takes header info
83            seq_header_fields = [_f.strip() for _f in line[1:].split(header_delimiter)]
84            seq_name = seq_header_fields[0]
85
86            # Checks for duplicated seq names
87            if fix_duplicates and seq_name in names:
88                tag = str(len([k for k in list(SC.name2id.keys()) if k.endswith(seq_name)]))
89                old_name = seq_name
90                seq_name = tag+"_"+seq_name
91                print("Duplicated entry [%s] was renamed to [%s]" %(old_name, seq_name), file=STDERR)
92
93            # stores seq_name
94            SC.id2seq[seq_id] = ""
95            SC.id2name[seq_id] = seq_name
96            SC.name2id[seq_name] = seq_id
97            SC.id2comment[seq_id] = seq_header_fields[1:]
98            names.add(seq_name)
99
100        else:
101            if seq_name is None:
102                raise Exception("Error reading sequences: Wrong format.")
103
104            # removes all white spaces in line
105            s = line.strip().replace(" ","")
106
107            # append to seq_string
108            SC.id2seq[seq_id] += s
109
110    if seq_name and SC.id2seq[seq_id] == "":
111        print(seq_name,"has no sequence", file=STDERR)
112        return None
113
114    # Everything ok
115    return SC
116
117def write_fasta(sequences, outfile = None, seqwidth = 80):
118    """ Writes a SeqGroup python object using FASTA format. """
119
120    wrapper = textwrap.TextWrapper()
121    wrapper.break_on_hyphens = False
122    wrapper.replace_whitespace = False
123    wrapper.expand_tabs = False
124    wrapper.break_long_words = True
125    wrapper.width = 80
126    text =  '\n'.join([">%s\n%s\n" %( "\t".join([name]+comment), wrapper.fill(seq)) for
127                       name, seq, comment in sequences])
128
129    if outfile is not None:
130        OUT = open(outfile,"w")
131        OUT.write(text)
132        OUT.close()
133    else:
134        return text
135