1# #START_LICENSE########################################################### 2# 3# 4# This file is part of the Environment for Tree Exploration program 5# (ETE). http://etetoolkit.org 6# 7# ETE is free software: you can redistribute it and/or modify it 8# under the terms of the GNU General Public License as published by 9# the Free Software Foundation, either version 3 of the License, or 10# (at your option) any later version. 11# 12# ETE is distributed in the hope that it will be useful, but WITHOUT 13# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 15# License for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with ETE. If not, see <http://www.gnu.org/licenses/>. 19# 20# 21# ABOUT THE ETE PACKAGE 22# ===================== 23# 24# ETE is distributed under the GPL copyleft license (2008-2015). 25# 26# If you make use of ETE in published work, please cite: 27# 28# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon. 29# ETE: a python Environment for Tree Exploration. Jaime BMC 30# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24 31# 32# Note that extra references to the specific methods implemented in 33# the toolkit may be available in the documentation. 34# 35# More info at http://etetoolkit.org. Contact: huerta@embl.de 36# 37# 38# #END_LICENSE############################################################# 39from __future__ import absolute_import 40from __future__ import print_function 41 42import os 43import string 44import textwrap 45from sys import stderr as STDERR 46from six.moves import map 47 48def read_fasta(source, obj=None, header_delimiter="\t", fix_duplicates=True): 49 """ Reads a collection of sequences econded in FASTA format.""" 50 51 if obj is None: 52 from ..coretype import seqgroup 53 SC = seqgroup.SeqGroup() 54 else: 55 SC = obj 56 57 names = set([]) 58 seq_id = -1 59 60 # Prepares handle from which read sequences 61 if os.path.isfile(source): 62 if source.endswith('.gz'): 63 import gzip 64 _source = gzip.open(source) 65 else: 66 _source = open(source, "r") 67 else: 68 _source = iter(source.split("\n")) 69 70 seq_name = None 71 for line in _source: 72 line = line.strip() 73 if line.startswith('#') or not line: 74 continue 75 # Reads seq number 76 elif line.startswith('>'): 77 # Checks if previous name had seq 78 if seq_id>-1 and SC.id2seq[seq_id] == "": 79 raise Exception("No sequence found for "+seq_name) 80 81 seq_id += 1 82 # Takes header info 83 seq_header_fields = [_f.strip() for _f in line[1:].split(header_delimiter)] 84 seq_name = seq_header_fields[0] 85 86 # Checks for duplicated seq names 87 if fix_duplicates and seq_name in names: 88 tag = str(len([k for k in list(SC.name2id.keys()) if k.endswith(seq_name)])) 89 old_name = seq_name 90 seq_name = tag+"_"+seq_name 91 print("Duplicated entry [%s] was renamed to [%s]" %(old_name, seq_name), file=STDERR) 92 93 # stores seq_name 94 SC.id2seq[seq_id] = "" 95 SC.id2name[seq_id] = seq_name 96 SC.name2id[seq_name] = seq_id 97 SC.id2comment[seq_id] = seq_header_fields[1:] 98 names.add(seq_name) 99 100 else: 101 if seq_name is None: 102 raise Exception("Error reading sequences: Wrong format.") 103 104 # removes all white spaces in line 105 s = line.strip().replace(" ","") 106 107 # append to seq_string 108 SC.id2seq[seq_id] += s 109 110 if seq_name and SC.id2seq[seq_id] == "": 111 print(seq_name,"has no sequence", file=STDERR) 112 return None 113 114 # Everything ok 115 return SC 116 117def write_fasta(sequences, outfile = None, seqwidth = 80): 118 """ Writes a SeqGroup python object using FASTA format. """ 119 120 wrapper = textwrap.TextWrapper() 121 wrapper.break_on_hyphens = False 122 wrapper.replace_whitespace = False 123 wrapper.expand_tabs = False 124 wrapper.break_long_words = True 125 wrapper.width = 80 126 text = '\n'.join([">%s\n%s\n" %( "\t".join([name]+comment), wrapper.fill(seq)) for 127 name, seq, comment in sequences]) 128 129 if outfile is not None: 130 OUT = open(outfile,"w") 131 OUT.write(text) 132 OUT.close() 133 else: 134 return text 135