1# Copyright 2015-2015 by Eric Rasche. All rights reserved. 2# 3# This file is part of the Biopython distribution and governed by your 4# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 5# Please see the LICENSE file that should have been included as part of this 6# package. 7"""Bio.AlignIO support for "xmfa" output from Mauve/ProgressiveMauve. 8 9You are expected to use this module via the Bio.AlignIO functions (or the 10Bio.SeqIO functions if you want to work directly with the gapped sequences). 11 12For example, consider a progressiveMauve alignment file containing the following:: 13 14 #FormatVersion Mauve1 15 #Sequence1File a.fa 16 #Sequence1Entry 1 17 #Sequence1Format FastA 18 #Sequence2File b.fa 19 #Sequence2Entry 2 20 #Sequence2Format FastA 21 #Sequence3File c.fa 22 #Sequence3Entry 3 23 #Sequence3Format FastA 24 #BackboneFile three.xmfa.bbcols 25 > 1:0-0 + a.fa 26 -------------------------------------------------------------------------------- 27 -------------------------------------------------------------------------------- 28 -------------------------------------------------------------------------------- 29 > 2:5417-5968 + b.fa 30 TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACGTGAGAGGAGCGCCCTAAGCTTTGGGAAATTCAAGC- 31 -------------------------------------------------------------------------------- 32 CTGGAACGTACTTGCTGGTTTCGCTACTATTTCAAACAAGTTAGAGGCCGTTACCTCGGGCGAACGTATAAACCATTCTG 33 > 3:9476-10076 - c.fa 34 TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-GGGAGGAGATCGCCCCAAACGTATGGTGAGTCGGGCG 35 TTTCCTATAGCTATAGGACCAATCCACTTACCATACGCCCGGCGTCGCCCAGTCCGGTTCGGTACCCTCCATGACCCACG 36 ---------------------------------------------------------AAATGAGGGCCCAGGGTATGCTT 37 = 38 > 2:5969-6015 + b.fa 39 ----------------------- 40 GGGCGAACGTATAAACCATTCTG 41 > 3:9429-9476 - c.fa 42 TTCGGTACCCTCCATGACCCACG 43 AAATGAGGGCCCAGGGTATGCTT 44 45This is a multiple sequence alignment with multiple aligned sections, so you 46would probably load this using the Bio.AlignIO.parse() function: 47 48 >>> from Bio import AlignIO 49 >>> align = AlignIO.parse("Mauve/simple_short.xmfa", "mauve") 50 >>> alignments = list(align) 51 >>> for aln in alignments: 52 ... print(aln) 53 ... 54 Alignment with 3 rows and 240 columns 55 --------------------------------------------...--- a.fa 56 TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACG...CTG b.fa/5416-5968 57 TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-G...CTT c.fa/9475-10076 58 Alignment with 2 rows and 46 columns 59 -----------------------GGGCGAACGTATAAACCATTCTG b.fa/5968-6015 60 TTCGGTACCCTCCATGACCCACGAAATGAGGGCCCAGGGTATGCTT c.fa/9428-9476 61 62Additional information is extracted from the XMFA file and available through 63the annotation attribute of each record:: 64 65 >>> for record in alignments[0]: 66 ... print(record.id, len(record)) 67 ... print(" start: %d, end: %d, strand: %d" %( 68 ... record.annotations['start'], record.annotations['end'], 69 ... record.annotations['strand'])) 70 ... 71 a.fa 240 72 start: 0, end: 0, strand: 1 73 b.fa/5416-5968 240 74 start: 5416, end: 5968, strand: 1 75 c.fa/9475-10076 240 76 start: 9475, end: 10076, strand: -1 77 78""" 79import re 80 81from Bio.Align import MultipleSeqAlignment 82from Bio.Seq import Seq 83from Bio.SeqRecord import SeqRecord 84 85from .Interfaces import AlignmentIterator 86from .Interfaces import SequentialAlignmentWriter 87 88 89XMFA_HEADER_REGEX = re.compile( 90 r"> (?P<id>\d+):(?P<start>\d+)-(?P<end>\d+) (?P<strand>[+-]) (?P<name>.*)" 91) 92XMFA_HEADER_REGEX_BIOPYTHON = re.compile( 93 r"> (?P<id>\d+):(?P<start>\d+)-(?P<end>\d+) (?P<strand>[+-]) (?P<name>[^#]*) # (?P<realname>.*)" 94) 95ID_LINE_FMT = "> {seq_name}:{start}-{end} {strand} {filename} # {ugly_hack}" 96 97 98def _identifier_split(identifier): 99 """Return (name, start, end) string tuple from an identifier (PRIVATE).""" 100 id, loc, strand = identifier.split(":") 101 start, end = map(int, loc.split("-")) 102 start -= 1 103 return id, start, end, strand 104 105 106class MauveWriter(SequentialAlignmentWriter): 107 """Mauve/XMFA alignment writer.""" 108 109 def __init__(self, *args, **kwargs): 110 """Initialize the class.""" 111 super().__init__(*args, **kwargs) 112 self._wrote_header = False 113 self._wrote_first = False 114 115 def write_alignment(self, alignment): 116 """Use this to write (another) single alignment to an open file. 117 118 Note that sequences and their annotation are recorded 119 together (rather than having a block of annotation followed 120 by a block of aligned sequences). 121 """ 122 count = len(alignment) 123 124 self._length_of_sequences = alignment.get_alignment_length() 125 126 # NOTE - For now, the alignment object does not hold any per column 127 # or per alignment annotation - only per sequence. 128 129 if count == 0: 130 raise ValueError("Must have at least one sequence") 131 if self._length_of_sequences == 0: 132 raise ValueError("Non-empty sequences are required") 133 134 if not self._wrote_header: 135 self._wrote_header = True 136 self.handle.write("#FormatVersion Mauve1\n") 137 # There are some more headers, but we ignore those for now. 138 # Sequence1File unknown.fa 139 # Sequence1Entry 1 140 # Sequence1Format FastA 141 for i in range(1, count + 1): 142 self.handle.write("#Sequence%sEntry\t%s\n" % (i, i)) 143 144 for idx, record in enumerate(alignment): 145 self._write_record(record, record_idx=idx) 146 self.handle.write("=\n") 147 148 def _write_record(self, record, record_idx=0): 149 """Write a single SeqRecord to the file (PRIVATE).""" 150 if self._length_of_sequences != len(record.seq): 151 raise ValueError("Sequences must all be the same length") 152 153 seq_name = record.name 154 try: 155 seq_name = str(int(record.name)) 156 except ValueError: 157 seq_name = str(record_idx + 1) 158 159 # We remove the "/{start}-{end}" before writing, as it cannot be part 160 # of the produced XMFA file. 161 if "start" in record.annotations and "end" in record.annotations: 162 suffix0 = "/%s-%s" % ( 163 record.annotations["start"], 164 record.annotations["end"], 165 ) 166 suffix1 = "/%s-%s" % ( 167 record.annotations["start"] + 1, 168 record.annotations["end"], 169 ) 170 if seq_name[-len(suffix0) :] == suffix0: 171 seq_name = seq_name[: -len(suffix0)] 172 if seq_name[-len(suffix1) :] == suffix1: 173 seq_name = seq_name[: -len(suffix1)] 174 175 if ( 176 "start" in record.annotations 177 and "end" in record.annotations 178 and "strand" in record.annotations 179 ): 180 id_line = ID_LINE_FMT.format( 181 seq_name=seq_name, 182 start=record.annotations["start"] + 1, 183 end=record.annotations["end"], 184 strand=("+" if record.annotations["strand"] == 1 else "-"), 185 filename=record.name + ".fa", 186 ugly_hack=record.id, 187 ) 188 lacking_annotations = False 189 else: 190 id_line = ID_LINE_FMT.format( 191 seq_name=seq_name, 192 start=0, 193 end=0, 194 strand="+", 195 filename=record.name + ".fa", 196 ugly_hack=record.id, 197 ) 198 lacking_annotations = True 199 200 # If the sequence is an empty one, skip writing it out 201 if (":0-0 " in id_line or ":1-0 " in id_line) and not lacking_annotations: 202 # Except in the first LCB 203 if not self._wrote_first: 204 self._wrote_first = True 205 # The first LCB we write out is special, and must list ALL 206 # sequences, for the Mauve GUI 207 # http://darlinglab.org/mauve/user-guide/files.html#non-standard-xmfa-formatting-used-by-the-mauve-gui 208 id_line = ID_LINE_FMT.format( 209 seq_name=seq_name, 210 start=0, 211 end=0, 212 strand="+", 213 filename=record.name + ".fa", 214 ugly_hack=record.id, 215 ) 216 id_line = id_line.replace("\n", " ").replace("\r", " ") 217 self.handle.write(id_line + "\n\n") 218 # Alignments lacking a start/stop/strand were generated by 219 # Biopython on load, and shouldn't exist according to XMFA 220 else: 221 # In other blocks, we only write sequences if they exist in a given 222 # alignment. 223 id_line = id_line.replace("\n", " ").replace("\r", " ") 224 self.handle.write(id_line + "\n") 225 for i in range(0, len(record.seq), 80): 226 self.handle.write("%s\n" % record.seq[i : i + 80]) 227 228 229class MauveIterator(AlignmentIterator): 230 """Mauve xmfa alignment iterator.""" 231 232 _ids = [] # for caching IDs between __next__ calls 233 234 def __next__(self): 235 """Parse the next alignment from the handle.""" 236 handle = self.handle 237 line = handle.readline() 238 239 if not line: 240 raise StopIteration 241 242 # Strip out header comments 243 while line and line.strip().startswith("#"): 244 line = handle.readline() 245 246 seqs = {} 247 seq_regions = {} 248 passed_end_alignment = False 249 250 latest_id = None 251 while True: 252 if not line: 253 break # end of file 254 line = line.strip() 255 256 if line.startswith("="): 257 # There may be more data, but we've reached the end of this 258 # alignment 259 break 260 elif line.startswith(">"): 261 m = XMFA_HEADER_REGEX_BIOPYTHON.match(line) 262 if not m: 263 m = XMFA_HEADER_REGEX.match(line) 264 if not m: 265 raise ValueError("Malformed header line: %s", line) 266 267 parsed_id = m.group("id") 268 parsed_data = {} 269 for key in ("start", "end", "id", "strand", "name", "realname"): 270 try: 271 value = m.group(key) 272 if key == "start": 273 value = int(value) 274 # Convert to zero based counting 275 if value > 0: 276 value -= 1 277 278 if key == "end": 279 value = int(value) 280 parsed_data[key] = value 281 except IndexError: 282 # This will occur if we're asking for a group that 283 # doesn't exist. It's fine. 284 pass 285 seq_regions[parsed_id] = parsed_data 286 287 if parsed_id not in self._ids: 288 self._ids.append(parsed_id) 289 290 seqs.setdefault(parsed_id, "") 291 latest_id = parsed_id 292 else: 293 assert not passed_end_alignment 294 if latest_id is None: 295 raise ValueError("Saw sequence before definition line") 296 seqs[latest_id] += line 297 line = handle.readline() 298 299 assert len(seqs) <= len(self._ids) 300 301 self.ids = self._ids 302 self.sequences = seqs 303 304 if self._ids and seqs: 305 alignment_length = max(map(len, list(seqs.values()))) 306 records = [] 307 for id in self._ids: 308 if id not in seqs or len(seqs[id]) == 0 or len(seqs[id]) == 0: 309 seq = "-" * alignment_length 310 else: 311 seq = seqs[id] 312 313 if alignment_length != len(seq): 314 raise ValueError( 315 "Sequences have different lengths, or repeated identifier" 316 ) 317 318 # Sometimes we don't see a particular sequence in the 319 # alignment, so we skip that record since it isn't present in 320 # that LCB/alignment 321 if id not in seq_regions: 322 continue 323 324 if seq_regions[id]["start"] != 0 or seq_regions[id]["end"] != 0: 325 suffix = "/{start}-{end}".format(**seq_regions[id]) 326 if "realname" in seq_regions[id]: 327 corrected_id = seq_regions[id]["realname"] 328 else: 329 corrected_id = seq_regions[id]["name"] 330 if corrected_id.count(suffix) == 0: 331 corrected_id += suffix 332 else: 333 if "realname" in seq_regions[id]: 334 corrected_id = seq_regions[id]["realname"] 335 else: 336 corrected_id = seq_regions[id]["name"] 337 338 record = SeqRecord(Seq(seq), id=corrected_id, name=id) 339 340 record.annotations["start"] = seq_regions[id]["start"] 341 record.annotations["end"] = seq_regions[id]["end"] 342 record.annotations["strand"] = ( 343 1 if seq_regions[id]["strand"] == "+" else -1 344 ) 345 346 records.append(record) 347 return MultipleSeqAlignment(records) 348 else: 349 raise StopIteration 350