1# Copyright 2015-2015 by Eric Rasche.  All rights reserved.
2#
3# This file is part of the Biopython distribution and governed by your
4# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
5# Please see the LICENSE file that should have been included as part of this
6# package.
7"""Bio.AlignIO support for "xmfa" output from Mauve/ProgressiveMauve.
8
9You are expected to use this module via the Bio.AlignIO functions (or the
10Bio.SeqIO functions if you want to work directly with the gapped sequences).
11
12For example, consider a progressiveMauve alignment file containing the following::
13
14    #FormatVersion Mauve1
15    #Sequence1File	a.fa
16    #Sequence1Entry	1
17    #Sequence1Format	FastA
18    #Sequence2File	b.fa
19    #Sequence2Entry	2
20    #Sequence2Format	FastA
21    #Sequence3File	c.fa
22    #Sequence3Entry	3
23    #Sequence3Format	FastA
24    #BackboneFile	three.xmfa.bbcols
25    > 1:0-0 + a.fa
26    --------------------------------------------------------------------------------
27    --------------------------------------------------------------------------------
28    --------------------------------------------------------------------------------
29    > 2:5417-5968 + b.fa
30    TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACGTGAGAGGAGCGCCCTAAGCTTTGGGAAATTCAAGC-
31    --------------------------------------------------------------------------------
32    CTGGAACGTACTTGCTGGTTTCGCTACTATTTCAAACAAGTTAGAGGCCGTTACCTCGGGCGAACGTATAAACCATTCTG
33    > 3:9476-10076 - c.fa
34    TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-GGGAGGAGATCGCCCCAAACGTATGGTGAGTCGGGCG
35    TTTCCTATAGCTATAGGACCAATCCACTTACCATACGCCCGGCGTCGCCCAGTCCGGTTCGGTACCCTCCATGACCCACG
36    ---------------------------------------------------------AAATGAGGGCCCAGGGTATGCTT
37    =
38    > 2:5969-6015 + b.fa
39    -----------------------
40    GGGCGAACGTATAAACCATTCTG
41    > 3:9429-9476 - c.fa
42    TTCGGTACCCTCCATGACCCACG
43    AAATGAGGGCCCAGGGTATGCTT
44
45This is a multiple sequence alignment with multiple aligned sections, so you
46would probably load this using the Bio.AlignIO.parse() function:
47
48    >>> from Bio import AlignIO
49    >>> align = AlignIO.parse("Mauve/simple_short.xmfa", "mauve")
50    >>> alignments = list(align)
51    >>> for aln in alignments:
52    ...     print(aln)
53    ...
54    Alignment with 3 rows and 240 columns
55    --------------------------------------------...--- a.fa
56    TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACG...CTG b.fa/5416-5968
57    TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-G...CTT c.fa/9475-10076
58    Alignment with 2 rows and 46 columns
59    -----------------------GGGCGAACGTATAAACCATTCTG b.fa/5968-6015
60    TTCGGTACCCTCCATGACCCACGAAATGAGGGCCCAGGGTATGCTT c.fa/9428-9476
61
62Additional information is extracted from the XMFA file and available through
63the annotation attribute of each record::
64
65    >>> for record in alignments[0]:
66    ...     print(record.id, len(record))
67    ...     print("  start: %d, end: %d, strand: %d" %(
68    ...         record.annotations['start'], record.annotations['end'],
69    ...         record.annotations['strand']))
70    ...
71    a.fa 240
72      start: 0, end: 0, strand: 1
73    b.fa/5416-5968 240
74      start: 5416, end: 5968, strand: 1
75    c.fa/9475-10076 240
76      start: 9475, end: 10076, strand: -1
77
78"""
79import re
80
81from Bio.Align import MultipleSeqAlignment
82from Bio.Seq import Seq
83from Bio.SeqRecord import SeqRecord
84
85from .Interfaces import AlignmentIterator
86from .Interfaces import SequentialAlignmentWriter
87
88
89XMFA_HEADER_REGEX = re.compile(
90    r"> (?P<id>\d+):(?P<start>\d+)-(?P<end>\d+) (?P<strand>[+-]) (?P<name>.*)"
91)
92XMFA_HEADER_REGEX_BIOPYTHON = re.compile(
93    r"> (?P<id>\d+):(?P<start>\d+)-(?P<end>\d+) (?P<strand>[+-]) (?P<name>[^#]*) # (?P<realname>.*)"
94)
95ID_LINE_FMT = "> {seq_name}:{start}-{end} {strand} {filename} # {ugly_hack}"
96
97
98def _identifier_split(identifier):
99    """Return (name, start, end) string tuple from an identifier (PRIVATE)."""
100    id, loc, strand = identifier.split(":")
101    start, end = map(int, loc.split("-"))
102    start -= 1
103    return id, start, end, strand
104
105
106class MauveWriter(SequentialAlignmentWriter):
107    """Mauve/XMFA alignment writer."""
108
109    def __init__(self, *args, **kwargs):
110        """Initialize the class."""
111        super().__init__(*args, **kwargs)
112        self._wrote_header = False
113        self._wrote_first = False
114
115    def write_alignment(self, alignment):
116        """Use this to write (another) single alignment to an open file.
117
118        Note that sequences and their annotation are recorded
119        together (rather than having a block of annotation followed
120        by a block of aligned sequences).
121        """
122        count = len(alignment)
123
124        self._length_of_sequences = alignment.get_alignment_length()
125
126        # NOTE - For now, the alignment object does not hold any per column
127        # or per alignment annotation - only per sequence.
128
129        if count == 0:
130            raise ValueError("Must have at least one sequence")
131        if self._length_of_sequences == 0:
132            raise ValueError("Non-empty sequences are required")
133
134        if not self._wrote_header:
135            self._wrote_header = True
136            self.handle.write("#FormatVersion Mauve1\n")
137            # There are some more headers, but we ignore those for now.
138            # Sequence1File	unknown.fa
139            # Sequence1Entry	1
140            # Sequence1Format	FastA
141            for i in range(1, count + 1):
142                self.handle.write("#Sequence%sEntry\t%s\n" % (i, i))
143
144        for idx, record in enumerate(alignment):
145            self._write_record(record, record_idx=idx)
146        self.handle.write("=\n")
147
148    def _write_record(self, record, record_idx=0):
149        """Write a single SeqRecord to the file (PRIVATE)."""
150        if self._length_of_sequences != len(record.seq):
151            raise ValueError("Sequences must all be the same length")
152
153        seq_name = record.name
154        try:
155            seq_name = str(int(record.name))
156        except ValueError:
157            seq_name = str(record_idx + 1)
158
159        # We remove the "/{start}-{end}" before writing, as it cannot be part
160        # of the produced XMFA file.
161        if "start" in record.annotations and "end" in record.annotations:
162            suffix0 = "/%s-%s" % (
163                record.annotations["start"],
164                record.annotations["end"],
165            )
166            suffix1 = "/%s-%s" % (
167                record.annotations["start"] + 1,
168                record.annotations["end"],
169            )
170            if seq_name[-len(suffix0) :] == suffix0:
171                seq_name = seq_name[: -len(suffix0)]
172            if seq_name[-len(suffix1) :] == suffix1:
173                seq_name = seq_name[: -len(suffix1)]
174
175        if (
176            "start" in record.annotations
177            and "end" in record.annotations
178            and "strand" in record.annotations
179        ):
180            id_line = ID_LINE_FMT.format(
181                seq_name=seq_name,
182                start=record.annotations["start"] + 1,
183                end=record.annotations["end"],
184                strand=("+" if record.annotations["strand"] == 1 else "-"),
185                filename=record.name + ".fa",
186                ugly_hack=record.id,
187            )
188            lacking_annotations = False
189        else:
190            id_line = ID_LINE_FMT.format(
191                seq_name=seq_name,
192                start=0,
193                end=0,
194                strand="+",
195                filename=record.name + ".fa",
196                ugly_hack=record.id,
197            )
198            lacking_annotations = True
199
200        # If the sequence is an empty one, skip writing it out
201        if (":0-0 " in id_line or ":1-0 " in id_line) and not lacking_annotations:
202            # Except in the first LCB
203            if not self._wrote_first:
204                self._wrote_first = True
205                # The first LCB we write out is special, and must list ALL
206                # sequences, for the Mauve GUI
207                # http://darlinglab.org/mauve/user-guide/files.html#non-standard-xmfa-formatting-used-by-the-mauve-gui
208                id_line = ID_LINE_FMT.format(
209                    seq_name=seq_name,
210                    start=0,
211                    end=0,
212                    strand="+",
213                    filename=record.name + ".fa",
214                    ugly_hack=record.id,
215                )
216                id_line = id_line.replace("\n", " ").replace("\r", " ")
217                self.handle.write(id_line + "\n\n")
218            # Alignments lacking a start/stop/strand were generated by
219            # Biopython on load, and shouldn't exist according to XMFA
220        else:
221            # In other blocks, we only write sequences if they exist in a given
222            # alignment.
223            id_line = id_line.replace("\n", " ").replace("\r", " ")
224            self.handle.write(id_line + "\n")
225            for i in range(0, len(record.seq), 80):
226                self.handle.write("%s\n" % record.seq[i : i + 80])
227
228
229class MauveIterator(AlignmentIterator):
230    """Mauve xmfa alignment iterator."""
231
232    _ids = []  # for caching IDs between __next__ calls
233
234    def __next__(self):
235        """Parse the next alignment from the handle."""
236        handle = self.handle
237        line = handle.readline()
238
239        if not line:
240            raise StopIteration
241
242        # Strip out header comments
243        while line and line.strip().startswith("#"):
244            line = handle.readline()
245
246        seqs = {}
247        seq_regions = {}
248        passed_end_alignment = False
249
250        latest_id = None
251        while True:
252            if not line:
253                break  # end of file
254            line = line.strip()
255
256            if line.startswith("="):
257                # There may be more data, but we've reached the end of this
258                # alignment
259                break
260            elif line.startswith(">"):
261                m = XMFA_HEADER_REGEX_BIOPYTHON.match(line)
262                if not m:
263                    m = XMFA_HEADER_REGEX.match(line)
264                    if not m:
265                        raise ValueError("Malformed header line: %s", line)
266
267                parsed_id = m.group("id")
268                parsed_data = {}
269                for key in ("start", "end", "id", "strand", "name", "realname"):
270                    try:
271                        value = m.group(key)
272                        if key == "start":
273                            value = int(value)
274                            # Convert to zero based counting
275                            if value > 0:
276                                value -= 1
277
278                        if key == "end":
279                            value = int(value)
280                        parsed_data[key] = value
281                    except IndexError:
282                        # This will occur if we're asking for a group that
283                        # doesn't exist. It's fine.
284                        pass
285                seq_regions[parsed_id] = parsed_data
286
287                if parsed_id not in self._ids:
288                    self._ids.append(parsed_id)
289
290                seqs.setdefault(parsed_id, "")
291                latest_id = parsed_id
292            else:
293                assert not passed_end_alignment
294                if latest_id is None:
295                    raise ValueError("Saw sequence before definition line")
296                seqs[latest_id] += line
297            line = handle.readline()
298
299        assert len(seqs) <= len(self._ids)
300
301        self.ids = self._ids
302        self.sequences = seqs
303
304        if self._ids and seqs:
305            alignment_length = max(map(len, list(seqs.values())))
306            records = []
307            for id in self._ids:
308                if id not in seqs or len(seqs[id]) == 0 or len(seqs[id]) == 0:
309                    seq = "-" * alignment_length
310                else:
311                    seq = seqs[id]
312
313                if alignment_length != len(seq):
314                    raise ValueError(
315                        "Sequences have different lengths, or repeated identifier"
316                    )
317
318                # Sometimes we don't see a particular sequence in the
319                # alignment, so we skip that record since it isn't present in
320                # that LCB/alignment
321                if id not in seq_regions:
322                    continue
323
324                if seq_regions[id]["start"] != 0 or seq_regions[id]["end"] != 0:
325                    suffix = "/{start}-{end}".format(**seq_regions[id])
326                    if "realname" in seq_regions[id]:
327                        corrected_id = seq_regions[id]["realname"]
328                    else:
329                        corrected_id = seq_regions[id]["name"]
330                    if corrected_id.count(suffix) == 0:
331                        corrected_id += suffix
332                else:
333                    if "realname" in seq_regions[id]:
334                        corrected_id = seq_regions[id]["realname"]
335                    else:
336                        corrected_id = seq_regions[id]["name"]
337
338                record = SeqRecord(Seq(seq), id=corrected_id, name=id)
339
340                record.annotations["start"] = seq_regions[id]["start"]
341                record.annotations["end"] = seq_regions[id]["end"]
342                record.annotations["strand"] = (
343                    1 if seq_regions[id]["strand"] == "+" else -1
344                )
345
346                records.append(record)
347            return MultipleSeqAlignment(records)
348        else:
349            raise StopIteration
350