1# Copyright 2011, 2012 by Andrew Sczesnak.  All rights reserved.
2# Revisions Copyright 2011, 2017 by Peter Cock.  All rights reserved.
3# Revisions Copyright 2014, 2015 by Adam Novak.  All rights reserved.
4# Revisions Copyright 2015, 2017 by Blaise Li.  All rights reserved.
5#
6# This file is part of the Biopython distribution and governed by your
7# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
8# Please see the LICENSE file that should have been included as part of this
9# package.
10"""Bio.AlignIO support for the "maf" multiple alignment format.
11
12The Multiple Alignment Format, described by UCSC, stores a series of
13multiple alignments in a single file. It is suitable for whole-genome
14to whole-genome alignments, metadata such as source chromosome, start
15position, size, and strand can be stored.
16
17See http://genome.ucsc.edu/FAQ/FAQformat.html#format5
18
19You are expected to use this module via the Bio.AlignIO functions(or the
20Bio.SeqIO functions if you want to work directly with the gapped sequences).
21
22Coordinates in the MAF format are defined in terms of zero-based start
23positions (like Python) and aligning region sizes.
24
25A minimal aligned region of length one and starting at first position in the
26source sequence would have ``start == 0`` and ``size == 1``.
27
28As we can see on this example, ``start + size`` will give one more than the
29zero-based end position. We can therefore manipulate ``start`` and
30``start + size`` as python list slice boundaries.
31
32For an inclusive end coordinate, we need to use ``end = start + size - 1``.
33A 1-column wide alignment would have ``start == end``.
34"""
35import os
36
37from itertools import islice
38from sqlite3 import dbapi2
39
40from Bio.Align import MultipleSeqAlignment
41from Bio.Seq import Seq
42from Bio.SeqRecord import SeqRecord
43
44from .Interfaces import SequentialAlignmentWriter
45
46MAFINDEX_VERSION = 2
47
48
49class MafWriter(SequentialAlignmentWriter):
50    """Accepts a MultipleSeqAlignment object, writes a MAF file."""
51
52    def write_header(self):
53        """Write the MAF header."""
54        self.handle.write("##maf version=1 scoring=none\n")
55        self.handle.write("# generated by Biopython\n\n")
56
57    def _write_record(self, record):
58        """Write a single SeqRecord object to an 's' line in a MAF block (PRIVATE)."""
59        # convert biopython-style 1/-1 strand to MAF-style +/- strand
60        if record.annotations.get("strand") == 1:
61            strand = "+"
62        elif record.annotations.get("strand") == -1:
63            strand = "-"
64        else:
65            # TODO: issue warning?
66            strand = "+"
67
68        fields = [
69            "s",
70            # In the MAF file format, spaces are not allowed in the id
71            "%-40s" % record.id.replace(" ", "_"),
72            "%15s" % record.annotations.get("start", 0),
73            "%5s"
74            % record.annotations.get("size", len(str(record.seq).replace("-", ""))),
75            strand,
76            "%15s" % record.annotations.get("srcSize", 0),
77            str(record.seq),
78        ]
79        self.handle.write("%s\n" % " ".join(fields))
80
81    def write_alignment(self, alignment):
82        """Write a complete alignment to a MAF block.
83
84        Writes every SeqRecord in a MultipleSeqAlignment object to its own
85        MAF block (beginning with an 'a' line, containing 's' lines).
86        """
87        if not isinstance(alignment, MultipleSeqAlignment):
88            raise TypeError("Expected an alignment object")
89
90        if len({len(x) for x in alignment}) > 1:
91            raise ValueError("Sequences must all be the same length")
92
93        # We allow multiple sequences with the same IDs; for example, there may
94        # be a MAF aligning the + and - strands of the same sequence together.
95
96        # for now, use ._annotations private property, but restrict keys to those
97        # specifically supported by the MAF format, according to spec
98        try:
99            anno = " ".join(
100                [
101                    "%s=%s" % (x, y)
102                    for x, y in alignment._annotations.items()
103                    if x in ("score", "pass")
104                ]
105            )
106        except AttributeError:
107            anno = "score=0.00"
108
109        self.handle.write("a %s\n" % (anno,))
110
111        recs_out = 0
112
113        for record in alignment:
114            self._write_record(record)
115
116            recs_out += 1
117
118        self.handle.write("\n")
119
120        return recs_out
121
122
123# Invalid function name according to pylint, but kept for compatibility
124# with Bio* conventions.
125def MafIterator(handle, seq_count=None):
126    """Iterate over a MAF file handle as MultipleSeqAlignment objects.
127
128    Iterates over lines in a MAF file-like object (handle), yielding
129    MultipleSeqAlignment objects. SeqRecord IDs generally correspond to
130    species names.
131    """
132    in_a_bundle = False
133
134    annotations = []
135    records = []
136
137    while True:
138        # allows parsing of the last bundle without duplicating code
139        try:
140            line = next(handle)
141        except StopIteration:
142            line = ""
143
144        if in_a_bundle:
145            if line.startswith("s"):
146                # add a SeqRecord to the bundle
147                line_split = line.strip().split()
148
149                if len(line_split) != 7:
150                    raise ValueError(
151                        "Error parsing alignment - 's' line must have 7 fields"
152                    )
153
154                # convert MAF-style +/- strand to biopython-type 1/-1
155                if line_split[4] == "+":
156                    strand = 1
157                elif line_split[4] == "-":
158                    strand = -1
159                else:
160                    # TODO: issue warning, set to 0?
161                    strand = 1
162
163                # s (literal), src (ID), start, size, strand, srcSize, text (sequence)
164                anno = {
165                    "start": int(line_split[2]),
166                    "size": int(line_split[3]),
167                    "strand": strand,
168                    "srcSize": int(line_split[5]),
169                }
170
171                sequence = line_split[6]
172
173                # interpret a dot/period to mean the same as the first sequence
174                if "." in sequence:
175                    if not records:
176                        raise ValueError(
177                            "Found dot/period in first sequence of alignment"
178                        )
179
180                    ref = records[0].seq
181                    new = []
182
183                    for (letter, ref_letter) in zip(sequence, ref):
184                        new.append(ref_letter if letter == "." else letter)
185
186                    sequence = "".join(new)
187
188                records.append(
189                    SeqRecord(
190                        Seq(sequence),
191                        id=line_split[1],
192                        name=line_split[1],
193                        description="",
194                        annotations=anno,
195                    )
196                )
197            elif line.startswith("i"):
198                # TODO: information about what is in the aligned species DNA before
199                # and after the immediately preceding "s" line
200                pass
201            elif line.startswith("e"):
202                # TODO: information about the size of the gap between the alignments
203                # that span the current block
204                pass
205            elif line.startswith("q"):
206                # TODO: quality of each aligned base for the species.
207                # Need to find documentation on this, looks like ASCII 0-9 or gap?
208                # Can then store in each SeqRecord's .letter_annotations dictionary,
209                # perhaps as the raw string or turned into integers / None for gap?
210                pass
211            elif line.startswith("#"):
212                # ignore comments
213                # (not sure whether comments
214                # are in the maf specification, though)
215                pass
216            elif not line.strip():
217                # end a bundle of records
218                if seq_count is not None:
219                    assert len(records) == seq_count
220
221                alignment = MultipleSeqAlignment(records)
222                # TODO - Introduce an annotated alignment class?
223                # See also Bio/AlignIO/FastaIO.py for same requirement.
224                # For now, store the annotation a new private property:
225                alignment._annotations = annotations
226
227                yield alignment
228
229                in_a_bundle = False
230
231                annotations = []
232                records = []
233            else:
234                raise ValueError(
235                    "Error parsing alignment - unexpected line:\n%s" % (line,)
236                )
237        elif line.startswith("a"):
238            # start a bundle of records
239            in_a_bundle = True
240            annot_strings = line.strip().split()[1:]
241            if len(annot_strings) != line.count("="):
242                raise ValueError("Error parsing alignment - invalid key in 'a' line")
243            annotations = dict(a_string.split("=") for a_string in annot_strings)
244        elif line.startswith("#"):
245            # ignore comments
246            pass
247        elif not line:
248            break
249
250
251class MafIndex:
252    """Index for a MAF file.
253
254    The index is a sqlite3 database that is built upon creation of the object
255    if necessary, and queried when methods *search* or *get_spliced* are
256    used.
257    """
258
259    def __init__(self, sqlite_file, maf_file, target_seqname):
260        """Indexes or loads the index of a MAF file."""
261        self._target_seqname = target_seqname
262        # example: Tests/MAF/ucsc_mm9_chr10.mafindex
263        self._index_filename = sqlite_file
264        # example: /home/bli/src/biopython/Tests/MAF
265        self._relative_path = os.path.abspath(os.path.dirname(sqlite_file))
266        # example: Tests/MAF/ucsc_mm9_chr10.maf
267        self._maf_file = maf_file
268
269        self._maf_fp = open(self._maf_file)
270
271        # if sqlite_file exists, use the existing db, otherwise index the file
272        if os.path.isfile(sqlite_file):
273            self._con = dbapi2.connect(sqlite_file)
274            self._record_count = self.__check_existing_db()
275        else:
276            self._con = dbapi2.connect(sqlite_file)
277            self._record_count = self.__make_new_index()
278
279        # lastly, setup a MafIterator pointing at the open maf_file
280        self._mafiter = MafIterator(self._maf_fp)
281
282    def __check_existing_db(self):
283        """Perform basic sanity checks upon loading an existing index (PRIVATE)."""
284        try:
285            idx_version = int(
286                self._con.execute(
287                    "SELECT value FROM meta_data WHERE key = 'version'"
288                ).fetchone()[0]
289            )
290            if idx_version != MAFINDEX_VERSION:
291                msg = "\n".join(
292                    [
293                        "Index version (%s) incompatible with this version "
294                        "of MafIndex" % idx_version,
295                        "You might erase the existing index %s "
296                        "for it to be rebuilt." % self._index_filename,
297                    ]
298                )
299                raise ValueError(msg)
300
301            filename = self._con.execute(
302                "SELECT value FROM meta_data WHERE key = 'filename'"
303            ).fetchone()[0]
304            # Compute absolute path of the original maf file
305            if os.path.isabs(filename):
306                # It was already stored as absolute
307                tmp_mafpath = filename
308            else:
309                # It should otherwise have been stored as relative to the index
310                # Would be stored with Unix / path separator, so convert
311                # it to the local OS path separator here:
312                tmp_mafpath = os.path.join(
313                    self._relative_path, filename.replace("/", os.path.sep)
314                )
315            if tmp_mafpath != os.path.abspath(self._maf_file):
316                # Original and given absolute paths differ.
317                raise ValueError(
318                    "Index uses a different file (%s != %s)"
319                    % (filename, self._maf_file)
320                )
321
322            db_target = self._con.execute(
323                "SELECT value FROM meta_data WHERE key = 'target_seqname'"
324            ).fetchone()[0]
325            if db_target != self._target_seqname:
326                raise ValueError(
327                    "Provided database indexed for %s, expected %s"
328                    % (db_target, self._target_seqname)
329                )
330
331            record_count = int(
332                self._con.execute(
333                    "SELECT value FROM meta_data WHERE key = 'record_count'"
334                ).fetchone()[0]
335            )
336            if record_count == -1:
337                raise ValueError("Unfinished/partial database provided")
338
339            records_found = int(
340                self._con.execute("SELECT COUNT(*) FROM offset_data").fetchone()[0]
341            )
342            if records_found != record_count:
343                raise ValueError(
344                    "Expected %s records, found %s.  Corrupt index?"
345                    % (record_count, records_found)
346                )
347
348            return records_found
349
350        except (dbapi2.OperationalError, dbapi2.DatabaseError) as err:
351            raise ValueError("Problem with SQLite database: %s" % err) from None
352
353    def __make_new_index(self):
354        """Read MAF file and generate SQLite index (PRIVATE)."""
355        # make the tables
356        self._con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);")
357        self._con.execute(
358            "INSERT INTO meta_data (key, value) VALUES ('version', %s);"
359            % MAFINDEX_VERSION
360        )
361        self._con.execute(
362            "INSERT INTO meta_data (key, value) VALUES ('record_count', -1);"
363        )
364        self._con.execute(
365            "INSERT INTO meta_data (key, value) VALUES ('target_seqname', '%s');"
366            % (self._target_seqname,)
367        )
368        # Determine whether to store maf file as relative to the index or absolute
369        # See https://github.com/biopython/biopython/pull/381
370        if not os.path.isabs(self._maf_file) and not os.path.isabs(
371            self._index_filename
372        ):
373            # Since the user gave both maf file and index as relative paths,
374            # we will store the maf file relative to the index.
375            # Note for cross platform use (e.g. shared drive over SAMBA),
376            # convert any Windows slash into Unix style for rel paths.
377            # example: ucsc_mm9_chr10.maf
378            mafpath = os.path.relpath(self._maf_file, self._relative_path).replace(
379                os.path.sep, "/"
380            )
381        elif (
382            os.path.dirname(os.path.abspath(self._maf_file)) + os.path.sep
383        ).startswith(self._relative_path + os.path.sep):
384            # Since maf file is in same directory or sub directory,
385            # might as well make this into a relative path:
386            mafpath = os.path.relpath(self._maf_file, self._relative_path).replace(
387                os.path.sep, "/"
388            )
389        else:
390            # Default to storing as an absolute path
391            # example: /home/bli/src/biopython/Tests/MAF/ucsc_mm9_chr10.maf
392            mafpath = os.path.abspath(self._maf_file)
393        self._con.execute(
394            "INSERT INTO meta_data (key, value) VALUES ('filename', '%s');" % (mafpath,)
395        )
396        self._con.execute(
397            "CREATE TABLE offset_data (bin INTEGER, start INTEGER, end INTEGER, offset INTEGER);"
398        )
399
400        insert_count = 0
401
402        # iterate over the entire file and insert in batches
403        mafindex_func = self.__maf_indexer()
404
405        while True:
406            batch = list(islice(mafindex_func, 100))
407            if not batch:
408                break
409
410            # batch is made from self.__maf_indexer(),
411            # which yields zero-based "inclusive" start and end coordinates
412            self._con.executemany(
413                "INSERT INTO offset_data (bin, start, end, offset) VALUES (?,?,?,?);",
414                batch,
415            )
416            self._con.commit()
417            insert_count += len(batch)
418
419        # then make indexes on the relevant fields
420        self._con.execute("CREATE INDEX IF NOT EXISTS bin_index ON offset_data(bin);")
421        self._con.execute(
422            "CREATE INDEX IF NOT EXISTS start_index ON offset_data(start);"
423        )
424        self._con.execute("CREATE INDEX IF NOT EXISTS end_index ON offset_data(end);")
425
426        self._con.execute(
427            "UPDATE meta_data SET value = '%s' WHERE key = 'record_count'"
428            % (insert_count,)
429        )
430
431        self._con.commit()
432
433        return insert_count
434
435    def __maf_indexer(self):
436        """Return index information for each bundle (PRIVATE).
437
438        Yields index information for each bundle in the form of
439        (bin, start, end, offset) tuples where start and end are
440        0-based inclusive coordinates.
441        """
442        line = self._maf_fp.readline()
443
444        while line:
445            if line.startswith("a"):
446                # note the offset
447                offset = self._maf_fp.tell() - len(line)
448
449                # search the following lines for a match to target_seqname
450                while True:
451                    line = self._maf_fp.readline()
452
453                    if not line.strip() or line.startswith("a"):
454                        # Empty line or new alignment record
455                        raise ValueError(
456                            "Target for indexing (%s) not found in this bundle"
457                            % (self._target_seqname,)
458                        )
459                    elif line.startswith("s"):
460                        # s (literal), src (ID), start, size, strand, srcSize, text (sequence)
461                        line_split = line.strip().split()
462
463                        if line_split[1] == self._target_seqname:
464                            start = int(line_split[2])
465                            size = int(line_split[3])
466                            if size != len(line_split[6].replace("-", "")):
467                                raise ValueError(
468                                    "Invalid length for target coordinates "
469                                    "(expected %s, found %s)"
470                                    % (size, len(line_split[6].replace("-", "")))
471                                )
472
473                            # "inclusive" end position is start + length - 1
474                            end = start + size - 1
475
476                            # _ucscbin takes end-exclusive coordinates
477                            yield (self._ucscbin(start, end + 1), start, end, offset)
478
479                            break
480
481            line = self._maf_fp.readline()
482
483    # TODO: check coordinate correctness for the two bin-related static methods
484    @staticmethod
485    def _region2bin(start, end):
486        """Find bins that a region may belong to (PRIVATE).
487
488        Converts a region to a list of bins that it may belong to, including largest
489        and smallest bins.
490        """
491        bins = [0, 1]
492
493        bins.extend(range(1 + (start >> 26), 2 + ((end - 1) >> 26)))
494        bins.extend(range(9 + (start >> 23), 10 + ((end - 1) >> 23)))
495        bins.extend(range(73 + (start >> 20), 74 + ((end - 1) >> 20)))
496        bins.extend(range(585 + (start >> 17), 586 + ((end - 1) >> 17)))
497
498        return set(bins)
499
500    @staticmethod
501    def _ucscbin(start, end):
502        """Return the smallest bin a given region will fit into (PRIVATE).
503
504        Adapted from http://genomewiki.ucsc.edu/index.php/Bin_indexing_system
505        """
506        bin_offsets = [512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0]
507
508        _bin_first_shift = 17
509        _bin_next_shift = 3
510
511        start_bin = start
512        end_bin = end - 1
513
514        start_bin >>= _bin_first_shift
515        end_bin >>= _bin_first_shift
516
517        for bin_offset in bin_offsets:
518            if start_bin == end_bin:
519                return bin_offset + start_bin
520            start_bin >>= _bin_next_shift
521            end_bin >>= _bin_next_shift
522
523        return 0
524
525    def _get_record(self, offset):
526        """Retrieve a single MAF record located at the offset provided (PRIVATE)."""
527        self._maf_fp.seek(offset)
528        return next(self._mafiter)
529
530    def search(self, starts, ends):
531        """Search index database for MAF records overlapping ranges provided.
532
533        Returns *MultipleSeqAlignment* results in order by start, then end, then
534        internal offset field.
535
536        *starts* should be a list of 0-based start coordinates of segments in the reference.
537        *ends* should be the list of the corresponding segment ends
538        (in the half-open UCSC convention:
539        http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/).
540        """
541        # verify the provided exon coordinates
542        if len(starts) != len(ends):
543            raise ValueError("Every position in starts must have a match in ends")
544
545        # Could it be safer to sort the (exonstart, exonend) pairs?
546        for exonstart, exonend in zip(starts, ends):
547            exonlen = exonend - exonstart
548            if exonlen < 1:
549                raise ValueError(
550                    "Exon coordinates (%d, %d) invalid: exon length (%d) < 1"
551                    % (exonstart, exonend, exonlen)
552                )
553        con = self._con
554
555        # Keep track of what blocks have already been yielded
556        # in order to avoid duplicating them
557        # (see https://github.com/biopython/biopython/issues/1083)
558        yielded_rec_coords = set()
559        # search for every exon
560        for exonstart, exonend in zip(starts, ends):
561            try:
562                possible_bins = ", ".join(
563                    map(str, self._region2bin(exonstart, exonend))
564                )
565            except TypeError:
566                raise TypeError(
567                    "Exon coordinates must be integers "
568                    "(start=%d, end=%d)" % (exonstart, exonend)
569                ) from None
570
571            # https://www.sqlite.org/lang_expr.html
572            # -----
573            # The BETWEEN operator
574            #
575            # The BETWEEN operator is logically equivalent to a pair of
576            # comparisons. "x BETWEEN y AND z" is equivalent to "x>=y AND x<=z"
577            # except that with BETWEEN, the x expression is only evaluated
578            # once. The precedence of the BETWEEN operator is the same as the
579            # precedence as operators == and != and LIKE and groups left to
580            # right.
581            # -----
582
583            # We are testing overlap between the query segment and records in
584            # the index, using non-strict coordinates comparisons.
585            # The query segment end must be passed as end-inclusive
586            # The index should also have been build with end-inclusive
587            # end coordinates.
588            # See https://github.com/biopython/biopython/pull/1086#issuecomment-285069073
589
590            result = con.execute(
591                "SELECT DISTINCT start, end, offset FROM offset_data "
592                "WHERE bin IN (%s) "
593                "AND (end BETWEEN %s AND %s OR %s BETWEEN start AND end) "
594                "ORDER BY start, end, offset ASC;"
595                % (possible_bins, exonstart, exonend - 1, exonend - 1)
596            )
597
598            rows = result.fetchall()
599
600            # rows come from the sqlite index,
601            # which should have been written using __make_new_index,
602            # so rec_start and rec_end should be zero-based "inclusive" coordinates
603            for rec_start, rec_end, offset in rows:
604                # Avoid yielding multiple time the same block
605                if (rec_start, rec_end) in yielded_rec_coords:
606                    continue
607                else:
608                    yielded_rec_coords.add((rec_start, rec_end))
609                # Iterate through hits, fetching alignments from the MAF file
610                # and checking to be sure we've retrieved the expected record.
611
612                fetched = self._get_record(int(offset))
613
614                for record in fetched:
615                    if record.id == self._target_seqname:
616                        # start and size come from the maf lines
617                        start = record.annotations["start"]
618                        # "inclusive" end is start + length - 1
619                        end = start + record.annotations["size"] - 1
620
621                        if not (start == rec_start and end == rec_end):
622                            raise ValueError(
623                                "Expected %s-%s @ offset %s, found %s-%s"
624                                % (rec_start, rec_end, offset, start, end)
625                            )
626
627                yield fetched
628
629    def get_spliced(self, starts, ends, strand=1):
630        """Return a multiple alignment of the exact sequence range provided.
631
632        Accepts two lists of start and end positions on target_seqname, representing
633        exons to be spliced in silico.  Returns a *MultipleSeqAlignment* of the
634        desired sequences spliced together.
635
636        *starts* should be a list of 0-based start coordinates of segments in the reference.
637        *ends* should be the list of the corresponding segment ends
638        (in the half-open UCSC convention:
639        http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/).
640
641        To ask for the alignment portion corresponding to the first 100
642        nucleotides of the reference sequence, you would use
643        ``search([0], [100])``
644        """
645        # validate strand
646        if strand not in (1, -1):
647            raise ValueError("Strand must be 1 or -1, got %s" % strand)
648
649        # pull all alignments that span the desired intervals
650        fetched = list(self.search(starts, ends))
651
652        # keep track of the expected letter count
653        # (sum of lengths of [start, end) segments,
654        # where [start, end) half-open)
655        expected_letters = sum(end - start for start, end in zip(starts, ends))
656
657        # if there's no alignment, return filler for the assembly of the length given
658        if len(fetched) == 0:
659            return MultipleSeqAlignment(
660                [SeqRecord(Seq("N" * expected_letters), id=self._target_seqname)]
661            )
662
663        # find the union of all IDs in these alignments
664        all_seqnames = {sequence.id for multiseq in fetched for sequence in multiseq}
665
666        # split every record by base position
667        # key: sequence name
668        # value: dictionary
669        #        key: position in the reference sequence
670        #        value: letter(s) (including letters
671        #               aligned to the "-" preceding the letter
672        #               at the position in the reference, if any)
673        split_by_position = {seq_name: {} for seq_name in all_seqnames}
674
675        # keep track of what the total number of (unspliced) letters should be
676        total_rec_length = 0
677
678        # track first strand encountered on the target seqname
679        ref_first_strand = None
680
681        for multiseq in fetched:
682            # find the target_seqname in this MultipleSeqAlignment and use it to
683            # set the parameters for the rest of this iteration
684            for seqrec in multiseq:
685                if seqrec.id == self._target_seqname:
686                    try:
687                        if ref_first_strand is None:
688                            ref_first_strand = seqrec.annotations["strand"]
689
690                            if ref_first_strand not in (1, -1):
691                                raise ValueError("Strand must be 1 or -1")
692                        elif ref_first_strand != seqrec.annotations["strand"]:
693                            raise ValueError(
694                                "Encountered strand='%s' on target seqname, "
695                                "expected '%s'"
696                                % (seqrec.annotations["strand"], ref_first_strand)
697                            )
698                    except KeyError:
699                        raise ValueError(
700                            "No strand information for target seqname (%s)"
701                            % self._target_seqname
702                        ) from None
703                    # length including gaps (i.e. alignment length)
704                    rec_length = len(seqrec)
705                    rec_start = seqrec.annotations["start"]
706                    ungapped_length = seqrec.annotations["size"]
707                    # inclusive end in zero-based coordinates of the reference
708                    rec_end = rec_start + ungapped_length - 1
709                    # This is length in terms of actual letters in the reference
710                    total_rec_length += ungapped_length
711
712                    # blank out these positions for every seqname
713                    for seqrec in multiseq:
714                        for pos in range(rec_start, rec_end + 1):
715                            split_by_position[seqrec.id][pos] = ""
716
717                    break
718            # http://psung.blogspot.fr/2007/12/for-else-in-python.html
719            # https://docs.python.org/2/tutorial/controlflow.html#break-and-continue-statements-and-else-clauses-on-loops
720            else:
721                raise ValueError(
722                    "Did not find %s in alignment bundle" % (self._target_seqname,)
723                )
724
725            # the true, chromosome/contig/etc position in the target seqname
726            real_pos = rec_start
727
728            # loop over the alignment to fill split_by_position
729            for gapped_pos in range(0, rec_length):
730                for seqrec in multiseq:
731                    # keep track of this position's value for the target seqname
732                    if seqrec.id == self._target_seqname:
733                        track_val = seqrec.seq[gapped_pos]
734
735                    # Here, a real_pos that corresponds to just after a series of "-"
736                    # in the reference will "accumulate" the letters found in other sequences
737                    # in front of the "-"s
738                    split_by_position[seqrec.id][real_pos] += seqrec.seq[gapped_pos]
739
740                # increment the real_pos counter only when non-gaps are found in
741                # the target_seqname, and we haven't reached the end of the record
742                if track_val != "-" and real_pos < rec_end:
743                    real_pos += 1
744
745        # make sure the number of bp entries equals the sum of the record lengths
746        if len(split_by_position[self._target_seqname]) != total_rec_length:
747            raise ValueError(
748                "Target seqname (%s) has %s records, expected %s"
749                % (
750                    self._target_seqname,
751                    len(split_by_position[self._target_seqname]),
752                    total_rec_length,
753                )
754            )
755
756        # translates a position in the target_seqname sequence to its gapped length
757        realpos_to_len = {
758            pos: len(gapped_fragment)
759            for pos, gapped_fragment in split_by_position[self._target_seqname].items()
760            if len(gapped_fragment) > 1
761        }
762
763        # splice together the exons
764        subseq = {}
765
766        for seqid in all_seqnames:
767            seq_split = split_by_position[seqid]
768            seq_splice = []
769
770            filler_char = "N" if seqid == self._target_seqname else "-"
771
772            # iterate from start to end, taking bases from split_by_position when
773            # they exist, using N or - for gaps when there is no alignment.
774            append = seq_splice.append
775
776            for exonstart, exonend in zip(starts, ends):
777                # exonend is exclusive
778                for real_pos in range(exonstart, exonend):
779                    # if this seqname has this position, add it
780                    if real_pos in seq_split:
781                        append(seq_split[real_pos])
782                    # if not, but it's in the target_seqname, add length-matched filler
783                    elif real_pos in realpos_to_len:
784                        append(filler_char * realpos_to_len[real_pos])
785                    # it's not in either, so add a single filler character
786                    else:
787                        append(filler_char)
788
789            subseq[seqid] = "".join(seq_splice)
790
791        # make sure we're returning the right number of letters
792        if len(subseq[self._target_seqname].replace("-", "")) != expected_letters:
793            raise ValueError(
794                "Returning %s letters for target seqname (%s), expected %s"
795                % (
796                    len(subseq[self._target_seqname].replace("-", "")),
797                    self._target_seqname,
798                    expected_letters,
799                )
800            )
801
802        # check to make sure all sequences are the same length as the target seqname
803        ref_subseq_len = len(subseq[self._target_seqname])
804
805        for seqid, seq in subseq.items():
806            if len(seq) != ref_subseq_len:
807                raise ValueError(
808                    "Returning length %s for %s, expected %s"
809                    % (len(seq), seqid, ref_subseq_len)
810                )
811
812        # finally, build a MultipleSeqAlignment object for our final sequences
813        result_multiseq = []
814
815        for seqid, seq in subseq.items():
816            seq = Seq(seq)
817
818            seq = seq if strand == ref_first_strand else seq.reverse_complement()
819
820            result_multiseq.append(SeqRecord(seq, id=seqid, name=seqid, description=""))
821
822        return MultipleSeqAlignment(result_multiseq)
823
824    def __repr__(self):
825        """Return a string representation of the index."""
826        return "MafIO.MafIndex(%r, target_seqname=%r)" % (
827            self._maf_fp.name,
828            self._target_seqname,
829        )
830
831    def __len__(self):
832        """Return the number of records in the index."""
833        return self._record_count
834