1# Copyright 2011, 2012 by Andrew Sczesnak. All rights reserved. 2# Revisions Copyright 2011, 2017 by Peter Cock. All rights reserved. 3# Revisions Copyright 2014, 2015 by Adam Novak. All rights reserved. 4# Revisions Copyright 2015, 2017 by Blaise Li. All rights reserved. 5# 6# This file is part of the Biopython distribution and governed by your 7# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 8# Please see the LICENSE file that should have been included as part of this 9# package. 10"""Bio.AlignIO support for the "maf" multiple alignment format. 11 12The Multiple Alignment Format, described by UCSC, stores a series of 13multiple alignments in a single file. It is suitable for whole-genome 14to whole-genome alignments, metadata such as source chromosome, start 15position, size, and strand can be stored. 16 17See http://genome.ucsc.edu/FAQ/FAQformat.html#format5 18 19You are expected to use this module via the Bio.AlignIO functions(or the 20Bio.SeqIO functions if you want to work directly with the gapped sequences). 21 22Coordinates in the MAF format are defined in terms of zero-based start 23positions (like Python) and aligning region sizes. 24 25A minimal aligned region of length one and starting at first position in the 26source sequence would have ``start == 0`` and ``size == 1``. 27 28As we can see on this example, ``start + size`` will give one more than the 29zero-based end position. We can therefore manipulate ``start`` and 30``start + size`` as python list slice boundaries. 31 32For an inclusive end coordinate, we need to use ``end = start + size - 1``. 33A 1-column wide alignment would have ``start == end``. 34""" 35import os 36 37from itertools import islice 38from sqlite3 import dbapi2 39 40from Bio.Align import MultipleSeqAlignment 41from Bio.Seq import Seq 42from Bio.SeqRecord import SeqRecord 43 44from .Interfaces import SequentialAlignmentWriter 45 46MAFINDEX_VERSION = 2 47 48 49class MafWriter(SequentialAlignmentWriter): 50 """Accepts a MultipleSeqAlignment object, writes a MAF file.""" 51 52 def write_header(self): 53 """Write the MAF header.""" 54 self.handle.write("##maf version=1 scoring=none\n") 55 self.handle.write("# generated by Biopython\n\n") 56 57 def _write_record(self, record): 58 """Write a single SeqRecord object to an 's' line in a MAF block (PRIVATE).""" 59 # convert biopython-style 1/-1 strand to MAF-style +/- strand 60 if record.annotations.get("strand") == 1: 61 strand = "+" 62 elif record.annotations.get("strand") == -1: 63 strand = "-" 64 else: 65 # TODO: issue warning? 66 strand = "+" 67 68 fields = [ 69 "s", 70 # In the MAF file format, spaces are not allowed in the id 71 "%-40s" % record.id.replace(" ", "_"), 72 "%15s" % record.annotations.get("start", 0), 73 "%5s" 74 % record.annotations.get("size", len(str(record.seq).replace("-", ""))), 75 strand, 76 "%15s" % record.annotations.get("srcSize", 0), 77 str(record.seq), 78 ] 79 self.handle.write("%s\n" % " ".join(fields)) 80 81 def write_alignment(self, alignment): 82 """Write a complete alignment to a MAF block. 83 84 Writes every SeqRecord in a MultipleSeqAlignment object to its own 85 MAF block (beginning with an 'a' line, containing 's' lines). 86 """ 87 if not isinstance(alignment, MultipleSeqAlignment): 88 raise TypeError("Expected an alignment object") 89 90 if len({len(x) for x in alignment}) > 1: 91 raise ValueError("Sequences must all be the same length") 92 93 # We allow multiple sequences with the same IDs; for example, there may 94 # be a MAF aligning the + and - strands of the same sequence together. 95 96 # for now, use ._annotations private property, but restrict keys to those 97 # specifically supported by the MAF format, according to spec 98 try: 99 anno = " ".join( 100 [ 101 "%s=%s" % (x, y) 102 for x, y in alignment._annotations.items() 103 if x in ("score", "pass") 104 ] 105 ) 106 except AttributeError: 107 anno = "score=0.00" 108 109 self.handle.write("a %s\n" % (anno,)) 110 111 recs_out = 0 112 113 for record in alignment: 114 self._write_record(record) 115 116 recs_out += 1 117 118 self.handle.write("\n") 119 120 return recs_out 121 122 123# Invalid function name according to pylint, but kept for compatibility 124# with Bio* conventions. 125def MafIterator(handle, seq_count=None): 126 """Iterate over a MAF file handle as MultipleSeqAlignment objects. 127 128 Iterates over lines in a MAF file-like object (handle), yielding 129 MultipleSeqAlignment objects. SeqRecord IDs generally correspond to 130 species names. 131 """ 132 in_a_bundle = False 133 134 annotations = [] 135 records = [] 136 137 while True: 138 # allows parsing of the last bundle without duplicating code 139 try: 140 line = next(handle) 141 except StopIteration: 142 line = "" 143 144 if in_a_bundle: 145 if line.startswith("s"): 146 # add a SeqRecord to the bundle 147 line_split = line.strip().split() 148 149 if len(line_split) != 7: 150 raise ValueError( 151 "Error parsing alignment - 's' line must have 7 fields" 152 ) 153 154 # convert MAF-style +/- strand to biopython-type 1/-1 155 if line_split[4] == "+": 156 strand = 1 157 elif line_split[4] == "-": 158 strand = -1 159 else: 160 # TODO: issue warning, set to 0? 161 strand = 1 162 163 # s (literal), src (ID), start, size, strand, srcSize, text (sequence) 164 anno = { 165 "start": int(line_split[2]), 166 "size": int(line_split[3]), 167 "strand": strand, 168 "srcSize": int(line_split[5]), 169 } 170 171 sequence = line_split[6] 172 173 # interpret a dot/period to mean the same as the first sequence 174 if "." in sequence: 175 if not records: 176 raise ValueError( 177 "Found dot/period in first sequence of alignment" 178 ) 179 180 ref = records[0].seq 181 new = [] 182 183 for (letter, ref_letter) in zip(sequence, ref): 184 new.append(ref_letter if letter == "." else letter) 185 186 sequence = "".join(new) 187 188 records.append( 189 SeqRecord( 190 Seq(sequence), 191 id=line_split[1], 192 name=line_split[1], 193 description="", 194 annotations=anno, 195 ) 196 ) 197 elif line.startswith("i"): 198 # TODO: information about what is in the aligned species DNA before 199 # and after the immediately preceding "s" line 200 pass 201 elif line.startswith("e"): 202 # TODO: information about the size of the gap between the alignments 203 # that span the current block 204 pass 205 elif line.startswith("q"): 206 # TODO: quality of each aligned base for the species. 207 # Need to find documentation on this, looks like ASCII 0-9 or gap? 208 # Can then store in each SeqRecord's .letter_annotations dictionary, 209 # perhaps as the raw string or turned into integers / None for gap? 210 pass 211 elif line.startswith("#"): 212 # ignore comments 213 # (not sure whether comments 214 # are in the maf specification, though) 215 pass 216 elif not line.strip(): 217 # end a bundle of records 218 if seq_count is not None: 219 assert len(records) == seq_count 220 221 alignment = MultipleSeqAlignment(records) 222 # TODO - Introduce an annotated alignment class? 223 # See also Bio/AlignIO/FastaIO.py for same requirement. 224 # For now, store the annotation a new private property: 225 alignment._annotations = annotations 226 227 yield alignment 228 229 in_a_bundle = False 230 231 annotations = [] 232 records = [] 233 else: 234 raise ValueError( 235 "Error parsing alignment - unexpected line:\n%s" % (line,) 236 ) 237 elif line.startswith("a"): 238 # start a bundle of records 239 in_a_bundle = True 240 annot_strings = line.strip().split()[1:] 241 if len(annot_strings) != line.count("="): 242 raise ValueError("Error parsing alignment - invalid key in 'a' line") 243 annotations = dict(a_string.split("=") for a_string in annot_strings) 244 elif line.startswith("#"): 245 # ignore comments 246 pass 247 elif not line: 248 break 249 250 251class MafIndex: 252 """Index for a MAF file. 253 254 The index is a sqlite3 database that is built upon creation of the object 255 if necessary, and queried when methods *search* or *get_spliced* are 256 used. 257 """ 258 259 def __init__(self, sqlite_file, maf_file, target_seqname): 260 """Indexes or loads the index of a MAF file.""" 261 self._target_seqname = target_seqname 262 # example: Tests/MAF/ucsc_mm9_chr10.mafindex 263 self._index_filename = sqlite_file 264 # example: /home/bli/src/biopython/Tests/MAF 265 self._relative_path = os.path.abspath(os.path.dirname(sqlite_file)) 266 # example: Tests/MAF/ucsc_mm9_chr10.maf 267 self._maf_file = maf_file 268 269 self._maf_fp = open(self._maf_file) 270 271 # if sqlite_file exists, use the existing db, otherwise index the file 272 if os.path.isfile(sqlite_file): 273 self._con = dbapi2.connect(sqlite_file) 274 self._record_count = self.__check_existing_db() 275 else: 276 self._con = dbapi2.connect(sqlite_file) 277 self._record_count = self.__make_new_index() 278 279 # lastly, setup a MafIterator pointing at the open maf_file 280 self._mafiter = MafIterator(self._maf_fp) 281 282 def __check_existing_db(self): 283 """Perform basic sanity checks upon loading an existing index (PRIVATE).""" 284 try: 285 idx_version = int( 286 self._con.execute( 287 "SELECT value FROM meta_data WHERE key = 'version'" 288 ).fetchone()[0] 289 ) 290 if idx_version != MAFINDEX_VERSION: 291 msg = "\n".join( 292 [ 293 "Index version (%s) incompatible with this version " 294 "of MafIndex" % idx_version, 295 "You might erase the existing index %s " 296 "for it to be rebuilt." % self._index_filename, 297 ] 298 ) 299 raise ValueError(msg) 300 301 filename = self._con.execute( 302 "SELECT value FROM meta_data WHERE key = 'filename'" 303 ).fetchone()[0] 304 # Compute absolute path of the original maf file 305 if os.path.isabs(filename): 306 # It was already stored as absolute 307 tmp_mafpath = filename 308 else: 309 # It should otherwise have been stored as relative to the index 310 # Would be stored with Unix / path separator, so convert 311 # it to the local OS path separator here: 312 tmp_mafpath = os.path.join( 313 self._relative_path, filename.replace("/", os.path.sep) 314 ) 315 if tmp_mafpath != os.path.abspath(self._maf_file): 316 # Original and given absolute paths differ. 317 raise ValueError( 318 "Index uses a different file (%s != %s)" 319 % (filename, self._maf_file) 320 ) 321 322 db_target = self._con.execute( 323 "SELECT value FROM meta_data WHERE key = 'target_seqname'" 324 ).fetchone()[0] 325 if db_target != self._target_seqname: 326 raise ValueError( 327 "Provided database indexed for %s, expected %s" 328 % (db_target, self._target_seqname) 329 ) 330 331 record_count = int( 332 self._con.execute( 333 "SELECT value FROM meta_data WHERE key = 'record_count'" 334 ).fetchone()[0] 335 ) 336 if record_count == -1: 337 raise ValueError("Unfinished/partial database provided") 338 339 records_found = int( 340 self._con.execute("SELECT COUNT(*) FROM offset_data").fetchone()[0] 341 ) 342 if records_found != record_count: 343 raise ValueError( 344 "Expected %s records, found %s. Corrupt index?" 345 % (record_count, records_found) 346 ) 347 348 return records_found 349 350 except (dbapi2.OperationalError, dbapi2.DatabaseError) as err: 351 raise ValueError("Problem with SQLite database: %s" % err) from None 352 353 def __make_new_index(self): 354 """Read MAF file and generate SQLite index (PRIVATE).""" 355 # make the tables 356 self._con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);") 357 self._con.execute( 358 "INSERT INTO meta_data (key, value) VALUES ('version', %s);" 359 % MAFINDEX_VERSION 360 ) 361 self._con.execute( 362 "INSERT INTO meta_data (key, value) VALUES ('record_count', -1);" 363 ) 364 self._con.execute( 365 "INSERT INTO meta_data (key, value) VALUES ('target_seqname', '%s');" 366 % (self._target_seqname,) 367 ) 368 # Determine whether to store maf file as relative to the index or absolute 369 # See https://github.com/biopython/biopython/pull/381 370 if not os.path.isabs(self._maf_file) and not os.path.isabs( 371 self._index_filename 372 ): 373 # Since the user gave both maf file and index as relative paths, 374 # we will store the maf file relative to the index. 375 # Note for cross platform use (e.g. shared drive over SAMBA), 376 # convert any Windows slash into Unix style for rel paths. 377 # example: ucsc_mm9_chr10.maf 378 mafpath = os.path.relpath(self._maf_file, self._relative_path).replace( 379 os.path.sep, "/" 380 ) 381 elif ( 382 os.path.dirname(os.path.abspath(self._maf_file)) + os.path.sep 383 ).startswith(self._relative_path + os.path.sep): 384 # Since maf file is in same directory or sub directory, 385 # might as well make this into a relative path: 386 mafpath = os.path.relpath(self._maf_file, self._relative_path).replace( 387 os.path.sep, "/" 388 ) 389 else: 390 # Default to storing as an absolute path 391 # example: /home/bli/src/biopython/Tests/MAF/ucsc_mm9_chr10.maf 392 mafpath = os.path.abspath(self._maf_file) 393 self._con.execute( 394 "INSERT INTO meta_data (key, value) VALUES ('filename', '%s');" % (mafpath,) 395 ) 396 self._con.execute( 397 "CREATE TABLE offset_data (bin INTEGER, start INTEGER, end INTEGER, offset INTEGER);" 398 ) 399 400 insert_count = 0 401 402 # iterate over the entire file and insert in batches 403 mafindex_func = self.__maf_indexer() 404 405 while True: 406 batch = list(islice(mafindex_func, 100)) 407 if not batch: 408 break 409 410 # batch is made from self.__maf_indexer(), 411 # which yields zero-based "inclusive" start and end coordinates 412 self._con.executemany( 413 "INSERT INTO offset_data (bin, start, end, offset) VALUES (?,?,?,?);", 414 batch, 415 ) 416 self._con.commit() 417 insert_count += len(batch) 418 419 # then make indexes on the relevant fields 420 self._con.execute("CREATE INDEX IF NOT EXISTS bin_index ON offset_data(bin);") 421 self._con.execute( 422 "CREATE INDEX IF NOT EXISTS start_index ON offset_data(start);" 423 ) 424 self._con.execute("CREATE INDEX IF NOT EXISTS end_index ON offset_data(end);") 425 426 self._con.execute( 427 "UPDATE meta_data SET value = '%s' WHERE key = 'record_count'" 428 % (insert_count,) 429 ) 430 431 self._con.commit() 432 433 return insert_count 434 435 def __maf_indexer(self): 436 """Return index information for each bundle (PRIVATE). 437 438 Yields index information for each bundle in the form of 439 (bin, start, end, offset) tuples where start and end are 440 0-based inclusive coordinates. 441 """ 442 line = self._maf_fp.readline() 443 444 while line: 445 if line.startswith("a"): 446 # note the offset 447 offset = self._maf_fp.tell() - len(line) 448 449 # search the following lines for a match to target_seqname 450 while True: 451 line = self._maf_fp.readline() 452 453 if not line.strip() or line.startswith("a"): 454 # Empty line or new alignment record 455 raise ValueError( 456 "Target for indexing (%s) not found in this bundle" 457 % (self._target_seqname,) 458 ) 459 elif line.startswith("s"): 460 # s (literal), src (ID), start, size, strand, srcSize, text (sequence) 461 line_split = line.strip().split() 462 463 if line_split[1] == self._target_seqname: 464 start = int(line_split[2]) 465 size = int(line_split[3]) 466 if size != len(line_split[6].replace("-", "")): 467 raise ValueError( 468 "Invalid length for target coordinates " 469 "(expected %s, found %s)" 470 % (size, len(line_split[6].replace("-", ""))) 471 ) 472 473 # "inclusive" end position is start + length - 1 474 end = start + size - 1 475 476 # _ucscbin takes end-exclusive coordinates 477 yield (self._ucscbin(start, end + 1), start, end, offset) 478 479 break 480 481 line = self._maf_fp.readline() 482 483 # TODO: check coordinate correctness for the two bin-related static methods 484 @staticmethod 485 def _region2bin(start, end): 486 """Find bins that a region may belong to (PRIVATE). 487 488 Converts a region to a list of bins that it may belong to, including largest 489 and smallest bins. 490 """ 491 bins = [0, 1] 492 493 bins.extend(range(1 + (start >> 26), 2 + ((end - 1) >> 26))) 494 bins.extend(range(9 + (start >> 23), 10 + ((end - 1) >> 23))) 495 bins.extend(range(73 + (start >> 20), 74 + ((end - 1) >> 20))) 496 bins.extend(range(585 + (start >> 17), 586 + ((end - 1) >> 17))) 497 498 return set(bins) 499 500 @staticmethod 501 def _ucscbin(start, end): 502 """Return the smallest bin a given region will fit into (PRIVATE). 503 504 Adapted from http://genomewiki.ucsc.edu/index.php/Bin_indexing_system 505 """ 506 bin_offsets = [512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0] 507 508 _bin_first_shift = 17 509 _bin_next_shift = 3 510 511 start_bin = start 512 end_bin = end - 1 513 514 start_bin >>= _bin_first_shift 515 end_bin >>= _bin_first_shift 516 517 for bin_offset in bin_offsets: 518 if start_bin == end_bin: 519 return bin_offset + start_bin 520 start_bin >>= _bin_next_shift 521 end_bin >>= _bin_next_shift 522 523 return 0 524 525 def _get_record(self, offset): 526 """Retrieve a single MAF record located at the offset provided (PRIVATE).""" 527 self._maf_fp.seek(offset) 528 return next(self._mafiter) 529 530 def search(self, starts, ends): 531 """Search index database for MAF records overlapping ranges provided. 532 533 Returns *MultipleSeqAlignment* results in order by start, then end, then 534 internal offset field. 535 536 *starts* should be a list of 0-based start coordinates of segments in the reference. 537 *ends* should be the list of the corresponding segment ends 538 (in the half-open UCSC convention: 539 http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/). 540 """ 541 # verify the provided exon coordinates 542 if len(starts) != len(ends): 543 raise ValueError("Every position in starts must have a match in ends") 544 545 # Could it be safer to sort the (exonstart, exonend) pairs? 546 for exonstart, exonend in zip(starts, ends): 547 exonlen = exonend - exonstart 548 if exonlen < 1: 549 raise ValueError( 550 "Exon coordinates (%d, %d) invalid: exon length (%d) < 1" 551 % (exonstart, exonend, exonlen) 552 ) 553 con = self._con 554 555 # Keep track of what blocks have already been yielded 556 # in order to avoid duplicating them 557 # (see https://github.com/biopython/biopython/issues/1083) 558 yielded_rec_coords = set() 559 # search for every exon 560 for exonstart, exonend in zip(starts, ends): 561 try: 562 possible_bins = ", ".join( 563 map(str, self._region2bin(exonstart, exonend)) 564 ) 565 except TypeError: 566 raise TypeError( 567 "Exon coordinates must be integers " 568 "(start=%d, end=%d)" % (exonstart, exonend) 569 ) from None 570 571 # https://www.sqlite.org/lang_expr.html 572 # ----- 573 # The BETWEEN operator 574 # 575 # The BETWEEN operator is logically equivalent to a pair of 576 # comparisons. "x BETWEEN y AND z" is equivalent to "x>=y AND x<=z" 577 # except that with BETWEEN, the x expression is only evaluated 578 # once. The precedence of the BETWEEN operator is the same as the 579 # precedence as operators == and != and LIKE and groups left to 580 # right. 581 # ----- 582 583 # We are testing overlap between the query segment and records in 584 # the index, using non-strict coordinates comparisons. 585 # The query segment end must be passed as end-inclusive 586 # The index should also have been build with end-inclusive 587 # end coordinates. 588 # See https://github.com/biopython/biopython/pull/1086#issuecomment-285069073 589 590 result = con.execute( 591 "SELECT DISTINCT start, end, offset FROM offset_data " 592 "WHERE bin IN (%s) " 593 "AND (end BETWEEN %s AND %s OR %s BETWEEN start AND end) " 594 "ORDER BY start, end, offset ASC;" 595 % (possible_bins, exonstart, exonend - 1, exonend - 1) 596 ) 597 598 rows = result.fetchall() 599 600 # rows come from the sqlite index, 601 # which should have been written using __make_new_index, 602 # so rec_start and rec_end should be zero-based "inclusive" coordinates 603 for rec_start, rec_end, offset in rows: 604 # Avoid yielding multiple time the same block 605 if (rec_start, rec_end) in yielded_rec_coords: 606 continue 607 else: 608 yielded_rec_coords.add((rec_start, rec_end)) 609 # Iterate through hits, fetching alignments from the MAF file 610 # and checking to be sure we've retrieved the expected record. 611 612 fetched = self._get_record(int(offset)) 613 614 for record in fetched: 615 if record.id == self._target_seqname: 616 # start and size come from the maf lines 617 start = record.annotations["start"] 618 # "inclusive" end is start + length - 1 619 end = start + record.annotations["size"] - 1 620 621 if not (start == rec_start and end == rec_end): 622 raise ValueError( 623 "Expected %s-%s @ offset %s, found %s-%s" 624 % (rec_start, rec_end, offset, start, end) 625 ) 626 627 yield fetched 628 629 def get_spliced(self, starts, ends, strand=1): 630 """Return a multiple alignment of the exact sequence range provided. 631 632 Accepts two lists of start and end positions on target_seqname, representing 633 exons to be spliced in silico. Returns a *MultipleSeqAlignment* of the 634 desired sequences spliced together. 635 636 *starts* should be a list of 0-based start coordinates of segments in the reference. 637 *ends* should be the list of the corresponding segment ends 638 (in the half-open UCSC convention: 639 http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/). 640 641 To ask for the alignment portion corresponding to the first 100 642 nucleotides of the reference sequence, you would use 643 ``search([0], [100])`` 644 """ 645 # validate strand 646 if strand not in (1, -1): 647 raise ValueError("Strand must be 1 or -1, got %s" % strand) 648 649 # pull all alignments that span the desired intervals 650 fetched = list(self.search(starts, ends)) 651 652 # keep track of the expected letter count 653 # (sum of lengths of [start, end) segments, 654 # where [start, end) half-open) 655 expected_letters = sum(end - start for start, end in zip(starts, ends)) 656 657 # if there's no alignment, return filler for the assembly of the length given 658 if len(fetched) == 0: 659 return MultipleSeqAlignment( 660 [SeqRecord(Seq("N" * expected_letters), id=self._target_seqname)] 661 ) 662 663 # find the union of all IDs in these alignments 664 all_seqnames = {sequence.id for multiseq in fetched for sequence in multiseq} 665 666 # split every record by base position 667 # key: sequence name 668 # value: dictionary 669 # key: position in the reference sequence 670 # value: letter(s) (including letters 671 # aligned to the "-" preceding the letter 672 # at the position in the reference, if any) 673 split_by_position = {seq_name: {} for seq_name in all_seqnames} 674 675 # keep track of what the total number of (unspliced) letters should be 676 total_rec_length = 0 677 678 # track first strand encountered on the target seqname 679 ref_first_strand = None 680 681 for multiseq in fetched: 682 # find the target_seqname in this MultipleSeqAlignment and use it to 683 # set the parameters for the rest of this iteration 684 for seqrec in multiseq: 685 if seqrec.id == self._target_seqname: 686 try: 687 if ref_first_strand is None: 688 ref_first_strand = seqrec.annotations["strand"] 689 690 if ref_first_strand not in (1, -1): 691 raise ValueError("Strand must be 1 or -1") 692 elif ref_first_strand != seqrec.annotations["strand"]: 693 raise ValueError( 694 "Encountered strand='%s' on target seqname, " 695 "expected '%s'" 696 % (seqrec.annotations["strand"], ref_first_strand) 697 ) 698 except KeyError: 699 raise ValueError( 700 "No strand information for target seqname (%s)" 701 % self._target_seqname 702 ) from None 703 # length including gaps (i.e. alignment length) 704 rec_length = len(seqrec) 705 rec_start = seqrec.annotations["start"] 706 ungapped_length = seqrec.annotations["size"] 707 # inclusive end in zero-based coordinates of the reference 708 rec_end = rec_start + ungapped_length - 1 709 # This is length in terms of actual letters in the reference 710 total_rec_length += ungapped_length 711 712 # blank out these positions for every seqname 713 for seqrec in multiseq: 714 for pos in range(rec_start, rec_end + 1): 715 split_by_position[seqrec.id][pos] = "" 716 717 break 718 # http://psung.blogspot.fr/2007/12/for-else-in-python.html 719 # https://docs.python.org/2/tutorial/controlflow.html#break-and-continue-statements-and-else-clauses-on-loops 720 else: 721 raise ValueError( 722 "Did not find %s in alignment bundle" % (self._target_seqname,) 723 ) 724 725 # the true, chromosome/contig/etc position in the target seqname 726 real_pos = rec_start 727 728 # loop over the alignment to fill split_by_position 729 for gapped_pos in range(0, rec_length): 730 for seqrec in multiseq: 731 # keep track of this position's value for the target seqname 732 if seqrec.id == self._target_seqname: 733 track_val = seqrec.seq[gapped_pos] 734 735 # Here, a real_pos that corresponds to just after a series of "-" 736 # in the reference will "accumulate" the letters found in other sequences 737 # in front of the "-"s 738 split_by_position[seqrec.id][real_pos] += seqrec.seq[gapped_pos] 739 740 # increment the real_pos counter only when non-gaps are found in 741 # the target_seqname, and we haven't reached the end of the record 742 if track_val != "-" and real_pos < rec_end: 743 real_pos += 1 744 745 # make sure the number of bp entries equals the sum of the record lengths 746 if len(split_by_position[self._target_seqname]) != total_rec_length: 747 raise ValueError( 748 "Target seqname (%s) has %s records, expected %s" 749 % ( 750 self._target_seqname, 751 len(split_by_position[self._target_seqname]), 752 total_rec_length, 753 ) 754 ) 755 756 # translates a position in the target_seqname sequence to its gapped length 757 realpos_to_len = { 758 pos: len(gapped_fragment) 759 for pos, gapped_fragment in split_by_position[self._target_seqname].items() 760 if len(gapped_fragment) > 1 761 } 762 763 # splice together the exons 764 subseq = {} 765 766 for seqid in all_seqnames: 767 seq_split = split_by_position[seqid] 768 seq_splice = [] 769 770 filler_char = "N" if seqid == self._target_seqname else "-" 771 772 # iterate from start to end, taking bases from split_by_position when 773 # they exist, using N or - for gaps when there is no alignment. 774 append = seq_splice.append 775 776 for exonstart, exonend in zip(starts, ends): 777 # exonend is exclusive 778 for real_pos in range(exonstart, exonend): 779 # if this seqname has this position, add it 780 if real_pos in seq_split: 781 append(seq_split[real_pos]) 782 # if not, but it's in the target_seqname, add length-matched filler 783 elif real_pos in realpos_to_len: 784 append(filler_char * realpos_to_len[real_pos]) 785 # it's not in either, so add a single filler character 786 else: 787 append(filler_char) 788 789 subseq[seqid] = "".join(seq_splice) 790 791 # make sure we're returning the right number of letters 792 if len(subseq[self._target_seqname].replace("-", "")) != expected_letters: 793 raise ValueError( 794 "Returning %s letters for target seqname (%s), expected %s" 795 % ( 796 len(subseq[self._target_seqname].replace("-", "")), 797 self._target_seqname, 798 expected_letters, 799 ) 800 ) 801 802 # check to make sure all sequences are the same length as the target seqname 803 ref_subseq_len = len(subseq[self._target_seqname]) 804 805 for seqid, seq in subseq.items(): 806 if len(seq) != ref_subseq_len: 807 raise ValueError( 808 "Returning length %s for %s, expected %s" 809 % (len(seq), seqid, ref_subseq_len) 810 ) 811 812 # finally, build a MultipleSeqAlignment object for our final sequences 813 result_multiseq = [] 814 815 for seqid, seq in subseq.items(): 816 seq = Seq(seq) 817 818 seq = seq if strand == ref_first_strand else seq.reverse_complement() 819 820 result_multiseq.append(SeqRecord(seq, id=seqid, name=seqid, description="")) 821 822 return MultipleSeqAlignment(result_multiseq) 823 824 def __repr__(self): 825 """Return a string representation of the index.""" 826 return "MafIO.MafIndex(%r, target_seqname=%r)" % ( 827 self._maf_fp.name, 828 self._target_seqname, 829 ) 830 831 def __len__(self): 832 """Return the number of records in the index.""" 833 return self._record_count 834