1# ----------------------------------------------------------------------------- 2# Copyright (c) 2013--, scikit-bio development team. 3# 4# Distributed under the terms of the Modified BSD License. 5# 6# The full license is in the file COPYING.txt, distributed with this software. 7# ----------------------------------------------------------------------------- 8 9from cpython cimport bool 10import numpy as np 11cimport numpy as cnp 12from skbio.sequence import Protein, Sequence 13 14cdef extern from "_lib/ssw.h": 15 16 ctypedef struct s_align: 17 cnp.uint16_t score1 18 cnp.uint16_t score2 19 cnp.int32_t ref_begin1 20 cnp.int32_t ref_end1 21 cnp.int32_t read_begin1 22 cnp.int32_t read_end1 23 cnp.int32_t ref_end2 24 cnp.uint32_t* cigar 25 cnp.int32_t cigarLen 26 27 ctypedef struct s_profile: 28 pass 29 30 cdef s_profile* ssw_init(const cnp.int8_t* read, 31 const cnp.int32_t readLen, 32 const cnp.int8_t* mat, 33 const cnp.int32_t n, 34 const cnp.int8_t score_size) 35 36 cdef void init_destroy(s_profile* p) 37 38 cdef s_align* ssw_align(const s_profile* prof, 39 const cnp.int8_t* ref, 40 cnp.int32_t refLen, 41 const cnp.uint8_t weight_gapO, 42 const cnp.uint8_t weight_gapE, 43 const cnp.uint8_t flag, 44 const cnp.uint16_t filters, 45 const cnp.int32_t filterd, 46 const cnp.int32_t maskLen) 47 48 cdef void align_destroy(s_align* a) 49 50np_aa_table = np.array([ 51 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 52 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 53 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 54 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 55 23, 0, 20, 4, 3, 6, 13, 7, 8, 9, 23, 11, 10, 12, 2, 23, 56 14, 5, 1, 15, 16, 23, 19, 17, 22, 18, 21, 23, 23, 23, 23, 23, 57 23, 0, 20, 4, 3, 6, 13, 7, 8, 9, 23, 11, 10, 12, 2, 23, 58 14, 5, 1, 15, 16, 23, 19, 17, 22, 18, 21, 23, 23, 23, 23, 23]) 59 60np_nt_table = np.array([ 61 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 62 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 63 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 64 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 65 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 66 4, 4, 4, 4, 3, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 67 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 68 4, 4, 4, 4, 3, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]) 69 70mid_table = np.array(['M', 'I', 'D']) 71 72 73cdef class AlignmentStructure: 74 """Wraps the result of an alignment c struct so it is accessible to Python 75 76 Notes 77 ----- 78 `cigar` may be empty depending on parameters used. 79 80 `target_begin` and `query_begin` may be -1 depending on parameters used. 81 82 Developer note: `read_sequence` is an alias for `query_sequence` used by 83 ssw.c as is `reference_sequence` for `target_sequence` 84 """ 85 cdef s_align *p 86 cdef str read_sequence 87 cdef str reference_sequence 88 cdef int index_starts_at 89 cdef str _cigar_string 90 91 def __cinit__(self, read_sequence, reference_sequence, index_starts_at): 92 # We use `read_sequence` and `reference_sequence` here as they are 93 # treated sematically as a private output of ssw.c like the `s_align` 94 # struct 95 self.read_sequence = read_sequence 96 self.reference_sequence = reference_sequence 97 self.index_starts_at = index_starts_at 98 99 cdef __constructor(self, s_align* pointer): 100 self.p = pointer 101 102 def __dealloc__(self): 103 if self.p is not NULL: 104 align_destroy(self.p) 105 106 def __getitem__(self, key): 107 return getattr(self, key) 108 109 def __repr__(self): 110 data = ['optimal_alignment_score', 'suboptimal_alignment_score', 111 'query_begin', 'query_end', 'target_begin', 112 'target_end_optimal', 'target_end_suboptimal', 'cigar', 113 'query_sequence', 'target_sequence'] 114 return "{\n%s\n}" % ',\n'.join([ 115 " {!r}: {!r}".format(k, self[k]) for k in data]) 116 117 def __str__(self): 118 score = "Score: %d" % self.optimal_alignment_score 119 if self.query_sequence and self.cigar: 120 target = self.aligned_target_sequence 121 query = self.aligned_query_sequence 122 align_len = len(query) 123 if align_len > 13: 124 target = target[:10] + "..." 125 query = query[:10] + "..." 126 127 length = "Length: %d" % align_len 128 return "\n".join([query, target, score, length]) 129 return score 130 131 @property 132 def optimal_alignment_score(self): 133 """Optimal alignment score 134 135 Returns 136 ------- 137 int 138 The optimal alignment score 139 140 """ 141 return self.p.score1 142 143 @property 144 def suboptimal_alignment_score(self): 145 """Suboptimal alignment score 146 147 Returns 148 ------- 149 int 150 The suboptimal alignment score 151 152 """ 153 return self.p.score2 154 155 @property 156 def target_begin(self): 157 """Character index where the target's alignment begins 158 159 Returns 160 ------- 161 int 162 The character index of the target sequence's alignment's beginning 163 164 Notes 165 ----- 166 The result is a 0-based index by default 167 168 """ 169 return self.p.ref_begin1 + self.index_starts_at if (self.p.ref_begin1 170 >= 0) else -1 171 172 @property 173 def target_end_optimal(self): 174 """Character index where the target's optimal alignment ends 175 176 Returns 177 ------- 178 int 179 The character index of the target sequence's optimal alignment's 180 end 181 182 Notes 183 ----- 184 The result is a 0-based index by default 185 186 """ 187 return self.p.ref_end1 + self.index_starts_at 188 189 @property 190 def target_end_suboptimal(self): 191 """Character index where the target's suboptimal alignment ends 192 193 Returns 194 ------- 195 int 196 The character index of the target sequence's suboptimal alignment's 197 end 198 199 Notes 200 ----- 201 The result is a 0-based index by default 202 203 """ 204 return self.p.ref_end2 + self.index_starts_at 205 206 @property 207 def query_begin(self): 208 """Returns the character index at which the query sequence begins 209 210 Returns 211 ------- 212 int 213 The character index of the query sequence beginning 214 215 Notes 216 ----- 217 The result is a 0-based index by default 218 219 """ 220 return self.p.read_begin1 + self.index_starts_at if (self.p.read_begin1 221 >= 0) else -1 222 223 @property 224 def query_end(self): 225 """Character index at where query sequence ends 226 227 Returns 228 ------- 229 int 230 The character index of the query sequence ending 231 232 Notes 233 ----- 234 The result is a 0-based index by default 235 236 """ 237 return self.p.read_end1 + self.index_starts_at 238 239 @property 240 def cigar(self): 241 """Cigar formatted string for the optimal alignment 242 243 Returns 244 ------- 245 str 246 The cigar string of the optimal alignment 247 248 Notes 249 ----- 250 The cigar string format is described in [1]_ and [2]_. 251 252 If there is no cigar or optimal alignment, this will return an empty 253 string 254 255 References 256 ---------- 257 .. [1] http://genome.sph.umich.edu/wiki/SAM 258 .. [2] http://samtools.github.io/hts-specs/SAMv1.pdf 259 260 """ 261 # Memoization! (1/2) 262 if self._cigar_string is not None: 263 return self._cigar_string 264 cigar_list = [] 265 for i in range(self.p.cigarLen): 266 # stored the same as that in BAM format, 267 # high 28 bits: length, low 4 bits: M/I/D (0/1/2) 268 269 # Length, remove first 4 bits 270 cigar_list.append(str(self.p.cigar[i] >> 4)) 271 # M/I/D, lookup first 4 bits in the mid_table 272 cigar_list.append(mid_table[self.p.cigar[i] & 0xf]) 273 # Memoization! (2/2) 274 self._cigar_string = "".join(cigar_list) 275 return self._cigar_string 276 277 @property 278 def query_sequence(self): 279 """Query sequence 280 281 Returns 282 ------- 283 str 284 The query sequence 285 286 """ 287 return self.read_sequence 288 289 @property 290 def target_sequence(self): 291 """Target sequence 292 293 Returns 294 ------- 295 str 296 The target sequence 297 298 """ 299 return self.reference_sequence 300 301 @property 302 def aligned_query_sequence(self): 303 """Returns the query sequence aligned by the cigar 304 305 Returns 306 ------- 307 str 308 Aligned query sequence 309 310 Notes 311 ----- 312 This will return `None` if `suppress_sequences` was True when this 313 object was created 314 315 """ 316 if self.query_sequence: 317 return self._get_aligned_sequence(self.query_sequence, 318 self._tuples_from_cigar(), 319 self.query_begin, self.query_end, 320 "D") 321 return None 322 323 @property 324 def aligned_target_sequence(self): 325 """Returns the target sequence aligned by the cigar 326 327 Returns 328 ------- 329 str 330 Aligned target sequence 331 332 Notes 333 ----- 334 This will return `None` if `suppress_sequences` was True when this 335 object was created 336 337 """ 338 if self.target_sequence: 339 return self._get_aligned_sequence(self.target_sequence, 340 self._tuples_from_cigar(), 341 self.target_begin, 342 self.target_end_optimal, 343 "I") 344 return None 345 346 def set_zero_based(self, is_zero_based): 347 """Set the aligment indices to start at 0 if True else 1 if False 348 349 """ 350 if is_zero_based: 351 self.index_starts_at = 0 352 else: 353 self.index_starts_at = 1 354 355 def is_zero_based(self): 356 """Returns True if alignment inidices start at 0 else False 357 358 Returns 359 ------- 360 bool 361 Whether the alignment inidices start at 0 362 363 """ 364 return self.index_starts_at == 0 365 366 def _get_aligned_sequence(self, sequence, tuple_cigar, begin, end, 367 gap_type): 368 # Save the original index scheme and then set it to 0 (1/2) 369 orig_z_base = self.is_zero_based() 370 self.set_zero_based(True) 371 aligned_sequence = [] 372 seq = sequence[begin:end + 1] 373 index = 0 374 for length, mid in tuple_cigar: 375 if mid == gap_type: 376 aligned_sequence += ['-' * length] 377 else: 378 aligned_sequence += [seq[index:index + length]] 379 index += length 380 # Our sequence end is sometimes beyond the cigar: 381 aligned_sequence += [seq[index:end - begin + 1]] 382 # Revert our index scheme to the original (2/2) 383 self.set_zero_based(orig_z_base) 384 return "".join(aligned_sequence) 385 386 def _tuples_from_cigar(self): 387 tuples = [] 388 length_stack = [] 389 for character in self.cigar: 390 if character.isdigit(): 391 length_stack.append(character) 392 else: 393 tuples.append((int("".join(length_stack)), character)) 394 length_stack = [] 395 return tuples 396 397cdef class StripedSmithWaterman: 398 """Performs a striped (banded) Smith Waterman Alignment. 399 400 First a StripedSmithWaterman object must be instantiated with a query 401 sequence. The resulting object is then callable with a target sequence and 402 may be reused on a large collection of target sequences. 403 404 Parameters 405 ---------- 406 query_sequence : string 407 The query sequence, this may be upper or lowercase from the set of 408 {A, C, G, T, N} (nucleotide) or from the set of 409 {A, R, N, D, C, Q, E, G, H, I, L, K, M, F, P, S, T, W, Y, V, B, Z, X, * 410 } (protein) 411 gap_open_penalty : int, optional 412 The penalty applied to creating a gap in the alignment. This CANNOT 413 be 0. 414 Default is 5. 415 gap_extend_penalty : int, optional 416 The penalty applied to extending a gap in the alignment. This CANNOT 417 be 0. 418 Default is 2. 419 score_size : int, optional 420 If your estimated best alignment score is < 255 this should be 0. 421 If your estimated best alignment score is >= 255, this should be 1. 422 If you don't know, this should be 2. 423 Default is 2. 424 mask_length : int, optional 425 The distance between the optimal and suboptimal alignment ending 426 position >= mask_length. We suggest to use len(query_sequence)/2, if 427 you don't have special concerns. 428 Detailed description of mask_length: After locating the optimal 429 alignment ending position, the suboptimal alignment score can be 430 heuristically found by checking the second largest score in the array 431 that contains the maximal score of each column of the SW matrix. In 432 order to avoid picking the scores that belong to the alignments 433 sharing the partial best alignment, SSW C library masks the reference 434 loci nearby (mask length = mask_length) the best alignment ending 435 position and locates the second largest score from the unmasked 436 elements. 437 Default is 15. 438 mask_auto : bool, optional 439 This will automatically set the used mask length to be 440 max(int(len(`query_sequence`)/2), `mask_length`). 441 Default is True. 442 score_only : bool, optional 443 This will prevent the best alignment beginning positions (BABP) and the 444 cigar from being returned as a result. This overrides any setting on 445 `score_filter`, `distance_filter`, and `override_skip_babp`. It has the 446 highest precedence. 447 Default is False. 448 score_filter : int, optional 449 If set, this will prevent the cigar and best alignment beginning 450 positions (BABP) from being returned if the optimal alignment score is 451 less than `score_filter` saving some time computationally. This filter 452 may be overridden by `score_only` (prevents BABP and cigar, regardless 453 of other arguments), `distance_filter` (may prevent cigar, but will 454 cause BABP to be calculated), and `override_skip_babp` (will ensure 455 BABP) returned. 456 Default is None. 457 distance_filter : int, optional 458 If set, this will prevent the cigar from being returned if the length 459 of the `query_sequence` or the `target_sequence` is less than 460 `distance_filter` saving some time computationally. The results of 461 this filter may be overridden by `score_only` (prevents BABP and cigar, 462 regardless of other arguments), and `score_filter` (may prevent cigar). 463 `override_skip_babp` has no effect with this filter applied, as BABP 464 must be calculated to perform the filter. 465 Default is None. 466 override_skip_babp : bool, optional 467 When True, the best alignment beginning positions (BABP) will always be 468 returned unless `score_only` is set to True. 469 Default is False. 470 protein : bool, optional 471 When True, the `query_sequence` and `target_sequence` will be read as 472 protein sequence. When False, the `query_sequence` and 473 `target_sequence` will be read as nucleotide sequence. If True, a 474 `substitution_matrix` must be supplied. 475 Default is False. 476 match_score : int, optional 477 When using a nucleotide sequence, the match_score is the score added 478 when a match occurs. This is ignored if `substitution_matrix` is 479 provided. 480 Default is 2. 481 mismatch_score : int, optional 482 When using a nucleotide sequence, the mismatch is the score subtracted 483 when a mismatch occurs. This should be a negative integer. 484 This is ignored if `substitution_matrix` is provided. 485 Default is -3. 486 substitution_matrix : 2D dict, optional 487 Provides the score for each possible substitution of sequence 488 characters. This may be used for protein or nucleotide sequences. The 489 entire set of possible combinations for the relevant sequence type MUST 490 be enumerated in the dict of dicts. This will override `match_score` 491 and `mismatch_score`. Required when `protein` is True. 492 Default is None. 493 suppress_sequences : bool, optional 494 If True, the query and target sequences will not be returned for 495 convenience. 496 Default is False. 497 zero_index : bool, optional 498 If True, all inidices will start at 0. If False, all inidices will 499 start at 1. 500 Default is True. 501 502 Notes 503 ----- 504 This is a wrapper for the SSW package [1]_. 505 506 `mask_length` has to be >= 15, otherwise the suboptimal alignment 507 information will NOT be returned. 508 509 `match_score` is a positive integer and `mismatch_score` is a negative 510 integer. 511 512 `match_score` and `mismatch_score` are only meaningful in the context of 513 nucleotide sequences. 514 515 A substitution matrix must be provided when working with protein sequences. 516 517 References 518 ---------- 519 .. [1] Zhao, Mengyao, Wan-Ping Lee, Erik P. Garrison, & Gabor T. 520 Marth. "SSW Library: An SIMD Smith-Waterman C/C++ Library for 521 Applications". PLOS ONE (2013). Web. 11 July 2014. 522 http://www.plosone.org/article/info:doi/10.1371/journal.pone.0082138 523 524 """ 525 cdef s_profile *profile 526 cdef cnp.uint8_t gap_open_penalty 527 cdef cnp.uint8_t gap_extend_penalty 528 cdef cnp.uint8_t bit_flag 529 cdef cnp.uint16_t score_filter 530 cdef cnp.int32_t distance_filter 531 cdef cnp.int32_t mask_length 532 cdef str read_sequence 533 cdef int index_starts_at 534 cdef bool is_protein 535 cdef bool suppress_sequences 536 cdef cnp.ndarray __KEEP_IT_IN_SCOPE_read 537 cdef cnp.ndarray __KEEP_IT_IN_SCOPE_matrix 538 539 def __cinit__(self, query_sequence, 540 gap_open_penalty=5, # BLASTN Default 541 gap_extend_penalty=2, # BLASTN Default 542 score_size=2, # BLASTN Default 543 mask_length=15, # Minimum length for a suboptimal alignment 544 mask_auto=True, 545 score_only=False, 546 score_filter=None, 547 distance_filter=None, 548 override_skip_babp=False, 549 protein=False, 550 match_score=2, # BLASTN Default 551 mismatch_score=-3, # BLASTN Default 552 substitution_matrix=None, 553 suppress_sequences=False, 554 zero_index=True): 555 # initalize our values 556 self.read_sequence = query_sequence 557 if gap_open_penalty <= 0: 558 raise ValueError("`gap_open_penalty` must be > 0") 559 self.gap_open_penalty = gap_open_penalty 560 if gap_extend_penalty <= 0: 561 raise ValueError("`gap_extend_penalty` must be > 0") 562 self.gap_extend_penalty = gap_extend_penalty 563 self.distance_filter = 0 if distance_filter is None else \ 564 distance_filter 565 self.score_filter = 0 if score_filter is None else score_filter 566 self.suppress_sequences = suppress_sequences 567 self.is_protein = protein 568 self.bit_flag = self._get_bit_flag(override_skip_babp, score_only) 569 # http://www.cs.utexas.edu/users/EWD/transcriptions/EWD08xx/EWD831.html 570 # Dijkstra knows what's up: 571 self.index_starts_at = 0 if zero_index else 1 572 # set up our matrix 573 cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] matrix 574 if substitution_matrix is None: 575 if protein: 576 raise Exception("Must provide a substitution matrix for" 577 " protein sequences") 578 matrix = self._build_match_matrix(match_score, mismatch_score) 579 else: 580 matrix = self._convert_dict2d_to_matrix(substitution_matrix) 581 # Set up our mask_length 582 # Mask is recommended to be max(query_sequence/2, 15) 583 if mask_auto: 584 self.mask_length = len(query_sequence) / 2 585 if self.mask_length < mask_length: 586 self.mask_length = mask_length 587 else: 588 self.mask_length = mask_length 589 590 cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] read_seq 591 read_seq = self._seq_converter(query_sequence) 592 593 cdef cnp.int32_t read_length 594 read_length = len(query_sequence) 595 596 cdef cnp.int8_t s_size 597 s_size = score_size 598 599 cdef cnp.int32_t m_width 600 m_width = 24 if self.is_protein else 5 601 602 cdef s_profile* p 603 self.profile = ssw_init(<cnp.int8_t*> read_seq.data, 604 read_length, 605 <cnp.int8_t*> matrix.data, 606 m_width, 607 s_size) 608 609 # A hack to keep the python GC from eating our data 610 self.__KEEP_IT_IN_SCOPE_read = read_seq 611 self.__KEEP_IT_IN_SCOPE_matrix = matrix 612 613 def __call__(self, target_sequence): 614 """Align `target_sequence` to `query_sequence` 615 616 Parameters 617 ---------- 618 target_sequence : str 619 620 Returns 621 ------- 622 skbio.alignment.AlignmentStructure 623 The resulting alignment. 624 625 """ 626 reference_sequence = target_sequence 627 cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] reference 628 reference = self._seq_converter(reference_sequence) 629 630 cdef cnp.int32_t ref_length 631 ref_length = len(reference_sequence) 632 633 cdef s_align *align 634 align = ssw_align(self.profile, <cnp.int8_t*> reference.data, 635 ref_length, self.gap_open_penalty, 636 self.gap_extend_penalty, self.bit_flag, 637 self.score_filter, self.distance_filter, 638 self.mask_length) 639 640 # Cython won't let me do this correctly, so duplicate code ahoy: 641 if self.suppress_sequences: 642 alignment = AlignmentStructure("", "", self.index_starts_at) 643 else: 644 alignment = AlignmentStructure(self.read_sequence, 645 reference_sequence, 646 self.index_starts_at) 647 alignment.__constructor(align) # Hack to get a pointer through 648 return alignment 649 650 def __dealloc__(self): 651 if self.profile is not NULL: 652 init_destroy(self.profile) 653 654 def _get_bit_flag(self, override_skip_babp, score_only): 655 bit_flag = 0 656 if score_only: 657 return bit_flag 658 if override_skip_babp: 659 bit_flag = bit_flag | 0x8 660 if self.distance_filter != 0: 661 bit_flag = bit_flag | 0x4 662 if self.score_filter != 0: 663 bit_flag = bit_flag | 0x2 664 if bit_flag == 0 or bit_flag == 8: 665 bit_flag = bit_flag | 0x1 666 return bit_flag 667 668 cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] _seq_converter( 669 self, 670 sequence): 671 cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] seq 672 seq = np.empty(len(sequence), dtype=np.int8) 673 if self.is_protein: 674 for i, char in enumerate(sequence): 675 seq[i] = np_aa_table[ord(char)] 676 else: 677 for i, char in enumerate(sequence): 678 seq[i] = np_nt_table[ord(char)] 679 return seq 680 681 cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] \ 682 _build_match_matrix(self, match_score, mismatch_score): 683 sequence_order = "ACGTN" 684 dict2d = {} 685 for row in sequence_order: 686 dict2d[row] = {} 687 for column in sequence_order: 688 if column == 'N' or row == 'N': 689 dict2d[row][column] = 0 690 else: 691 dict2d[row][column] = match_score if row == column \ 692 else mismatch_score 693 return self._convert_dict2d_to_matrix(dict2d) 694 695 cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] \ 696 _convert_dict2d_to_matrix(self, dict2d): 697 if self.is_protein: 698 sequence_order = "ARNDCQEGHILKMFPSTWYVBZX*" 699 else: 700 sequence_order = "ACGTN" 701 cdef int i = 0 702 length = len(sequence_order) 703 cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] py_list_matrix = \ 704 np.empty(length*length, dtype=np.int8) 705 for row in sequence_order: 706 for column in sequence_order: 707 py_list_matrix[i] = dict2d[row][column] 708 i += 1 709 return py_list_matrix 710