1# ---------------------------------------------------------------------------- 2# Copyright (c) 2013--, scikit-bio development team. 3# 4# Distributed under the terms of the Modified BSD License. 5# 6# The full license is in the file COPYING.txt, distributed with this software. 7# ---------------------------------------------------------------------------- 8 9# Special thanks to http://www.faculty.ucr.edu/~mmaduro/random.htm for the 10# random DNA generator. 11 12# These tests confirm that StripedSmithWaterman returns the same results as 13# SSW. We don't test for correctness of those results (i.e., we assume that 14# ssw.c and ssw.h are correct) as that testing is beyond the scope of skbio. 15# Furthermore all expected results are created by running StripedSmithWaterman 16# the resulting alignments are verified by hand. Creating tests from the base 17# C API is impractical at this time. 18 19from unittest import TestCase, main 20 21from skbio import (local_pairwise_align_ssw, Sequence, DNA, RNA, Protein, 22 TabularMSA) 23from skbio.alignment import StripedSmithWaterman, AlignmentStructure 24from skbio.alignment._pairwise import blosum50 25 26 27class TestSSW(TestCase): 28 29 align_attributes = [ 30 "optimal_alignment_score", "suboptimal_alignment_score", 31 "target_begin", "target_end_optimal", "target_end_suboptimal", 32 "query_begin", "query_end", "cigar", "query_sequence", 33 "target_sequence" 34 ] 35 36 def _check_alignment(self, alignment, expected): 37 for attribute in self.align_attributes: 38 # The first element of this tuple is to identify 39 # the broken sequence if one should fail 40 self.assertEqual((expected['target_sequence'], 41 expected[attribute]), 42 (alignment['target_sequence'], 43 alignment[attribute])) 44 45 def _check_argument_with_inequality_on_optimal_align_score( 46 self, 47 query_sequences=None, 48 target_sequences=None, 49 arg=None, 50 default=None, 51 i_range=None, 52 compare_lt=None, 53 compare_gt=None): 54 iterable_kwarg = {} 55 default_kwarg = {} 56 default_kwarg[arg] = default 57 for query_sequence in query_sequences: 58 for target_sequence in target_sequences: 59 for i in i_range: 60 iterable_kwarg[arg] = i 61 query1 = StripedSmithWaterman(query_sequence, 62 **iterable_kwarg) 63 align1 = query1(target_sequence) 64 65 query2 = StripedSmithWaterman(query_sequence, 66 **default_kwarg) 67 align2 = query2(target_sequence) 68 69 if i == default: 70 self.assertEqual(align1.optimal_alignment_score, 71 align2.optimal_alignment_score) 72 if i < default: 73 compare_lt(align1.optimal_alignment_score, 74 align2.optimal_alignment_score) 75 if i > default: 76 compare_gt(align1.optimal_alignment_score, 77 align2.optimal_alignment_score) 78 79 def _check_bit_flag_sets_properties_falsy_or_negative( 80 self, 81 query_sequences=None, 82 target_sequences=None, 83 arg_settings=[], 84 properties_to_null=[]): 85 kwarg = {} 86 87 def falsy_or_negative(alignment, prop): 88 if type(alignment[prop]) is int: 89 return alignment[prop] < 0 90 else: 91 return not alignment[prop] 92 93 for query_sequence in query_sequences: 94 for target_sequence in target_sequences: 95 for arg, setting in arg_settings: 96 kwarg[arg] = setting 97 query = StripedSmithWaterman(query_sequence, **kwarg) 98 alignment = query(target_sequence) 99 for prop in properties_to_null: 100 self.assertTrue(falsy_or_negative(alignment, prop)) 101 # Every property not in our null list 102 for prop in [p for p in self.align_attributes 103 if p not in properties_to_null]: 104 self.assertFalse(falsy_or_negative(alignment, prop)) 105 106 107class TestStripedSmithWaterman(TestSSW): 108 109 def test_object_is_reusable(self): 110 q_seq = "AGGGTAATTAGGCGTGTTCACCTA" 111 expected_alignments = [ 112 { 113 'optimal_alignment_score': 10, 114 'suboptimal_alignment_score': 10, 115 'query_begin': 4, 116 'query_end': 8, 117 'target_begin': 3, 118 'target_end_optimal': 7, 119 'target_end_suboptimal': 34, 120 'cigar': '5M', 121 'query_sequence': q_seq, 122 'target_sequence': ('TTATAATTTTCTTATTATTATCAATATTTATAATTTGATTT' 123 'TGTTGTAAT') 124 }, 125 { 126 'optimal_alignment_score': 36, 127 'suboptimal_alignment_score': 16, 128 'query_begin': 0, 129 'query_end': 23, 130 'target_begin': 6, 131 'target_end_optimal': 29, 132 'target_end_suboptimal': 13, 133 'cigar': '8M1D8M1I7M', 134 'query_sequence': q_seq, 135 'target_sequence': 'AGTCGAAGGGTAATATAGGCGTGTCACCTA' 136 }, 137 { 138 'optimal_alignment_score': 16, 139 'suboptimal_alignment_score': 0, 140 'query_begin': 0, 141 'query_end': 7, 142 'target_begin': 6, 143 'target_end_optimal': 13, 144 'target_end_suboptimal': 0, 145 'cigar': '8M', 146 'query_sequence': q_seq, 147 'target_sequence': 'AGTCGAAGGGTAATA' 148 }, 149 { 150 'optimal_alignment_score': 8, 151 'suboptimal_alignment_score': 8, 152 'query_begin': 0, 153 'query_end': 3, 154 'target_begin': 7, 155 'target_end_optimal': 10, 156 'target_end_suboptimal': 42, 157 'cigar': '4M', 158 'query_sequence': q_seq, 159 'target_sequence': ('CTGCCTCAGGGGGAGGAAAGCGTCAGCGCGGCTGCCGTCGG' 160 'CGCAGGGGC') 161 }, 162 { 163 'optimal_alignment_score': 48, 164 'suboptimal_alignment_score': 16, 165 'query_begin': 0, 166 'query_end': 23, 167 'target_begin': 0, 168 'target_end_optimal': 23, 169 'target_end_suboptimal': 7, 170 'cigar': '24M', 171 'query_sequence': q_seq, 172 'target_sequence': q_seq 173 } 174 ] 175 query = StripedSmithWaterman(q_seq) 176 results = [] 177 for expected in expected_alignments: 178 alignment = query(expected['target_sequence']) 179 results.append(alignment) 180 181 for result, expected in zip(results, expected_alignments): 182 self._check_alignment(result, expected) 183 184 def test_regression_on_instantiation_arguments(self): 185 expected = { 186 'optimal_alignment_score': 23, 187 'suboptimal_alignment_score': 10, 188 'query_begin': 0, 189 'query_end': 16, 190 'target_begin': 0, 191 'target_end_optimal': 20, 192 'target_end_suboptimal': 4, 193 'cigar': '6M4D11M', 194 'query_sequence': 'AAACGATAAATCCGCGTA', 195 'target_sequence': 'AAACGACTACTAAATCCGCGTGATAGGGGA' 196 } 197 query = StripedSmithWaterman(expected['query_sequence'], 198 gap_open_penalty=5, 199 gap_extend_penalty=2, 200 score_size=2, 201 mask_length=15, 202 mask_auto=True, 203 score_only=False, 204 score_filter=None, 205 distance_filter=None, 206 override_skip_babp=False, 207 protein=False, 208 match_score=2, 209 mismatch_score=-3, 210 substitution_matrix=None, 211 suppress_sequences=False, 212 zero_index=True) 213 alignment = query(expected['target_sequence']) 214 self._check_alignment(alignment, expected) 215 216 def test_protein_sequence_is_usable(self): 217 expected = { 218 'optimal_alignment_score': 316, 219 'suboptimal_alignment_score': 95, 220 'query_begin': 0, 221 'query_end': 52, 222 'target_begin': 0, 223 'target_end_optimal': 52, 224 'target_end_suboptimal': 18, 225 'cigar': '15M1D15M1I22M', 226 'query_sequence': ('VHLTGEEKSAVAALWGKVNVDEVGGEALGRXLLVVYPWTQRFFESF' 227 'SDLSTPDABVMSNPKVKAHGK'), 228 'target_sequence': ('VHLTPEEKSAVTALWBGKVNVDEVGGEALGRLLVVYPWTQRFFES' 229 'FGDLSTPD*') 230 } 231 query = StripedSmithWaterman(expected['query_sequence'], 232 protein=True, 233 substitution_matrix=blosum50) 234 alignment = query(expected['target_sequence']) 235 self._check_alignment(alignment, expected) 236 237 def test_lowercase_is_valid_sequence(self): 238 expected = { 239 'optimal_alignment_score': 23, 240 'suboptimal_alignment_score': 10, 241 'query_begin': 0, 242 'query_end': 16, 243 'target_begin': 0, 244 'target_end_optimal': 20, 245 'target_end_suboptimal': 4, 246 'cigar': '6M4D11M', 247 'query_sequence': 'aaacgataaatccgcgta', 248 'target_sequence': 'aaacgactactaaatccgcgtgatagggga' 249 } 250 query = StripedSmithWaterman(expected['query_sequence']) 251 alignment = query(expected['target_sequence']) 252 self._check_alignment(alignment, expected) 253 254 def test_align_with_N_in_nucleotide_sequence(self): 255 expected = { 256 'optimal_alignment_score': 9, 257 'suboptimal_alignment_score': 0, 258 'query_begin': 0, 259 'query_end': 8, 260 'target_begin': 0, 261 'target_end_optimal': 9, 262 'target_end_suboptimal': 0, 263 'cigar': '4M1D5M', 264 'query_sequence': 'ACTCANNATCGANCTAGC', 265 'target_sequence': 'ACTCGAAAATGTNNGCA' 266 } 267 query = StripedSmithWaterman(expected['query_sequence']) 268 alignment = query(expected['target_sequence']) 269 self._check_alignment(alignment, expected) 270 271 def test_arg_match_score(self): 272 query_sequences = [ 273 "TTTTTTCTTATTATTATCAATATTTATAATTTGATTTTGTTGTAAT", 274 "AGTCGAAGGGTCAATATAGGCGTGTCACCTA", 275 "AGTCGAAGGGTAATA", 276 "CTGCCTCAAGGGGGAGGAAAGCGTCAGCGCGGCTGCCGTCGGCGCAGGGGC", 277 "AGGGTAATTTTAGGCGTGTTCACCTA" 278 ] 279 target_sequences = query_sequences 280 self._check_argument_with_inequality_on_optimal_align_score( 281 query_sequences=query_sequences, 282 target_sequences=target_sequences, 283 arg='match_score', 284 default=2, 285 i_range=range(0, 5), 286 compare_lt=self.assertLess, 287 compare_gt=self.assertGreater 288 ) 289 # The above is a strict bound, so we don't need a expected align 290 291 def test_arg_mismatch_score(self): 292 query_sequences = [ 293 "TTATAATTAATTCTTATTATTATCAATATTTATAATTTGATTTTGTTGTAAT", 294 "AGTCGAAGGGTAAGGGGTATAGGCGTGTCACCTA", 295 "AGTCGAAGGGTAATA", 296 "CTGCCTCAGGGGCGAGGAAAGCGTCAGCGCGGCTGCCGTCGGCGCAGGGGC", 297 "AGGGTAATTAGCGCGTGTTCACCTA" 298 ] 299 target_sequences = query_sequences 300 self._check_argument_with_inequality_on_optimal_align_score( 301 query_sequences=query_sequences, 302 target_sequences=target_sequences, 303 arg='mismatch_score', 304 default=-3, 305 i_range=range(-6, 1), 306 # These are intentionally inverted 307 compare_lt=self.assertLessEqual, 308 compare_gt=self.assertGreaterEqual 309 ) 310 # The above is not a strict bound, so lets use an expected align 311 # to plug the hole where every align is exactly equal to default 312 expected = { 313 'optimal_alignment_score': 8, 314 'suboptimal_alignment_score': 0, 315 'query_begin': 5, 316 'query_end': 8, 317 'target_begin': 10, 318 'target_end_optimal': 13, 319 'target_end_suboptimal': 0, 320 'cigar': '4M', 321 'query_sequence': 'AGAGGGTAATCAGCCGTGTCCACCGGAACACAACGCTATCGGGCGA', 322 'target_sequence': 'GTTCGCCCCAGTAAAGTTGCTACCAAATCCGCATG' 323 } 324 query = StripedSmithWaterman(expected['query_sequence'], 325 mismatch_score=-8) 326 alignment = query(expected['target_sequence']) 327 self._check_alignment(alignment, expected) 328 329 def test_arg_matrix_overrides_match_and_mismatch(self): 330 query_sequences = [ 331 "TTATAATTAATTCTTATTATTATCAATATTTATAATTTGATTTTGTTGTAAT", 332 "AGTCGAAGGGTAAGGGGTATAGGCGTGTCACCTA", 333 "AGTCGAAGGGTAATA", 334 "CTGCCTCAGGGGCGAGGAAAGCGTCAGCGCGGCTGCCGTCGGCGCAGGGGC", 335 "AGGGTAATTAGCGCGTGTTCACCTA" 336 ] 337 target_sequences = query_sequences 338 matrix = { # This is a biologically meaningless matrix 339 "A": {"A": 4, "T": -1, "C": -2, "G": -3, "N": 4}, 340 "T": {"A": -1, "T": 1, "C": -1, "G": -4, "N": 1}, 341 "C": {"A": -2, "T": -1, "C": 10, "G": 1, "N": 1}, 342 "G": {"A": -3, "T": -4, "C": 1, "G": 3, "N": 1}, 343 "N": {"A": 4, "T": 1, "C": 1, "G": 1, "N": 0} 344 } 345 for query_sequence in query_sequences: 346 for target_sequence in target_sequences: 347 query1 = StripedSmithWaterman(query_sequence) 348 align1 = query1(target_sequence) 349 350 query2 = StripedSmithWaterman(query_sequence, 351 substitution_matrix=matrix) 352 align2 = query2(target_sequence) 353 354 self.assertNotEqual(align1.optimal_alignment_score, 355 align2.optimal_alignment_score) 356 357 def test_arg_gap_open_penalty(self): 358 query_sequences = [ 359 "TTATAATTTTCTTAGTTATTATCAATATTTATAATTTGATTTTGTTGTAAT", 360 "AGTCCGAAGGGTAATATAGGCGTGTCACCTA", 361 "AGTCGAAGGCGGTAATA", 362 "CTGCCTCGGCAGGGGGAGGAAAGCGTCAGCGCGGCTGCCGTCGGCGCAGGGGC", 363 "AGGGTAATTAAAGGCGTGTTCACCTA" 364 ] 365 target_sequences = query_sequences 366 self._check_argument_with_inequality_on_optimal_align_score( 367 query_sequences=query_sequences, 368 target_sequences=target_sequences, 369 arg='gap_open_penalty', 370 default=5, 371 i_range=range(1, 12), 372 # These are intentionally inverted 373 compare_lt=self.assertGreaterEqual, 374 compare_gt=self.assertLessEqual 375 ) 376 # The above is not a strict bound, so lets use an expected align 377 # to plug the hole where every align is exactly equal to default 378 expected = { 379 'optimal_alignment_score': 51, 380 'suboptimal_alignment_score': 20, 381 'query_begin': 0, 382 'query_end': 37, 383 'target_begin': 0, 384 'target_end_optimal': 29, 385 'target_end_suboptimal': 9, 386 'cigar': '5M4I3M3I1M1I21M', 387 'query_sequence': 'TAGAGATTAATTGCCACATTGCCACTGCCAAAATTCTG', 388 'target_sequence': 'TAGAGATTAATTGCCACTGCCAAAATTCTG' 389 } 390 query = StripedSmithWaterman(expected['query_sequence'], 391 gap_open_penalty=1) 392 alignment = query(expected['target_sequence']) 393 self._check_alignment(alignment, expected) 394 395 def test_arg_gap_extend_penalty(self): 396 query_sequences = [ 397 "TTATAATTTTCTTATTATTATCAATATTTATAATTTGATTTTGTTGTAAT", 398 "AGTCGAAGGGTAATACTAGGCGTGTCACCTA", 399 "AGTCGAAGGGTAATA", 400 "CTGCCTCAGGGGGAGGCAAAGCGTCAGCGCGGCTGCCGTCGGCGCAGGGGC", 401 "AGGGTAATTAGGCGTGTTCACCTA" 402 ] 403 target_sequences = query_sequences 404 self._check_argument_with_inequality_on_optimal_align_score( 405 query_sequences=query_sequences, 406 target_sequences=target_sequences, 407 arg='gap_extend_penalty', 408 default=2, 409 i_range=range(1, 10), 410 # These are intentionally inverted 411 compare_lt=self.assertGreaterEqual, 412 compare_gt=self.assertLessEqual 413 ) 414 # The above is not a strict bound, so lets use an expected align 415 # to plug the hole where every align is exactly equal to default 416 expected = { 417 'optimal_alignment_score': 9, 418 'suboptimal_alignment_score': 8, 419 'query_begin': 6, 420 'query_end': 12, 421 'target_begin': 7, 422 'target_end_optimal': 13, 423 'target_end_suboptimal': 38, 424 'cigar': '7M', 425 'query_sequence': 'TCTATAAGATTCCGCATGCGTTACTTATAAGATGTCTCAACGG', 426 'target_sequence': 'GCCCAGTAGCTTCCCAATATGAGAGCATCAATTGTAGATCGGGCC' 427 } 428 query = StripedSmithWaterman(expected['query_sequence'], 429 gap_extend_penalty=10) 430 alignment = query(expected['target_sequence']) 431 self._check_alignment(alignment, expected) 432 433 def test_arg_score_only(self): 434 query_sequences = [ 435 "TTATCGTGATTATTATCAATATTTATAATTTGATTTTGTTGTAAT", 436 "AGTCGAAGGGTAATACTATAAGGCGTGTCACCTA", 437 "AGTCGAAGGGTAATA", 438 "AGGGTAATTAGGCGTGCGTGCGTGTTCACCTA", 439 "AGGGTATTAGGCGTGTTCACCTA" 440 ] 441 target_sequences = query_sequences 442 self._check_bit_flag_sets_properties_falsy_or_negative( 443 query_sequences=query_sequences, 444 target_sequences=target_sequences, 445 arg_settings=[('score_only', True)], 446 properties_to_null=['query_begin', 'target_begin', 'cigar'] 447 ) 448 449 def test_arg_score_filter_is_used(self): 450 query_sequences = [ 451 "TTATCGTGATTATTATCAATATTTATAATTTGATTTTGTTGTAAT", 452 "AGTCGAAGGGTAATACTATAAGGCGTGTCACCTA", 453 "AGTCGAAGGGTAATA", 454 "AGGGTAATTAGGCGTGCGTGCGTGTTCACCTA", 455 "AGGGTATTAGGCGTGTTCACCTA" 456 ] 457 target_sequences = query_sequences 458 self._check_bit_flag_sets_properties_falsy_or_negative( 459 query_sequences=query_sequences, 460 target_sequences=target_sequences, 461 # score_filter will force a BABP and cigar to be falsy 462 arg_settings=[('score_filter', 9001)], 463 properties_to_null=['query_begin', 'target_begin', 'cigar'] 464 ) 465 466 def test_arg_distance_filter_is_used(self): 467 query_sequences = [ 468 "TTATCGTGATTATTATCAATATTTATAATTTGATTTTGTTGTAAT", 469 "AGTCGAAGGGTAATACTATAAGGCGTGTCACCTA", 470 "AGTCGAAGGGTAATA", 471 "AGGGTAATTAGGCGTGCGTGCGTGTTCACCTA", 472 "AGGGTATTAGGCGTGTTCACCTA" 473 ] 474 target_sequences = query_sequences 475 self._check_bit_flag_sets_properties_falsy_or_negative( 476 query_sequences=query_sequences, 477 target_sequences=target_sequences, 478 # distance_filter will force cigar to be falsy only 479 arg_settings=[('distance_filter', 1)], 480 properties_to_null=['cigar'] 481 ) 482 483 def test_arg_override_skip_babp(self): 484 query_sequences = [ 485 "TTATCGTGATTATTATCAATATTTATAATTTGATTTTGTTGTAAT", 486 "AGTCGAAGGGTAATACTATAAGGCGTGTCACCTA", 487 "AGTCGAAGGGTAATA", 488 "AGGGTAATTAGGCGTGCGTGCGTGTTCACCTA", 489 "AGGGTATTAGGCGTGTTCACCTA" 490 ] 491 target_sequences = query_sequences 492 self._check_bit_flag_sets_properties_falsy_or_negative( 493 query_sequences=query_sequences, 494 target_sequences=target_sequences, 495 # score_filter will force a BABP and cigar to be falsy if not for 496 # override_skip_babp preventing this for all but the cigar 497 arg_settings=[('override_skip_babp', True), 498 ('score_filter', 9001)], 499 properties_to_null=['cigar'] 500 ) 501 502 def test_arg_zero_index_changes_base_of_index_to_0_or_1(self): 503 expected_alignments = [ 504 ({ 505 'optimal_alignment_score': 100, 506 'suboptimal_alignment_score': 44, 507 'query_begin': 5, 508 'query_end': 54, 509 'target_begin': 0, 510 'target_end_optimal': 49, 511 'target_end_suboptimal': 21, 512 'cigar': '50M', 513 'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG' 514 'CCCCGGGCGGGGC'), 515 'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC' 516 'GGGCGGGGC') 517 }, True), 518 ({ 519 'optimal_alignment_score': 100, 520 'suboptimal_alignment_score': 44, 521 'query_begin': 6, 522 'query_end': 55, 523 'target_begin': 1, 524 'target_end_optimal': 50, 525 'target_end_suboptimal': 22, 526 'cigar': '50M', 527 'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG' 528 'CCCCGGGCGGGGC'), 529 'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC' 530 'GGGCGGGGC') 531 }, False) 532 ] 533 for expected, z in expected_alignments: 534 query = StripedSmithWaterman(expected['query_sequence'], 535 zero_index=z) 536 alignment = query(expected['target_sequence']) 537 self._check_alignment(alignment, expected) 538 539 def test_arg_suppress_sequences(self): 540 expected = { 541 'optimal_alignment_score': 100, 542 'suboptimal_alignment_score': 44, 543 'query_begin': 5, 544 'query_end': 54, 545 'target_begin': 0, 546 'target_end_optimal': 49, 547 'target_end_suboptimal': 21, 548 'cigar': '50M', 549 'query_sequence': '', 550 'target_sequence': '' 551 } 552 query = StripedSmithWaterman( 553 "AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCCGGGCGGGGC", 554 suppress_sequences=True) 555 alignment = query("CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCCGGGCGGGGC") 556 self._check_alignment(alignment, expected) 557 558 559class TestAlignStripedSmithWaterman(TestSSW): 560 def _check_TabularMSA_to_AlignmentStructure(self, alignment, structure, 561 expected_dtype): 562 msa, score, start_end = alignment 563 564 self.assertEqual(score, structure.optimal_alignment_score) 565 self.assertEqual( 566 msa, 567 TabularMSA([expected_dtype(structure.aligned_query_sequence), 568 expected_dtype(structure.aligned_target_sequence)])) 569 if structure.query_begin == -1: 570 self.assertEqual(start_end, None) 571 else: 572 for (start, end), (expected_start, expected_end) in \ 573 zip(start_end, 574 [(structure.query_begin, 575 structure.query_end), 576 (structure.target_begin, 577 structure.target_end_optimal)]): 578 self.assertEqual(start, expected_start) 579 self.assertEqual(end, expected_end) 580 581 def test_same_as_using_StripedSmithWaterman_object_DNA(self): 582 query_sequence = 'ATGGAAGCTATAAGCGCGGGTGAG' 583 target_sequence = 'AACTTATATAATAAAAATTATATATTCGTTGGGTTCTTTTGATATAAATC' 584 query = StripedSmithWaterman(query_sequence) 585 align1 = query(target_sequence) 586 align2 = local_pairwise_align_ssw(DNA(query_sequence), 587 DNA(target_sequence)) 588 self._check_TabularMSA_to_AlignmentStructure(align2, align1, DNA) 589 590 def test_same_as_using_StripedSmithWaterman_object_Protein(self): 591 query_sequence = 'HEAGAWGHEE' 592 target_sequence = 'PAWHEAE' 593 query = StripedSmithWaterman(query_sequence, 594 protein=True, 595 substitution_matrix=blosum50) 596 align1 = query(target_sequence) 597 align2 = local_pairwise_align_ssw(Protein(query_sequence), 598 Protein(target_sequence), 599 substitution_matrix=blosum50) 600 self._check_TabularMSA_to_AlignmentStructure(align2, align1, Protein) 601 602 def test_kwargs_are_usable(self): 603 kwargs = {} 604 kwargs['mismatch_score'] = -2 605 kwargs['match_score'] = 5 606 query_sequence = 'AGGGTAATTAGGCGTGTTCACCTA' 607 target_sequence = 'TACTTATAAGATGTCTCAACGGCATGCGCAACTTGTGAAGTG' 608 query = StripedSmithWaterman(query_sequence, **kwargs) 609 align1 = query(target_sequence) 610 align2 = local_pairwise_align_ssw(DNA(query_sequence), 611 DNA(target_sequence), **kwargs) 612 self._check_TabularMSA_to_AlignmentStructure(align2, align1, DNA) 613 614 def test_invalid_type(self): 615 with self.assertRaisesRegex(TypeError, r"not type 'Sequence'"): 616 local_pairwise_align_ssw(DNA('ACGT'), Sequence('ACGT')) 617 618 with self.assertRaisesRegex(TypeError, r"not type 'str'"): 619 local_pairwise_align_ssw('ACGU', RNA('ACGU')) 620 621 def test_type_mismatch(self): 622 with self.assertRaisesRegex(TypeError, r"same type: 'DNA' != 'RNA'"): 623 local_pairwise_align_ssw(DNA('ACGT'), RNA('ACGU')) 624 625 626class TestAlignmentStructure(TestSSW): 627 628 def mock_object_factory(self, dictionary): 629 class MockAlignmentStructure(AlignmentStructure): 630 def __init__(self, _a, _b, _c): 631 for key in dictionary: 632 setattr(self, key, dictionary[key]) 633 return MockAlignmentStructure(None, None, 0) 634 635 def test_works_for_dot_and_square_bracket_access(self): 636 q_seq = "AGGGTAATTAGGCGTGTTCACCTA" 637 query = StripedSmithWaterman(q_seq) 638 alignment = query("TACTTATAAGATGTCTCAACGGCATGCGCAACTTGTGAAGTG") 639 for accessible in self.align_attributes: 640 self.assertEqual(getattr(alignment, accessible), 641 alignment[accessible]) 642 643 def test_is_zero_based_returns_true_if_index_base_is_zero(self): 644 expected_alignments = [ 645 ({ 646 'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG' 647 'CCCCGGGCGGGGC'), 648 'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC' 649 'GGGCGGGGC') 650 }, True), 651 ({ 652 'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG' 653 'CCCCGGGCGGGGC'), 654 'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC' 655 'GGGCGGGGC') 656 }, False) 657 ] 658 for expected, z in expected_alignments: 659 query = StripedSmithWaterman(expected['query_sequence'], 660 zero_index=z) 661 alignment = query(expected['target_sequence']) 662 self.assertEqual(z, alignment.is_zero_based()) 663 664 def test_set_zero_based_changes_the_index_base(self): 665 expected_alignments = [ 666 ({ 667 'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG' 668 'CCCCGGGCGGGGC'), 669 'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC' 670 'GGGCGGGGC') 671 }, True), 672 ({ 673 'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG' 674 'CCCCGGGCGGGGC'), 675 'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC' 676 'GGGCGGGGC') 677 }, False) 678 ] 679 for expected, z in expected_alignments: 680 query = StripedSmithWaterman(expected['query_sequence'], 681 zero_index=z) 682 alignment = query(expected['target_sequence']) 683 alignment.set_zero_based(not z) 684 self.assertEqual(not z, alignment.is_zero_based()) 685 686 def test__get_aligned_sequences(self): 687 generic_sequence = "123456789abcdefghijklmnopqrstuvwxyz" 688 tests = [ # `end_after_cigar` is how far end extends beyond the cigar. 689 # Negative values on this should not be possible with SSW 690 { 691 'cigar_tuples': [ 692 (4, 'M'), (3, 'I'), (1, 'D'), (15, 'M') 693 ], 694 'begin': 4, 695 'end_after_cigar': 2, 696 'gap_type': 'I', 697 'expected': "5678---9abcdefghijklmnopq" 698 }, 699 { 700 'cigar_tuples': [ 701 (12, 'M') 702 ], 703 'begin': 10, 704 'end_after_cigar': 0, 705 'gap_type': 'D', 706 'expected': "bcdefghijklm" 707 }, 708 { 709 'cigar_tuples': [ 710 (10, 'D'), (1, 'M'), (3, 'I'), (2, 'M') 711 ], 712 'begin': 0, 713 'end_after_cigar': 5, 714 'gap_type': 'I', 715 'expected': "123456789ab---cdefghi" 716 }, 717 { 718 'cigar_tuples': [ 719 (10, 'D'), (1, 'M'), (3, 'I'), (2, 'M') 720 ], 721 'begin': 3, 722 'end_after_cigar': 0, 723 'gap_type': 'D', 724 'expected': "----------456789" 725 }, 726 { 727 'cigar_tuples': [ 728 (1, 'I'), (4, 'M'), (3, 'I'), (1, 'D'), (8, 'M'), (8, 'D'), 729 (2, 'I'), (6, 'M'), (1, 'I') 730 ], 731 'begin': 4, 732 'end_after_cigar': 3, 733 'gap_type': 'I', 734 'expected': "-5678---9abcdefghijklmnop--qrstuv-wxy" 735 } 736 ] 737 for test in tests: 738 mock_object = self.mock_object_factory({}) 739 # Because SSW's output is [a, b] and Python's list ranges use 740 # [a, b) a 1 is added in the calculation of aligned sequences. 741 # We just have to subtract 1 while we are testing with the easy to 742 # verify interface of `end_after_cigar` to cancel this range effect 743 # out. 744 end = test['end_after_cigar'] - 1 + test['begin'] + \ 745 sum(le if t != test['gap_type'] else 0 746 for le, t in test['cigar_tuples']) 747 self.assertEqual(test['expected'], 748 AlignmentStructure._get_aligned_sequence( 749 mock_object, generic_sequence, 750 test['cigar_tuples'], test['begin'], 751 end, test['gap_type'])) 752 753 def test_aligned_query_target_sequence(self): 754 query = StripedSmithWaterman("AGGGTAATTAGGCGTGTTCACCTA") 755 alignment = query("AGTCGAAGGGTAATATAGGCGTGTCACCTA") 756 self.assertEqual("AGGGTAATATAGGCGTG-TCACCTA", 757 alignment.aligned_target_sequence) 758 self.assertEqual("AGGGTAAT-TAGGCGTGTTCACCTA", 759 alignment.aligned_query_sequence) 760 761 def test_aligned_query_target_sequence_with_suppressed_sequences(self): 762 query = StripedSmithWaterman("AGGGTAATTAGGCGTGTTCACCTA", 763 suppress_sequences=True) 764 alignment = query("AGTCGAAGGGTAATATAGGCGTGTCACCTA") 765 self.assertEqual(None, alignment.aligned_target_sequence) 766 self.assertEqual(None, alignment.aligned_query_sequence) 767 768 769if __name__ == '__main__': 770 main() 771