1# ----------------------------------------------------------------------------
2# Copyright (c) 2013--, scikit-bio development team.
3#
4# Distributed under the terms of the Modified BSD License.
5#
6# The full license is in the file COPYING.txt, distributed with this software.
7# ----------------------------------------------------------------------------
8
9# Special thanks to http://www.faculty.ucr.edu/~mmaduro/random.htm for the
10# random DNA generator.
11
12# These tests confirm that StripedSmithWaterman returns the same results as
13# SSW. We don't test for correctness of those results (i.e., we assume that
14# ssw.c and ssw.h are correct) as that testing is beyond the scope of skbio.
15# Furthermore all expected results are created by running StripedSmithWaterman
16# the resulting alignments are verified by hand. Creating tests from the base
17# C API is impractical at this time.
18
19from unittest import TestCase, main
20
21from skbio import (local_pairwise_align_ssw, Sequence, DNA, RNA, Protein,
22                   TabularMSA)
23from skbio.alignment import StripedSmithWaterman, AlignmentStructure
24from skbio.alignment._pairwise import blosum50
25
26
27class TestSSW(TestCase):
28
29    align_attributes = [
30        "optimal_alignment_score", "suboptimal_alignment_score",
31        "target_begin", "target_end_optimal", "target_end_suboptimal",
32        "query_begin", "query_end", "cigar", "query_sequence",
33        "target_sequence"
34    ]
35
36    def _check_alignment(self, alignment, expected):
37        for attribute in self.align_attributes:
38            # The first element of this tuple is to identify
39            # the broken sequence if one should fail
40            self.assertEqual((expected['target_sequence'],
41                              expected[attribute]),
42                             (alignment['target_sequence'],
43                              alignment[attribute]))
44
45    def _check_argument_with_inequality_on_optimal_align_score(
46            self,
47            query_sequences=None,
48            target_sequences=None,
49            arg=None,
50            default=None,
51            i_range=None,
52            compare_lt=None,
53            compare_gt=None):
54        iterable_kwarg = {}
55        default_kwarg = {}
56        default_kwarg[arg] = default
57        for query_sequence in query_sequences:
58            for target_sequence in target_sequences:
59                for i in i_range:
60                    iterable_kwarg[arg] = i
61                    query1 = StripedSmithWaterman(query_sequence,
62                                                  **iterable_kwarg)
63                    align1 = query1(target_sequence)
64
65                    query2 = StripedSmithWaterman(query_sequence,
66                                                  **default_kwarg)
67                    align2 = query2(target_sequence)
68
69                    if i == default:
70                        self.assertEqual(align1.optimal_alignment_score,
71                                         align2.optimal_alignment_score)
72                    if i < default:
73                        compare_lt(align1.optimal_alignment_score,
74                                   align2.optimal_alignment_score)
75                    if i > default:
76                        compare_gt(align1.optimal_alignment_score,
77                                   align2.optimal_alignment_score)
78
79    def _check_bit_flag_sets_properties_falsy_or_negative(
80            self,
81            query_sequences=None,
82            target_sequences=None,
83            arg_settings=[],
84            properties_to_null=[]):
85        kwarg = {}
86
87        def falsy_or_negative(alignment, prop):
88            if type(alignment[prop]) is int:
89                return alignment[prop] < 0
90            else:
91                return not alignment[prop]
92
93        for query_sequence in query_sequences:
94            for target_sequence in target_sequences:
95                for arg, setting in arg_settings:
96                    kwarg[arg] = setting
97                query = StripedSmithWaterman(query_sequence, **kwarg)
98                alignment = query(target_sequence)
99                for prop in properties_to_null:
100                    self.assertTrue(falsy_or_negative(alignment, prop))
101                # Every property not in our null list
102                for prop in [p for p in self.align_attributes
103                             if p not in properties_to_null]:
104                    self.assertFalse(falsy_or_negative(alignment, prop))
105
106
107class TestStripedSmithWaterman(TestSSW):
108
109    def test_object_is_reusable(self):
110        q_seq = "AGGGTAATTAGGCGTGTTCACCTA"
111        expected_alignments = [
112            {
113                'optimal_alignment_score': 10,
114                'suboptimal_alignment_score': 10,
115                'query_begin': 4,
116                'query_end': 8,
117                'target_begin': 3,
118                'target_end_optimal': 7,
119                'target_end_suboptimal': 34,
120                'cigar': '5M',
121                'query_sequence': q_seq,
122                'target_sequence': ('TTATAATTTTCTTATTATTATCAATATTTATAATTTGATTT'
123                                    'TGTTGTAAT')
124            },
125            {
126                'optimal_alignment_score': 36,
127                'suboptimal_alignment_score': 16,
128                'query_begin': 0,
129                'query_end': 23,
130                'target_begin': 6,
131                'target_end_optimal': 29,
132                'target_end_suboptimal': 13,
133                'cigar': '8M1D8M1I7M',
134                'query_sequence': q_seq,
135                'target_sequence': 'AGTCGAAGGGTAATATAGGCGTGTCACCTA'
136            },
137            {
138                'optimal_alignment_score': 16,
139                'suboptimal_alignment_score': 0,
140                'query_begin': 0,
141                'query_end': 7,
142                'target_begin': 6,
143                'target_end_optimal': 13,
144                'target_end_suboptimal': 0,
145                'cigar': '8M',
146                'query_sequence': q_seq,
147                'target_sequence': 'AGTCGAAGGGTAATA'
148            },
149            {
150                'optimal_alignment_score': 8,
151                'suboptimal_alignment_score': 8,
152                'query_begin': 0,
153                'query_end': 3,
154                'target_begin': 7,
155                'target_end_optimal': 10,
156                'target_end_suboptimal': 42,
157                'cigar': '4M',
158                'query_sequence': q_seq,
159                'target_sequence': ('CTGCCTCAGGGGGAGGAAAGCGTCAGCGCGGCTGCCGTCGG'
160                                    'CGCAGGGGC')
161            },
162            {
163                'optimal_alignment_score': 48,
164                'suboptimal_alignment_score': 16,
165                'query_begin': 0,
166                'query_end': 23,
167                'target_begin': 0,
168                'target_end_optimal': 23,
169                'target_end_suboptimal': 7,
170                'cigar': '24M',
171                'query_sequence': q_seq,
172                'target_sequence': q_seq
173            }
174        ]
175        query = StripedSmithWaterman(q_seq)
176        results = []
177        for expected in expected_alignments:
178            alignment = query(expected['target_sequence'])
179            results.append(alignment)
180
181        for result, expected in zip(results, expected_alignments):
182            self._check_alignment(result, expected)
183
184    def test_regression_on_instantiation_arguments(self):
185        expected = {
186            'optimal_alignment_score': 23,
187            'suboptimal_alignment_score': 10,
188            'query_begin': 0,
189            'query_end': 16,
190            'target_begin': 0,
191            'target_end_optimal': 20,
192            'target_end_suboptimal': 4,
193            'cigar': '6M4D11M',
194            'query_sequence': 'AAACGATAAATCCGCGTA',
195            'target_sequence': 'AAACGACTACTAAATCCGCGTGATAGGGGA'
196        }
197        query = StripedSmithWaterman(expected['query_sequence'],
198                                     gap_open_penalty=5,
199                                     gap_extend_penalty=2,
200                                     score_size=2,
201                                     mask_length=15,
202                                     mask_auto=True,
203                                     score_only=False,
204                                     score_filter=None,
205                                     distance_filter=None,
206                                     override_skip_babp=False,
207                                     protein=False,
208                                     match_score=2,
209                                     mismatch_score=-3,
210                                     substitution_matrix=None,
211                                     suppress_sequences=False,
212                                     zero_index=True)
213        alignment = query(expected['target_sequence'])
214        self._check_alignment(alignment, expected)
215
216    def test_protein_sequence_is_usable(self):
217        expected = {
218            'optimal_alignment_score': 316,
219            'suboptimal_alignment_score': 95,
220            'query_begin': 0,
221            'query_end': 52,
222            'target_begin': 0,
223            'target_end_optimal': 52,
224            'target_end_suboptimal': 18,
225            'cigar': '15M1D15M1I22M',
226            'query_sequence': ('VHLTGEEKSAVAALWGKVNVDEVGGEALGRXLLVVYPWTQRFFESF'
227                               'SDLSTPDABVMSNPKVKAHGK'),
228            'target_sequence': ('VHLTPEEKSAVTALWBGKVNVDEVGGEALGRLLVVYPWTQRFFES'
229                                'FGDLSTPD*')
230        }
231        query = StripedSmithWaterman(expected['query_sequence'],
232                                     protein=True,
233                                     substitution_matrix=blosum50)
234        alignment = query(expected['target_sequence'])
235        self._check_alignment(alignment, expected)
236
237    def test_lowercase_is_valid_sequence(self):
238        expected = {
239            'optimal_alignment_score': 23,
240            'suboptimal_alignment_score': 10,
241            'query_begin': 0,
242            'query_end': 16,
243            'target_begin': 0,
244            'target_end_optimal': 20,
245            'target_end_suboptimal': 4,
246            'cigar': '6M4D11M',
247            'query_sequence': 'aaacgataaatccgcgta',
248            'target_sequence': 'aaacgactactaaatccgcgtgatagggga'
249        }
250        query = StripedSmithWaterman(expected['query_sequence'])
251        alignment = query(expected['target_sequence'])
252        self._check_alignment(alignment, expected)
253
254    def test_align_with_N_in_nucleotide_sequence(self):
255        expected = {
256            'optimal_alignment_score': 9,
257            'suboptimal_alignment_score': 0,
258            'query_begin': 0,
259            'query_end': 8,
260            'target_begin': 0,
261            'target_end_optimal': 9,
262            'target_end_suboptimal': 0,
263            'cigar': '4M1D5M',
264            'query_sequence': 'ACTCANNATCGANCTAGC',
265            'target_sequence': 'ACTCGAAAATGTNNGCA'
266        }
267        query = StripedSmithWaterman(expected['query_sequence'])
268        alignment = query(expected['target_sequence'])
269        self._check_alignment(alignment, expected)
270
271    def test_arg_match_score(self):
272        query_sequences = [
273            "TTTTTTCTTATTATTATCAATATTTATAATTTGATTTTGTTGTAAT",
274            "AGTCGAAGGGTCAATATAGGCGTGTCACCTA",
275            "AGTCGAAGGGTAATA",
276            "CTGCCTCAAGGGGGAGGAAAGCGTCAGCGCGGCTGCCGTCGGCGCAGGGGC",
277            "AGGGTAATTTTAGGCGTGTTCACCTA"
278        ]
279        target_sequences = query_sequences
280        self._check_argument_with_inequality_on_optimal_align_score(
281            query_sequences=query_sequences,
282            target_sequences=target_sequences,
283            arg='match_score',
284            default=2,
285            i_range=range(0, 5),
286            compare_lt=self.assertLess,
287            compare_gt=self.assertGreater
288        )
289        # The above is a strict bound, so we don't need a expected align
290
291    def test_arg_mismatch_score(self):
292        query_sequences = [
293            "TTATAATTAATTCTTATTATTATCAATATTTATAATTTGATTTTGTTGTAAT",
294            "AGTCGAAGGGTAAGGGGTATAGGCGTGTCACCTA",
295            "AGTCGAAGGGTAATA",
296            "CTGCCTCAGGGGCGAGGAAAGCGTCAGCGCGGCTGCCGTCGGCGCAGGGGC",
297            "AGGGTAATTAGCGCGTGTTCACCTA"
298        ]
299        target_sequences = query_sequences
300        self._check_argument_with_inequality_on_optimal_align_score(
301            query_sequences=query_sequences,
302            target_sequences=target_sequences,
303            arg='mismatch_score',
304            default=-3,
305            i_range=range(-6, 1),
306            # These are intentionally inverted
307            compare_lt=self.assertLessEqual,
308            compare_gt=self.assertGreaterEqual
309        )
310        # The above is not a strict bound, so lets use an expected align
311        # to plug the hole where every align is exactly equal to default
312        expected = {
313            'optimal_alignment_score': 8,
314            'suboptimal_alignment_score': 0,
315            'query_begin': 5,
316            'query_end': 8,
317            'target_begin': 10,
318            'target_end_optimal': 13,
319            'target_end_suboptimal': 0,
320            'cigar': '4M',
321            'query_sequence': 'AGAGGGTAATCAGCCGTGTCCACCGGAACACAACGCTATCGGGCGA',
322            'target_sequence': 'GTTCGCCCCAGTAAAGTTGCTACCAAATCCGCATG'
323        }
324        query = StripedSmithWaterman(expected['query_sequence'],
325                                     mismatch_score=-8)
326        alignment = query(expected['target_sequence'])
327        self._check_alignment(alignment, expected)
328
329    def test_arg_matrix_overrides_match_and_mismatch(self):
330        query_sequences = [
331            "TTATAATTAATTCTTATTATTATCAATATTTATAATTTGATTTTGTTGTAAT",
332            "AGTCGAAGGGTAAGGGGTATAGGCGTGTCACCTA",
333            "AGTCGAAGGGTAATA",
334            "CTGCCTCAGGGGCGAGGAAAGCGTCAGCGCGGCTGCCGTCGGCGCAGGGGC",
335            "AGGGTAATTAGCGCGTGTTCACCTA"
336        ]
337        target_sequences = query_sequences
338        matrix = {  # This is a biologically meaningless matrix
339            "A": {"A": 4,  "T": -1, "C": -2, "G": -3, "N": 4},
340            "T": {"A": -1, "T": 1,  "C": -1, "G": -4, "N": 1},
341            "C": {"A": -2, "T": -1, "C": 10, "G": 1,  "N": 1},
342            "G": {"A": -3, "T": -4, "C": 1,  "G": 3,  "N": 1},
343            "N": {"A": 4,  "T": 1,  "C": 1,  "G": 1,  "N": 0}
344        }
345        for query_sequence in query_sequences:
346            for target_sequence in target_sequences:
347                query1 = StripedSmithWaterman(query_sequence)
348                align1 = query1(target_sequence)
349
350                query2 = StripedSmithWaterman(query_sequence,
351                                              substitution_matrix=matrix)
352                align2 = query2(target_sequence)
353
354                self.assertNotEqual(align1.optimal_alignment_score,
355                                    align2.optimal_alignment_score)
356
357    def test_arg_gap_open_penalty(self):
358        query_sequences = [
359            "TTATAATTTTCTTAGTTATTATCAATATTTATAATTTGATTTTGTTGTAAT",
360            "AGTCCGAAGGGTAATATAGGCGTGTCACCTA",
361            "AGTCGAAGGCGGTAATA",
362            "CTGCCTCGGCAGGGGGAGGAAAGCGTCAGCGCGGCTGCCGTCGGCGCAGGGGC",
363            "AGGGTAATTAAAGGCGTGTTCACCTA"
364        ]
365        target_sequences = query_sequences
366        self._check_argument_with_inequality_on_optimal_align_score(
367            query_sequences=query_sequences,
368            target_sequences=target_sequences,
369            arg='gap_open_penalty',
370            default=5,
371            i_range=range(1, 12),
372            # These are intentionally inverted
373            compare_lt=self.assertGreaterEqual,
374            compare_gt=self.assertLessEqual
375        )
376        # The above is not a strict bound, so lets use an expected align
377        # to plug the hole where every align is exactly equal to default
378        expected = {
379            'optimal_alignment_score': 51,
380            'suboptimal_alignment_score': 20,
381            'query_begin': 0,
382            'query_end': 37,
383            'target_begin': 0,
384            'target_end_optimal': 29,
385            'target_end_suboptimal': 9,
386            'cigar': '5M4I3M3I1M1I21M',
387            'query_sequence': 'TAGAGATTAATTGCCACATTGCCACTGCCAAAATTCTG',
388            'target_sequence': 'TAGAGATTAATTGCCACTGCCAAAATTCTG'
389        }
390        query = StripedSmithWaterman(expected['query_sequence'],
391                                     gap_open_penalty=1)
392        alignment = query(expected['target_sequence'])
393        self._check_alignment(alignment, expected)
394
395    def test_arg_gap_extend_penalty(self):
396        query_sequences = [
397            "TTATAATTTTCTTATTATTATCAATATTTATAATTTGATTTTGTTGTAAT",
398            "AGTCGAAGGGTAATACTAGGCGTGTCACCTA",
399            "AGTCGAAGGGTAATA",
400            "CTGCCTCAGGGGGAGGCAAAGCGTCAGCGCGGCTGCCGTCGGCGCAGGGGC",
401            "AGGGTAATTAGGCGTGTTCACCTA"
402        ]
403        target_sequences = query_sequences
404        self._check_argument_with_inequality_on_optimal_align_score(
405            query_sequences=query_sequences,
406            target_sequences=target_sequences,
407            arg='gap_extend_penalty',
408            default=2,
409            i_range=range(1, 10),
410            # These are intentionally inverted
411            compare_lt=self.assertGreaterEqual,
412            compare_gt=self.assertLessEqual
413        )
414        # The above is not a strict bound, so lets use an expected align
415        # to plug the hole where every align is exactly equal to default
416        expected = {
417            'optimal_alignment_score': 9,
418            'suboptimal_alignment_score': 8,
419            'query_begin': 6,
420            'query_end': 12,
421            'target_begin': 7,
422            'target_end_optimal': 13,
423            'target_end_suboptimal': 38,
424            'cigar': '7M',
425            'query_sequence': 'TCTATAAGATTCCGCATGCGTTACTTATAAGATGTCTCAACGG',
426            'target_sequence': 'GCCCAGTAGCTTCCCAATATGAGAGCATCAATTGTAGATCGGGCC'
427        }
428        query = StripedSmithWaterman(expected['query_sequence'],
429                                     gap_extend_penalty=10)
430        alignment = query(expected['target_sequence'])
431        self._check_alignment(alignment, expected)
432
433    def test_arg_score_only(self):
434        query_sequences = [
435            "TTATCGTGATTATTATCAATATTTATAATTTGATTTTGTTGTAAT",
436            "AGTCGAAGGGTAATACTATAAGGCGTGTCACCTA",
437            "AGTCGAAGGGTAATA",
438            "AGGGTAATTAGGCGTGCGTGCGTGTTCACCTA",
439            "AGGGTATTAGGCGTGTTCACCTA"
440        ]
441        target_sequences = query_sequences
442        self._check_bit_flag_sets_properties_falsy_or_negative(
443            query_sequences=query_sequences,
444            target_sequences=target_sequences,
445            arg_settings=[('score_only', True)],
446            properties_to_null=['query_begin', 'target_begin', 'cigar']
447        )
448
449    def test_arg_score_filter_is_used(self):
450        query_sequences = [
451            "TTATCGTGATTATTATCAATATTTATAATTTGATTTTGTTGTAAT",
452            "AGTCGAAGGGTAATACTATAAGGCGTGTCACCTA",
453            "AGTCGAAGGGTAATA",
454            "AGGGTAATTAGGCGTGCGTGCGTGTTCACCTA",
455            "AGGGTATTAGGCGTGTTCACCTA"
456        ]
457        target_sequences = query_sequences
458        self._check_bit_flag_sets_properties_falsy_or_negative(
459            query_sequences=query_sequences,
460            target_sequences=target_sequences,
461            # score_filter will force a BABP and cigar to be falsy
462            arg_settings=[('score_filter', 9001)],
463            properties_to_null=['query_begin', 'target_begin', 'cigar']
464        )
465
466    def test_arg_distance_filter_is_used(self):
467        query_sequences = [
468            "TTATCGTGATTATTATCAATATTTATAATTTGATTTTGTTGTAAT",
469            "AGTCGAAGGGTAATACTATAAGGCGTGTCACCTA",
470            "AGTCGAAGGGTAATA",
471            "AGGGTAATTAGGCGTGCGTGCGTGTTCACCTA",
472            "AGGGTATTAGGCGTGTTCACCTA"
473        ]
474        target_sequences = query_sequences
475        self._check_bit_flag_sets_properties_falsy_or_negative(
476            query_sequences=query_sequences,
477            target_sequences=target_sequences,
478            # distance_filter will force cigar to be falsy only
479            arg_settings=[('distance_filter', 1)],
480            properties_to_null=['cigar']
481        )
482
483    def test_arg_override_skip_babp(self):
484        query_sequences = [
485            "TTATCGTGATTATTATCAATATTTATAATTTGATTTTGTTGTAAT",
486            "AGTCGAAGGGTAATACTATAAGGCGTGTCACCTA",
487            "AGTCGAAGGGTAATA",
488            "AGGGTAATTAGGCGTGCGTGCGTGTTCACCTA",
489            "AGGGTATTAGGCGTGTTCACCTA"
490        ]
491        target_sequences = query_sequences
492        self._check_bit_flag_sets_properties_falsy_or_negative(
493            query_sequences=query_sequences,
494            target_sequences=target_sequences,
495            # score_filter will force a BABP and cigar to be falsy if not for
496            # override_skip_babp preventing this for all but the cigar
497            arg_settings=[('override_skip_babp', True),
498                          ('score_filter', 9001)],
499            properties_to_null=['cigar']
500        )
501
502    def test_arg_zero_index_changes_base_of_index_to_0_or_1(self):
503        expected_alignments = [
504            ({
505                'optimal_alignment_score': 100,
506                'suboptimal_alignment_score': 44,
507                'query_begin': 5,
508                'query_end': 54,
509                'target_begin': 0,
510                'target_end_optimal': 49,
511                'target_end_suboptimal': 21,
512                'cigar': '50M',
513                'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG'
514                                   'CCCCGGGCGGGGC'),
515                'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC'
516                                    'GGGCGGGGC')
517            }, True),
518            ({
519                'optimal_alignment_score': 100,
520                'suboptimal_alignment_score': 44,
521                'query_begin': 6,
522                'query_end': 55,
523                'target_begin': 1,
524                'target_end_optimal': 50,
525                'target_end_suboptimal': 22,
526                'cigar': '50M',
527                'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG'
528                                   'CCCCGGGCGGGGC'),
529                'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC'
530                                    'GGGCGGGGC')
531            }, False)
532        ]
533        for expected, z in expected_alignments:
534            query = StripedSmithWaterman(expected['query_sequence'],
535                                         zero_index=z)
536            alignment = query(expected['target_sequence'])
537            self._check_alignment(alignment, expected)
538
539    def test_arg_suppress_sequences(self):
540        expected = {
541            'optimal_alignment_score': 100,
542            'suboptimal_alignment_score': 44,
543            'query_begin': 5,
544            'query_end': 54,
545            'target_begin': 0,
546            'target_end_optimal': 49,
547            'target_end_suboptimal': 21,
548            'cigar': '50M',
549            'query_sequence': '',
550            'target_sequence': ''
551        }
552        query = StripedSmithWaterman(
553            "AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCCGGGCGGGGC",
554            suppress_sequences=True)
555        alignment = query("CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCCGGGCGGGGC")
556        self._check_alignment(alignment, expected)
557
558
559class TestAlignStripedSmithWaterman(TestSSW):
560    def _check_TabularMSA_to_AlignmentStructure(self, alignment, structure,
561                                                expected_dtype):
562        msa, score, start_end = alignment
563
564        self.assertEqual(score, structure.optimal_alignment_score)
565        self.assertEqual(
566            msa,
567            TabularMSA([expected_dtype(structure.aligned_query_sequence),
568                        expected_dtype(structure.aligned_target_sequence)]))
569        if structure.query_begin == -1:
570            self.assertEqual(start_end, None)
571        else:
572            for (start, end), (expected_start, expected_end) in \
573                zip(start_end,
574                    [(structure.query_begin,
575                      structure.query_end),
576                     (structure.target_begin,
577                      structure.target_end_optimal)]):
578                self.assertEqual(start, expected_start)
579                self.assertEqual(end, expected_end)
580
581    def test_same_as_using_StripedSmithWaterman_object_DNA(self):
582        query_sequence = 'ATGGAAGCTATAAGCGCGGGTGAG'
583        target_sequence = 'AACTTATATAATAAAAATTATATATTCGTTGGGTTCTTTTGATATAAATC'
584        query = StripedSmithWaterman(query_sequence)
585        align1 = query(target_sequence)
586        align2 = local_pairwise_align_ssw(DNA(query_sequence),
587                                          DNA(target_sequence))
588        self._check_TabularMSA_to_AlignmentStructure(align2, align1, DNA)
589
590    def test_same_as_using_StripedSmithWaterman_object_Protein(self):
591        query_sequence = 'HEAGAWGHEE'
592        target_sequence = 'PAWHEAE'
593        query = StripedSmithWaterman(query_sequence,
594                                     protein=True,
595                                     substitution_matrix=blosum50)
596        align1 = query(target_sequence)
597        align2 = local_pairwise_align_ssw(Protein(query_sequence),
598                                          Protein(target_sequence),
599                                          substitution_matrix=blosum50)
600        self._check_TabularMSA_to_AlignmentStructure(align2, align1, Protein)
601
602    def test_kwargs_are_usable(self):
603        kwargs = {}
604        kwargs['mismatch_score'] = -2
605        kwargs['match_score'] = 5
606        query_sequence = 'AGGGTAATTAGGCGTGTTCACCTA'
607        target_sequence = 'TACTTATAAGATGTCTCAACGGCATGCGCAACTTGTGAAGTG'
608        query = StripedSmithWaterman(query_sequence, **kwargs)
609        align1 = query(target_sequence)
610        align2 = local_pairwise_align_ssw(DNA(query_sequence),
611                                          DNA(target_sequence), **kwargs)
612        self._check_TabularMSA_to_AlignmentStructure(align2, align1, DNA)
613
614    def test_invalid_type(self):
615        with self.assertRaisesRegex(TypeError, r"not type 'Sequence'"):
616            local_pairwise_align_ssw(DNA('ACGT'), Sequence('ACGT'))
617
618        with self.assertRaisesRegex(TypeError, r"not type 'str'"):
619            local_pairwise_align_ssw('ACGU', RNA('ACGU'))
620
621    def test_type_mismatch(self):
622        with self.assertRaisesRegex(TypeError, r"same type: 'DNA' != 'RNA'"):
623            local_pairwise_align_ssw(DNA('ACGT'), RNA('ACGU'))
624
625
626class TestAlignmentStructure(TestSSW):
627
628    def mock_object_factory(self, dictionary):
629        class MockAlignmentStructure(AlignmentStructure):
630            def __init__(self, _a, _b, _c):
631                for key in dictionary:
632                    setattr(self, key, dictionary[key])
633        return MockAlignmentStructure(None, None, 0)
634
635    def test_works_for_dot_and_square_bracket_access(self):
636        q_seq = "AGGGTAATTAGGCGTGTTCACCTA"
637        query = StripedSmithWaterman(q_seq)
638        alignment = query("TACTTATAAGATGTCTCAACGGCATGCGCAACTTGTGAAGTG")
639        for accessible in self.align_attributes:
640            self.assertEqual(getattr(alignment, accessible),
641                             alignment[accessible])
642
643    def test_is_zero_based_returns_true_if_index_base_is_zero(self):
644        expected_alignments = [
645            ({
646                'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG'
647                                   'CCCCGGGCGGGGC'),
648                'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC'
649                                    'GGGCGGGGC')
650            }, True),
651            ({
652                'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG'
653                                   'CCCCGGGCGGGGC'),
654                'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC'
655                                    'GGGCGGGGC')
656            }, False)
657        ]
658        for expected, z in expected_alignments:
659            query = StripedSmithWaterman(expected['query_sequence'],
660                                         zero_index=z)
661            alignment = query(expected['target_sequence'])
662            self.assertEqual(z, alignment.is_zero_based())
663
664    def test_set_zero_based_changes_the_index_base(self):
665        expected_alignments = [
666            ({
667                'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG'
668                                   'CCCCGGGCGGGGC'),
669                'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC'
670                                    'GGGCGGGGC')
671            }, True),
672            ({
673                'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG'
674                                   'CCCCGGGCGGGGC'),
675                'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC'
676                                    'GGGCGGGGC')
677            }, False)
678        ]
679        for expected, z in expected_alignments:
680            query = StripedSmithWaterman(expected['query_sequence'],
681                                         zero_index=z)
682            alignment = query(expected['target_sequence'])
683            alignment.set_zero_based(not z)
684            self.assertEqual(not z, alignment.is_zero_based())
685
686    def test__get_aligned_sequences(self):
687        generic_sequence = "123456789abcdefghijklmnopqrstuvwxyz"
688        tests = [  # `end_after_cigar` is how far end extends beyond the cigar.
689                   #  Negative values on this should not be possible with SSW
690            {
691                'cigar_tuples': [
692                    (4, 'M'), (3, 'I'), (1, 'D'), (15, 'M')
693                ],
694                'begin': 4,
695                'end_after_cigar': 2,
696                'gap_type': 'I',
697                'expected': "5678---9abcdefghijklmnopq"
698            },
699            {
700                'cigar_tuples': [
701                    (12, 'M')
702                ],
703                'begin': 10,
704                'end_after_cigar': 0,
705                'gap_type': 'D',
706                'expected': "bcdefghijklm"
707            },
708            {
709                'cigar_tuples': [
710                    (10, 'D'), (1, 'M'), (3, 'I'), (2, 'M')
711                ],
712                'begin': 0,
713                'end_after_cigar': 5,
714                'gap_type': 'I',
715                'expected': "123456789ab---cdefghi"
716            },
717            {
718                'cigar_tuples': [
719                    (10, 'D'), (1, 'M'), (3, 'I'), (2, 'M')
720                ],
721                'begin': 3,
722                'end_after_cigar': 0,
723                'gap_type': 'D',
724                'expected': "----------456789"
725            },
726            {
727                'cigar_tuples': [
728                    (1, 'I'), (4, 'M'), (3, 'I'), (1, 'D'), (8, 'M'), (8, 'D'),
729                    (2, 'I'), (6, 'M'), (1, 'I')
730                ],
731                'begin': 4,
732                'end_after_cigar': 3,
733                'gap_type': 'I',
734                'expected': "-5678---9abcdefghijklmnop--qrstuv-wxy"
735            }
736        ]
737        for test in tests:
738            mock_object = self.mock_object_factory({})
739            # Because SSW's output is [a, b] and Python's list ranges use
740            # [a, b) a 1 is added in the calculation of aligned sequences.
741            # We just have to subtract 1 while we are testing with the easy to
742            # verify interface of `end_after_cigar` to cancel this range effect
743            # out.
744            end = test['end_after_cigar'] - 1 + test['begin'] + \
745                sum(le if t != test['gap_type'] else 0
746                    for le, t in test['cigar_tuples'])
747            self.assertEqual(test['expected'],
748                             AlignmentStructure._get_aligned_sequence(
749                                 mock_object, generic_sequence,
750                                 test['cigar_tuples'], test['begin'],
751                                 end, test['gap_type']))
752
753    def test_aligned_query_target_sequence(self):
754        query = StripedSmithWaterman("AGGGTAATTAGGCGTGTTCACCTA")
755        alignment = query("AGTCGAAGGGTAATATAGGCGTGTCACCTA")
756        self.assertEqual("AGGGTAATATAGGCGTG-TCACCTA",
757                         alignment.aligned_target_sequence)
758        self.assertEqual("AGGGTAAT-TAGGCGTGTTCACCTA",
759                         alignment.aligned_query_sequence)
760
761    def test_aligned_query_target_sequence_with_suppressed_sequences(self):
762        query = StripedSmithWaterman("AGGGTAATTAGGCGTGTTCACCTA",
763                                     suppress_sequences=True)
764        alignment = query("AGTCGAAGGGTAATATAGGCGTGTCACCTA")
765        self.assertEqual(None, alignment.aligned_target_sequence)
766        self.assertEqual(None, alignment.aligned_query_sequence)
767
768
769if __name__ == '__main__':
770    main()
771