1# ----------------------------------------------------------------------------
2# Copyright (c) 2013--, scikit-bio development team.
3#
4# Distributed under the terms of the Modified BSD License.
5#
6# The full license is in the file COPYING.txt, distributed with this software.
7# ----------------------------------------------------------------------------
8
9import itertools
10import unittest
11
12import numpy as np
13import numpy.testing as npt
14
15from skbio import Sequence, DNA, RNA, Protein, GeneticCode
16from skbio.sequence._genetic_code import _ncbi_genetic_codes
17
18
19class TestGeneticCode(unittest.TestCase):
20    def setUp(self):
21        self.sgc = GeneticCode.from_ncbi(1)
22
23    def test_from_ncbi_valid_table_ids(self):
24        # spot check a few tables
25        self.assertEqual(GeneticCode.from_ncbi().name,
26                         'Standard')
27        self.assertEqual(GeneticCode.from_ncbi(2).name,
28                         'Vertebrate Mitochondrial')
29        self.assertEqual(GeneticCode.from_ncbi(12).name,
30                         'Alternative Yeast Nuclear')
31        self.assertEqual(GeneticCode.from_ncbi(25).name,
32                         'Candidate Division SR1 and Gracilibacteria')
33
34    def test_from_ncbi_invalid_input(self):
35        with self.assertRaisesRegex(ValueError, r'table_id.*7'):
36            GeneticCode.from_ncbi(7)
37        with self.assertRaisesRegex(ValueError, r'table_id.*42'):
38            GeneticCode.from_ncbi(42)
39
40    def test_reading_frames(self):
41        exp = [1, 2, 3, -1, -2, -3]
42        self.assertEqual(GeneticCode.reading_frames, exp)
43        self.assertEqual(self.sgc.reading_frames, exp)
44
45        GeneticCode.reading_frames.append(42)
46
47        self.assertEqual(GeneticCode.reading_frames, exp)
48        self.assertEqual(self.sgc.reading_frames, exp)
49
50        with self.assertRaises(AttributeError):
51            self.sgc.reading_frames = [1, 2, 42]
52
53    def test_name(self):
54        self.assertEqual(self.sgc.name, 'Standard')
55        self.assertEqual(GeneticCode('M' * 64, '-' * 64).name, '')
56        self.assertEqual(GeneticCode('M' * 64, '-' * 64, 'foo').name, 'foo')
57
58        with self.assertRaises(AttributeError):
59            self.sgc.name = 'foo'
60
61    def test_init_varied_equivalent_input(self):
62        for args in (('M' * 64, '-' * 64),
63                     (Protein('M' * 64), Protein('-' * 64)),
64                     (Sequence('M' * 64), Sequence('-' * 64))):
65            gc = GeneticCode(*args)
66            self.assertEqual(gc.name, '')
67            self.assertEqual(gc._amino_acids, Protein('M' * 64))
68            self.assertEqual(gc._starts, Protein('-' * 64))
69            npt.assert_array_equal(gc._m_character_codon,
70                                   np.asarray([0, 0, 0], dtype=np.uint8))
71            self.assertEqual(len(gc._start_codons), 0)
72
73    def test_init_invalid_input(self):
74        # `amino_acids` invalid protein
75        with self.assertRaisesRegex(ValueError, r'Invalid character.*&'):
76            GeneticCode('&' * 64, '-' * 64)
77
78        # wrong number of amino acids
79        with self.assertRaisesRegex(ValueError, r'amino_acids.*64.*42'):
80            GeneticCode('M' * 42, '-' * 64)
81
82        # `amino_acids` missing M
83        with self.assertRaisesRegex(ValueError, r'amino_acids.*M.*character'):
84            GeneticCode('A' * 64, '-' * 64)
85
86        # `starts` invalid protein
87        with self.assertRaisesRegex(ValueError, r'Invalid character.*&'):
88            GeneticCode('M' * 64, '&' * 64)
89
90        # wrong number of starts
91        with self.assertRaisesRegex(ValueError, r'starts.*64.*42'):
92            GeneticCode('M' * 64, '-' * 42)
93
94        # invalid characters in `starts`
95        with self.assertRaisesRegex(ValueError, r'starts.*M and - characters'):
96            GeneticCode('M' * 64, '-M' * 30 + '*AQR')
97
98    def test_str(self):
99        # predefined
100        exp = (
101            '  AAs  = FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAA'
102            'DDEEGGGG\n'
103            'Starts = ---M---------------M---------------M--------------------'
104            '--------\n'
105            'Base1  = UUUUUUUUUUUUUUUUCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGG'
106            'GGGGGGGG\n'
107            'Base2  = UUUUCCCCAAAAGGGGUUUUCCCCAAAAGGGGUUUUCCCCAAAAGGGGUUUUCCCC'
108            'AAAAGGGG\n'
109            'Base3  = UCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAG'
110            'UCAGUCAG'
111        )
112        self.assertEqual(str(self.sgc), exp)
113
114        # custom, no name
115        obs = str(GeneticCode('M' * 64, '-' * 64))
116        self.assertIn('M' * 64, obs)
117        self.assertIn('-' * 64, obs)
118
119    def test_repr(self):
120        # predefined
121        exp = (
122            'GeneticCode (Standard)\n'
123            '-----------------------------------------------------------------'
124            '--------\n'
125            '  AAs  = FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAA'
126            'DDEEGGGG\n'
127            'Starts = ---M---------------M---------------M--------------------'
128            '--------\n'
129            'Base1  = UUUUUUUUUUUUUUUUCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGG'
130            'GGGGGGGG\n'
131            'Base2  = UUUUCCCCAAAAGGGGUUUUCCCCAAAAGGGGUUUUCCCCAAAAGGGGUUUUCCCC'
132            'AAAAGGGG\n'
133            'Base3  = UCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAG'
134            'UCAGUCAG'
135        )
136        self.assertEqual(repr(self.sgc), exp)
137
138        # custom, no name
139        obs = repr(GeneticCode('M' * 64, '-' * 64))
140        self.assertTrue(obs.startswith('GeneticCode\n'))
141        self.assertIn('M' * 64, obs)
142        self.assertIn('-' * 64, obs)
143
144    def test_eq(self):
145        amino_acids = 'AMPM' * 16
146        starts = '--M-' * 16
147
148        equal_gcs = [
149            GeneticCode(amino_acids, starts),
150            # name should be ignored
151            GeneticCode(amino_acids, starts, 'foo'),
152            # metadata/positional metadata should be ignored if Sequence
153            # subclass is provided
154            GeneticCode(
155                Protein(amino_acids, metadata={'foo': 'bar'}),
156                Protein(starts, positional_metadata={'foo': range(64)}))
157        ]
158
159        # every gc should be equal to itself
160        for gc in equal_gcs:
161            self.assertTrue(gc == gc)
162            self.assertFalse(gc != gc)
163
164        # every pair of gcs should be equal. use permutations instead of
165        # combinations to test that comparing gc1 to gc2 and gc2 to gc1 are
166        # both equal
167        for gc1, gc2 in itertools.permutations(equal_gcs, 2):
168            self.assertTrue(gc1 == gc2)
169            self.assertFalse(gc1 != gc2)
170
171    def test_ne(self):
172        class GeneticCodeSubclass(GeneticCode):
173            pass
174
175        amino_acids = 'AMPM' * 16
176        starts = '--M-' * 16
177
178        unequal_gcs = [
179            GeneticCode(amino_acids, starts),
180            # type must match
181            GeneticCodeSubclass(amino_acids, starts),
182            # completely different type
183            'foo'
184        ]
185        # none of the NCBI genetic codes should be equal to each other
186        unequal_gcs.extend(_ncbi_genetic_codes.values())
187
188        for gc in unequal_gcs:
189            self.assertTrue(gc == gc)
190            self.assertFalse(gc != gc)
191
192        for gc1, gc2 in itertools.permutations(unequal_gcs, 2):
193            self.assertTrue(gc1 != gc2)
194            self.assertFalse(gc1 == gc2)
195
196    def test_translate_preserves_metadata(self):
197        obs = self.sgc.translate(
198            RNA('AUG', metadata={'foo': 'bar', 'baz': 42},
199                positional_metadata={'foo': range(3)}))
200        # metadata retained, positional metadata dropped
201        self.assertEqual(obs, Protein('M',
202                                      metadata={'foo': 'bar', 'baz': 42}))
203
204    def test_translate_default_behavior(self):
205        # empty translation
206        exp = Protein('')
207        for seq in RNA(''), RNA('A'), RNA('AU'):
208            obs = self.sgc.translate(seq)
209            self.assertEqual(obs, exp)
210
211        # no start or stop codons
212        obs = self.sgc.translate(RNA('CCU'))
213        self.assertEqual(obs, Protein('P'))
214
215        # multiple alternative start codons, no stop codons, length is multiple
216        # of 3
217        obs = self.sgc.translate(RNA('CAUUUGCUGAAA'))
218        self.assertEqual(obs, Protein('HLLK'))
219
220        # multiple stop codons, length isn't multiple of 3
221        obs = self.sgc.translate(RNA('UUUUUUUAAAGUUAAGGGAU'))
222        self.assertEqual(obs, Protein('FF*S*G'))
223
224    def test_translate_reading_frame_empty_translation(self):
225        exp = Protein('')
226        for seq in RNA(''), RNA('A'), RNA('AU'):
227            for reading_frame in GeneticCode.reading_frames:
228                obs = self.sgc.translate(seq, reading_frame=reading_frame)
229                self.assertEqual(obs, exp)
230
231        # reading frames that yield a partial codon
232        for reading_frame in 2, 3, -2, -3:
233            obs = self.sgc.translate(RNA('AUG'), reading_frame=reading_frame)
234            self.assertEqual(obs, exp)
235
236    def test_translate_reading_frame_non_empty_translation(self):
237        seq = RNA('AUGGUGGAA')  # rc = UUCCACCAU
238        for reading_frame, exp_str in ((1, 'MVE'), (2, 'WW'), (3, 'GG'),
239                                       (-1, 'FHH'), (-2, 'ST'), (-3, 'PP')):
240            exp = Protein(exp_str)
241            obs = self.sgc.translate(seq, reading_frame=reading_frame)
242            self.assertEqual(obs, exp)
243
244    def test_translate_start_empty_translation(self):
245        exp = Protein('')
246        for seq in RNA(''), RNA('A'), RNA('AU'):
247            for start in {'optional', 'ignore'}:
248                obs = self.sgc.translate(seq, start=start)
249                self.assertEqual(obs, exp)
250
251            with self.assertRaisesRegex(ValueError,
252                                        r'reading_frame=1.*start=\'require\''):
253                self.sgc.translate(seq, start='require')
254
255    def test_translate_start_with_start_codon(self):
256        # trim before start codon, replace with M. ensure alternative start
257        # codons following the start codon aren't replaced with M. ensure
258        # default behavior for handling stop codons is retained
259        seq = RNA('CAUUUGCUGAAAUGA')
260        exp = Protein('MLK*')
261        for start in {'require', 'optional'}:
262            obs = self.sgc.translate(seq, start=start)
263            self.assertEqual(obs, exp)
264
265        # ignore start codon replacement and trimming; just translate
266        exp = Protein('HLLK*')
267        obs = self.sgc.translate(seq, start='ignore')
268        self.assertEqual(obs, exp)
269
270        # just a start codon, no replacement necessary
271        seq = RNA('AUG')
272        exp = Protein('M')
273        for start in {'require', 'optional', 'ignore'}:
274            obs = self.sgc.translate(seq, start=start)
275            self.assertEqual(obs, exp)
276
277        # single alternative start codon
278        seq = RNA('CUG')
279        exp = Protein('M')
280        for start in {'require', 'optional'}:
281            obs = self.sgc.translate(seq, start=start)
282            self.assertEqual(obs, exp)
283
284        exp = Protein('L')
285        obs = self.sgc.translate(seq, start='ignore')
286        self.assertEqual(obs, exp)
287
288    def test_translate_start_no_start_codon(self):
289        seq = RNA('CAACAACAGCAA')
290        exp = Protein('QQQQ')
291        for start in {'ignore', 'optional'}:
292            obs = self.sgc.translate(seq, start=start)
293            self.assertEqual(obs, exp)
294
295        with self.assertRaisesRegex(ValueError,
296                                    r'reading_frame=1.*start=\'require\''):
297            self.sgc.translate(seq, start='require')
298
299        # non-start codon that translates to an AA that start codons also map
300        # to. should catch bug if code attempts to search and trim *after*
301        # translation -- this must happen *before* translation
302        seq = RNA('UUACAA')
303        exp = Protein('LQ')
304        for start in {'ignore', 'optional'}:
305            obs = self.sgc.translate(seq, start=start)
306            self.assertEqual(obs, exp)
307
308        with self.assertRaisesRegex(ValueError,
309                                    r'reading_frame=1.*start=\'require\''):
310            self.sgc.translate(seq, start='require')
311
312    def test_translate_start_no_accidental_mutation(self):
313        # `start` mutates a vector in-place that is derived from
314        # GeneticCode._offset_table. the current code doesn't perform an
315        # explicit copy because numpy's advanced indexing is used, which always
316        # returns a copy. test this assumption here in case that behavior
317        # changes in the future
318        offset_table = self.sgc._offset_table.copy()
319
320        seq = RNA('CAUUUGCUGAAAUGA')
321        obs = self.sgc.translate(seq, start='require')
322        self.assertEqual(obs, Protein('MLK*'))
323
324        npt.assert_array_equal(self.sgc._offset_table, offset_table)
325
326    def test_translate_stop_empty_translation(self):
327        exp = Protein('')
328        for seq in RNA(''), RNA('A'), RNA('AU'):
329            for stop in {'optional', 'ignore'}:
330                obs = self.sgc.translate(seq, stop=stop)
331                self.assertEqual(obs, exp)
332
333            with self.assertRaisesRegex(ValueError,
334                                        r'reading_frame=1.*stop=\'require\''):
335                self.sgc.translate(seq, stop='require')
336
337    def test_translate_stop_with_stop_codon(self):
338        # multiple stop codons with trailing codons
339        seq = RNA('UGGACUUGAUAUCGUUAGGAU')
340        exp = Protein('WT')
341        for stop in {'require', 'optional'}:
342            obs = self.sgc.translate(seq, stop=stop)
343            self.assertEqual(obs, exp)
344
345        # ignore stop codon trimming; just translate
346        exp = Protein('WT*YR*D')
347        obs = self.sgc.translate(seq, stop='ignore')
348        self.assertEqual(obs, exp)
349
350        # ends with single stop codon
351        seq = RNA('UGUCUGUAA')
352        exp = Protein('CL')
353        for stop in {'require', 'optional'}:
354            obs = self.sgc.translate(seq, stop=stop)
355            self.assertEqual(obs, exp)
356
357        exp = Protein('CL*')
358        obs = self.sgc.translate(seq, stop='ignore')
359        self.assertEqual(obs, exp)
360
361        # just a stop codon
362        seq = RNA('UAG')
363        exp = Protein('')
364        for stop in {'require', 'optional'}:
365            obs = self.sgc.translate(seq, stop=stop)
366            self.assertEqual(obs, exp)
367
368        exp = Protein('*')
369        obs = self.sgc.translate(seq, stop='ignore')
370        self.assertEqual(obs, exp)
371
372    def test_translate_stop_no_stop_codon(self):
373        seq = RNA('GAAUCU')
374        exp = Protein('ES')
375        for stop in {'ignore', 'optional'}:
376            obs = self.sgc.translate(seq, stop=stop)
377            self.assertEqual(obs, exp)
378
379        with self.assertRaisesRegex(ValueError,
380                                    r'reading_frame=1.*stop=\'require\''):
381            self.sgc.translate(seq, stop='require')
382
383    def test_translate_trim_to_cds(self):
384        seq = RNA('UAAUUGCCUCAUUAAUAACAAUGA')
385
386        # find first start codon, trim all before it, convert alternative start
387        # codon to M, finally trim to first stop codon following the start
388        # codon
389        exp = Protein('MPH')
390        for param in {'require', 'optional'}:
391            obs = self.sgc.translate(seq, start=param, stop=param)
392            self.assertEqual(obs, exp)
393
394        exp = Protein('*LPH**Q*')
395        obs = self.sgc.translate(seq, start='ignore', stop='ignore')
396        self.assertEqual(obs, exp)
397
398        # alternative reading frame disrupts cds:
399        #     AAUUGCCUCAUUAAUAACAAUGA
400        #     NCLINNN
401        with self.assertRaisesRegex(ValueError,
402                                    r'reading_frame=2.*start=\'require\''):
403            self.sgc.translate(seq, reading_frame=2, start='require')
404        with self.assertRaisesRegex(ValueError,
405                                    r'reading_frame=2.*stop=\'require\''):
406            self.sgc.translate(seq, reading_frame=2, stop='require')
407
408        exp = Protein('NCLINNN')
409        for param in {'ignore', 'optional'}:
410            obs = self.sgc.translate(seq, reading_frame=2, start=param,
411                                     stop=param)
412            self.assertEqual(obs, exp)
413
414    def test_translate_invalid_input(self):
415        # invalid sequence type
416        with self.assertRaisesRegex(TypeError, r'RNA.*DNA'):
417            self.sgc.translate(DNA('ACG'))
418        with self.assertRaisesRegex(TypeError, r'RNA.*str'):
419            self.sgc.translate('ACG')
420
421        # invalid reading frame
422        with self.assertRaisesRegex(ValueError, r'\[1, 2, 3, -1, -2, -3\].*0'):
423            self.sgc.translate(RNA('AUG'), reading_frame=0)
424
425        # invalid start
426        with self.assertRaisesRegex(ValueError, r'start.*foo'):
427            self.sgc.translate(RNA('AUG'), start='foo')
428
429        # invalid stop
430        with self.assertRaisesRegex(ValueError, r'stop.*foo'):
431            self.sgc.translate(RNA('AUG'), stop='foo')
432
433        # gapped sequence
434        with self.assertRaisesRegex(ValueError, r'gapped'):
435            self.sgc.translate(RNA('UU-G'))
436
437        # degenerate sequence
438        with self.assertRaisesRegex(NotImplementedError, r'degenerate'):
439            self.sgc.translate(RNA('RUG'))
440
441    def test_translate_varied_genetic_codes(self):
442        # spot check using a few NCBI and custom genetic codes to translate
443        seq = RNA('AAUGAUGUGACUAUCAGAAGG')
444
445        # table_id=2
446        exp = Protein('NDVTI**')
447        obs = GeneticCode.from_ncbi(2).translate(seq)
448        self.assertEqual(obs, exp)
449
450        exp = Protein('MTI')
451        obs = GeneticCode.from_ncbi(2).translate(seq, start='require',
452                                                 stop='require')
453        self.assertEqual(obs, exp)
454
455        # table_id=22
456        exp = Protein('NDVTIRR')
457        obs = GeneticCode.from_ncbi(22).translate(seq)
458        self.assertEqual(obs, exp)
459
460        with self.assertRaisesRegex(ValueError,
461                                    r'reading_frame=1.*start=\'require\''):
462            GeneticCode.from_ncbi(22).translate(seq, start='require',
463                                                stop='require')
464
465        # custom, no start codons
466        gc = GeneticCode('MWN*' * 16, '-' * 64)
467        exp = Protein('MM*MWN*')
468        obs = gc.translate(seq)
469        self.assertEqual(obs, exp)
470
471        with self.assertRaisesRegex(ValueError,
472                                    r'reading_frame=1.*start=\'require\''):
473            gc.translate(seq, start='require', stop='require')
474
475    def test_translate_six_frames(self):
476        seq = RNA('AUGCUAACAUAAA')  # rc = UUUAUGUUAGCAU
477
478        # test default behavior
479        exp = [Protein('MLT*'), Protein('C*HK'), Protein('ANI'),
480               Protein('FMLA'), Protein('LC*H'), Protein('YVS')]
481        obs = list(self.sgc.translate_six_frames(seq))
482        self.assertEqual(obs, exp)
483
484        # test that start/stop are respected
485        exp = [Protein('MLT'), Protein('C'), Protein('ANI'),
486               Protein('MLA'), Protein('LC'), Protein('YVS')]
487        obs = list(self.sgc.translate_six_frames(seq, start='optional',
488                                                 stop='optional'))
489        self.assertEqual(obs, exp)
490
491    def test_translate_six_frames_preserves_metadata(self):
492        seq = RNA('AUG', metadata={'foo': 'bar', 'baz': 42},
493                  positional_metadata={'foo': range(3)})
494        obs = list(self.sgc.translate_six_frames(seq))[:2]
495        # metadata retained, positional metadata dropped
496        self.assertEqual(
497            obs,
498            [Protein('M', metadata={'foo': 'bar', 'baz': 42}),
499             Protein('', metadata={'foo': 'bar', 'baz': 42})])
500
501
502if __name__ == '__main__':
503    unittest.main()
504