1# ---------------------------------------------------------------------------- 2# Copyright (c) 2013--, scikit-bio development team. 3# 4# Distributed under the terms of the Modified BSD License. 5# 6# The full license is in the file COPYING.txt, distributed with this software. 7# ---------------------------------------------------------------------------- 8 9import itertools 10import unittest 11 12import numpy as np 13import numpy.testing as npt 14 15from skbio import Sequence, DNA, RNA, Protein, GeneticCode 16from skbio.sequence._genetic_code import _ncbi_genetic_codes 17 18 19class TestGeneticCode(unittest.TestCase): 20 def setUp(self): 21 self.sgc = GeneticCode.from_ncbi(1) 22 23 def test_from_ncbi_valid_table_ids(self): 24 # spot check a few tables 25 self.assertEqual(GeneticCode.from_ncbi().name, 26 'Standard') 27 self.assertEqual(GeneticCode.from_ncbi(2).name, 28 'Vertebrate Mitochondrial') 29 self.assertEqual(GeneticCode.from_ncbi(12).name, 30 'Alternative Yeast Nuclear') 31 self.assertEqual(GeneticCode.from_ncbi(25).name, 32 'Candidate Division SR1 and Gracilibacteria') 33 34 def test_from_ncbi_invalid_input(self): 35 with self.assertRaisesRegex(ValueError, r'table_id.*7'): 36 GeneticCode.from_ncbi(7) 37 with self.assertRaisesRegex(ValueError, r'table_id.*42'): 38 GeneticCode.from_ncbi(42) 39 40 def test_reading_frames(self): 41 exp = [1, 2, 3, -1, -2, -3] 42 self.assertEqual(GeneticCode.reading_frames, exp) 43 self.assertEqual(self.sgc.reading_frames, exp) 44 45 GeneticCode.reading_frames.append(42) 46 47 self.assertEqual(GeneticCode.reading_frames, exp) 48 self.assertEqual(self.sgc.reading_frames, exp) 49 50 with self.assertRaises(AttributeError): 51 self.sgc.reading_frames = [1, 2, 42] 52 53 def test_name(self): 54 self.assertEqual(self.sgc.name, 'Standard') 55 self.assertEqual(GeneticCode('M' * 64, '-' * 64).name, '') 56 self.assertEqual(GeneticCode('M' * 64, '-' * 64, 'foo').name, 'foo') 57 58 with self.assertRaises(AttributeError): 59 self.sgc.name = 'foo' 60 61 def test_init_varied_equivalent_input(self): 62 for args in (('M' * 64, '-' * 64), 63 (Protein('M' * 64), Protein('-' * 64)), 64 (Sequence('M' * 64), Sequence('-' * 64))): 65 gc = GeneticCode(*args) 66 self.assertEqual(gc.name, '') 67 self.assertEqual(gc._amino_acids, Protein('M' * 64)) 68 self.assertEqual(gc._starts, Protein('-' * 64)) 69 npt.assert_array_equal(gc._m_character_codon, 70 np.asarray([0, 0, 0], dtype=np.uint8)) 71 self.assertEqual(len(gc._start_codons), 0) 72 73 def test_init_invalid_input(self): 74 # `amino_acids` invalid protein 75 with self.assertRaisesRegex(ValueError, r'Invalid character.*&'): 76 GeneticCode('&' * 64, '-' * 64) 77 78 # wrong number of amino acids 79 with self.assertRaisesRegex(ValueError, r'amino_acids.*64.*42'): 80 GeneticCode('M' * 42, '-' * 64) 81 82 # `amino_acids` missing M 83 with self.assertRaisesRegex(ValueError, r'amino_acids.*M.*character'): 84 GeneticCode('A' * 64, '-' * 64) 85 86 # `starts` invalid protein 87 with self.assertRaisesRegex(ValueError, r'Invalid character.*&'): 88 GeneticCode('M' * 64, '&' * 64) 89 90 # wrong number of starts 91 with self.assertRaisesRegex(ValueError, r'starts.*64.*42'): 92 GeneticCode('M' * 64, '-' * 42) 93 94 # invalid characters in `starts` 95 with self.assertRaisesRegex(ValueError, r'starts.*M and - characters'): 96 GeneticCode('M' * 64, '-M' * 30 + '*AQR') 97 98 def test_str(self): 99 # predefined 100 exp = ( 101 ' AAs = FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAA' 102 'DDEEGGGG\n' 103 'Starts = ---M---------------M---------------M--------------------' 104 '--------\n' 105 'Base1 = UUUUUUUUUUUUUUUUCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGG' 106 'GGGGGGGG\n' 107 'Base2 = UUUUCCCCAAAAGGGGUUUUCCCCAAAAGGGGUUUUCCCCAAAAGGGGUUUUCCCC' 108 'AAAAGGGG\n' 109 'Base3 = UCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAG' 110 'UCAGUCAG' 111 ) 112 self.assertEqual(str(self.sgc), exp) 113 114 # custom, no name 115 obs = str(GeneticCode('M' * 64, '-' * 64)) 116 self.assertIn('M' * 64, obs) 117 self.assertIn('-' * 64, obs) 118 119 def test_repr(self): 120 # predefined 121 exp = ( 122 'GeneticCode (Standard)\n' 123 '-----------------------------------------------------------------' 124 '--------\n' 125 ' AAs = FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAA' 126 'DDEEGGGG\n' 127 'Starts = ---M---------------M---------------M--------------------' 128 '--------\n' 129 'Base1 = UUUUUUUUUUUUUUUUCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGG' 130 'GGGGGGGG\n' 131 'Base2 = UUUUCCCCAAAAGGGGUUUUCCCCAAAAGGGGUUUUCCCCAAAAGGGGUUUUCCCC' 132 'AAAAGGGG\n' 133 'Base3 = UCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAGUCAG' 134 'UCAGUCAG' 135 ) 136 self.assertEqual(repr(self.sgc), exp) 137 138 # custom, no name 139 obs = repr(GeneticCode('M' * 64, '-' * 64)) 140 self.assertTrue(obs.startswith('GeneticCode\n')) 141 self.assertIn('M' * 64, obs) 142 self.assertIn('-' * 64, obs) 143 144 def test_eq(self): 145 amino_acids = 'AMPM' * 16 146 starts = '--M-' * 16 147 148 equal_gcs = [ 149 GeneticCode(amino_acids, starts), 150 # name should be ignored 151 GeneticCode(amino_acids, starts, 'foo'), 152 # metadata/positional metadata should be ignored if Sequence 153 # subclass is provided 154 GeneticCode( 155 Protein(amino_acids, metadata={'foo': 'bar'}), 156 Protein(starts, positional_metadata={'foo': range(64)})) 157 ] 158 159 # every gc should be equal to itself 160 for gc in equal_gcs: 161 self.assertTrue(gc == gc) 162 self.assertFalse(gc != gc) 163 164 # every pair of gcs should be equal. use permutations instead of 165 # combinations to test that comparing gc1 to gc2 and gc2 to gc1 are 166 # both equal 167 for gc1, gc2 in itertools.permutations(equal_gcs, 2): 168 self.assertTrue(gc1 == gc2) 169 self.assertFalse(gc1 != gc2) 170 171 def test_ne(self): 172 class GeneticCodeSubclass(GeneticCode): 173 pass 174 175 amino_acids = 'AMPM' * 16 176 starts = '--M-' * 16 177 178 unequal_gcs = [ 179 GeneticCode(amino_acids, starts), 180 # type must match 181 GeneticCodeSubclass(amino_acids, starts), 182 # completely different type 183 'foo' 184 ] 185 # none of the NCBI genetic codes should be equal to each other 186 unequal_gcs.extend(_ncbi_genetic_codes.values()) 187 188 for gc in unequal_gcs: 189 self.assertTrue(gc == gc) 190 self.assertFalse(gc != gc) 191 192 for gc1, gc2 in itertools.permutations(unequal_gcs, 2): 193 self.assertTrue(gc1 != gc2) 194 self.assertFalse(gc1 == gc2) 195 196 def test_translate_preserves_metadata(self): 197 obs = self.sgc.translate( 198 RNA('AUG', metadata={'foo': 'bar', 'baz': 42}, 199 positional_metadata={'foo': range(3)})) 200 # metadata retained, positional metadata dropped 201 self.assertEqual(obs, Protein('M', 202 metadata={'foo': 'bar', 'baz': 42})) 203 204 def test_translate_default_behavior(self): 205 # empty translation 206 exp = Protein('') 207 for seq in RNA(''), RNA('A'), RNA('AU'): 208 obs = self.sgc.translate(seq) 209 self.assertEqual(obs, exp) 210 211 # no start or stop codons 212 obs = self.sgc.translate(RNA('CCU')) 213 self.assertEqual(obs, Protein('P')) 214 215 # multiple alternative start codons, no stop codons, length is multiple 216 # of 3 217 obs = self.sgc.translate(RNA('CAUUUGCUGAAA')) 218 self.assertEqual(obs, Protein('HLLK')) 219 220 # multiple stop codons, length isn't multiple of 3 221 obs = self.sgc.translate(RNA('UUUUUUUAAAGUUAAGGGAU')) 222 self.assertEqual(obs, Protein('FF*S*G')) 223 224 def test_translate_reading_frame_empty_translation(self): 225 exp = Protein('') 226 for seq in RNA(''), RNA('A'), RNA('AU'): 227 for reading_frame in GeneticCode.reading_frames: 228 obs = self.sgc.translate(seq, reading_frame=reading_frame) 229 self.assertEqual(obs, exp) 230 231 # reading frames that yield a partial codon 232 for reading_frame in 2, 3, -2, -3: 233 obs = self.sgc.translate(RNA('AUG'), reading_frame=reading_frame) 234 self.assertEqual(obs, exp) 235 236 def test_translate_reading_frame_non_empty_translation(self): 237 seq = RNA('AUGGUGGAA') # rc = UUCCACCAU 238 for reading_frame, exp_str in ((1, 'MVE'), (2, 'WW'), (3, 'GG'), 239 (-1, 'FHH'), (-2, 'ST'), (-3, 'PP')): 240 exp = Protein(exp_str) 241 obs = self.sgc.translate(seq, reading_frame=reading_frame) 242 self.assertEqual(obs, exp) 243 244 def test_translate_start_empty_translation(self): 245 exp = Protein('') 246 for seq in RNA(''), RNA('A'), RNA('AU'): 247 for start in {'optional', 'ignore'}: 248 obs = self.sgc.translate(seq, start=start) 249 self.assertEqual(obs, exp) 250 251 with self.assertRaisesRegex(ValueError, 252 r'reading_frame=1.*start=\'require\''): 253 self.sgc.translate(seq, start='require') 254 255 def test_translate_start_with_start_codon(self): 256 # trim before start codon, replace with M. ensure alternative start 257 # codons following the start codon aren't replaced with M. ensure 258 # default behavior for handling stop codons is retained 259 seq = RNA('CAUUUGCUGAAAUGA') 260 exp = Protein('MLK*') 261 for start in {'require', 'optional'}: 262 obs = self.sgc.translate(seq, start=start) 263 self.assertEqual(obs, exp) 264 265 # ignore start codon replacement and trimming; just translate 266 exp = Protein('HLLK*') 267 obs = self.sgc.translate(seq, start='ignore') 268 self.assertEqual(obs, exp) 269 270 # just a start codon, no replacement necessary 271 seq = RNA('AUG') 272 exp = Protein('M') 273 for start in {'require', 'optional', 'ignore'}: 274 obs = self.sgc.translate(seq, start=start) 275 self.assertEqual(obs, exp) 276 277 # single alternative start codon 278 seq = RNA('CUG') 279 exp = Protein('M') 280 for start in {'require', 'optional'}: 281 obs = self.sgc.translate(seq, start=start) 282 self.assertEqual(obs, exp) 283 284 exp = Protein('L') 285 obs = self.sgc.translate(seq, start='ignore') 286 self.assertEqual(obs, exp) 287 288 def test_translate_start_no_start_codon(self): 289 seq = RNA('CAACAACAGCAA') 290 exp = Protein('QQQQ') 291 for start in {'ignore', 'optional'}: 292 obs = self.sgc.translate(seq, start=start) 293 self.assertEqual(obs, exp) 294 295 with self.assertRaisesRegex(ValueError, 296 r'reading_frame=1.*start=\'require\''): 297 self.sgc.translate(seq, start='require') 298 299 # non-start codon that translates to an AA that start codons also map 300 # to. should catch bug if code attempts to search and trim *after* 301 # translation -- this must happen *before* translation 302 seq = RNA('UUACAA') 303 exp = Protein('LQ') 304 for start in {'ignore', 'optional'}: 305 obs = self.sgc.translate(seq, start=start) 306 self.assertEqual(obs, exp) 307 308 with self.assertRaisesRegex(ValueError, 309 r'reading_frame=1.*start=\'require\''): 310 self.sgc.translate(seq, start='require') 311 312 def test_translate_start_no_accidental_mutation(self): 313 # `start` mutates a vector in-place that is derived from 314 # GeneticCode._offset_table. the current code doesn't perform an 315 # explicit copy because numpy's advanced indexing is used, which always 316 # returns a copy. test this assumption here in case that behavior 317 # changes in the future 318 offset_table = self.sgc._offset_table.copy() 319 320 seq = RNA('CAUUUGCUGAAAUGA') 321 obs = self.sgc.translate(seq, start='require') 322 self.assertEqual(obs, Protein('MLK*')) 323 324 npt.assert_array_equal(self.sgc._offset_table, offset_table) 325 326 def test_translate_stop_empty_translation(self): 327 exp = Protein('') 328 for seq in RNA(''), RNA('A'), RNA('AU'): 329 for stop in {'optional', 'ignore'}: 330 obs = self.sgc.translate(seq, stop=stop) 331 self.assertEqual(obs, exp) 332 333 with self.assertRaisesRegex(ValueError, 334 r'reading_frame=1.*stop=\'require\''): 335 self.sgc.translate(seq, stop='require') 336 337 def test_translate_stop_with_stop_codon(self): 338 # multiple stop codons with trailing codons 339 seq = RNA('UGGACUUGAUAUCGUUAGGAU') 340 exp = Protein('WT') 341 for stop in {'require', 'optional'}: 342 obs = self.sgc.translate(seq, stop=stop) 343 self.assertEqual(obs, exp) 344 345 # ignore stop codon trimming; just translate 346 exp = Protein('WT*YR*D') 347 obs = self.sgc.translate(seq, stop='ignore') 348 self.assertEqual(obs, exp) 349 350 # ends with single stop codon 351 seq = RNA('UGUCUGUAA') 352 exp = Protein('CL') 353 for stop in {'require', 'optional'}: 354 obs = self.sgc.translate(seq, stop=stop) 355 self.assertEqual(obs, exp) 356 357 exp = Protein('CL*') 358 obs = self.sgc.translate(seq, stop='ignore') 359 self.assertEqual(obs, exp) 360 361 # just a stop codon 362 seq = RNA('UAG') 363 exp = Protein('') 364 for stop in {'require', 'optional'}: 365 obs = self.sgc.translate(seq, stop=stop) 366 self.assertEqual(obs, exp) 367 368 exp = Protein('*') 369 obs = self.sgc.translate(seq, stop='ignore') 370 self.assertEqual(obs, exp) 371 372 def test_translate_stop_no_stop_codon(self): 373 seq = RNA('GAAUCU') 374 exp = Protein('ES') 375 for stop in {'ignore', 'optional'}: 376 obs = self.sgc.translate(seq, stop=stop) 377 self.assertEqual(obs, exp) 378 379 with self.assertRaisesRegex(ValueError, 380 r'reading_frame=1.*stop=\'require\''): 381 self.sgc.translate(seq, stop='require') 382 383 def test_translate_trim_to_cds(self): 384 seq = RNA('UAAUUGCCUCAUUAAUAACAAUGA') 385 386 # find first start codon, trim all before it, convert alternative start 387 # codon to M, finally trim to first stop codon following the start 388 # codon 389 exp = Protein('MPH') 390 for param in {'require', 'optional'}: 391 obs = self.sgc.translate(seq, start=param, stop=param) 392 self.assertEqual(obs, exp) 393 394 exp = Protein('*LPH**Q*') 395 obs = self.sgc.translate(seq, start='ignore', stop='ignore') 396 self.assertEqual(obs, exp) 397 398 # alternative reading frame disrupts cds: 399 # AAUUGCCUCAUUAAUAACAAUGA 400 # NCLINNN 401 with self.assertRaisesRegex(ValueError, 402 r'reading_frame=2.*start=\'require\''): 403 self.sgc.translate(seq, reading_frame=2, start='require') 404 with self.assertRaisesRegex(ValueError, 405 r'reading_frame=2.*stop=\'require\''): 406 self.sgc.translate(seq, reading_frame=2, stop='require') 407 408 exp = Protein('NCLINNN') 409 for param in {'ignore', 'optional'}: 410 obs = self.sgc.translate(seq, reading_frame=2, start=param, 411 stop=param) 412 self.assertEqual(obs, exp) 413 414 def test_translate_invalid_input(self): 415 # invalid sequence type 416 with self.assertRaisesRegex(TypeError, r'RNA.*DNA'): 417 self.sgc.translate(DNA('ACG')) 418 with self.assertRaisesRegex(TypeError, r'RNA.*str'): 419 self.sgc.translate('ACG') 420 421 # invalid reading frame 422 with self.assertRaisesRegex(ValueError, r'\[1, 2, 3, -1, -2, -3\].*0'): 423 self.sgc.translate(RNA('AUG'), reading_frame=0) 424 425 # invalid start 426 with self.assertRaisesRegex(ValueError, r'start.*foo'): 427 self.sgc.translate(RNA('AUG'), start='foo') 428 429 # invalid stop 430 with self.assertRaisesRegex(ValueError, r'stop.*foo'): 431 self.sgc.translate(RNA('AUG'), stop='foo') 432 433 # gapped sequence 434 with self.assertRaisesRegex(ValueError, r'gapped'): 435 self.sgc.translate(RNA('UU-G')) 436 437 # degenerate sequence 438 with self.assertRaisesRegex(NotImplementedError, r'degenerate'): 439 self.sgc.translate(RNA('RUG')) 440 441 def test_translate_varied_genetic_codes(self): 442 # spot check using a few NCBI and custom genetic codes to translate 443 seq = RNA('AAUGAUGUGACUAUCAGAAGG') 444 445 # table_id=2 446 exp = Protein('NDVTI**') 447 obs = GeneticCode.from_ncbi(2).translate(seq) 448 self.assertEqual(obs, exp) 449 450 exp = Protein('MTI') 451 obs = GeneticCode.from_ncbi(2).translate(seq, start='require', 452 stop='require') 453 self.assertEqual(obs, exp) 454 455 # table_id=22 456 exp = Protein('NDVTIRR') 457 obs = GeneticCode.from_ncbi(22).translate(seq) 458 self.assertEqual(obs, exp) 459 460 with self.assertRaisesRegex(ValueError, 461 r'reading_frame=1.*start=\'require\''): 462 GeneticCode.from_ncbi(22).translate(seq, start='require', 463 stop='require') 464 465 # custom, no start codons 466 gc = GeneticCode('MWN*' * 16, '-' * 64) 467 exp = Protein('MM*MWN*') 468 obs = gc.translate(seq) 469 self.assertEqual(obs, exp) 470 471 with self.assertRaisesRegex(ValueError, 472 r'reading_frame=1.*start=\'require\''): 473 gc.translate(seq, start='require', stop='require') 474 475 def test_translate_six_frames(self): 476 seq = RNA('AUGCUAACAUAAA') # rc = UUUAUGUUAGCAU 477 478 # test default behavior 479 exp = [Protein('MLT*'), Protein('C*HK'), Protein('ANI'), 480 Protein('FMLA'), Protein('LC*H'), Protein('YVS')] 481 obs = list(self.sgc.translate_six_frames(seq)) 482 self.assertEqual(obs, exp) 483 484 # test that start/stop are respected 485 exp = [Protein('MLT'), Protein('C'), Protein('ANI'), 486 Protein('MLA'), Protein('LC'), Protein('YVS')] 487 obs = list(self.sgc.translate_six_frames(seq, start='optional', 488 stop='optional')) 489 self.assertEqual(obs, exp) 490 491 def test_translate_six_frames_preserves_metadata(self): 492 seq = RNA('AUG', metadata={'foo': 'bar', 'baz': 42}, 493 positional_metadata={'foo': range(3)}) 494 obs = list(self.sgc.translate_six_frames(seq))[:2] 495 # metadata retained, positional metadata dropped 496 self.assertEqual( 497 obs, 498 [Protein('M', metadata={'foo': 'bar', 'baz': 42}), 499 Protein('', metadata={'foo': 'bar', 'baz': 42})]) 500 501 502if __name__ == '__main__': 503 unittest.main() 504