1 /* 2 * Jalview - A Sequence Alignment Editor and Viewer (2.11.1.4) 3 * Copyright (C) 2021 The Jalview Authors 4 * 5 * This file is part of Jalview. 6 * 7 * Jalview is free software: you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation, either version 3 10 * of the License, or (at your option) any later version. 11 * 12 * Jalview is distributed in the hope that it will be useful, but 13 * WITHOUT ANY WARRANTY; without even the implied warranty 14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR 15 * PURPOSE. See the GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>. 19 * The Jalview Authors are detailed in the 'AUTHORS' file. 20 */ 21 package jalview.analysis; 22 23 import static org.junit.Assert.assertNotEquals; 24 import static org.testng.AssertJUnit.assertEquals; 25 import static org.testng.AssertJUnit.assertFalse; 26 import static org.testng.AssertJUnit.assertNotNull; 27 import static org.testng.AssertJUnit.assertNull; 28 import static org.testng.AssertJUnit.assertSame; 29 import static org.testng.AssertJUnit.assertTrue; 30 31 import jalview.analysis.AlignmentUtils.DnaVariant; 32 import jalview.datamodel.AlignedCodonFrame; 33 import jalview.datamodel.Alignment; 34 import jalview.datamodel.AlignmentAnnotation; 35 import jalview.datamodel.AlignmentI; 36 import jalview.datamodel.Annotation; 37 import jalview.datamodel.DBRefEntry; 38 import jalview.datamodel.GeneLociI; 39 import jalview.datamodel.Mapping; 40 import jalview.datamodel.SearchResultMatchI; 41 import jalview.datamodel.SearchResultsI; 42 import jalview.datamodel.Sequence; 43 import jalview.datamodel.SequenceFeature; 44 import jalview.datamodel.SequenceI; 45 import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping; 46 import jalview.datamodel.features.SequenceFeatures; 47 import jalview.gui.JvOptionPane; 48 import jalview.io.AppletFormatAdapter; 49 import jalview.io.DataSourceType; 50 import jalview.io.FileFormat; 51 import jalview.io.FileFormatI; 52 import jalview.io.FormatAdapter; 53 import jalview.io.gff.SequenceOntologyI; 54 import jalview.util.MapList; 55 import jalview.util.MappingUtils; 56 57 import java.io.IOException; 58 import java.util.ArrayList; 59 import java.util.Arrays; 60 import java.util.Iterator; 61 import java.util.LinkedHashMap; 62 import java.util.List; 63 import java.util.Map; 64 import java.util.TreeMap; 65 66 import org.testng.annotations.BeforeClass; 67 import org.testng.annotations.Test; 68 69 public class AlignmentUtilsTests 70 { 71 private static Sequence ts = new Sequence("short", 72 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm"); 73 74 @BeforeClass(alwaysRun = true) setUpJvOptionPane()75 public void setUpJvOptionPane() 76 { 77 JvOptionPane.setInteractiveMode(false); 78 JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION); 79 } 80 81 @Test(groups = { "Functional" }) testExpandContext()82 public void testExpandContext() 83 { 84 AlignmentI al = new Alignment(new Sequence[] {}); 85 for (int i = 4; i < 14; i += 2) 86 { 87 SequenceI s1 = ts.deriveSequence().getSubSequence(i, i + 7); 88 al.addSequence(s1); 89 } 90 System.out.println(new AppletFormatAdapter().formatSequences( 91 FileFormat.Clustal, 92 al, true)); 93 for (int flnk = -1; flnk < 25; flnk++) 94 { 95 AlignmentI exp = AlignmentUtils.expandContext(al, flnk); 96 System.out.println("\nFlank size: " + flnk); 97 System.out.println(new AppletFormatAdapter().formatSequences( 98 FileFormat.Clustal, exp, true)); 99 if (flnk == -1) 100 { 101 /* 102 * Full expansion to complete sequences 103 */ 104 for (SequenceI sq : exp.getSequences()) 105 { 106 String ung = sq.getSequenceAsString().replaceAll("-+", ""); 107 final String errorMsg = "Flanking sequence not the same as original dataset sequence.\n" 108 + ung 109 + "\n" 110 + sq.getDatasetSequence().getSequenceAsString(); 111 assertTrue(errorMsg, ung.equalsIgnoreCase(sq.getDatasetSequence() 112 .getSequenceAsString())); 113 } 114 } 115 else if (flnk == 24) 116 { 117 /* 118 * Last sequence is fully expanded, others have leading gaps to match 119 */ 120 assertTrue(exp.getSequenceAt(4).getSequenceAsString() 121 .startsWith("abc")); 122 assertTrue(exp.getSequenceAt(3).getSequenceAsString() 123 .startsWith("--abc")); 124 assertTrue(exp.getSequenceAt(2).getSequenceAsString() 125 .startsWith("----abc")); 126 assertTrue(exp.getSequenceAt(1).getSequenceAsString() 127 .startsWith("------abc")); 128 assertTrue(exp.getSequenceAt(0).getSequenceAsString() 129 .startsWith("--------abc")); 130 } 131 } 132 } 133 134 /** 135 * Test that annotations are correctly adjusted by expandContext 136 */ 137 @Test(groups = { "Functional" }) testExpandContext_annotation()138 public void testExpandContext_annotation() 139 { 140 AlignmentI al = new Alignment(new Sequence[] {}); 141 SequenceI ds = new Sequence("Seq1", "ABCDEFGHI"); 142 // subsequence DEF: 143 SequenceI seq1 = ds.deriveSequence().getSubSequence(3, 6); 144 al.addSequence(seq1); 145 146 /* 147 * Annotate DEF with 4/5/6 respectively 148 */ 149 Annotation[] anns = new Annotation[] { new Annotation(4), 150 new Annotation(5), new Annotation(6) }; 151 AlignmentAnnotation ann = new AlignmentAnnotation("SS", 152 "secondary structure", anns); 153 seq1.addAlignmentAnnotation(ann); 154 155 /* 156 * The annotations array should match aligned positions 157 */ 158 assertEquals(3, ann.annotations.length); 159 assertEquals(4, ann.annotations[0].value, 0.001); 160 assertEquals(5, ann.annotations[1].value, 0.001); 161 assertEquals(6, ann.annotations[2].value, 0.001); 162 163 /* 164 * Check annotation to sequence position mappings before expanding the 165 * sequence; these are set up in Sequence.addAlignmentAnnotation -> 166 * Annotation.setSequenceRef -> createSequenceMappings 167 */ 168 assertNull(ann.getAnnotationForPosition(1)); 169 assertNull(ann.getAnnotationForPosition(2)); 170 assertNull(ann.getAnnotationForPosition(3)); 171 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001); 172 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001); 173 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001); 174 assertNull(ann.getAnnotationForPosition(7)); 175 assertNull(ann.getAnnotationForPosition(8)); 176 assertNull(ann.getAnnotationForPosition(9)); 177 178 /* 179 * Expand the subsequence to the full sequence abcDEFghi 180 */ 181 AlignmentI expanded = AlignmentUtils.expandContext(al, -1); 182 assertEquals("abcDEFghi", expanded.getSequenceAt(0) 183 .getSequenceAsString()); 184 185 /* 186 * Confirm the alignment and sequence have the same SS annotation, 187 * referencing the expanded sequence 188 */ 189 ann = expanded.getSequenceAt(0).getAnnotation()[0]; 190 assertSame(ann, expanded.getAlignmentAnnotation()[0]); 191 assertSame(expanded.getSequenceAt(0), ann.sequenceRef); 192 193 /* 194 * The annotations array should have null values except for annotated 195 * positions 196 */ 197 assertNull(ann.annotations[0]); 198 assertNull(ann.annotations[1]); 199 assertNull(ann.annotations[2]); 200 assertEquals(4, ann.annotations[3].value, 0.001); 201 assertEquals(5, ann.annotations[4].value, 0.001); 202 assertEquals(6, ann.annotations[5].value, 0.001); 203 assertNull(ann.annotations[6]); 204 assertNull(ann.annotations[7]); 205 assertNull(ann.annotations[8]); 206 207 /* 208 * sequence position mappings should be unchanged 209 */ 210 assertNull(ann.getAnnotationForPosition(1)); 211 assertNull(ann.getAnnotationForPosition(2)); 212 assertNull(ann.getAnnotationForPosition(3)); 213 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001); 214 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001); 215 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001); 216 assertNull(ann.getAnnotationForPosition(7)); 217 assertNull(ann.getAnnotationForPosition(8)); 218 assertNull(ann.getAnnotationForPosition(9)); 219 } 220 221 /** 222 * Test method that returns a map of lists of sequences by sequence name. 223 * 224 * @throws IOException 225 */ 226 @Test(groups = { "Functional" }) testGetSequencesByName()227 public void testGetSequencesByName() throws IOException 228 { 229 final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n" 230 + ">Seq1Name\nABCD\n"; 231 AlignmentI al = loadAlignment(data, FileFormat.Fasta); 232 Map<String, List<SequenceI>> map = AlignmentUtils 233 .getSequencesByName(al); 234 assertEquals(2, map.keySet().size()); 235 assertEquals(2, map.get("Seq1Name").size()); 236 assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString()); 237 assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString()); 238 assertEquals(1, map.get("Seq2Name").size()); 239 assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString()); 240 } 241 242 /** 243 * Helper method to load an alignment and ensure dataset sequences are set up. 244 * 245 * @param data 246 * @param format 247 * TODO 248 * @return 249 * @throws IOException 250 */ loadAlignment(final String data, FileFormatI format)251 protected AlignmentI loadAlignment(final String data, FileFormatI format) 252 throws IOException 253 { 254 AlignmentI a = new FormatAdapter().readFile(data, 255 DataSourceType.PASTE, format); 256 a.setDataset(null); 257 return a; 258 } 259 260 /** 261 * Test mapping of protein to cDNA, for the case where we have no sequence 262 * cross-references, so mappings are made first-served 1-1 where sequences 263 * translate. 264 * 265 * @throws IOException 266 */ 267 @Test(groups = { "Functional" }) testMapProteinAlignmentToCdna_noXrefs()268 public void testMapProteinAlignmentToCdna_noXrefs() throws IOException 269 { 270 List<SequenceI> protseqs = new ArrayList<>(); 271 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); 272 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); 273 protseqs.add(new Sequence("UNIPROT|V12347", "SAR")); 274 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3])); 275 protein.setDataset(null); 276 277 List<SequenceI> dnaseqs = new ArrayList<>(); 278 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR 279 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ 280 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ 281 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ 282 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4])); 283 cdna.setDataset(null); 284 285 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna)); 286 287 // 3 mappings made, each from 1 to 1 sequence 288 assertEquals(3, protein.getCodonFrames().size()); 289 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size()); 290 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size()); 291 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size()); 292 293 // V12345 mapped to A22222 294 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0)) 295 .get(0); 296 assertEquals(1, acf.getdnaSeqs().length); 297 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(), 298 acf.getdnaSeqs()[0]); 299 Mapping[] protMappings = acf.getProtMappings(); 300 assertEquals(1, protMappings.length); 301 MapList mapList = protMappings[0].getMap(); 302 assertEquals(3, mapList.getFromRatio()); 303 assertEquals(1, mapList.getToRatio()); 304 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges() 305 .get(0))); 306 assertEquals(1, mapList.getFromRanges().size()); 307 assertTrue(Arrays.equals(new int[] { 1, 3 }, 308 mapList.getToRanges().get(0))); 309 assertEquals(1, mapList.getToRanges().size()); 310 311 // V12346 mapped to A33333 312 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0); 313 assertEquals(1, acf.getdnaSeqs().length); 314 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(), 315 acf.getdnaSeqs()[0]); 316 317 // V12347 mapped to A11111 318 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0); 319 assertEquals(1, acf.getdnaSeqs().length); 320 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(), 321 acf.getdnaSeqs()[0]); 322 323 // no mapping involving the 'extra' A44444 324 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty()); 325 } 326 327 /** 328 * Test for the alignSequenceAs method that takes two sequences and a mapping. 329 */ 330 @Test(groups = { "Functional" }) testAlignSequenceAs_withMapping_noIntrons()331 public void testAlignSequenceAs_withMapping_noIntrons() 332 { 333 MapList map = new MapList(new int[] { 1, 6 }, new int[] { 1, 2 }, 3, 1); 334 335 /* 336 * No existing gaps in dna: 337 */ 338 checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map, 339 "---GGG---AAA"); 340 341 /* 342 * Now introduce gaps in dna but ignore them when realigning. 343 */ 344 checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map, 345 "---GGG---AAA"); 346 347 /* 348 * Now include gaps in dna when realigning. First retaining 'mapped' gaps 349 * only, i.e. those within the exon region. 350 */ 351 checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map, 352 "---G-G--G---A--A-A"); 353 354 /* 355 * Include all gaps in dna when realigning (within and without the exon 356 * region). The leading gap, and the gaps between codons, are subsumed by 357 * the protein alignment gap. 358 */ 359 checkAlignSequenceAs("-G-GG--AA-A---", "-A-L-", true, true, map, 360 "---G-GG---AA-A---"); 361 362 /* 363 * Include only unmapped gaps in dna when realigning (outside the exon 364 * region). The leading gap, and the gaps between codons, are subsumed by 365 * the protein alignment gap. 366 */ 367 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map, 368 "---GGG---AAA---"); 369 } 370 371 /** 372 * Test for the alignSequenceAs method that takes two sequences and a mapping. 373 */ 374 @Test(groups = { "Functional" }) testAlignSequenceAs_withMapping_withIntrons()375 public void testAlignSequenceAs_withMapping_withIntrons() 376 { 377 /* 378 * Exons at codon 2 (AAA) and 4 (TTT) 379 */ 380 MapList map = new MapList(new int[] { 4, 6, 10, 12 }, 381 new int[] { 1, 2 }, 3, 1); 382 383 /* 384 * Simple case: no gaps in dna 385 */ 386 checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map, 387 "GGG---AAACCCTTTGGG"); 388 389 /* 390 * Add gaps to dna - but ignore when realigning. 391 */ 392 checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-", 393 false, false, map, "GGG---AAACCCTTTGGG"); 394 395 /* 396 * Add gaps to dna - include within exons only when realigning. 397 */ 398 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-", 399 true, false, map, "GGG---A--A---ACCCT-TTGGG"); 400 401 /* 402 * Include gaps outside exons only when realigning. 403 */ 404 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-", 405 false, true, map, "-G-G-GAAAC-CCTTT-GG-G-"); 406 407 /* 408 * Include gaps following first intron if we are 'preserving mapped gaps' 409 */ 410 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-", 411 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-"); 412 413 /* 414 * Include all gaps in dna when realigning. 415 */ 416 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-", 417 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-"); 418 } 419 420 /** 421 * Test for the case where not all of the protein sequence is mapped to cDNA. 422 */ 423 @Test(groups = { "Functional" }) testAlignSequenceAs_withMapping_withUnmappedProtein()424 public void testAlignSequenceAs_withMapping_withUnmappedProtein() 425 { 426 /* 427 * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P 428 */ 429 final MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] { 430 1, 1, 3, 3 }, 3, 1); 431 432 /* 433 * -L- 'aligns' ccc------ 434 */ 435 checkAlignSequenceAs("gggAAAcccTTTggg", "-A-L-P-", false, false, map, 436 "gggAAAccc------TTTggg"); 437 } 438 439 /** 440 * Helper method that performs and verifies the method under test. 441 * 442 * @param alignee 443 * the sequence to be realigned 444 * @param alignModel 445 * the sequence whose alignment is to be copied 446 * @param preserveMappedGaps 447 * @param preserveUnmappedGaps 448 * @param map 449 * @param expected 450 */ checkAlignSequenceAs(final String alignee, final String alignModel, final boolean preserveMappedGaps, final boolean preserveUnmappedGaps, MapList map, final String expected)451 protected void checkAlignSequenceAs(final String alignee, 452 final String alignModel, final boolean preserveMappedGaps, 453 final boolean preserveUnmappedGaps, MapList map, 454 final String expected) 455 { 456 SequenceI alignMe = new Sequence("Seq1", alignee); 457 alignMe.createDatasetSequence(); 458 SequenceI alignFrom = new Sequence("Seq2", alignModel); 459 alignFrom.createDatasetSequence(); 460 AlignedCodonFrame acf = new AlignedCodonFrame(); 461 acf.addMap(alignMe.getDatasetSequence(), 462 alignFrom.getDatasetSequence(), map); 463 464 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "---", '-', 465 preserveMappedGaps, preserveUnmappedGaps); 466 assertEquals(expected, alignMe.getSequenceAsString()); 467 } 468 469 /** 470 * Test for the alignSequenceAs method where we preserve gaps in introns only. 471 */ 472 @Test(groups = { "Functional" }) testAlignSequenceAs_keepIntronGapsOnly()473 public void testAlignSequenceAs_keepIntronGapsOnly() 474 { 475 476 /* 477 * Intron GGGAAA followed by exon CCCTTT 478 */ 479 MapList map = new MapList(new int[] { 7, 12 }, new int[] { 1, 2 }, 3, 1); 480 481 checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL", false, true, map, 482 "GG-G-AA-ACCCTTT"); 483 } 484 485 /** 486 * Test the method that realigns protein to match mapped codon alignment. 487 */ 488 @Test(groups = { "Functional" }) testAlignProteinAsDna()489 public void testAlignProteinAsDna() 490 { 491 // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12] 492 SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-"); 493 // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13] 494 SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG"); 495 // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13] 496 SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG"); 497 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 }); 498 dna.setDataset(null); 499 500 // protein alignment will be realigned like dna 501 SequenceI prot1 = new Sequence("Seq1", "CHYQ"); 502 SequenceI prot2 = new Sequence("Seq2", "CHYQ"); 503 SequenceI prot3 = new Sequence("Seq3", "CHYQ"); 504 SequenceI prot4 = new Sequence("Seq4", "R-QSV"); // unmapped, unchanged 505 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2, 506 prot3, prot4 }); 507 protein.setDataset(null); 508 509 MapList map = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 }, 3, 1); 510 AlignedCodonFrame acf = new AlignedCodonFrame(); 511 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map); 512 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map); 513 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map); 514 ArrayList<AlignedCodonFrame> acfs = new ArrayList<>(); 515 acfs.add(acf); 516 protein.setCodonFrames(acfs); 517 518 /* 519 * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9] 520 * [8,9,10] [10,11,12] [11,12,13] 521 */ 522 AlignmentUtils.alignProteinAsDna(protein, dna); 523 assertEquals("C-H--Y-Q-", prot1.getSequenceAsString()); 524 assertEquals("-C--H-Y-Q", prot2.getSequenceAsString()); 525 assertEquals("C--H--Y-Q", prot3.getSequenceAsString()); 526 assertEquals("R-QSV", prot4.getSequenceAsString()); 527 } 528 529 /** 530 * Test the method that tests whether a CDNA sequence translates to a protein 531 * sequence 532 */ 533 @Test(groups = { "Functional" }) testTranslatesAs()534 public void testTranslatesAs() 535 { 536 // null arguments check 537 assertFalse(AlignmentUtils.translatesAs(null, 0, null)); 538 assertFalse(AlignmentUtils.translatesAs(new char[] { 't' }, 0, null)); 539 assertFalse(AlignmentUtils.translatesAs(null, 0, new char[] { 'a' })); 540 541 // straight translation 542 assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0, 543 "FPKG".toCharArray())); 544 // with extra start codon (not in protein) 545 assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(), 546 3, "FPKG".toCharArray())); 547 // with stop codon1 (not in protein) 548 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(), 549 0, "FPKG".toCharArray())); 550 // with stop codon1 (in protein as *) 551 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(), 552 0, "FPKG*".toCharArray())); 553 // with stop codon2 (not in protein) 554 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(), 555 0, "FPKG".toCharArray())); 556 // with stop codon3 (not in protein) 557 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(), 558 0, "FPKG".toCharArray())); 559 // with start and stop codon1 560 assertTrue(AlignmentUtils.translatesAs( 561 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG".toCharArray())); 562 // with start and stop codon1 (in protein as *) 563 assertTrue(AlignmentUtils.translatesAs( 564 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG*".toCharArray())); 565 // with start and stop codon2 566 assertTrue(AlignmentUtils.translatesAs( 567 "atgtttcccaaagggtag".toCharArray(), 3, "FPKG".toCharArray())); 568 // with start and stop codon3 569 assertTrue(AlignmentUtils.translatesAs( 570 "atgtttcccaaagggtga".toCharArray(), 3, "FPKG".toCharArray())); 571 572 // with embedded stop codons 573 assertTrue(AlignmentUtils.translatesAs( 574 "atgtttTAGcccaaaTAAgggtga".toCharArray(), 3, 575 "F*PK*G".toCharArray())); 576 577 // wrong protein 578 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 579 0, "FPMG".toCharArray())); 580 581 // truncated dna 582 assertFalse(AlignmentUtils.translatesAs("tttcccaaagg".toCharArray(), 0, 583 "FPKG".toCharArray())); 584 585 // truncated protein 586 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 587 0, "FPK".toCharArray())); 588 589 // overlong dna (doesn't end in stop codon) 590 assertFalse(AlignmentUtils.translatesAs( 591 "tttcccaaagggttt".toCharArray(), 0, "FPKG".toCharArray())); 592 593 // dna + stop codon + more 594 assertFalse(AlignmentUtils.translatesAs( 595 "tttcccaaagggttaga".toCharArray(), 0, "FPKG".toCharArray())); 596 597 // overlong protein 598 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 599 0, "FPKGQ".toCharArray())); 600 } 601 602 /** 603 * Test mapping of protein to cDNA, for cases where the cDNA has start and/or 604 * stop codons in addition to the protein coding sequence. 605 * 606 * @throws IOException 607 */ 608 @Test(groups = { "Functional" }) testMapProteinAlignmentToCdna_withStartAndStopCodons()609 public void testMapProteinAlignmentToCdna_withStartAndStopCodons() 610 throws IOException 611 { 612 List<SequenceI> protseqs = new ArrayList<>(); 613 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); 614 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); 615 protseqs.add(new Sequence("UNIPROT|V12347", "SAR")); 616 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3])); 617 protein.setDataset(null); 618 619 List<SequenceI> dnaseqs = new ArrayList<>(); 620 // start + SAR: 621 dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC")); 622 // = EIQ + stop 623 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA")); 624 // = start +EIQ + stop 625 dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG")); 626 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); 627 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4])); 628 cdna.setDataset(null); 629 630 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna)); 631 632 // 3 mappings made, each from 1 to 1 sequence 633 assertEquals(3, protein.getCodonFrames().size()); 634 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size()); 635 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size()); 636 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size()); 637 638 // V12345 mapped from A22222 639 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0)) 640 .get(0); 641 assertEquals(1, acf.getdnaSeqs().length); 642 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(), 643 acf.getdnaSeqs()[0]); 644 Mapping[] protMappings = acf.getProtMappings(); 645 assertEquals(1, protMappings.length); 646 MapList mapList = protMappings[0].getMap(); 647 assertEquals(3, mapList.getFromRatio()); 648 assertEquals(1, mapList.getToRatio()); 649 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges() 650 .get(0))); 651 assertEquals(1, mapList.getFromRanges().size()); 652 assertTrue(Arrays.equals(new int[] { 1, 3 }, 653 mapList.getToRanges().get(0))); 654 assertEquals(1, mapList.getToRanges().size()); 655 656 // V12346 mapped from A33333 starting position 4 657 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0); 658 assertEquals(1, acf.getdnaSeqs().length); 659 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(), 660 acf.getdnaSeqs()[0]); 661 protMappings = acf.getProtMappings(); 662 assertEquals(1, protMappings.length); 663 mapList = protMappings[0].getMap(); 664 assertEquals(3, mapList.getFromRatio()); 665 assertEquals(1, mapList.getToRatio()); 666 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges() 667 .get(0))); 668 assertEquals(1, mapList.getFromRanges().size()); 669 assertTrue(Arrays.equals(new int[] { 1, 3 }, 670 mapList.getToRanges().get(0))); 671 assertEquals(1, mapList.getToRanges().size()); 672 673 // V12347 mapped to A11111 starting position 4 674 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0); 675 assertEquals(1, acf.getdnaSeqs().length); 676 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(), 677 acf.getdnaSeqs()[0]); 678 protMappings = acf.getProtMappings(); 679 assertEquals(1, protMappings.length); 680 mapList = protMappings[0].getMap(); 681 assertEquals(3, mapList.getFromRatio()); 682 assertEquals(1, mapList.getToRatio()); 683 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges() 684 .get(0))); 685 assertEquals(1, mapList.getFromRanges().size()); 686 assertTrue(Arrays.equals(new int[] { 1, 3 }, 687 mapList.getToRanges().get(0))); 688 assertEquals(1, mapList.getToRanges().size()); 689 690 // no mapping involving the 'extra' A44444 691 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty()); 692 } 693 694 /** 695 * Test mapping of protein to cDNA, for the case where we have some sequence 696 * cross-references. Verify that 1-to-many mappings are made where 697 * cross-references exist and sequences are mappable. 698 * 699 * @throws IOException 700 */ 701 @Test(groups = { "Functional" }) testMapProteinAlignmentToCdna_withXrefs()702 public void testMapProteinAlignmentToCdna_withXrefs() throws IOException 703 { 704 List<SequenceI> protseqs = new ArrayList<>(); 705 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); 706 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); 707 protseqs.add(new Sequence("UNIPROT|V12347", "SAR")); 708 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3])); 709 protein.setDataset(null); 710 711 List<SequenceI> dnaseqs = new ArrayList<>(); 712 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR 713 dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ 714 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ 715 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ 716 dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ 717 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5])); 718 cdna.setDataset(null); 719 720 // Xref A22222 to V12345 (should get mapped) 721 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345")); 722 // Xref V12345 to A44444 (should get mapped) 723 protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444")); 724 // Xref A33333 to V12347 (sequence mismatch - should not get mapped) 725 dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347")); 726 // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped. 727 // it should get paired up with the unmapped A33333 728 // A11111 should be mapped to V12347 729 // A55555 is spare and has no xref so is not mapped 730 731 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna)); 732 733 // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7 734 assertEquals(3, protein.getCodonFrames().size()); 735 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size()); 736 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size()); 737 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size()); 738 739 // one mapping for each of the first 4 cDNA sequences 740 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size()); 741 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size()); 742 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size()); 743 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size()); 744 745 // V12345 mapped to A22222 and A44444 746 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0)) 747 .get(0); 748 assertEquals(2, acf.getdnaSeqs().length); 749 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(), 750 acf.getdnaSeqs()[0]); 751 assertEquals(cdna.getSequenceAt(3).getDatasetSequence(), 752 acf.getdnaSeqs()[1]); 753 754 // V12346 mapped to A33333 755 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0); 756 assertEquals(1, acf.getdnaSeqs().length); 757 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(), 758 acf.getdnaSeqs()[0]); 759 760 // V12347 mapped to A11111 761 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0); 762 assertEquals(1, acf.getdnaSeqs().length); 763 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(), 764 acf.getdnaSeqs()[0]); 765 766 // no mapping involving the 'extra' A55555 767 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty()); 768 } 769 770 /** 771 * Test mapping of protein to cDNA, for the case where we have some sequence 772 * cross-references. Verify that once we have made an xref mapping we don't 773 * also map un-xrefd sequeces. 774 * 775 * @throws IOException 776 */ 777 @Test(groups = { "Functional" }) testMapProteinAlignmentToCdna_prioritiseXrefs()778 public void testMapProteinAlignmentToCdna_prioritiseXrefs() 779 throws IOException 780 { 781 List<SequenceI> protseqs = new ArrayList<>(); 782 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); 783 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); 784 AlignmentI protein = new Alignment( 785 protseqs.toArray(new SequenceI[protseqs.size()])); 786 protein.setDataset(null); 787 788 List<SequenceI> dnaseqs = new ArrayList<>(); 789 dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ 790 dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ 791 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs 792 .size()])); 793 cdna.setDataset(null); 794 795 // Xref A22222 to V12345 (should get mapped) 796 // A11111 should then be mapped to the unmapped V12346 797 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345")); 798 799 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna)); 800 801 // 2 protein mappings made 802 assertEquals(2, protein.getCodonFrames().size()); 803 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size()); 804 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size()); 805 806 // one mapping for each of the cDNA sequences 807 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size()); 808 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size()); 809 810 // V12345 mapped to A22222 811 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0)) 812 .get(0); 813 assertEquals(1, acf.getdnaSeqs().length); 814 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(), 815 acf.getdnaSeqs()[0]); 816 817 // V12346 mapped to A11111 818 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0); 819 assertEquals(1, acf.getdnaSeqs().length); 820 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(), 821 acf.getdnaSeqs()[0]); 822 } 823 824 /** 825 * Test the method that shows or hides sequence annotations by type(s) and 826 * selection group. 827 */ 828 @Test(groups = { "Functional" }) testShowOrHideSequenceAnnotations()829 public void testShowOrHideSequenceAnnotations() 830 { 831 SequenceI seq1 = new Sequence("Seq1", "AAA"); 832 SequenceI seq2 = new Sequence("Seq2", "BBB"); 833 SequenceI seq3 = new Sequence("Seq3", "CCC"); 834 Annotation[] anns = new Annotation[] { new Annotation(2f) }; 835 AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1", 836 anns); 837 ann1.setSequenceRef(seq1); 838 AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2", 839 anns); 840 ann2.setSequenceRef(seq2); 841 AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3", 842 anns); 843 AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns); 844 ann4.setSequenceRef(seq1); 845 AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns); 846 ann5.setSequenceRef(seq2); 847 AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns); 848 AlignmentI al = new Alignment(new SequenceI[] { seq1, seq2, seq3 }); 849 al.addAnnotation(ann1); // Structure for Seq1 850 al.addAnnotation(ann2); // Structure for Seq2 851 al.addAnnotation(ann3); // Structure for no sequence 852 al.addAnnotation(ann4); // Temp for seq1 853 al.addAnnotation(ann5); // Temp for seq2 854 al.addAnnotation(ann6); // Temp for no sequence 855 List<String> types = new ArrayList<>(); 856 List<SequenceI> scope = new ArrayList<>(); 857 858 /* 859 * Set all sequence related Structure to hidden (ann1, ann2) 860 */ 861 types.add("Structure"); 862 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false, 863 false); 864 assertFalse(ann1.visible); 865 assertFalse(ann2.visible); 866 assertTrue(ann3.visible); // not sequence-related, not affected 867 assertTrue(ann4.visible); // not Structure, not affected 868 assertTrue(ann5.visible); // " 869 assertTrue(ann6.visible); // not sequence-related, not affected 870 871 /* 872 * Set Temp in {seq1, seq3} to hidden 873 */ 874 types.clear(); 875 types.add("Temp"); 876 scope.add(seq1); 877 scope.add(seq3); 878 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false, 879 false); 880 assertFalse(ann1.visible); // unchanged 881 assertFalse(ann2.visible); // unchanged 882 assertTrue(ann3.visible); // not sequence-related, not affected 883 assertFalse(ann4.visible); // Temp for seq1 hidden 884 assertTrue(ann5.visible); // not in scope, not affected 885 assertTrue(ann6.visible); // not sequence-related, not affected 886 887 /* 888 * Set Temp in all sequences to hidden 889 */ 890 types.clear(); 891 types.add("Temp"); 892 scope.add(seq1); 893 scope.add(seq3); 894 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false, 895 false); 896 assertFalse(ann1.visible); // unchanged 897 assertFalse(ann2.visible); // unchanged 898 assertTrue(ann3.visible); // not sequence-related, not affected 899 assertFalse(ann4.visible); // Temp for seq1 hidden 900 assertFalse(ann5.visible); // Temp for seq2 hidden 901 assertTrue(ann6.visible); // not sequence-related, not affected 902 903 /* 904 * Set all types in {seq1, seq3} to visible 905 */ 906 types.clear(); 907 scope.clear(); 908 scope.add(seq1); 909 scope.add(seq3); 910 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true, 911 true); 912 assertTrue(ann1.visible); // Structure for seq1 set visible 913 assertFalse(ann2.visible); // not in scope, unchanged 914 assertTrue(ann3.visible); // not sequence-related, not affected 915 assertTrue(ann4.visible); // Temp for seq1 set visible 916 assertFalse(ann5.visible); // not in scope, unchanged 917 assertTrue(ann6.visible); // not sequence-related, not affected 918 919 /* 920 * Set all types in all scope to hidden 921 */ 922 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true, 923 false); 924 assertFalse(ann1.visible); 925 assertFalse(ann2.visible); 926 assertTrue(ann3.visible); // not sequence-related, not affected 927 assertFalse(ann4.visible); 928 assertFalse(ann5.visible); 929 assertTrue(ann6.visible); // not sequence-related, not affected 930 } 931 932 /** 933 * Tests for the method that checks if one sequence cross-references another 934 */ 935 @Test(groups = { "Functional" }) testHasCrossRef()936 public void testHasCrossRef() 937 { 938 assertFalse(AlignmentUtils.hasCrossRef(null, null)); 939 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF"); 940 assertFalse(AlignmentUtils.hasCrossRef(seq1, null)); 941 assertFalse(AlignmentUtils.hasCrossRef(null, seq1)); 942 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF"); 943 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2)); 944 945 // different ref 946 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193")); 947 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2)); 948 949 // case-insensitive; version number is ignored 950 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192")); 951 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2)); 952 953 // right case! 954 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192")); 955 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2)); 956 // test is one-way only 957 assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1)); 958 } 959 960 /** 961 * Tests for the method that checks if either sequence cross-references the 962 * other 963 */ 964 @Test(groups = { "Functional" }) testHaveCrossRef()965 public void testHaveCrossRef() 966 { 967 assertFalse(AlignmentUtils.hasCrossRef(null, null)); 968 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF"); 969 assertFalse(AlignmentUtils.haveCrossRef(seq1, null)); 970 assertFalse(AlignmentUtils.haveCrossRef(null, seq1)); 971 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF"); 972 assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2)); 973 974 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192")); 975 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2)); 976 // next is true for haveCrossRef, false for hasCrossRef 977 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1)); 978 979 // now the other way round 980 seq1.setDBRefs(null); 981 seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345")); 982 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2)); 983 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1)); 984 985 // now both ways 986 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192")); 987 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2)); 988 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1)); 989 } 990 991 /** 992 * Test the method that extracts the cds-only part of a dna alignment. 993 */ 994 @Test(groups = { "Functional" }) testMakeCdsAlignment()995 public void testMakeCdsAlignment() 996 { 997 /* 998 * scenario: 999 * dna1 --> [4, 6] [10,12] --> pep1 1000 * dna2 --> [1, 3] [7, 9] [13,15] --> pep2 1001 */ 1002 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa"); 1003 SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC"); 1004 SequenceI pep1 = new Sequence("pep1", "GF"); 1005 SequenceI pep2 = new Sequence("pep2", "GFP"); 1006 pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "pep1")); 1007 pep2.addDBRef(new DBRefEntry("UNIPROT", "0", "pep2")); 1008 dna1.createDatasetSequence(); 1009 dna2.createDatasetSequence(); 1010 pep1.createDatasetSequence(); 1011 pep2.createDatasetSequence(); 1012 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 }); 1013 dna.setDataset(null); 1014 1015 /* 1016 * put a variant feature on dna2 base 8 1017 * - should transfer to cds2 base 5 1018 */ 1019 dna2.addSequenceFeature(new SequenceFeature("variant", "hgmd", 8, 8, 1020 0f, null)); 1021 1022 /* 1023 * need a sourceDbRef if we are to construct dbrefs to the CDS 1024 * sequence from the dna contig sequences 1025 */ 1026 DBRefEntry dbref = new DBRefEntry("ENSEMBL", "0", "dna1"); 1027 dna1.getDatasetSequence().addDBRef(dbref); 1028 org.testng.Assert.assertEquals(dbref, dna1.getPrimaryDBRefs().get(0)); 1029 dbref = new DBRefEntry("ENSEMBL", "0", "dna2"); 1030 dna2.getDatasetSequence().addDBRef(dbref); 1031 org.testng.Assert.assertEquals(dbref, dna2.getPrimaryDBRefs().get(0)); 1032 1033 /* 1034 * CDS sequences are 'discovered' from dna-to-protein mappings on the alignment 1035 * dataset (e.g. added from dbrefs by CrossRef.findXrefSequences) 1036 */ 1037 MapList mapfordna1 = new MapList(new int[] { 4, 6, 10, 12 }, new int[] { 1038 1, 2 }, 3, 1); 1039 AlignedCodonFrame acf = new AlignedCodonFrame(); 1040 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), 1041 mapfordna1); 1042 dna.addCodonFrame(acf); 1043 MapList mapfordna2 = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, 1044 new int[] { 1, 3 }, 3, 1); 1045 acf = new AlignedCodonFrame(); 1046 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), 1047 mapfordna2); 1048 dna.addCodonFrame(acf); 1049 1050 /* 1051 * In this case, mappings originally came from matching Uniprot accessions 1052 * - so need an xref on dna involving those regions. 1053 * These are normally constructed from CDS annotation 1054 */ 1055 DBRefEntry dna1xref = new DBRefEntry("UNIPROT", "ENSEMBL", "pep1", 1056 new Mapping(mapfordna1)); 1057 dna1.addDBRef(dna1xref); 1058 assertEquals(2, dna1.getDBRefs().length); // to self and to pep1 1059 DBRefEntry dna2xref = new DBRefEntry("UNIPROT", "ENSEMBL", "pep2", 1060 new Mapping(mapfordna2)); 1061 dna2.addDBRef(dna2xref); 1062 assertEquals(2, dna2.getDBRefs().length); // to self and to pep2 1063 1064 /* 1065 * execute method under test: 1066 */ 1067 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] { 1068 dna1, dna2 }, dna.getDataset(), null); 1069 1070 /* 1071 * verify cds sequences 1072 */ 1073 assertEquals(2, cds.getSequences().size()); 1074 assertEquals("GGGTTT", cds.getSequenceAt(0).getSequenceAsString()); 1075 assertEquals("GGGTTTCCC", cds.getSequenceAt(1).getSequenceAsString()); 1076 1077 /* 1078 * verify shared, extended alignment dataset 1079 */ 1080 assertSame(dna.getDataset(), cds.getDataset()); 1081 SequenceI cds1Dss = cds.getSequenceAt(0).getDatasetSequence(); 1082 SequenceI cds2Dss = cds.getSequenceAt(1).getDatasetSequence(); 1083 assertTrue(dna.getDataset().getSequences().contains(cds1Dss)); 1084 assertTrue(dna.getDataset().getSequences().contains(cds2Dss)); 1085 1086 /* 1087 * verify CDS has a dbref with mapping to peptide 1088 */ 1089 assertNotNull(cds1Dss.getDBRefs()); 1090 assertEquals(2, cds1Dss.getDBRefs().length); 1091 dbref = cds1Dss.getDBRefs()[0]; 1092 assertEquals(dna1xref.getSource(), dbref.getSource()); 1093 // version is via ensembl's primary ref 1094 assertEquals(dna1xref.getVersion(), dbref.getVersion()); 1095 assertEquals(dna1xref.getAccessionId(), dbref.getAccessionId()); 1096 assertNotNull(dbref.getMap()); 1097 assertSame(pep1.getDatasetSequence(), dbref.getMap().getTo()); 1098 MapList cdsMapping = new MapList(new int[] { 1, 6 }, 1099 new int[] { 1, 2 }, 3, 1); 1100 assertEquals(cdsMapping, dbref.getMap().getMap()); 1101 1102 /* 1103 * verify peptide has added a dbref with reverse mapping to CDS 1104 */ 1105 assertNotNull(pep1.getDBRefs()); 1106 // FIXME pep1.getDBRefs() is 1 - is that the correct behaviour ? 1107 assertEquals(2, pep1.getDBRefs().length); 1108 dbref = pep1.getDBRefs()[1]; 1109 assertEquals("ENSEMBL", dbref.getSource()); 1110 assertEquals("0", dbref.getVersion()); 1111 assertEquals("CDS|dna1", dbref.getAccessionId()); 1112 assertNotNull(dbref.getMap()); 1113 assertSame(cds1Dss, dbref.getMap().getTo()); 1114 assertEquals(cdsMapping.getInverse(), dbref.getMap().getMap()); 1115 1116 /* 1117 * verify cDNA has added a dbref with mapping to CDS 1118 */ 1119 assertEquals(3, dna1.getDBRefs().length); 1120 DBRefEntry dbRefEntry = dna1.getDBRefs()[2]; 1121 assertSame(cds1Dss, dbRefEntry.getMap().getTo()); 1122 MapList dnaToCdsMapping = new MapList(new int[] { 4, 6, 10, 12 }, 1123 new int[] { 1, 6 }, 1, 1); 1124 assertEquals(dnaToCdsMapping, dbRefEntry.getMap().getMap()); 1125 assertEquals(3, dna2.getDBRefs().length); 1126 dbRefEntry = dna2.getDBRefs()[2]; 1127 assertSame(cds2Dss, dbRefEntry.getMap().getTo()); 1128 dnaToCdsMapping = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, 1129 new int[] { 1, 9 }, 1, 1); 1130 assertEquals(dnaToCdsMapping, dbRefEntry.getMap().getMap()); 1131 1132 /* 1133 * verify CDS has added a dbref with mapping to cDNA 1134 */ 1135 assertEquals(2, cds1Dss.getDBRefs().length); 1136 dbRefEntry = cds1Dss.getDBRefs()[1]; 1137 assertSame(dna1.getDatasetSequence(), dbRefEntry.getMap().getTo()); 1138 MapList cdsToDnaMapping = new MapList(new int[] { 1, 6 }, new int[] { 1139 4, 6, 10, 12 }, 1, 1); 1140 assertEquals(cdsToDnaMapping, dbRefEntry.getMap().getMap()); 1141 assertEquals(2, cds2Dss.getDBRefs().length); 1142 dbRefEntry = cds2Dss.getDBRefs()[1]; 1143 assertSame(dna2.getDatasetSequence(), dbRefEntry.getMap().getTo()); 1144 cdsToDnaMapping = new MapList(new int[] { 1, 9 }, new int[] { 1, 3, 7, 1145 9, 13, 15 }, 1, 1); 1146 assertEquals(cdsToDnaMapping, dbRefEntry.getMap().getMap()); 1147 1148 /* 1149 * Verify mappings from CDS to peptide, cDNA to CDS, and cDNA to peptide 1150 * the mappings are on the shared alignment dataset 1151 * 6 mappings, 2*(DNA->CDS), 2*(DNA->Pep), 2*(CDS->Pep) 1152 */ 1153 List<AlignedCodonFrame> cdsMappings = cds.getDataset().getCodonFrames(); 1154 assertEquals(6, cdsMappings.size()); 1155 1156 /* 1157 * verify that mapping sets for dna and cds alignments are different 1158 * [not current behaviour - all mappings are on the alignment dataset] 1159 */ 1160 // select -> subselect type to test. 1161 // Assert.assertNotSame(dna.getCodonFrames(), cds.getCodonFrames()); 1162 // assertEquals(4, dna.getCodonFrames().size()); 1163 // assertEquals(4, cds.getCodonFrames().size()); 1164 1165 /* 1166 * Two mappings involve pep1 (dna to pep1, cds to pep1) 1167 * Mapping from pep1 to GGGTTT in first new exon sequence 1168 */ 1169 List<AlignedCodonFrame> pep1Mappings = MappingUtils 1170 .findMappingsForSequence(pep1, cdsMappings); 1171 assertEquals(2, pep1Mappings.size()); 1172 List<AlignedCodonFrame> mappings = MappingUtils 1173 .findMappingsForSequence(cds.getSequenceAt(0), pep1Mappings); 1174 assertEquals(1, mappings.size()); 1175 1176 // map G to GGG 1177 SearchResultsI sr = MappingUtils.buildSearchResults(pep1, 1, mappings); 1178 assertEquals(1, sr.getResults().size()); 1179 SearchResultMatchI m = sr.getResults().get(0); 1180 assertSame(cds1Dss, m.getSequence()); 1181 assertEquals(1, m.getStart()); 1182 assertEquals(3, m.getEnd()); 1183 // map F to TTT 1184 sr = MappingUtils.buildSearchResults(pep1, 2, mappings); 1185 m = sr.getResults().get(0); 1186 assertSame(cds1Dss, m.getSequence()); 1187 assertEquals(4, m.getStart()); 1188 assertEquals(6, m.getEnd()); 1189 1190 /* 1191 * Two mappings involve pep2 (dna to pep2, cds to pep2) 1192 * Verify mapping from pep2 to GGGTTTCCC in second new exon sequence 1193 */ 1194 List<AlignedCodonFrame> pep2Mappings = MappingUtils 1195 .findMappingsForSequence(pep2, cdsMappings); 1196 assertEquals(2, pep2Mappings.size()); 1197 mappings = MappingUtils.findMappingsForSequence(cds.getSequenceAt(1), 1198 pep2Mappings); 1199 assertEquals(1, mappings.size()); 1200 // map G to GGG 1201 sr = MappingUtils.buildSearchResults(pep2, 1, mappings); 1202 assertEquals(1, sr.getResults().size()); 1203 m = sr.getResults().get(0); 1204 assertSame(cds2Dss, m.getSequence()); 1205 assertEquals(1, m.getStart()); 1206 assertEquals(3, m.getEnd()); 1207 // map F to TTT 1208 sr = MappingUtils.buildSearchResults(pep2, 2, mappings); 1209 m = sr.getResults().get(0); 1210 assertSame(cds2Dss, m.getSequence()); 1211 assertEquals(4, m.getStart()); 1212 assertEquals(6, m.getEnd()); 1213 // map P to CCC 1214 sr = MappingUtils.buildSearchResults(pep2, 3, mappings); 1215 m = sr.getResults().get(0); 1216 assertSame(cds2Dss, m.getSequence()); 1217 assertEquals(7, m.getStart()); 1218 assertEquals(9, m.getEnd()); 1219 1220 /* 1221 * check cds2 acquired a variant feature in position 5 1222 */ 1223 List<SequenceFeature> sfs = cds2Dss.getSequenceFeatures(); 1224 assertNotNull(sfs); 1225 assertEquals(1, sfs.size()); 1226 assertEquals("variant", sfs.get(0).type); 1227 assertEquals(5, sfs.get(0).begin); 1228 assertEquals(5, sfs.get(0).end); 1229 } 1230 1231 /** 1232 * Test the method that makes a cds-only alignment from a DNA sequence and its 1233 * product mappings, for the case where there are multiple exon mappings to 1234 * different protein products. 1235 */ 1236 @Test(groups = { "Functional" }) testMakeCdsAlignment_multipleProteins()1237 public void testMakeCdsAlignment_multipleProteins() 1238 { 1239 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa"); 1240 SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT 1241 SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc 1242 SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT 1243 dna1.createDatasetSequence(); 1244 pep1.createDatasetSequence(); 1245 pep2.createDatasetSequence(); 1246 pep3.createDatasetSequence(); 1247 pep1.getDatasetSequence().addDBRef( 1248 new DBRefEntry("EMBLCDS", "2", "A12345")); 1249 pep2.getDatasetSequence().addDBRef( 1250 new DBRefEntry("EMBLCDS", "3", "A12346")); 1251 pep3.getDatasetSequence().addDBRef( 1252 new DBRefEntry("EMBLCDS", "4", "A12347")); 1253 1254 /* 1255 * Create the CDS alignment 1256 */ 1257 AlignmentI dna = new Alignment(new SequenceI[] { dna1 }); 1258 dna.setDataset(null); 1259 1260 /* 1261 * Make the mappings from dna to protein 1262 */ 1263 // map ...GGG...TTT to GF 1264 MapList map = new MapList(new int[] { 4, 6, 10, 12 }, 1265 new int[] { 1, 2 }, 3, 1); 1266 AlignedCodonFrame acf = new AlignedCodonFrame(); 1267 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map); 1268 dna.addCodonFrame(acf); 1269 1270 // map aaa...ccc to KP 1271 map = new MapList(new int[] { 1, 3, 7, 9 }, new int[] { 1, 2 }, 3, 1); 1272 acf = new AlignedCodonFrame(); 1273 acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map); 1274 dna.addCodonFrame(acf); 1275 1276 // map aaa......TTT to KF 1277 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 2 }, 3, 1); 1278 acf = new AlignedCodonFrame(); 1279 acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map); 1280 dna.addCodonFrame(acf); 1281 1282 /* 1283 * execute method under test 1284 */ 1285 AlignmentI cdsal = AlignmentUtils.makeCdsAlignment( 1286 new SequenceI[] { dna1 }, dna.getDataset(), null); 1287 1288 /* 1289 * Verify we have 3 cds sequences, mapped to pep1/2/3 respectively 1290 */ 1291 List<SequenceI> cds = cdsal.getSequences(); 1292 assertEquals(3, cds.size()); 1293 1294 /* 1295 * verify shared, extended alignment dataset 1296 */ 1297 assertSame(cdsal.getDataset(), dna.getDataset()); 1298 assertTrue(dna.getDataset().getSequences() 1299 .contains(cds.get(0).getDatasetSequence())); 1300 assertTrue(dna.getDataset().getSequences() 1301 .contains(cds.get(1).getDatasetSequence())); 1302 assertTrue(dna.getDataset().getSequences() 1303 .contains(cds.get(2).getDatasetSequence())); 1304 1305 /* 1306 * verify aligned cds sequences and their xrefs 1307 */ 1308 SequenceI cdsSeq = cds.get(0); 1309 assertEquals("GGGTTT", cdsSeq.getSequenceAsString()); 1310 // assertEquals("dna1|A12345", cdsSeq.getName()); 1311 assertEquals("CDS|dna1", cdsSeq.getName()); 1312 // assertEquals(1, cdsSeq.getDBRefs().length); 1313 // DBRefEntry cdsRef = cdsSeq.getDBRefs()[0]; 1314 // assertEquals("EMBLCDS", cdsRef.getSource()); 1315 // assertEquals("2", cdsRef.getVersion()); 1316 // assertEquals("A12345", cdsRef.getAccessionId()); 1317 1318 cdsSeq = cds.get(1); 1319 assertEquals("aaaccc", cdsSeq.getSequenceAsString()); 1320 // assertEquals("dna1|A12346", cdsSeq.getName()); 1321 assertEquals("CDS|dna1", cdsSeq.getName()); 1322 // assertEquals(1, cdsSeq.getDBRefs().length); 1323 // cdsRef = cdsSeq.getDBRefs()[0]; 1324 // assertEquals("EMBLCDS", cdsRef.getSource()); 1325 // assertEquals("3", cdsRef.getVersion()); 1326 // assertEquals("A12346", cdsRef.getAccessionId()); 1327 1328 cdsSeq = cds.get(2); 1329 assertEquals("aaaTTT", cdsSeq.getSequenceAsString()); 1330 // assertEquals("dna1|A12347", cdsSeq.getName()); 1331 assertEquals("CDS|dna1", cdsSeq.getName()); 1332 // assertEquals(1, cdsSeq.getDBRefs().length); 1333 // cdsRef = cdsSeq.getDBRefs()[0]; 1334 // assertEquals("EMBLCDS", cdsRef.getSource()); 1335 // assertEquals("4", cdsRef.getVersion()); 1336 // assertEquals("A12347", cdsRef.getAccessionId()); 1337 1338 /* 1339 * Verify there are mappings from each cds sequence to its protein product 1340 * and also to its dna source 1341 */ 1342 List<AlignedCodonFrame> newMappings = cdsal.getCodonFrames(); 1343 1344 /* 1345 * 6 mappings involve dna1 (to pep1/2/3, cds1/2/3) 1346 */ 1347 List<AlignedCodonFrame> dnaMappings = MappingUtils 1348 .findMappingsForSequence(dna1, newMappings); 1349 assertEquals(6, dnaMappings.size()); 1350 1351 /* 1352 * dna1 to pep1 1353 */ 1354 List<AlignedCodonFrame> mappings = MappingUtils 1355 .findMappingsForSequence(pep1, dnaMappings); 1356 assertEquals(1, mappings.size()); 1357 assertEquals(1, mappings.get(0).getMappings().size()); 1358 assertSame(pep1.getDatasetSequence(), mappings.get(0).getMappings() 1359 .get(0).getMapping().getTo()); 1360 1361 /* 1362 * dna1 to cds1 1363 */ 1364 List<AlignedCodonFrame> dnaToCds1Mappings = MappingUtils 1365 .findMappingsForSequence(cds.get(0), dnaMappings); 1366 Mapping mapping = dnaToCds1Mappings.get(0).getMappings().get(0) 1367 .getMapping(); 1368 assertSame(cds.get(0).getDatasetSequence(), mapping.getTo()); 1369 assertEquals("G(1) in CDS should map to G(4) in DNA", 4, mapping 1370 .getMap().getToPosition(1)); 1371 1372 /* 1373 * dna1 to pep2 1374 */ 1375 mappings = MappingUtils.findMappingsForSequence(pep2, dnaMappings); 1376 assertEquals(1, mappings.size()); 1377 assertEquals(1, mappings.get(0).getMappings().size()); 1378 assertSame(pep2.getDatasetSequence(), mappings.get(0).getMappings() 1379 .get(0).getMapping().getTo()); 1380 1381 /* 1382 * dna1 to cds2 1383 */ 1384 List<AlignedCodonFrame> dnaToCds2Mappings = MappingUtils 1385 .findMappingsForSequence(cds.get(1), dnaMappings); 1386 mapping = dnaToCds2Mappings.get(0).getMappings().get(0).getMapping(); 1387 assertSame(cds.get(1).getDatasetSequence(), mapping.getTo()); 1388 assertEquals("c(4) in CDS should map to c(7) in DNA", 7, mapping 1389 .getMap().getToPosition(4)); 1390 1391 /* 1392 * dna1 to pep3 1393 */ 1394 mappings = MappingUtils.findMappingsForSequence(pep3, dnaMappings); 1395 assertEquals(1, mappings.size()); 1396 assertEquals(1, mappings.get(0).getMappings().size()); 1397 assertSame(pep3.getDatasetSequence(), mappings.get(0).getMappings() 1398 .get(0).getMapping().getTo()); 1399 1400 /* 1401 * dna1 to cds3 1402 */ 1403 List<AlignedCodonFrame> dnaToCds3Mappings = MappingUtils 1404 .findMappingsForSequence(cds.get(2), dnaMappings); 1405 mapping = dnaToCds3Mappings.get(0).getMappings().get(0).getMapping(); 1406 assertSame(cds.get(2).getDatasetSequence(), mapping.getTo()); 1407 assertEquals("T(4) in CDS should map to T(10) in DNA", 10, mapping 1408 .getMap().getToPosition(4)); 1409 } 1410 1411 @Test(groups = { "Functional" }) testIsMappable()1412 public void testIsMappable() 1413 { 1414 SequenceI dna1 = new Sequence("dna1", "cgCAGtgGT"); 1415 SequenceI aa1 = new Sequence("aa1", "RSG"); 1416 AlignmentI al1 = new Alignment(new SequenceI[] { dna1 }); 1417 AlignmentI al2 = new Alignment(new SequenceI[] { aa1 }); 1418 1419 assertFalse(AlignmentUtils.isMappable(null, null)); 1420 assertFalse(AlignmentUtils.isMappable(al1, null)); 1421 assertFalse(AlignmentUtils.isMappable(null, al1)); 1422 assertFalse(AlignmentUtils.isMappable(al1, al1)); 1423 assertFalse(AlignmentUtils.isMappable(al2, al2)); 1424 1425 assertTrue(AlignmentUtils.isMappable(al1, al2)); 1426 assertTrue(AlignmentUtils.isMappable(al2, al1)); 1427 } 1428 1429 /** 1430 * Test creating a mapping when the sequences involved do not start at residue 1431 * 1 1432 * 1433 * @throws IOException 1434 */ 1435 @Test(groups = { "Functional" }) testMapCdnaToProtein_forSubsequence()1436 public void testMapCdnaToProtein_forSubsequence() throws IOException 1437 { 1438 SequenceI prot = new Sequence("UNIPROT|V12345", "E-I--Q", 10, 12); 1439 prot.createDatasetSequence(); 1440 1441 SequenceI dna = new Sequence("EMBL|A33333", "GAA--AT-C-CAG", 40, 48); 1442 dna.createDatasetSequence(); 1443 1444 MapList map = AlignmentUtils.mapCdnaToProtein(prot, dna); 1445 assertEquals(10, map.getToLowest()); 1446 assertEquals(12, map.getToHighest()); 1447 assertEquals(40, map.getFromLowest()); 1448 assertEquals(48, map.getFromHighest()); 1449 } 1450 1451 /** 1452 * Test for the alignSequenceAs method where we have protein mapped to protein 1453 */ 1454 @Test(groups = { "Functional" }) testAlignSequenceAs_mappedProteinProtein()1455 public void testAlignSequenceAs_mappedProteinProtein() 1456 { 1457 1458 SequenceI alignMe = new Sequence("Match", "MGAASEV"); 1459 alignMe.createDatasetSequence(); 1460 SequenceI alignFrom = new Sequence("Query", "LQTGYMGAASEVMFSPTRR"); 1461 alignFrom.createDatasetSequence(); 1462 1463 AlignedCodonFrame acf = new AlignedCodonFrame(); 1464 // this is like a domain or motif match of part of a peptide sequence 1465 MapList map = new MapList(new int[] { 6, 12 }, new int[] { 1, 7 }, 1, 1); 1466 acf.addMap(alignFrom.getDatasetSequence(), 1467 alignMe.getDatasetSequence(), map); 1468 1469 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "-", '-', true, 1470 true); 1471 assertEquals("-----MGAASEV-------", alignMe.getSequenceAsString()); 1472 } 1473 1474 /** 1475 * Test for the alignSequenceAs method where there are trailing unmapped 1476 * residues in the model sequence 1477 */ 1478 @Test(groups = { "Functional" }) testAlignSequenceAs_withTrailingPeptide()1479 public void testAlignSequenceAs_withTrailingPeptide() 1480 { 1481 // map first 3 codons to KPF; G is a trailing unmapped residue 1482 MapList map = new MapList(new int[] { 1, 9 }, new int[] { 1, 3 }, 3, 1); 1483 1484 checkAlignSequenceAs("AAACCCTTT", "K-PFG", true, true, map, 1485 "AAA---CCCTTT---"); 1486 } 1487 1488 /** 1489 * Tests for transferring features between mapped sequences 1490 */ 1491 @Test(groups = { "Functional" }) testTransferFeatures()1492 public void testTransferFeatures() 1493 { 1494 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt"); 1495 SequenceI cds = new Sequence("cds/10-15", "TAGGCC"); 1496 1497 // no overlap 1498 dna.addSequenceFeature(new SequenceFeature("type1", "desc1", 1, 2, 1f, 1499 null)); 1500 // partial overlap - to [1, 1] 1501 dna.addSequenceFeature(new SequenceFeature("type2", "desc2", 3, 4, 2f, 1502 null)); 1503 // exact overlap - to [1, 3] 1504 dna.addSequenceFeature(new SequenceFeature("type3", "desc3", 4, 6, 3f, 1505 null)); 1506 // spanning overlap - to [2, 5] 1507 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f, 1508 null)); 1509 // exactly overlaps whole mapped range [1, 6] 1510 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f, 1511 null)); 1512 // no overlap (internal) 1513 dna.addSequenceFeature(new SequenceFeature("type6", "desc6", 7, 9, 6f, 1514 null)); 1515 // no overlap (3' end) 1516 dna.addSequenceFeature(new SequenceFeature("type7", "desc7", 13, 15, 1517 7f, null)); 1518 // overlap (3' end) - to [6, 6] 1519 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12, 1520 8f, null)); 1521 // extended overlap - to [6, +] 1522 dna.addSequenceFeature(new SequenceFeature("type9", "desc9", 12, 13, 1523 9f, null)); 1524 1525 MapList map = new MapList(new int[] { 4, 6, 10, 12 }, 1526 new int[] { 1, 6 }, 1, 1); 1527 1528 /* 1529 * transferFeatures() will build 'partial overlap' for regions 1530 * that partially overlap 5' or 3' (start or end) of target sequence 1531 */ 1532 AlignmentUtils.transferFeatures(dna, cds, map, null); 1533 List<SequenceFeature> sfs = cds.getSequenceFeatures(); 1534 assertEquals(6, sfs.size()); 1535 1536 SequenceFeature sf = sfs.get(0); 1537 assertEquals("type2", sf.getType()); 1538 assertEquals("desc2", sf.getDescription()); 1539 assertEquals(2f, sf.getScore()); 1540 assertEquals(1, sf.getBegin()); 1541 assertEquals(1, sf.getEnd()); 1542 1543 sf = sfs.get(1); 1544 assertEquals("type3", sf.getType()); 1545 assertEquals("desc3", sf.getDescription()); 1546 assertEquals(3f, sf.getScore()); 1547 assertEquals(1, sf.getBegin()); 1548 assertEquals(3, sf.getEnd()); 1549 1550 sf = sfs.get(2); 1551 assertEquals("type4", sf.getType()); 1552 assertEquals(2, sf.getBegin()); 1553 assertEquals(5, sf.getEnd()); 1554 1555 sf = sfs.get(3); 1556 assertEquals("type5", sf.getType()); 1557 assertEquals(1, sf.getBegin()); 1558 assertEquals(6, sf.getEnd()); 1559 1560 sf = sfs.get(4); 1561 assertEquals("type8", sf.getType()); 1562 assertEquals(6, sf.getBegin()); 1563 assertEquals(6, sf.getEnd()); 1564 1565 sf = sfs.get(5); 1566 assertEquals("type9", sf.getType()); 1567 assertEquals(6, sf.getBegin()); 1568 assertEquals(6, sf.getEnd()); 1569 } 1570 1571 /** 1572 * Tests for transferring features between mapped sequences 1573 */ 1574 @Test(groups = { "Functional" }) testTransferFeatures_withOmit()1575 public void testTransferFeatures_withOmit() 1576 { 1577 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt"); 1578 SequenceI cds = new Sequence("cds/10-15", "TAGGCC"); 1579 1580 MapList map = new MapList(new int[] { 4, 6, 10, 12 }, 1581 new int[] { 1, 6 }, 1, 1); 1582 1583 // [5, 11] maps to [2, 5] 1584 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f, 1585 null)); 1586 // [4, 12] maps to [1, 6] 1587 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f, 1588 null)); 1589 // [12, 12] maps to [6, 6] 1590 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12, 1591 8f, null)); 1592 1593 // desc4 and desc8 are the 'omit these' varargs 1594 AlignmentUtils.transferFeatures(dna, cds, map, null, "type4", "type8"); 1595 List<SequenceFeature> sfs = cds.getSequenceFeatures(); 1596 assertEquals(1, sfs.size()); 1597 1598 SequenceFeature sf = sfs.get(0); 1599 assertEquals("type5", sf.getType()); 1600 assertEquals(1, sf.getBegin()); 1601 assertEquals(6, sf.getEnd()); 1602 } 1603 1604 /** 1605 * Tests for transferring features between mapped sequences 1606 */ 1607 @Test(groups = { "Functional" }) testTransferFeatures_withSelect()1608 public void testTransferFeatures_withSelect() 1609 { 1610 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt"); 1611 SequenceI cds = new Sequence("cds/10-15", "TAGGCC"); 1612 1613 MapList map = new MapList(new int[] { 4, 6, 10, 12 }, 1614 new int[] { 1, 6 }, 1, 1); 1615 1616 // [5, 11] maps to [2, 5] 1617 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f, 1618 null)); 1619 // [4, 12] maps to [1, 6] 1620 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f, 1621 null)); 1622 // [12, 12] maps to [6, 6] 1623 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12, 1624 8f, null)); 1625 1626 // "type5" is the 'select this type' argument 1627 AlignmentUtils.transferFeatures(dna, cds, map, "type5"); 1628 List<SequenceFeature> sfs = cds.getSequenceFeatures(); 1629 assertEquals(1, sfs.size()); 1630 1631 SequenceFeature sf = sfs.get(0); 1632 assertEquals("type5", sf.getType()); 1633 assertEquals(1, sf.getBegin()); 1634 assertEquals(6, sf.getEnd()); 1635 } 1636 1637 /** 1638 * Test the method that extracts the cds-only part of a dna alignment, for the 1639 * case where the cds should be aligned to match its nucleotide sequence. 1640 */ 1641 @Test(groups = { "Functional" }) testMakeCdsAlignment_alternativeTranscripts()1642 public void testMakeCdsAlignment_alternativeTranscripts() 1643 { 1644 SequenceI dna1 = new Sequence("dna1", "aaaGGGCC-----CTTTaaaGGG"); 1645 // alternative transcript of same dna skips CCC codon 1646 SequenceI dna2 = new Sequence("dna2", "aaaGGGCC-----cttTaaaGGG"); 1647 // dna3 has no mapping (protein product) so should be ignored here 1648 SequenceI dna3 = new Sequence("dna3", "aaaGGGCCCCCGGGcttTaaaGGG"); 1649 SequenceI pep1 = new Sequence("pep1", "GPFG"); 1650 SequenceI pep2 = new Sequence("pep2", "GPG"); 1651 dna1.createDatasetSequence(); 1652 dna2.createDatasetSequence(); 1653 dna3.createDatasetSequence(); 1654 pep1.createDatasetSequence(); 1655 pep2.createDatasetSequence(); 1656 1657 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 }); 1658 dna.setDataset(null); 1659 1660 MapList map = new MapList(new int[] { 4, 12, 16, 18 }, 1661 new int[] { 1, 4 }, 3, 1); 1662 AlignedCodonFrame acf = new AlignedCodonFrame(); 1663 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map); 1664 dna.addCodonFrame(acf); 1665 map = new MapList(new int[] { 4, 8, 12, 12, 16, 18 }, 1666 new int[] { 1, 3 }, 3, 1); 1667 acf = new AlignedCodonFrame(); 1668 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map); 1669 dna.addCodonFrame(acf); 1670 1671 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] { 1672 dna1, dna2, dna3 }, dna.getDataset(), null); 1673 List<SequenceI> cdsSeqs = cds.getSequences(); 1674 assertEquals(2, cdsSeqs.size()); 1675 assertEquals("GGGCCCTTTGGG", cdsSeqs.get(0).getSequenceAsString()); 1676 assertEquals("GGGCCTGGG", cdsSeqs.get(1).getSequenceAsString()); 1677 1678 /* 1679 * verify shared, extended alignment dataset 1680 */ 1681 assertSame(dna.getDataset(), cds.getDataset()); 1682 assertTrue(dna.getDataset().getSequences() 1683 .contains(cdsSeqs.get(0).getDatasetSequence())); 1684 assertTrue(dna.getDataset().getSequences() 1685 .contains(cdsSeqs.get(1).getDatasetSequence())); 1686 1687 /* 1688 * Verify 6 mappings: dna1 to cds1, cds1 to pep1, dna1 to pep1 1689 * and the same for dna2/cds2/pep2 1690 */ 1691 List<AlignedCodonFrame> mappings = cds.getCodonFrames(); 1692 assertEquals(6, mappings.size()); 1693 1694 /* 1695 * 2 mappings involve pep1 1696 */ 1697 List<AlignedCodonFrame> pep1Mappings = MappingUtils 1698 .findMappingsForSequence(pep1, mappings); 1699 assertEquals(2, pep1Mappings.size()); 1700 1701 /* 1702 * Get mapping of pep1 to cds1 and verify it 1703 * maps GPFG to 1-3,4-6,7-9,10-12 1704 */ 1705 List<AlignedCodonFrame> pep1CdsMappings = MappingUtils 1706 .findMappingsForSequence(cds.getSequenceAt(0), pep1Mappings); 1707 assertEquals(1, pep1CdsMappings.size()); 1708 SearchResultsI sr = MappingUtils.buildSearchResults(pep1, 1, 1709 pep1CdsMappings); 1710 assertEquals(1, sr.getResults().size()); 1711 SearchResultMatchI m = sr.getResults().get(0); 1712 assertEquals(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence()); 1713 assertEquals(1, m.getStart()); 1714 assertEquals(3, m.getEnd()); 1715 sr = MappingUtils.buildSearchResults(pep1, 2, pep1CdsMappings); 1716 m = sr.getResults().get(0); 1717 assertEquals(4, m.getStart()); 1718 assertEquals(6, m.getEnd()); 1719 sr = MappingUtils.buildSearchResults(pep1, 3, pep1CdsMappings); 1720 m = sr.getResults().get(0); 1721 assertEquals(7, m.getStart()); 1722 assertEquals(9, m.getEnd()); 1723 sr = MappingUtils.buildSearchResults(pep1, 4, pep1CdsMappings); 1724 m = sr.getResults().get(0); 1725 assertEquals(10, m.getStart()); 1726 assertEquals(12, m.getEnd()); 1727 1728 /* 1729 * Get mapping of pep2 to cds2 and verify it 1730 * maps GPG in pep2 to 1-3,4-6,7-9 in second CDS sequence 1731 */ 1732 List<AlignedCodonFrame> pep2Mappings = MappingUtils 1733 .findMappingsForSequence(pep2, mappings); 1734 assertEquals(2, pep2Mappings.size()); 1735 List<AlignedCodonFrame> pep2CdsMappings = MappingUtils 1736 .findMappingsForSequence(cds.getSequenceAt(1), pep2Mappings); 1737 assertEquals(1, pep2CdsMappings.size()); 1738 sr = MappingUtils.buildSearchResults(pep2, 1, pep2CdsMappings); 1739 assertEquals(1, sr.getResults().size()); 1740 m = sr.getResults().get(0); 1741 assertEquals(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence()); 1742 assertEquals(1, m.getStart()); 1743 assertEquals(3, m.getEnd()); 1744 sr = MappingUtils.buildSearchResults(pep2, 2, pep2CdsMappings); 1745 m = sr.getResults().get(0); 1746 assertEquals(4, m.getStart()); 1747 assertEquals(6, m.getEnd()); 1748 sr = MappingUtils.buildSearchResults(pep2, 3, pep2CdsMappings); 1749 m = sr.getResults().get(0); 1750 assertEquals(7, m.getStart()); 1751 assertEquals(9, m.getEnd()); 1752 } 1753 1754 /** 1755 * Test the method that realigns protein to match mapped codon alignment. 1756 */ 1757 @Test(groups = { "Functional" }) testAlignProteinAsDna_incompleteStartCodon()1758 public void testAlignProteinAsDna_incompleteStartCodon() 1759 { 1760 // seq1: incomplete start codon (not mapped), then [3, 11] 1761 SequenceI dna1 = new Sequence("Seq1", "ccAAA-TTT-GGG-"); 1762 // seq2 codons are [4, 5], [8, 11] 1763 SequenceI dna2 = new Sequence("Seq2", "ccaAA-ttT-GGG-"); 1764 // seq3 incomplete start codon at 'tt' 1765 SequenceI dna3 = new Sequence("Seq3", "ccaaa-ttt-GGG-"); 1766 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 }); 1767 dna.setDataset(null); 1768 1769 // prot1 has 'X' for incomplete start codon (not mapped) 1770 SequenceI prot1 = new Sequence("Seq1", "XKFG"); // X for incomplete start 1771 SequenceI prot2 = new Sequence("Seq2", "NG"); 1772 SequenceI prot3 = new Sequence("Seq3", "XG"); // X for incomplete start 1773 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2, 1774 prot3 }); 1775 protein.setDataset(null); 1776 1777 // map dna1 [3, 11] to prot1 [2, 4] KFG 1778 MapList map = new MapList(new int[] { 3, 11 }, new int[] { 2, 4 }, 3, 1); 1779 AlignedCodonFrame acf = new AlignedCodonFrame(); 1780 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map); 1781 1782 // map dna2 [4, 5] [8, 11] to prot2 [1, 2] NG 1783 map = new MapList(new int[] { 4, 5, 8, 11 }, new int[] { 1, 2 }, 3, 1); 1784 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map); 1785 1786 // map dna3 [9, 11] to prot3 [2, 2] G 1787 map = new MapList(new int[] { 9, 11 }, new int[] { 2, 2 }, 3, 1); 1788 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map); 1789 1790 ArrayList<AlignedCodonFrame> acfs = new ArrayList<>(); 1791 acfs.add(acf); 1792 protein.setCodonFrames(acfs); 1793 Iterator<SequenceI> protseq = protein.getSequences().iterator(); 1794 for (SequenceI dnaseq:dna.getSequences()) { 1795 assertCanResolveProteinCDS(dnaseq,protseq.next(),protein); 1796 } 1797 /* 1798 * verify X is included in the aligned proteins, and placed just 1799 * before the first mapped residue 1800 * CCT is between CCC and TTT 1801 */ 1802 AlignmentUtils.alignProteinAsDna(protein, dna); 1803 assertEquals("XK-FG", prot1.getSequenceAsString()); 1804 assertEquals("--N-G", prot2.getSequenceAsString()); 1805 assertEquals("---XG", prot3.getSequenceAsString()); 1806 } 1807 1808 /** 1809 * assert that we can resolve the protein product in the given alignment given a DNA sequence with CDS mapping 1810 * @param dnaseq 1811 * @param protein 1812 */ assertCanResolveProteinCDS(SequenceI dnaseq, SequenceI expProtein, AlignmentI protein)1813 private void assertCanResolveProteinCDS(SequenceI dnaseq, SequenceI expProtein, AlignmentI protein) 1814 { 1815 // try a few different methods to check all work 1816 SequenceI aprot=null; 1817 for (AlignedCodonFrame cf:protein.getCodonFrame(dnaseq)) 1818 { 1819 aprot=cf.getAaForDnaSeq(dnaseq); 1820 if (aprot!=null) 1821 { 1822 assertTrue("getAaForDnaSeq didn't return expected protein sequence",aprot!=expProtein); 1823 break; 1824 } 1825 } 1826 assertNotNull("Didn't locate any proteins via AlignmentI.getCodonFrame .. AlignCodonFrame.getAaForDnaSeq", aprot); 1827 // try mapping utils - 1828 List<AlignedCodonFrame> mu_mappings=MappingUtils.findMappingsForSequence(dnaseq, protein.getCodonFrames()); 1829 assertNotNull("No mappings found for dnaseq in protein alignment via MappingUtils.findMappingsForSequence",mu_mappings); 1830 assertNotEquals("No mappings found for dnaseq in protein alignment via MappingUtils.findMappingsForSequence",0,mu_mappings.size()); 1831 SequenceI mu_alignedprot=null; 1832 List<SequenceToSequenceMapping> foundMap=null; 1833 for (AlignedCodonFrame cf:mu_mappings) 1834 { 1835 foundMap=new ArrayList<>(); 1836 mu_alignedprot = cf.findAlignedSequence(dnaseq, protein,foundMap); 1837 if (mu_alignedprot!=null) { 1838 break; 1839 } 1840 } 1841 assertNotNull("Didn't locate proteins via MappingUtils.findMappingsForSequence",mu_alignedprot); 1842 assertTrue("findAlignedSequence didn't return expected protein sequence",mu_alignedprot==expProtein); 1843 } 1844 1845 /** 1846 * Tests for the method that maps the subset of a dna sequence that has CDS 1847 * (or subtype) feature - case where the start codon is incomplete. 1848 */ 1849 @Test(groups = "Functional") testFindCdsPositions_fivePrimeIncomplete()1850 public void testFindCdsPositions_fivePrimeIncomplete() 1851 { 1852 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt"); 1853 dnaSeq.createDatasetSequence(); 1854 SequenceI ds = dnaSeq.getDatasetSequence(); 1855 1856 // CDS for dna 5-6 (incomplete codon), 7-9 1857 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null); 1858 sf.setPhase("2"); // skip 2 bases to start of next codon 1859 ds.addSequenceFeature(sf); 1860 // CDS for dna 13-15 1861 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null); 1862 ds.addSequenceFeature(sf); 1863 1864 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq); 1865 1866 /* 1867 * check the mapping starts with the first complete codon 1868 */ 1869 assertEquals(6, MappingUtils.getLength(ranges)); 1870 assertEquals(2, ranges.size()); 1871 assertEquals(7, ranges.get(0)[0]); 1872 assertEquals(9, ranges.get(0)[1]); 1873 assertEquals(13, ranges.get(1)[0]); 1874 assertEquals(15, ranges.get(1)[1]); 1875 } 1876 1877 /** 1878 * Tests for the method that maps the subset of a dna sequence that has CDS 1879 * (or subtype) feature. 1880 */ 1881 @Test(groups = "Functional") testFindCdsPositions()1882 public void testFindCdsPositions() 1883 { 1884 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt"); 1885 dnaSeq.createDatasetSequence(); 1886 SequenceI ds = dnaSeq.getDatasetSequence(); 1887 1888 // CDS for dna 10-12 1889 SequenceFeature sf = new SequenceFeature("CDS_predicted", "", 10, 12, 1890 0f, null); 1891 sf.setStrand("+"); 1892 ds.addSequenceFeature(sf); 1893 // CDS for dna 4-6 1894 sf = new SequenceFeature("CDS", "", 4, 6, 0f, null); 1895 sf.setStrand("+"); 1896 ds.addSequenceFeature(sf); 1897 // exon feature should be ignored here 1898 sf = new SequenceFeature("exon", "", 7, 9, 0f, null); 1899 ds.addSequenceFeature(sf); 1900 1901 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq); 1902 /* 1903 * verify ranges { [4-6], [12-10] } 1904 * note CDS ranges are ordered ascending even if the CDS 1905 * features are not 1906 */ 1907 assertEquals(6, MappingUtils.getLength(ranges)); 1908 assertEquals(2, ranges.size()); 1909 assertEquals(4, ranges.get(0)[0]); 1910 assertEquals(6, ranges.get(0)[1]); 1911 assertEquals(10, ranges.get(1)[0]); 1912 assertEquals(12, ranges.get(1)[1]); 1913 } 1914 1915 /** 1916 * Tests for the method that maps the subset of a dna sequence that has CDS 1917 * (or subtype) feature, with CDS strand = '-' (reverse) 1918 */ 1919 // test turned off as currently findCdsPositions is not strand-dependent 1920 // left in case it comes around again... 1921 @Test(groups = "Functional", enabled = false) testFindCdsPositions_reverseStrand()1922 public void testFindCdsPositions_reverseStrand() 1923 { 1924 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt"); 1925 dnaSeq.createDatasetSequence(); 1926 SequenceI ds = dnaSeq.getDatasetSequence(); 1927 1928 // CDS for dna 4-6 1929 SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null); 1930 sf.setStrand("-"); 1931 ds.addSequenceFeature(sf); 1932 // exon feature should be ignored here 1933 sf = new SequenceFeature("exon", "", 7, 9, 0f, null); 1934 ds.addSequenceFeature(sf); 1935 // CDS for dna 10-12 1936 sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null); 1937 sf.setStrand("-"); 1938 ds.addSequenceFeature(sf); 1939 1940 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq); 1941 /* 1942 * verify ranges { [12-10], [6-4] } 1943 */ 1944 assertEquals(6, MappingUtils.getLength(ranges)); 1945 assertEquals(2, ranges.size()); 1946 assertEquals(12, ranges.get(0)[0]); 1947 assertEquals(10, ranges.get(0)[1]); 1948 assertEquals(6, ranges.get(1)[0]); 1949 assertEquals(4, ranges.get(1)[1]); 1950 } 1951 1952 /** 1953 * Tests for the method that maps the subset of a dna sequence that has CDS 1954 * (or subtype) feature - reverse strand case where the start codon is 1955 * incomplete. 1956 */ 1957 @Test(groups = "Functional", enabled = false) 1958 // test turned off as currently findCdsPositions is not strand-dependent 1959 // left in case it comes around again... testFindCdsPositions_reverseStrandThreePrimeIncomplete()1960 public void testFindCdsPositions_reverseStrandThreePrimeIncomplete() 1961 { 1962 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt"); 1963 dnaSeq.createDatasetSequence(); 1964 SequenceI ds = dnaSeq.getDatasetSequence(); 1965 1966 // CDS for dna 5-9 1967 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null); 1968 sf.setStrand("-"); 1969 ds.addSequenceFeature(sf); 1970 // CDS for dna 13-15 1971 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null); 1972 sf.setStrand("-"); 1973 sf.setPhase("2"); // skip 2 bases to start of next codon 1974 ds.addSequenceFeature(sf); 1975 1976 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq); 1977 1978 /* 1979 * check the mapping starts with the first complete codon 1980 * expect ranges [13, 13], [9, 5] 1981 */ 1982 assertEquals(6, MappingUtils.getLength(ranges)); 1983 assertEquals(2, ranges.size()); 1984 assertEquals(13, ranges.get(0)[0]); 1985 assertEquals(13, ranges.get(0)[1]); 1986 assertEquals(9, ranges.get(1)[0]); 1987 assertEquals(5, ranges.get(1)[1]); 1988 } 1989 1990 @Test(groups = "Functional") testAlignAs_alternateTranscriptsUngapped()1991 public void testAlignAs_alternateTranscriptsUngapped() 1992 { 1993 SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa"); 1994 SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA"); 1995 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 }); 1996 ((Alignment) dna).createDatasetAlignment(); 1997 SequenceI cds1 = new Sequence("cds1", "GGGTTT"); 1998 SequenceI cds2 = new Sequence("cds2", "CCCAAA"); 1999 AlignmentI cds = new Alignment(new SequenceI[] { cds1, cds2 }); 2000 ((Alignment) cds).createDatasetAlignment(); 2001 2002 AlignedCodonFrame acf = new AlignedCodonFrame(); 2003 MapList map = new MapList(new int[] { 4, 9 }, new int[] { 1, 6 }, 1, 1); 2004 acf.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), map); 2005 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 6 }, 1, 1); 2006 acf.addMap(dna2.getDatasetSequence(), cds2.getDatasetSequence(), map); 2007 2008 /* 2009 * verify CDS alignment is as: 2010 * cccGGGTTTaaa (cdna) 2011 * CCCgggtttAAA (cdna) 2012 * 2013 * ---GGGTTT--- (cds) 2014 * CCC------AAA (cds) 2015 */ 2016 dna.addCodonFrame(acf); 2017 AlignmentUtils.alignAs(cds, dna); 2018 assertEquals("---GGGTTT", cds.getSequenceAt(0).getSequenceAsString()); 2019 assertEquals("CCC------AAA", cds.getSequenceAt(1).getSequenceAsString()); 2020 } 2021 2022 @Test(groups = { "Functional" }) testAddMappedPositions()2023 public void testAddMappedPositions() 2024 { 2025 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g"); 2026 SequenceI seq1 = new Sequence("cds", "AAATTT"); 2027 from.createDatasetSequence(); 2028 seq1.createDatasetSequence(); 2029 Mapping mapping = new Mapping(seq1, new MapList( 2030 new int[] { 3, 6, 9, 10 }, new int[] { 1, 6 }, 1, 1)); 2031 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<>(); 2032 AlignmentUtils.addMappedPositions(seq1, from, mapping, map); 2033 2034 /* 2035 * verify map has seq1 residues in columns 3,4,6,7,11,12 2036 */ 2037 assertEquals(6, map.size()); 2038 assertEquals('A', map.get(3).get(seq1).charValue()); 2039 assertEquals('A', map.get(4).get(seq1).charValue()); 2040 assertEquals('A', map.get(6).get(seq1).charValue()); 2041 assertEquals('T', map.get(7).get(seq1).charValue()); 2042 assertEquals('T', map.get(11).get(seq1).charValue()); 2043 assertEquals('T', map.get(12).get(seq1).charValue()); 2044 2045 /* 2046 * 2047 */ 2048 } 2049 2050 /** 2051 * Test case where the mapping 'from' range includes a stop codon which is 2052 * absent in the 'to' range 2053 */ 2054 @Test(groups = { "Functional" }) testAddMappedPositions_withStopCodon()2055 public void testAddMappedPositions_withStopCodon() 2056 { 2057 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g"); 2058 SequenceI seq1 = new Sequence("cds", "AAATTT"); 2059 from.createDatasetSequence(); 2060 seq1.createDatasetSequence(); 2061 Mapping mapping = new Mapping(seq1, new MapList( 2062 new int[] { 3, 6, 9, 10 }, new int[] { 1, 6 }, 1, 1)); 2063 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<>(); 2064 AlignmentUtils.addMappedPositions(seq1, from, mapping, map); 2065 2066 /* 2067 * verify map has seq1 residues in columns 3,4,6,7,11,12 2068 */ 2069 assertEquals(6, map.size()); 2070 assertEquals('A', map.get(3).get(seq1).charValue()); 2071 assertEquals('A', map.get(4).get(seq1).charValue()); 2072 assertEquals('A', map.get(6).get(seq1).charValue()); 2073 assertEquals('T', map.get(7).get(seq1).charValue()); 2074 assertEquals('T', map.get(11).get(seq1).charValue()); 2075 assertEquals('T', map.get(12).get(seq1).charValue()); 2076 } 2077 2078 /** 2079 * Test for the case where the products for which we want CDS are specified. 2080 * This is to represent the case where EMBL has CDS mappings to both Uniprot 2081 * and EMBLCDSPROTEIN. makeCdsAlignment() should only return the mappings for 2082 * the protein sequences specified. 2083 */ 2084 @Test(groups = { "Functional" }) testMakeCdsAlignment_filterProducts()2085 public void testMakeCdsAlignment_filterProducts() 2086 { 2087 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa"); 2088 SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC"); 2089 SequenceI pep1 = new Sequence("Uniprot|pep1", "GF"); 2090 SequenceI pep2 = new Sequence("Uniprot|pep2", "GFP"); 2091 SequenceI pep3 = new Sequence("EMBL|pep3", "GF"); 2092 SequenceI pep4 = new Sequence("EMBL|pep4", "GFP"); 2093 dna1.createDatasetSequence(); 2094 dna2.createDatasetSequence(); 2095 pep1.createDatasetSequence(); 2096 pep2.createDatasetSequence(); 2097 pep3.createDatasetSequence(); 2098 pep4.createDatasetSequence(); 2099 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 }); 2100 dna.setDataset(null); 2101 AlignmentI emblPeptides = new Alignment(new SequenceI[] { pep3, pep4 }); 2102 emblPeptides.setDataset(null); 2103 2104 AlignedCodonFrame acf = new AlignedCodonFrame(); 2105 MapList map = new MapList(new int[] { 4, 6, 10, 12 }, 2106 new int[] { 1, 2 }, 3, 1); 2107 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map); 2108 acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map); 2109 dna.addCodonFrame(acf); 2110 2111 acf = new AlignedCodonFrame(); 2112 map = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 }, 2113 3, 1); 2114 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map); 2115 acf.addMap(dna2.getDatasetSequence(), pep4.getDatasetSequence(), map); 2116 dna.addCodonFrame(acf); 2117 2118 /* 2119 * execute method under test to find CDS for EMBL peptides only 2120 */ 2121 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] { 2122 dna1, dna2 }, dna.getDataset(), emblPeptides.getSequencesArray()); 2123 2124 assertEquals(2, cds.getSequences().size()); 2125 assertEquals("GGGTTT", cds.getSequenceAt(0).getSequenceAsString()); 2126 assertEquals("GGGTTTCCC", cds.getSequenceAt(1).getSequenceAsString()); 2127 2128 /* 2129 * verify shared, extended alignment dataset 2130 */ 2131 assertSame(dna.getDataset(), cds.getDataset()); 2132 assertTrue(dna.getDataset().getSequences() 2133 .contains(cds.getSequenceAt(0).getDatasetSequence())); 2134 assertTrue(dna.getDataset().getSequences() 2135 .contains(cds.getSequenceAt(1).getDatasetSequence())); 2136 2137 /* 2138 * Verify mappings from CDS to peptide, cDNA to CDS, and cDNA to peptide 2139 * the mappings are on the shared alignment dataset 2140 */ 2141 List<AlignedCodonFrame> cdsMappings = cds.getDataset().getCodonFrames(); 2142 /* 2143 * 6 mappings, 2*(DNA->CDS), 2*(DNA->Pep), 2*(CDS->Pep) 2144 */ 2145 assertEquals(6, cdsMappings.size()); 2146 2147 /* 2148 * verify that mapping sets for dna and cds alignments are different 2149 * [not current behaviour - all mappings are on the alignment dataset] 2150 */ 2151 // select -> subselect type to test. 2152 // Assert.assertNotSame(dna.getCodonFrames(), cds.getCodonFrames()); 2153 // assertEquals(4, dna.getCodonFrames().size()); 2154 // assertEquals(4, cds.getCodonFrames().size()); 2155 2156 /* 2157 * Two mappings involve pep3 (dna to pep3, cds to pep3) 2158 * Mapping from pep3 to GGGTTT in first new exon sequence 2159 */ 2160 List<AlignedCodonFrame> pep3Mappings = MappingUtils 2161 .findMappingsForSequence(pep3, cdsMappings); 2162 assertEquals(2, pep3Mappings.size()); 2163 List<AlignedCodonFrame> mappings = MappingUtils 2164 .findMappingsForSequence(cds.getSequenceAt(0), pep3Mappings); 2165 assertEquals(1, mappings.size()); 2166 2167 // map G to GGG 2168 SearchResultsI sr = MappingUtils.buildSearchResults(pep3, 1, mappings); 2169 assertEquals(1, sr.getResults().size()); 2170 SearchResultMatchI m = sr.getResults().get(0); 2171 assertSame(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence()); 2172 assertEquals(1, m.getStart()); 2173 assertEquals(3, m.getEnd()); 2174 // map F to TTT 2175 sr = MappingUtils.buildSearchResults(pep3, 2, mappings); 2176 m = sr.getResults().get(0); 2177 assertSame(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence()); 2178 assertEquals(4, m.getStart()); 2179 assertEquals(6, m.getEnd()); 2180 2181 /* 2182 * Two mappings involve pep4 (dna to pep4, cds to pep4) 2183 * Verify mapping from pep4 to GGGTTTCCC in second new exon sequence 2184 */ 2185 List<AlignedCodonFrame> pep4Mappings = MappingUtils 2186 .findMappingsForSequence(pep4, cdsMappings); 2187 assertEquals(2, pep4Mappings.size()); 2188 mappings = MappingUtils.findMappingsForSequence(cds.getSequenceAt(1), 2189 pep4Mappings); 2190 assertEquals(1, mappings.size()); 2191 // map G to GGG 2192 sr = MappingUtils.buildSearchResults(pep4, 1, mappings); 2193 assertEquals(1, sr.getResults().size()); 2194 m = sr.getResults().get(0); 2195 assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence()); 2196 assertEquals(1, m.getStart()); 2197 assertEquals(3, m.getEnd()); 2198 // map F to TTT 2199 sr = MappingUtils.buildSearchResults(pep4, 2, mappings); 2200 m = sr.getResults().get(0); 2201 assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence()); 2202 assertEquals(4, m.getStart()); 2203 assertEquals(6, m.getEnd()); 2204 // map P to CCC 2205 sr = MappingUtils.buildSearchResults(pep4, 3, mappings); 2206 m = sr.getResults().get(0); 2207 assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence()); 2208 assertEquals(7, m.getStart()); 2209 assertEquals(9, m.getEnd()); 2210 } 2211 2212 /** 2213 * Test the method that just copies aligned sequences, provided all sequences 2214 * to be aligned share the aligned sequence's dataset 2215 */ 2216 @Test(groups = "Functional") testAlignAsSameSequences()2217 public void testAlignAsSameSequences() 2218 { 2219 SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa"); 2220 SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA"); 2221 AlignmentI al1 = new Alignment(new SequenceI[] { dna1, dna2 }); 2222 ((Alignment) al1).createDatasetAlignment(); 2223 2224 SequenceI dna3 = new Sequence(dna1); 2225 SequenceI dna4 = new Sequence(dna2); 2226 assertSame(dna3.getDatasetSequence(), dna1.getDatasetSequence()); 2227 assertSame(dna4.getDatasetSequence(), dna2.getDatasetSequence()); 2228 String seq1 = "-cc-GG-GT-TT--aaa"; 2229 dna3.setSequence(seq1); 2230 String seq2 = "C--C-Cgg--gtt-tAA-A-"; 2231 dna4.setSequence(seq2); 2232 AlignmentI al2 = new Alignment(new SequenceI[] { dna3, dna4 }); 2233 ((Alignment) al2).createDatasetAlignment(); 2234 2235 /* 2236 * alignment removes gapped columns (two internal, two trailing) 2237 */ 2238 assertTrue(AlignmentUtils.alignAsSameSequences(al1, al2)); 2239 String aligned1 = "-cc-GG-GTTT-aaa"; 2240 assertEquals(aligned1, 2241 al1.getSequenceAt(0).getSequenceAsString()); 2242 String aligned2 = "C--C-Cgg-gtttAAA"; 2243 assertEquals(aligned2, 2244 al1.getSequenceAt(1).getSequenceAsString()); 2245 2246 /* 2247 * add another sequence to 'aligned' - should still succeed, since 2248 * unaligned sequences still share a dataset with aligned sequences 2249 */ 2250 SequenceI dna5 = new Sequence("dna5", "CCCgggtttAAA"); 2251 dna5.createDatasetSequence(); 2252 al2.addSequence(dna5); 2253 assertTrue(AlignmentUtils.alignAsSameSequences(al1, al2)); 2254 assertEquals(aligned1, al1.getSequenceAt(0).getSequenceAsString()); 2255 assertEquals(aligned2, al1.getSequenceAt(1).getSequenceAsString()); 2256 2257 /* 2258 * add another sequence to 'unaligned' - should fail, since now not 2259 * all unaligned sequences share a dataset with aligned sequences 2260 */ 2261 SequenceI dna6 = new Sequence("dna6", "CCCgggtttAAA"); 2262 dna6.createDatasetSequence(); 2263 al1.addSequence(dna6); 2264 // JAL-2110 JBP Comment: what's the use case for this behaviour ? 2265 assertFalse(AlignmentUtils.alignAsSameSequences(al1, al2)); 2266 } 2267 2268 @Test(groups = "Functional") testAlignAsSameSequencesMultipleSubSeq()2269 public void testAlignAsSameSequencesMultipleSubSeq() 2270 { 2271 SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa"); 2272 SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA"); 2273 SequenceI as1 = dna1.deriveSequence(); // cccGGGTTTaaa/1-12 2274 SequenceI as2 = dna1.deriveSequence().getSubSequence(3, 7); // GGGT/4-7 2275 SequenceI as3 = dna2.deriveSequence(); // CCCgggtttAAA/1-12 2276 as1.insertCharAt(6, 5, '-'); 2277 assertEquals("cccGGG-----TTTaaa", as1.getSequenceAsString()); 2278 as2.insertCharAt(6, 5, '-'); 2279 assertEquals("GGGT-----", as2.getSequenceAsString()); 2280 as3.insertCharAt(3, 5, '-'); 2281 assertEquals("CCC-----gggtttAAA", as3.getSequenceAsString()); 2282 AlignmentI aligned = new Alignment(new SequenceI[] { as1, as2, as3 }); 2283 2284 // why do we need to cast this still ? 2285 ((Alignment) aligned).createDatasetAlignment(); 2286 SequenceI uas1 = dna1.deriveSequence(); 2287 SequenceI uas2 = dna1.deriveSequence().getSubSequence(3, 7); 2288 SequenceI uas3 = dna2.deriveSequence(); 2289 AlignmentI tobealigned = new Alignment(new SequenceI[] { uas1, uas2, 2290 uas3 }); 2291 ((Alignment) tobealigned).createDatasetAlignment(); 2292 2293 /* 2294 * alignAs lines up dataset sequences and removes empty columns (two) 2295 */ 2296 assertTrue(AlignmentUtils.alignAsSameSequences(tobealigned, aligned)); 2297 assertEquals("cccGGG---TTTaaa", uas1.getSequenceAsString()); 2298 assertEquals("GGGT", uas2.getSequenceAsString()); 2299 assertEquals("CCC---gggtttAAA", uas3.getSequenceAsString()); 2300 } 2301 2302 @Test(groups = { "Functional" }) testTransferGeneLoci()2303 public void testTransferGeneLoci() 2304 { 2305 SequenceI from = new Sequence("transcript", 2306 "aaacccgggTTTAAACCCGGGtttaaacccgggttt"); 2307 SequenceI to = new Sequence("CDS", "TTTAAACCCGGG"); 2308 MapList map = new MapList(new int[] { 1, 12 }, new int[] { 10, 21 }, 1, 2309 1); 2310 2311 /* 2312 * first with nothing to transfer 2313 */ 2314 AlignmentUtils.transferGeneLoci(from, map, to); 2315 assertNull(to.getGeneLoci()); 2316 2317 /* 2318 * next with gene loci set on 'from' sequence 2319 */ 2320 int[] exons = new int[] { 100, 105, 155, 164, 210, 229 }; 2321 MapList geneMap = new MapList(new int[] { 1, 36 }, exons, 1, 1); 2322 from.setGeneLoci("human", "GRCh38", "7", geneMap); 2323 AlignmentUtils.transferGeneLoci(from, map, to); 2324 2325 GeneLociI toLoci = to.getGeneLoci(); 2326 assertNotNull(toLoci); 2327 // DBRefEntry constructor upper-cases 'source' 2328 assertEquals("HUMAN", toLoci.getSpeciesId()); 2329 assertEquals("GRCh38", toLoci.getAssemblyId()); 2330 assertEquals("7", toLoci.getChromosomeId()); 2331 2332 /* 2333 * transcript 'exons' are 1-6, 7-16, 17-36 2334 * CDS 1:12 is transcript 10-21 2335 * transcript 'CDS' is 10-16, 17-21 2336 * which is 'gene' 158-164, 210-214 2337 */ 2338 MapList toMap = toLoci.getMapping(); 2339 assertEquals(1, toMap.getFromRanges().size()); 2340 assertEquals(2, toMap.getFromRanges().get(0).length); 2341 assertEquals(1, toMap.getFromRanges().get(0)[0]); 2342 assertEquals(12, toMap.getFromRanges().get(0)[1]); 2343 assertEquals(2, toMap.getToRanges().size()); 2344 assertEquals(2, toMap.getToRanges().get(0).length); 2345 assertEquals(158, toMap.getToRanges().get(0)[0]); 2346 assertEquals(164, toMap.getToRanges().get(0)[1]); 2347 assertEquals(210, toMap.getToRanges().get(1)[0]); 2348 assertEquals(214, toMap.getToRanges().get(1)[1]); 2349 // or summarised as (but toString might change in future): 2350 assertEquals("[ [1, 12] ] 1:1 to [ [158, 164] [210, 214] ]", 2351 toMap.toString()); 2352 2353 /* 2354 * an existing value is not overridden 2355 */ 2356 geneMap = new MapList(new int[] { 1, 36 }, new int[] { 36, 1 }, 1, 1); 2357 from.setGeneLoci("inhuman", "GRCh37", "6", geneMap); 2358 AlignmentUtils.transferGeneLoci(from, map, to); 2359 assertEquals("GRCh38", toLoci.getAssemblyId()); 2360 assertEquals("7", toLoci.getChromosomeId()); 2361 toMap = toLoci.getMapping(); 2362 assertEquals("[ [1, 12] ] 1:1 to [ [158, 164] [210, 214] ]", 2363 toMap.toString()); 2364 } 2365 2366 /** 2367 * Tests for the method that maps nucleotide to protein based on CDS features 2368 */ 2369 @Test(groups = "Functional") testMapCdsToProtein()2370 public void testMapCdsToProtein() 2371 { 2372 SequenceI peptide = new Sequence("pep", "KLQ"); 2373 2374 /* 2375 * Case 1: CDS 3 times length of peptide 2376 * NB method only checks lengths match, not translation 2377 */ 2378 SequenceI dna = new Sequence("dna", "AACGacgtCTCCT"); 2379 dna.createDatasetSequence(); 2380 dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); 2381 dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 13, null)); 2382 MapList ml = AlignmentUtils.mapCdsToProtein(dna, peptide); 2383 assertEquals(3, ml.getFromRatio()); 2384 assertEquals(1, ml.getToRatio()); 2385 assertEquals("[[1, 3]]", 2386 Arrays.deepToString(ml.getToRanges().toArray())); 2387 assertEquals("[[1, 4], [9, 13]]", 2388 Arrays.deepToString(ml.getFromRanges().toArray())); 2389 2390 /* 2391 * Case 2: CDS 3 times length of peptide + stop codon 2392 * (note code does not currently check trailing codon is a stop codon) 2393 */ 2394 dna = new Sequence("dna", "AACGacgtCTCCTCCC"); 2395 dna.createDatasetSequence(); 2396 dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); 2397 dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 16, null)); 2398 ml = AlignmentUtils.mapCdsToProtein(dna, peptide); 2399 assertEquals(3, ml.getFromRatio()); 2400 assertEquals(1, ml.getToRatio()); 2401 assertEquals("[[1, 3]]", 2402 Arrays.deepToString(ml.getToRanges().toArray())); 2403 assertEquals("[[1, 4], [9, 13]]", 2404 Arrays.deepToString(ml.getFromRanges().toArray())); 2405 2406 /* 2407 * Case 3: CDS longer than 3 * peptide + stop codon - no mapping is made 2408 */ 2409 dna = new Sequence("dna", "AACGacgtCTCCTTGATCA"); 2410 dna.createDatasetSequence(); 2411 dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); 2412 dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 19, null)); 2413 ml = AlignmentUtils.mapCdsToProtein(dna, peptide); 2414 assertNull(ml); 2415 2416 /* 2417 * Case 4: CDS shorter than 3 * peptide - no mapping is made 2418 */ 2419 dna = new Sequence("dna", "AACGacgtCTCC"); 2420 dna.createDatasetSequence(); 2421 dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); 2422 dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 12, null)); 2423 ml = AlignmentUtils.mapCdsToProtein(dna, peptide); 2424 assertNull(ml); 2425 2426 /* 2427 * Case 5: CDS 3 times length of peptide + part codon - mapping is truncated 2428 */ 2429 dna = new Sequence("dna", "AACGacgtCTCCTTG"); 2430 dna.createDatasetSequence(); 2431 dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); 2432 dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 15, null)); 2433 ml = AlignmentUtils.mapCdsToProtein(dna, peptide); 2434 assertEquals(3, ml.getFromRatio()); 2435 assertEquals(1, ml.getToRatio()); 2436 assertEquals("[[1, 3]]", 2437 Arrays.deepToString(ml.getToRanges().toArray())); 2438 assertEquals("[[1, 4], [9, 13]]", 2439 Arrays.deepToString(ml.getFromRanges().toArray())); 2440 2441 /* 2442 * Case 6: incomplete start codon corresponding to X in peptide 2443 */ 2444 dna = new Sequence("dna", "ACGacgtCTCCTTGG"); 2445 dna.createDatasetSequence(); 2446 SequenceFeature sf = new SequenceFeature("CDS", "", 1, 3, null); 2447 sf.setPhase("2"); // skip 2 positions (AC) to start of next codon (GCT) 2448 dna.addSequenceFeature(sf); 2449 dna.addSequenceFeature(new SequenceFeature("CDS", "", 8, 15, null)); 2450 peptide = new Sequence("pep", "XLQ"); 2451 ml = AlignmentUtils.mapCdsToProtein(dna, peptide); 2452 assertEquals("[[2, 3]]", 2453 Arrays.deepToString(ml.getToRanges().toArray())); 2454 assertEquals("[[3, 3], [8, 12]]", 2455 Arrays.deepToString(ml.getFromRanges().toArray())); 2456 } 2457 2458 /** 2459 * Tests for the method that locates the CDS sequence that has a mapping to 2460 * the given protein. That is, given a transcript-to-peptide mapping, find the 2461 * cds-to-peptide mapping that relates to both, and return the CDS sequence. 2462 */ 2463 @Test testFindCdsForProtein()2464 public void testFindCdsForProtein() 2465 { 2466 List<AlignedCodonFrame> mappings = new ArrayList<>(); 2467 AlignedCodonFrame acf1 = new AlignedCodonFrame(); 2468 mappings.add(acf1); 2469 2470 SequenceI dna1 = new Sequence("dna1", "cgatATcgGCTATCTATGacg"); 2471 dna1.createDatasetSequence(); 2472 2473 // NB we currently exclude STOP codon from CDS sequences 2474 // the test would need to change if this changes in future 2475 SequenceI cds1 = new Sequence("cds1", "ATGCTATCT"); 2476 cds1.createDatasetSequence(); 2477 2478 SequenceI pep1 = new Sequence("pep1", "MLS"); 2479 pep1.createDatasetSequence(); 2480 List<AlignedCodonFrame> seqMappings = new ArrayList<>(); 2481 MapList mapList = new MapList( 2482 new int[] 2483 { 5, 6, 9, 15 }, new int[] { 1, 3 }, 3, 1); 2484 Mapping dnaToPeptide = new Mapping(pep1.getDatasetSequence(), mapList); 2485 2486 // add dna to peptide mapping 2487 seqMappings.add(acf1); 2488 acf1.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), 2489 mapList); 2490 2491 /* 2492 * first case - no dna-to-CDS mapping exists - search fails 2493 */ 2494 SequenceI seq = AlignmentUtils.findCdsForProtein(mappings, dna1, 2495 seqMappings, dnaToPeptide); 2496 assertNull(seq); 2497 2498 /* 2499 * second case - CDS-to-peptide mapping exists but no dna-to-CDS 2500 * - search fails 2501 */ 2502 // todo this test fails if the mapping is added to acf1, not acf2 2503 // need to tidy up use of lists of mappings in AlignedCodonFrame 2504 AlignedCodonFrame acf2 = new AlignedCodonFrame(); 2505 mappings.add(acf2); 2506 MapList cdsToPeptideMapping = new MapList(new int[] 2507 { 1, 9 }, new int[] { 1, 3 }, 3, 1); 2508 acf2.addMap(cds1.getDatasetSequence(), pep1.getDatasetSequence(), 2509 cdsToPeptideMapping); 2510 assertNull(AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings, 2511 dnaToPeptide)); 2512 2513 /* 2514 * third case - add dna-to-CDS mapping - CDS is now found! 2515 */ 2516 MapList dnaToCdsMapping = new MapList(new int[] { 5, 6, 9, 15 }, 2517 new int[] 2518 { 1, 9 }, 1, 1); 2519 acf1.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), 2520 dnaToCdsMapping); 2521 seq = AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings, 2522 dnaToPeptide); 2523 assertSame(seq, cds1.getDatasetSequence()); 2524 } 2525 2526 /** 2527 * Tests for the method that locates the CDS sequence that has a mapping to 2528 * the given protein. That is, given a transcript-to-peptide mapping, find the 2529 * cds-to-peptide mapping that relates to both, and return the CDS sequence. 2530 * This test is for the case where transcript and CDS are the same length. 2531 */ 2532 @Test testFindCdsForProtein_noUTR()2533 public void testFindCdsForProtein_noUTR() 2534 { 2535 List<AlignedCodonFrame> mappings = new ArrayList<>(); 2536 AlignedCodonFrame acf1 = new AlignedCodonFrame(); 2537 mappings.add(acf1); 2538 2539 SequenceI dna1 = new Sequence("dna1", "ATGCTATCTTAA"); 2540 dna1.createDatasetSequence(); 2541 2542 // NB we currently exclude STOP codon from CDS sequences 2543 // the test would need to change if this changes in future 2544 SequenceI cds1 = new Sequence("cds1", "ATGCTATCT"); 2545 cds1.createDatasetSequence(); 2546 2547 SequenceI pep1 = new Sequence("pep1", "MLS"); 2548 pep1.createDatasetSequence(); 2549 List<AlignedCodonFrame> seqMappings = new ArrayList<>(); 2550 MapList mapList = new MapList( 2551 new int[] 2552 { 1, 9 }, new int[] { 1, 3 }, 3, 1); 2553 Mapping dnaToPeptide = new Mapping(pep1.getDatasetSequence(), mapList); 2554 2555 // add dna to peptide mapping 2556 seqMappings.add(acf1); 2557 acf1.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), 2558 mapList); 2559 2560 /* 2561 * first case - transcript lacks CDS features - it appears to be 2562 * the CDS sequence and is returned 2563 */ 2564 SequenceI seq = AlignmentUtils.findCdsForProtein(mappings, dna1, 2565 seqMappings, dnaToPeptide); 2566 assertSame(seq, dna1.getDatasetSequence()); 2567 2568 /* 2569 * second case - transcript has CDS feature - this means it is 2570 * not returned as a match for CDS (CDS sequences don't have CDS features) 2571 */ 2572 dna1.addSequenceFeature( 2573 new SequenceFeature(SequenceOntologyI.CDS, "cds", 1, 12, null)); 2574 seq = AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings, 2575 dnaToPeptide); 2576 assertNull(seq); 2577 2578 /* 2579 * third case - CDS-to-peptide mapping exists but no dna-to-CDS 2580 * - search fails 2581 */ 2582 // todo this test fails if the mapping is added to acf1, not acf2 2583 // need to tidy up use of lists of mappings in AlignedCodonFrame 2584 AlignedCodonFrame acf2 = new AlignedCodonFrame(); 2585 mappings.add(acf2); 2586 MapList cdsToPeptideMapping = new MapList(new int[] 2587 { 1, 9 }, new int[] { 1, 3 }, 3, 1); 2588 acf2.addMap(cds1.getDatasetSequence(), pep1.getDatasetSequence(), 2589 cdsToPeptideMapping); 2590 assertNull(AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings, 2591 dnaToPeptide)); 2592 2593 /* 2594 * fourth case - add dna-to-CDS mapping - CDS is now found! 2595 */ 2596 MapList dnaToCdsMapping = new MapList(new int[] { 1, 9 }, 2597 new int[] 2598 { 1, 9 }, 1, 1); 2599 acf1.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), 2600 dnaToCdsMapping); 2601 seq = AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings, 2602 dnaToPeptide); 2603 assertSame(seq, cds1.getDatasetSequence()); 2604 } 2605 } 2606