1 /* 2 * Jalview - A Sequence Alignment Editor and Viewer (2.11.1.4) 3 * Copyright (C) 2021 The Jalview Authors 4 * 5 * This file is part of Jalview. 6 * 7 * Jalview is free software: you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation, either version 3 10 * of the License, or (at your option) any later version. 11 * 12 * Jalview is distributed in the hope that it will be useful, but 13 * WITHOUT ANY WARRANTY; without even the implied warranty 14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR 15 * PURPOSE. See the GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>. 19 * The Jalview Authors are detailed in the 'AUTHORS' file. 20 */ 21 package jalview.analysis; 22 23 import static org.testng.AssertJUnit.assertEquals; 24 import static org.testng.AssertJUnit.assertFalse; 25 import static org.testng.AssertJUnit.assertNotNull; 26 import static org.testng.AssertJUnit.assertNotSame; 27 import static org.testng.AssertJUnit.assertNull; 28 import static org.testng.AssertJUnit.assertSame; 29 import static org.testng.AssertJUnit.assertTrue; 30 31 import jalview.datamodel.AlignedCodonFrame; 32 import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping; 33 import jalview.datamodel.Alignment; 34 import jalview.datamodel.AlignmentI; 35 import jalview.datamodel.DBRefEntry; 36 import jalview.datamodel.Mapping; 37 import jalview.datamodel.Sequence; 38 import jalview.datamodel.SequenceFeature; 39 import jalview.datamodel.SequenceI; 40 import jalview.gui.JvOptionPane; 41 import jalview.util.DBRefUtils; 42 import jalview.util.MapList; 43 import jalview.ws.SequenceFetcher; 44 import jalview.ws.SequenceFetcherFactory; 45 46 import java.util.ArrayList; 47 import java.util.List; 48 49 import org.testng.annotations.AfterClass; 50 import org.testng.annotations.BeforeClass; 51 import org.testng.annotations.Test; 52 53 public class CrossRefTest 54 { 55 56 @BeforeClass(alwaysRun = true) setUpJvOptionPane()57 public void setUpJvOptionPane() 58 { 59 JvOptionPane.setInteractiveMode(false); 60 JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION); 61 } 62 63 @Test(groups = { "Functional" }) testFindXDbRefs()64 public void testFindXDbRefs() 65 { 66 DBRefEntry ref1 = new DBRefEntry("UNIPROT", "1", "A123"); 67 DBRefEntry ref2 = new DBRefEntry("UNIPROTKB/TREMBL", "1", "A123"); 68 DBRefEntry ref3 = new DBRefEntry("pdb", "1", "A123"); 69 DBRefEntry ref4 = new DBRefEntry("EMBLCDSPROTEIN", "1", "A123"); 70 DBRefEntry ref5 = new DBRefEntry("embl", "1", "A123"); 71 DBRefEntry ref6 = new DBRefEntry("emblCDS", "1", "A123"); 72 DBRefEntry ref7 = new DBRefEntry("GeneDB", "1", "A123"); 73 DBRefEntry ref8 = new DBRefEntry("PFAM", "1", "A123"); 74 // ENSEMBL is a source of either dna or protein sequence data 75 DBRefEntry ref9 = new DBRefEntry("ENSEMBL", "1", "A123"); 76 DBRefEntry[] refs = new DBRefEntry[] { ref1, ref2, ref3, ref4, ref5, 77 ref6, ref7, ref8, ref9 }; 78 79 /* 80 * Just the DNA refs: 81 */ 82 DBRefEntry[] found = DBRefUtils.selectDbRefs(true, refs); 83 assertEquals(4, found.length); 84 assertSame(ref5, found[0]); 85 assertSame(ref6, found[1]); 86 assertSame(ref7, found[2]); 87 assertSame(ref9, found[3]); 88 89 /* 90 * Just the protein refs: 91 */ 92 found = DBRefUtils.selectDbRefs(false, refs); 93 assertEquals(4, found.length); 94 assertSame(ref1, found[0]); 95 assertSame(ref2, found[1]); 96 assertSame(ref4, found[2]); 97 assertSame(ref9, found[3]); 98 } 99 100 /** 101 * Test the method that finds a sequence's "product" xref source databases, 102 * which may be direct (dbrefs on the sequence), or indirect (dbrefs on 103 * sequences which share a dbref with the sequence 104 */ 105 @Test(groups = { "Functional" }, enabled = true) testFindXrefSourcesForSequence_proteinToDna()106 public void testFindXrefSourcesForSequence_proteinToDna() 107 { 108 SequenceI seq = new Sequence("Seq1", "MGKYQARLSS"); 109 List<String> sources = new ArrayList<>(); 110 AlignmentI al = new Alignment(new SequenceI[] {}); 111 112 /* 113 * first with no dbrefs to search 114 */ 115 sources = new CrossRef(new SequenceI[] { seq }, al) 116 .findXrefSourcesForSequences(false); 117 assertTrue(sources.isEmpty()); 118 119 /* 120 * add some dbrefs to sequence 121 */ 122 // protein db is not a candidate for findXrefSources 123 seq.addDBRef(new DBRefEntry("UNIPROT", "0", "A1234")); 124 // dna coding databatases are 125 seq.addDBRef(new DBRefEntry("EMBL", "0", "E2345")); 126 // a second EMBL xref should not result in a duplicate 127 seq.addDBRef(new DBRefEntry("EMBL", "0", "E2346")); 128 seq.addDBRef(new DBRefEntry("EMBLCDS", "0", "E2347")); 129 seq.addDBRef(new DBRefEntry("GENEDB", "0", "E2348")); 130 seq.addDBRef(new DBRefEntry("ENSEMBL", "0", "E2349")); 131 seq.addDBRef(new DBRefEntry("ENSEMBLGENOMES", "0", "E2350")); 132 sources = new CrossRef(new SequenceI[] { seq }, al) 133 .findXrefSourcesForSequences(false); 134 // method is patched to remove EMBL from the sources to match 135 assertEquals(4, sources.size()); 136 assertEquals("[EMBLCDS, GENEDB, ENSEMBL, ENSEMBLGENOMES]", 137 sources.toString()); 138 139 /* 140 * add a sequence to the alignment which has a dbref to UNIPROT|A1234 141 * and others to dna coding databases 142 */ 143 sources.clear(); 144 seq.setDBRefs(null); 145 seq.addDBRef(new DBRefEntry("UNIPROT", "0", "A1234")); 146 seq.addDBRef(new DBRefEntry("EMBLCDS", "0", "E2347")); 147 SequenceI seq2 = new Sequence("Seq2", "MGKYQARLSS"); 148 seq2.addDBRef(new DBRefEntry("UNIPROT", "0", "A1234")); 149 seq2.addDBRef(new DBRefEntry("EMBL", "0", "E2345")); 150 seq2.addDBRef(new DBRefEntry("GENEDB", "0", "E2348")); 151 // TODO include ENSEMBLGENOMES in DBRefSource.DNACODINGDBS ? 152 al.addSequence(seq2); 153 sources = new CrossRef(new SequenceI[] { seq, seq2 }, al) 154 .findXrefSourcesForSequences(false); 155 // method removed EMBL from sources to match 156 assertEquals(2, sources.size()); 157 assertEquals("[EMBLCDS, GENEDB]", sources.toString()); 158 } 159 160 /** 161 * Test for finding 'product' sequences for the case where only an indirect 162 * xref is found - not on the nucleotide sequence but on a peptide sequence in 163 * the alignment which which it shares a nucleotide dbref 164 */ 165 @Test(groups = { "Functional" }, enabled = true) testFindXrefSequences_indirectDbrefToProtein()166 public void testFindXrefSequences_indirectDbrefToProtein() 167 { 168 /* 169 * Alignment setup: 170 * - nucleotide dbref EMBL|AF039662 171 * - peptide dbrefs EMBL|AF039662, UNIPROT|Q9ZTS2 172 */ 173 SequenceI emblSeq = new Sequence("AF039662", "GGGGCAGCACAAGAAC"); 174 emblSeq.addDBRef(new DBRefEntry("EMBL", "0", "AF039662")); 175 SequenceI uniprotSeq = new Sequence("Q9ZTS2", "MASVSATMISTS"); 176 uniprotSeq.addDBRef(new DBRefEntry("EMBL", "0", "AF039662")); 177 uniprotSeq.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2")); 178 179 /* 180 * Find UNIPROT xrefs for nucleotide 181 * - it has no UNIPROT dbref of its own 182 * - but peptide with matching nucleotide dbref does, so is returned 183 */ 184 AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq }); 185 Alignment xrefs = new CrossRef(new SequenceI[] { emblSeq }, al) 186 .findXrefSequences("UNIPROT", true); 187 assertEquals(1, xrefs.getHeight()); 188 assertSame(uniprotSeq, xrefs.getSequenceAt(0)); 189 } 190 191 /** 192 * Test for finding 'product' sequences for the case where only an indirect 193 * xref is found - not on the peptide sequence but on a nucleotide sequence in 194 * the alignment which which it shares a protein dbref 195 */ 196 @Test(groups = { "Functional" }, enabled = true) testFindXrefSequences_indirectDbrefToNucleotide()197 public void testFindXrefSequences_indirectDbrefToNucleotide() 198 { 199 /* 200 * Alignment setup: 201 * - peptide dbref UNIPROT|Q9ZTS2 202 * - nucleotide dbref EMBL|AF039662, UNIPROT|Q9ZTS2 203 */ 204 SequenceI uniprotSeq = new Sequence("Q9ZTS2", "MASVSATMISTS"); 205 uniprotSeq.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2")); 206 SequenceI emblSeq = new Sequence("AF039662", "GGGGCAGCACAAGAAC"); 207 emblSeq.addDBRef(new DBRefEntry("EMBL", "0", "AF039662")); 208 emblSeq.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2")); 209 210 /* 211 * find EMBL xrefs for peptide sequence - it has no direct 212 * dbrefs, but the 'corresponding' nucleotide sequence does, so is returned 213 */ 214 /* 215 * Find EMBL xrefs for peptide 216 * - it has no EMBL dbref of its own 217 * - but nucleotide with matching peptide dbref does, so is returned 218 */ 219 AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq }); 220 Alignment xrefs = new CrossRef(new SequenceI[] { uniprotSeq }, al) 221 .findXrefSequences("EMBL", false); 222 assertEquals(1, xrefs.getHeight()); 223 assertSame(emblSeq, xrefs.getSequenceAt(0)); 224 } 225 226 /** 227 * Test for finding 'product' sequences for the case where the selected 228 * sequence has no dbref to the desired source, and there are no indirect 229 * references via another sequence in the alignment 230 */ 231 @Test(groups = { "Functional" }) testFindXrefSequences_noDbrefs()232 public void testFindXrefSequences_noDbrefs() 233 { 234 /* 235 * two nucleotide sequences, one with UNIPROT dbref 236 */ 237 SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC"); 238 dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2")); 239 SequenceI dna2 = new Sequence("AJ307031", "AAACCCTTT"); 240 241 /* 242 * find UNIPROT xrefs for peptide sequence - it has no direct 243 * dbrefs, and the other sequence (which has a UNIPROT dbref) is not 244 * equatable to it, so no results found 245 */ 246 AlignmentI al = new Alignment(new SequenceI[] { dna1, dna2 }); 247 Alignment xrefs = new CrossRef(new SequenceI[] { dna2 }, al) 248 .findXrefSequences("UNIPROT", true); 249 assertNull(xrefs); 250 } 251 252 /** 253 * Tests for the method that searches an alignment (with one sequence 254 * excluded) for protein/nucleotide sequences with a given cross-reference 255 */ 256 @Test(groups = { "Functional" }, enabled = true) testSearchDataset()257 public void testSearchDataset() 258 { 259 /* 260 * nucleotide sequence with UNIPROT AND EMBL dbref 261 * peptide sequence with UNIPROT dbref 262 */ 263 SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC"); 264 Mapping map = new Mapping(new Sequence("pep2", "MLAVSRG"), new MapList( 265 new int[] { 1, 21 }, new int[] { 1, 7 }, 3, 1)); 266 DBRefEntry dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2", map); 267 dna1.addDBRef(dbref); 268 dna1.addDBRef(new DBRefEntry("EMBL", "0", "AF039662")); 269 SequenceI pep1 = new Sequence("Q9ZTS2", "MLAVSRGQ"); 270 dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2"); 271 pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2")); 272 AlignmentI al = new Alignment(new SequenceI[] { dna1, pep1 }); 273 274 List<SequenceI> result = new ArrayList<>(); 275 276 /* 277 * first search for a dbref nowhere on the alignment: 278 */ 279 dbref = new DBRefEntry("UNIPROT", "0", "P30419"); 280 CrossRef testee = new CrossRef(al.getSequencesArray(), al); 281 AlignedCodonFrame acf = new AlignedCodonFrame(); 282 boolean found = testee.searchDataset(true, dna1, dbref, result, acf, 283 true); 284 assertFalse(found); 285 assertTrue(result.isEmpty()); 286 assertTrue(acf.isEmpty()); 287 288 /* 289 * search for a protein sequence with dbref UNIPROT:Q9ZTS2 290 */ 291 acf = new AlignedCodonFrame(); 292 dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2"); 293 found = testee.searchDataset(!dna1.isProtein(), dna1, dbref, result, 294 acf, false); // search dataset with a protein xref from a dna 295 // sequence to locate the protein product 296 assertTrue(found); 297 assertEquals(1, result.size()); 298 assertSame(pep1, result.get(0)); 299 assertTrue(acf.isEmpty()); 300 301 /* 302 * search for a nucleotide sequence with dbref UNIPROT:Q9ZTS2 303 */ 304 result.clear(); 305 acf = new AlignedCodonFrame(); 306 dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2"); 307 found = testee.searchDataset(!pep1.isProtein(), pep1, dbref, result, 308 acf, false); // search dataset with a protein's direct dbref to 309 // locate dna sequences with matching xref 310 assertTrue(found); 311 assertEquals(1, result.size()); 312 assertSame(dna1, result.get(0)); 313 // should now have a mapping from dna to pep1 314 List<SequenceToSequenceMapping> mappings = acf.getMappings(); 315 assertEquals(1, mappings.size()); 316 SequenceToSequenceMapping mapping = mappings.get(0); 317 assertSame(dna1, mapping.getFromSeq()); 318 assertSame(pep1, mapping.getMapping().getTo()); 319 MapList mapList = mapping.getMapping().getMap(); 320 assertEquals(1, mapList.getToRatio()); 321 assertEquals(3, mapList.getFromRatio()); 322 assertEquals(1, mapList.getFromRanges().size()); 323 assertEquals(1, mapList.getFromRanges().get(0)[0]); 324 assertEquals(21, mapList.getFromRanges().get(0)[1]); 325 assertEquals(1, mapList.getToRanges().size()); 326 assertEquals(1, mapList.getToRanges().get(0)[0]); 327 assertEquals(7, mapList.getToRanges().get(0)[1]); 328 } 329 330 /** 331 * Test for finding 'product' sequences for the case where the selected 332 * sequence has a dbref with a mapping to a sequence. This represents the case 333 * where either 334 * <ul> 335 * <li>a fetched sequence is already decorated with its cross-reference (e.g. 336 * EMBL + translation), or</li> 337 * <li>Get Cross-References has been done once resulting in instantiated 338 * cross-reference mappings</li> 339 * </ul> 340 */ 341 @Test(groups = { "Functional" }) testFindXrefSequences_fromDbRefMap()342 public void testFindXrefSequences_fromDbRefMap() 343 { 344 /* 345 * scenario: nucleotide sequence AF039662 346 * with dbref + mapping to Q9ZTS2 and P30419 347 * which themselves each have a dbref and feature 348 */ 349 SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC"); 350 SequenceI pep1 = new Sequence("Q9ZTS2", "MALFQRSV"); 351 SequenceI pep2 = new Sequence("P30419", "MTRRSQIF"); 352 dna1.createDatasetSequence(); 353 pep1.createDatasetSequence(); 354 pep2.createDatasetSequence(); 355 356 pep1.getDatasetSequence().addDBRef( 357 new DBRefEntry("Pfam", "0", "PF00111")); 358 pep1.addSequenceFeature(new SequenceFeature("type", "desc", 12, 14, 1f, 359 "group")); 360 pep2.getDatasetSequence().addDBRef(new DBRefEntry("PDB", "0", "3JTK")); 361 pep2.addSequenceFeature(new SequenceFeature("type2", "desc2", 13, 15, 362 12f, "group2")); 363 364 MapList mapList = new MapList(new int[] { 1, 24 }, new int[] { 1, 3 }, 365 3, 1); 366 Mapping map = new Mapping(pep1, mapList); 367 DBRefEntry dbRef1 = new DBRefEntry("UNIPROT", "0", "Q9ZTS2", map); 368 dna1.getDatasetSequence().addDBRef(dbRef1); 369 mapList = new MapList(new int[] { 1, 24 }, new int[] { 1, 3 }, 3, 1); 370 map = new Mapping(pep2, mapList); 371 DBRefEntry dbRef2 = new DBRefEntry("UNIPROT", "0", "P30419", map); 372 dna1.getDatasetSequence().addDBRef(dbRef2); 373 374 /* 375 * find UNIPROT xrefs for nucleotide sequence - it should pick up 376 * mapped sequences 377 */ 378 AlignmentI al = new Alignment(new SequenceI[] { dna1 }); 379 Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al) 380 .findXrefSequences("UNIPROT", true); 381 assertEquals(2, xrefs.getHeight()); 382 383 /* 384 * cross-refs alignment holds copies of the mapped sequences 385 * including copies of their dbrefs and features 386 */ 387 checkCopySequence(pep1, xrefs.getSequenceAt(0)); 388 checkCopySequence(pep2, xrefs.getSequenceAt(1)); 389 } 390 391 /** 392 * Helper method that verifies that 'copy' has the same name, start, end, 393 * sequence and dataset sequence object as 'original' (but is not the same 394 * object) 395 * 396 * @param copy 397 * @param original 398 */ checkCopySequence(SequenceI copy, SequenceI original)399 private void checkCopySequence(SequenceI copy, SequenceI original) 400 { 401 assertNotSame(copy, original); 402 assertSame(copy.getDatasetSequence(), original.getDatasetSequence()); 403 assertEquals(copy.getName(), original.getName()); 404 assertEquals(copy.getStart(), original.getStart()); 405 assertEquals(copy.getEnd(), original.getEnd()); 406 assertEquals(copy.getSequenceAsString(), original.getSequenceAsString()); 407 } 408 409 /** 410 * Test for finding 'product' sequences for the case where the selected 411 * sequence has a dbref with no mapping, triggering a fetch from database 412 */ 413 @Test(groups = { "Functional_Failing" }) testFindXrefSequences_withFetch()414 public void testFindXrefSequences_withFetch() 415 { 416 SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC"); 417 dna1.addDBRef(new DBRefEntry("UNIPROT", "ENA:0", "Q9ZTS2")); 418 dna1.addDBRef(new DBRefEntry("UNIPROT", "ENA:0", "P30419")); 419 dna1.addDBRef(new DBRefEntry("UNIPROT", "ENA:0", "P00314")); 420 final SequenceI pep1 = new Sequence("Q9ZTS2", "MYQLIRSSW"); 421 pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2")); 422 423 final SequenceI pep2 = new Sequence("P00314", "MRKLLAASG"); 424 pep2.addDBRef(new DBRefEntry("UNIPROT", "0", "P00314")); 425 426 /* 427 * argument false suppresses adding DAS sources 428 * todo: define an interface type SequenceFetcherI and mock that 429 */ 430 SequenceFetcher mockFetcher = new SequenceFetcher() 431 { 432 @Override 433 public boolean isFetchable(String source) 434 { 435 return true; 436 } 437 438 @Override 439 public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna) 440 { 441 return new SequenceI[] { pep1, pep2 }; 442 } 443 }; 444 SequenceFetcherFactory.setSequenceFetcher(mockFetcher); 445 446 /* 447 * find UNIPROT xrefs for nucleotide sequence 448 */ 449 AlignmentI al = new Alignment(new SequenceI[] { dna1 }); 450 Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al) 451 .findXrefSequences("UNIPROT", true); 452 assertEquals(2, xrefs.getHeight()); 453 assertSame(pep1, xrefs.getSequenceAt(0)); 454 assertSame(pep2, xrefs.getSequenceAt(1)); 455 } 456 457 @AfterClass(alwaysRun = true) tearDown()458 public void tearDown() 459 { 460 SequenceFetcherFactory.setSequenceFetcher(null); 461 } 462 463 /** 464 * Test for finding 'product' sequences for the case where both gene and 465 * transcript sequences have dbrefs to Uniprot. 466 */ 467 @Test(groups = { "Functional_Failing" }) testFindXrefSequences_forGeneAndTranscripts()468 public void testFindXrefSequences_forGeneAndTranscripts() 469 { 470 /* 471 * 'gene' sequence 472 */ 473 SequenceI gene = new Sequence("ENSG00000157764", "CGCCTCCCTTCCCC"); 474 gene.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056")); 475 gene.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3")); 476 477 /* 478 * 'transcript' with CDS feature (supports mapping to protein) 479 */ 480 SequenceI braf001 = new Sequence("ENST00000288602", "taagATGGCGGCGCTGa"); 481 braf001.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056")); 482 braf001.addSequenceFeature(new SequenceFeature("CDS", "", 5, 16, 0f, 483 null)); 484 485 /* 486 * 'spliced transcript' with CDS ranges 487 */ 488 SequenceI braf002 = new Sequence("ENST00000497784", "gCAGGCtaTCTGTTCaa"); 489 braf002.addDBRef(new DBRefEntry("UNIPROT", "ENSEMBL|0", "H7C5K3")); 490 braf002.addSequenceFeature(new SequenceFeature("CDS", "", 2, 6, 0f, 491 null)); 492 braf002.addSequenceFeature(new SequenceFeature("CDS", "", 9, 15, 0f, 493 null)); 494 495 /* 496 * TODO code is fragile - use of SequenceIdMatcher depends on fetched 497 * sequences having a name starting Source|Accession 498 * which happens to be true for Uniprot,PDB,EMBL but not Pfam,Rfam,Ensembl 499 */ 500 final SequenceI pep1 = new Sequence("UNIPROT|P15056", "MAAL"); 501 pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056")); 502 final SequenceI pep2 = new Sequence("UNIPROT|H7C5K3", "QALF"); 503 pep2.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3")); 504 /* 505 * argument false suppresses adding DAS sources 506 * todo: define an interface type SequenceFetcherI and mock that 507 */ 508 SequenceFetcher mockFetcher = new SequenceFetcher() 509 { 510 @Override 511 public boolean isFetchable(String source) 512 { 513 return true; 514 } 515 516 @Override 517 public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna) 518 { 519 return new SequenceI[] { pep1, pep2 }; 520 } 521 }; 522 SequenceFetcherFactory.setSequenceFetcher(mockFetcher); 523 524 /* 525 * find UNIPROT xrefs for gene and transcripts 526 * verify that 527 * - the two proteins are retrieved but not duplicated 528 * - mappings are built from transcript (CDS) to proteins 529 * - no mappings from gene to proteins 530 */ 531 SequenceI[] seqs = new SequenceI[] { gene, braf001, braf002 }; 532 AlignmentI al = new Alignment(seqs); 533 Alignment xrefs = new CrossRef(seqs, al).findXrefSequences("UNIPROT", 534 true); 535 assertEquals(2, xrefs.getHeight()); 536 assertSame(pep1, xrefs.getSequenceAt(0)); 537 assertSame(pep2, xrefs.getSequenceAt(1)); 538 } 539 540 /** 541 * <pre> 542 * Test that emulates this (real but simplified) case: 543 * Alignment: DBrefs 544 * UNIPROT|P0CE19 EMBL|J03321, EMBL|X06707, EMBL|M19487 545 * UNIPROT|P0CE20 EMBL|J03321, EMBL|X06707, EMBL|X07547 546 * Find cross-references for EMBL. These are mocked here as 547 * EMBL|J03321 with mappings to P0CE18, P0CE19, P0CE20 548 * EMBL|X06707 with mappings to P0CE17, P0CE19, P0CE20 549 * EMBL|M19487 with mappings to P0CE19, Q46432 550 * EMBL|X07547 with mappings to P0CE20, B0BCM4 551 * EMBL sequences are first 'fetched' (mocked here) for P0CE19. 552 * The 3 EMBL sequences are added to the alignment dataset. 553 * Their dbrefs to Uniprot products P0CE19 and P0CE20 should be matched in the 554 * alignment dataset and updated to reference the original Uniprot sequences. 555 * For the second Uniprot sequence, the J03321 and X06707 xrefs should be 556 * resolved from the dataset, and only the X07547 dbref fetched. 557 * So the end state to verify is: 558 * - 4 cross-ref sequences returned: J03321, X06707, M19487, X07547 559 * - P0CE19/20 dbrefs to EMBL sequences now have mappings 560 * - J03321 dbrefs to P0CE19/20 mapped to original Uniprot sequences 561 * - X06707 dbrefs to P0CE19/20 mapped to original Uniprot sequences 562 * </pre> 563 */ 564 @Test(groups = { "Functional_Failing" }) testFindXrefSequences_uniprotEmblManyToMany()565 public void testFindXrefSequences_uniprotEmblManyToMany() 566 { 567 /* 568 * Uniprot sequences, both with xrefs to EMBL|J03321 569 * and EMBL|X07547 570 */ 571 SequenceI p0ce19 = new Sequence("UNIPROT|P0CE19", "KPFG"); 572 p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "J03321")); 573 p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "X06707")); 574 p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "M19487")); 575 SequenceI p0ce20 = new Sequence("UNIPROT|P0CE20", "PFGK"); 576 p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "J03321")); 577 p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "X06707")); 578 p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "X07547")); 579 580 /* 581 * EMBL sequences to be 'fetched', complete with dbrefs and mappings 582 * to their protein products (CDS location and translations are provided 583 * in EMBL XML); these should be matched to, and replaced with, 584 * the corresponding uniprot sequences after fetching 585 */ 586 587 /* 588 * J03321 with mappings to P0CE19 and P0CE20 589 */ 590 final SequenceI j03321 = new Sequence("EMBL|J03321", "AAACCCTTTGGGAAAA"); 591 DBRefEntry dbref1 = new DBRefEntry("UNIPROT", "0", "P0CE19"); 592 MapList mapList = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 }, 593 3, 1); 594 Mapping map = new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), 595 mapList); 596 // add a dbref to the mapped to sequence - should get copied to p0ce19 597 map.getTo().addDBRef(new DBRefEntry("PIR", "0", "S01875")); 598 dbref1.setMap(map); 599 j03321.addDBRef(dbref1); 600 DBRefEntry dbref2 = new DBRefEntry("UNIPROT", "0", "P0CE20"); 601 mapList = new MapList(new int[] { 4, 15 }, new int[] { 2, 5 }, 3, 1); 602 dbref2.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "PFGK"), 603 new MapList(mapList))); 604 j03321.addDBRef(dbref2); 605 606 /* 607 * X06707 with mappings to P0CE19 and P0CE20 608 */ 609 final SequenceI x06707 = new Sequence("EMBL|X06707", "atgAAACCCTTTGGG"); 610 DBRefEntry dbref3 = new DBRefEntry("UNIPROT", "0", "P0CE19"); 611 MapList map2 = new MapList(new int[] { 4, 15 }, new int[] { 1, 4 }, 3, 612 1); 613 dbref3.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), map2)); 614 x06707.addDBRef(dbref3); 615 DBRefEntry dbref4 = new DBRefEntry("UNIPROT", "0", "P0CE20"); 616 MapList map3 = new MapList(new int[] { 4, 15 }, new int[] { 1, 4 }, 3, 617 1); 618 dbref4.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "PFGK"), map3)); 619 x06707.addDBRef(dbref4); 620 621 /* 622 * M19487 with mapping to P0CE19 and Q46432 623 */ 624 final SequenceI m19487 = new Sequence("EMBL|M19487", "AAACCCTTTGGG"); 625 DBRefEntry dbref5 = new DBRefEntry("UNIPROT", "0", "P0CE19"); 626 dbref5.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), 627 new MapList(mapList))); 628 m19487.addDBRef(dbref5); 629 DBRefEntry dbref6 = new DBRefEntry("UNIPROT", "0", "Q46432"); 630 dbref6.setMap(new Mapping(new Sequence("UNIPROT|Q46432", "KPFG"), 631 new MapList(mapList))); 632 m19487.addDBRef(dbref6); 633 634 /* 635 * X07547 with mapping to P0CE20 and B0BCM4 636 */ 637 final SequenceI x07547 = new Sequence("EMBL|X07547", "cccAAACCCTTTGGG"); 638 DBRefEntry dbref7 = new DBRefEntry("UNIPROT", "0", "P0CE20"); 639 dbref7.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "PFGK"), 640 new MapList(map2))); 641 x07547.addDBRef(dbref7); 642 DBRefEntry dbref8 = new DBRefEntry("UNIPROT", "0", "B0BCM4"); 643 dbref8.setMap(new Mapping(new Sequence("UNIPROT|B0BCM4", "KPFG"), 644 new MapList(map2))); 645 x07547.addDBRef(dbref8); 646 647 /* 648 * mock sequence fetcher to 'return' the EMBL sequences 649 * TODO: Mockito would allow .thenReturn().thenReturn() here, 650 * and also capture and verification of the parameters 651 * passed in calls to getSequences() - important to verify that 652 * duplicate sequence fetches are not requested 653 */ 654 SequenceFetcher mockFetcher = new SequenceFetcher() 655 { 656 int call = 0; 657 658 @Override 659 public boolean isFetchable(String source) 660 { 661 return true; 662 } 663 664 @Override 665 public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna) 666 { 667 call++; 668 if (call == 1) 669 { 670 assertEquals("Expected 3 embl seqs in first fetch", 3, 671 refs.size()); 672 return new SequenceI[] { j03321, x06707, m19487 }; 673 } 674 else 675 { 676 assertEquals("Expected 1 embl seq in second fetch", 1, 677 refs.size()); 678 return new SequenceI[] { x07547 }; 679 } 680 } 681 }; 682 SequenceFetcherFactory.setSequenceFetcher(mockFetcher); 683 684 /* 685 * find EMBL xrefs for Uniprot seqs and verify that 686 * - the EMBL xref'd sequences are retrieved without duplicates 687 * - mappings are added to the Uniprot dbrefs 688 * - mappings in the EMBL-to-Uniprot dbrefs are updated to the 689 * alignment sequences 690 * - dbrefs on the EMBL sequences are added to the original dbrefs 691 */ 692 SequenceI[] seqs = new SequenceI[] { p0ce19, p0ce20 }; 693 AlignmentI al = new Alignment(seqs); 694 Alignment xrefs = new CrossRef(seqs, al).findXrefSequences("EMBL", 695 false); 696 697 /* 698 * verify retrieved sequences 699 */ 700 assertNotNull(xrefs); 701 assertEquals(4, xrefs.getHeight()); 702 assertSame(j03321, xrefs.getSequenceAt(0)); 703 assertSame(x06707, xrefs.getSequenceAt(1)); 704 assertSame(m19487, xrefs.getSequenceAt(2)); 705 assertSame(x07547, xrefs.getSequenceAt(3)); 706 707 /* 708 * verify mappings added to Uniprot-to-EMBL dbrefs 709 */ 710 Mapping mapping = p0ce19.getDBRefs()[0].getMap(); 711 assertSame(j03321, mapping.getTo()); 712 mapping = p0ce19.getDBRefs()[1].getMap(); 713 assertSame(x06707, mapping.getTo()); 714 mapping = p0ce20.getDBRefs()[0].getMap(); 715 assertSame(j03321, mapping.getTo()); 716 mapping = p0ce20.getDBRefs()[1].getMap(); 717 assertSame(x06707, mapping.getTo()); 718 719 /* 720 * verify dbrefs on EMBL are mapped to alignment seqs 721 */ 722 assertSame(p0ce19, j03321.getDBRefs()[0].getMap().getTo()); 723 assertSame(p0ce20, j03321.getDBRefs()[1].getMap().getTo()); 724 assertSame(p0ce19, x06707.getDBRefs()[0].getMap().getTo()); 725 assertSame(p0ce20, x06707.getDBRefs()[1].getMap().getTo()); 726 727 /* 728 * verify new dbref on EMBL dbref mapping is copied to the 729 * original Uniprot sequence 730 */ 731 assertEquals(4, p0ce19.getDBRefs().length); 732 assertEquals("PIR", p0ce19.getDBRefs()[3].getSource()); 733 assertEquals("S01875", p0ce19.getDBRefs()[3].getAccessionId()); 734 } 735 736 @Test(groups = "Functional") testSameSequence()737 public void testSameSequence() 738 { 739 assertTrue(CrossRef.sameSequence(null, null)); 740 SequenceI seq1 = new Sequence("seq1", "ABCDEF"); 741 assertFalse(CrossRef.sameSequence(seq1, null)); 742 assertFalse(CrossRef.sameSequence(null, seq1)); 743 assertTrue(CrossRef.sameSequence(seq1, new Sequence("seq2", "ABCDEF"))); 744 assertTrue(CrossRef.sameSequence(seq1, new Sequence("seq2", "abcdef"))); 745 assertFalse(CrossRef 746 .sameSequence(seq1, new Sequence("seq2", "ABCDE-F"))); 747 assertFalse(CrossRef.sameSequence(seq1, new Sequence("seq2", "BCDEF"))); 748 } 749 } 750