1 /*
2  * Jalview - A Sequence Alignment Editor and Viewer (2.11.1.4)
3  * Copyright (C) 2021 The Jalview Authors
4  *
5  * This file is part of Jalview.
6  *
7  * Jalview is free software: you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation, either version 3
10  * of the License, or (at your option) any later version.
11  *
12  * Jalview is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty
14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15  * PURPOSE.  See the GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
19  * The Jalview Authors are detailed in the 'AUTHORS' file.
20  */
21 package jalview.analysis;
22 
23 import static org.junit.Assert.assertNotEquals;
24 import static org.testng.AssertJUnit.assertEquals;
25 import static org.testng.AssertJUnit.assertFalse;
26 import static org.testng.AssertJUnit.assertNotNull;
27 import static org.testng.AssertJUnit.assertNull;
28 import static org.testng.AssertJUnit.assertSame;
29 import static org.testng.AssertJUnit.assertTrue;
30 
31 import jalview.analysis.AlignmentUtils.DnaVariant;
32 import jalview.datamodel.AlignedCodonFrame;
33 import jalview.datamodel.Alignment;
34 import jalview.datamodel.AlignmentAnnotation;
35 import jalview.datamodel.AlignmentI;
36 import jalview.datamodel.Annotation;
37 import jalview.datamodel.DBRefEntry;
38 import jalview.datamodel.GeneLociI;
39 import jalview.datamodel.Mapping;
40 import jalview.datamodel.SearchResultMatchI;
41 import jalview.datamodel.SearchResultsI;
42 import jalview.datamodel.Sequence;
43 import jalview.datamodel.SequenceFeature;
44 import jalview.datamodel.SequenceI;
45 import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
46 import jalview.datamodel.features.SequenceFeatures;
47 import jalview.gui.JvOptionPane;
48 import jalview.io.AppletFormatAdapter;
49 import jalview.io.DataSourceType;
50 import jalview.io.FileFormat;
51 import jalview.io.FileFormatI;
52 import jalview.io.FormatAdapter;
53 import jalview.io.gff.SequenceOntologyI;
54 import jalview.util.MapList;
55 import jalview.util.MappingUtils;
56 
57 import java.io.IOException;
58 import java.util.ArrayList;
59 import java.util.Arrays;
60 import java.util.Iterator;
61 import java.util.LinkedHashMap;
62 import java.util.List;
63 import java.util.Map;
64 import java.util.TreeMap;
65 
66 import org.testng.annotations.BeforeClass;
67 import org.testng.annotations.Test;
68 
69 public class AlignmentUtilsTests
70 {
71   private static Sequence ts = new Sequence("short",
72           "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm");
73 
74   @BeforeClass(alwaysRun = true)
setUpJvOptionPane()75   public void setUpJvOptionPane()
76   {
77     JvOptionPane.setInteractiveMode(false);
78     JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION);
79   }
80 
81   @Test(groups = { "Functional" })
testExpandContext()82   public void testExpandContext()
83   {
84     AlignmentI al = new Alignment(new Sequence[] {});
85     for (int i = 4; i < 14; i += 2)
86     {
87       SequenceI s1 = ts.deriveSequence().getSubSequence(i, i + 7);
88       al.addSequence(s1);
89     }
90     System.out.println(new AppletFormatAdapter().formatSequences(
91             FileFormat.Clustal,
92             al, true));
93     for (int flnk = -1; flnk < 25; flnk++)
94     {
95       AlignmentI exp = AlignmentUtils.expandContext(al, flnk);
96       System.out.println("\nFlank size: " + flnk);
97       System.out.println(new AppletFormatAdapter().formatSequences(
98               FileFormat.Clustal, exp, true));
99       if (flnk == -1)
100       {
101         /*
102          * Full expansion to complete sequences
103          */
104         for (SequenceI sq : exp.getSequences())
105         {
106           String ung = sq.getSequenceAsString().replaceAll("-+", "");
107           final String errorMsg = "Flanking sequence not the same as original dataset sequence.\n"
108                   + ung
109                   + "\n"
110                   + sq.getDatasetSequence().getSequenceAsString();
111           assertTrue(errorMsg, ung.equalsIgnoreCase(sq.getDatasetSequence()
112                   .getSequenceAsString()));
113         }
114       }
115       else if (flnk == 24)
116       {
117         /*
118          * Last sequence is fully expanded, others have leading gaps to match
119          */
120         assertTrue(exp.getSequenceAt(4).getSequenceAsString()
121                 .startsWith("abc"));
122         assertTrue(exp.getSequenceAt(3).getSequenceAsString()
123                 .startsWith("--abc"));
124         assertTrue(exp.getSequenceAt(2).getSequenceAsString()
125                 .startsWith("----abc"));
126         assertTrue(exp.getSequenceAt(1).getSequenceAsString()
127                 .startsWith("------abc"));
128         assertTrue(exp.getSequenceAt(0).getSequenceAsString()
129                 .startsWith("--------abc"));
130       }
131     }
132   }
133 
134   /**
135    * Test that annotations are correctly adjusted by expandContext
136    */
137   @Test(groups = { "Functional" })
testExpandContext_annotation()138   public void testExpandContext_annotation()
139   {
140     AlignmentI al = new Alignment(new Sequence[] {});
141     SequenceI ds = new Sequence("Seq1", "ABCDEFGHI");
142     // subsequence DEF:
143     SequenceI seq1 = ds.deriveSequence().getSubSequence(3, 6);
144     al.addSequence(seq1);
145 
146     /*
147      * Annotate DEF with 4/5/6 respectively
148      */
149     Annotation[] anns = new Annotation[] { new Annotation(4),
150         new Annotation(5), new Annotation(6) };
151     AlignmentAnnotation ann = new AlignmentAnnotation("SS",
152             "secondary structure", anns);
153     seq1.addAlignmentAnnotation(ann);
154 
155     /*
156      * The annotations array should match aligned positions
157      */
158     assertEquals(3, ann.annotations.length);
159     assertEquals(4, ann.annotations[0].value, 0.001);
160     assertEquals(5, ann.annotations[1].value, 0.001);
161     assertEquals(6, ann.annotations[2].value, 0.001);
162 
163     /*
164      * Check annotation to sequence position mappings before expanding the
165      * sequence; these are set up in Sequence.addAlignmentAnnotation ->
166      * Annotation.setSequenceRef -> createSequenceMappings
167      */
168     assertNull(ann.getAnnotationForPosition(1));
169     assertNull(ann.getAnnotationForPosition(2));
170     assertNull(ann.getAnnotationForPosition(3));
171     assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
172     assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
173     assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
174     assertNull(ann.getAnnotationForPosition(7));
175     assertNull(ann.getAnnotationForPosition(8));
176     assertNull(ann.getAnnotationForPosition(9));
177 
178     /*
179      * Expand the subsequence to the full sequence abcDEFghi
180      */
181     AlignmentI expanded = AlignmentUtils.expandContext(al, -1);
182     assertEquals("abcDEFghi", expanded.getSequenceAt(0)
183             .getSequenceAsString());
184 
185     /*
186      * Confirm the alignment and sequence have the same SS annotation,
187      * referencing the expanded sequence
188      */
189     ann = expanded.getSequenceAt(0).getAnnotation()[0];
190     assertSame(ann, expanded.getAlignmentAnnotation()[0]);
191     assertSame(expanded.getSequenceAt(0), ann.sequenceRef);
192 
193     /*
194      * The annotations array should have null values except for annotated
195      * positions
196      */
197     assertNull(ann.annotations[0]);
198     assertNull(ann.annotations[1]);
199     assertNull(ann.annotations[2]);
200     assertEquals(4, ann.annotations[3].value, 0.001);
201     assertEquals(5, ann.annotations[4].value, 0.001);
202     assertEquals(6, ann.annotations[5].value, 0.001);
203     assertNull(ann.annotations[6]);
204     assertNull(ann.annotations[7]);
205     assertNull(ann.annotations[8]);
206 
207     /*
208      * sequence position mappings should be unchanged
209      */
210     assertNull(ann.getAnnotationForPosition(1));
211     assertNull(ann.getAnnotationForPosition(2));
212     assertNull(ann.getAnnotationForPosition(3));
213     assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
214     assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
215     assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
216     assertNull(ann.getAnnotationForPosition(7));
217     assertNull(ann.getAnnotationForPosition(8));
218     assertNull(ann.getAnnotationForPosition(9));
219   }
220 
221   /**
222    * Test method that returns a map of lists of sequences by sequence name.
223    *
224    * @throws IOException
225    */
226   @Test(groups = { "Functional" })
testGetSequencesByName()227   public void testGetSequencesByName() throws IOException
228   {
229     final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
230             + ">Seq1Name\nABCD\n";
231     AlignmentI al = loadAlignment(data, FileFormat.Fasta);
232     Map<String, List<SequenceI>> map = AlignmentUtils
233             .getSequencesByName(al);
234     assertEquals(2, map.keySet().size());
235     assertEquals(2, map.get("Seq1Name").size());
236     assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
237     assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
238     assertEquals(1, map.get("Seq2Name").size());
239     assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
240   }
241 
242   /**
243    * Helper method to load an alignment and ensure dataset sequences are set up.
244    *
245    * @param data
246    * @param format
247    *          TODO
248    * @return
249    * @throws IOException
250    */
loadAlignment(final String data, FileFormatI format)251   protected AlignmentI loadAlignment(final String data, FileFormatI format)
252           throws IOException
253   {
254     AlignmentI a = new FormatAdapter().readFile(data,
255             DataSourceType.PASTE, format);
256     a.setDataset(null);
257     return a;
258   }
259 
260   /**
261    * Test mapping of protein to cDNA, for the case where we have no sequence
262    * cross-references, so mappings are made first-served 1-1 where sequences
263    * translate.
264    *
265    * @throws IOException
266    */
267   @Test(groups = { "Functional" })
testMapProteinAlignmentToCdna_noXrefs()268   public void testMapProteinAlignmentToCdna_noXrefs() throws IOException
269   {
270     List<SequenceI> protseqs = new ArrayList<>();
271     protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
272     protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
273     protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
274     AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
275     protein.setDataset(null);
276 
277     List<SequenceI> dnaseqs = new ArrayList<>();
278     dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
279     dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
280     dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
281     dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
282     AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
283     cdna.setDataset(null);
284 
285     assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
286 
287     // 3 mappings made, each from 1 to 1 sequence
288     assertEquals(3, protein.getCodonFrames().size());
289     assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
290     assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
291     assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
292 
293     // V12345 mapped to A22222
294     AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
295             .get(0);
296     assertEquals(1, acf.getdnaSeqs().length);
297     assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
298             acf.getdnaSeqs()[0]);
299     Mapping[] protMappings = acf.getProtMappings();
300     assertEquals(1, protMappings.length);
301     MapList mapList = protMappings[0].getMap();
302     assertEquals(3, mapList.getFromRatio());
303     assertEquals(1, mapList.getToRatio());
304     assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
305             .get(0)));
306     assertEquals(1, mapList.getFromRanges().size());
307     assertTrue(Arrays.equals(new int[] { 1, 3 },
308             mapList.getToRanges().get(0)));
309     assertEquals(1, mapList.getToRanges().size());
310 
311     // V12346 mapped to A33333
312     acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
313     assertEquals(1, acf.getdnaSeqs().length);
314     assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
315             acf.getdnaSeqs()[0]);
316 
317     // V12347 mapped to A11111
318     acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
319     assertEquals(1, acf.getdnaSeqs().length);
320     assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
321             acf.getdnaSeqs()[0]);
322 
323     // no mapping involving the 'extra' A44444
324     assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
325   }
326 
327   /**
328    * Test for the alignSequenceAs method that takes two sequences and a mapping.
329    */
330   @Test(groups = { "Functional" })
testAlignSequenceAs_withMapping_noIntrons()331   public void testAlignSequenceAs_withMapping_noIntrons()
332   {
333     MapList map = new MapList(new int[] { 1, 6 }, new int[] { 1, 2 }, 3, 1);
334 
335     /*
336      * No existing gaps in dna:
337      */
338     checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
339             "---GGG---AAA");
340 
341     /*
342      * Now introduce gaps in dna but ignore them when realigning.
343      */
344     checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
345             "---GGG---AAA");
346 
347     /*
348      * Now include gaps in dna when realigning. First retaining 'mapped' gaps
349      * only, i.e. those within the exon region.
350      */
351     checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
352             "---G-G--G---A--A-A");
353 
354     /*
355      * Include all gaps in dna when realigning (within and without the exon
356      * region). The leading gap, and the gaps between codons, are subsumed by
357      * the protein alignment gap.
358      */
359     checkAlignSequenceAs("-G-GG--AA-A---", "-A-L-", true, true, map,
360             "---G-GG---AA-A---");
361 
362     /*
363      * Include only unmapped gaps in dna when realigning (outside the exon
364      * region). The leading gap, and the gaps between codons, are subsumed by
365      * the protein alignment gap.
366      */
367     checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
368             "---GGG---AAA---");
369   }
370 
371   /**
372    * Test for the alignSequenceAs method that takes two sequences and a mapping.
373    */
374   @Test(groups = { "Functional" })
testAlignSequenceAs_withMapping_withIntrons()375   public void testAlignSequenceAs_withMapping_withIntrons()
376   {
377     /*
378      * Exons at codon 2 (AAA) and 4 (TTT)
379      */
380     MapList map = new MapList(new int[] { 4, 6, 10, 12 },
381             new int[] { 1, 2 }, 3, 1);
382 
383     /*
384      * Simple case: no gaps in dna
385      */
386     checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
387             "GGG---AAACCCTTTGGG");
388 
389     /*
390      * Add gaps to dna - but ignore when realigning.
391      */
392     checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
393             false, false, map, "GGG---AAACCCTTTGGG");
394 
395     /*
396      * Add gaps to dna - include within exons only when realigning.
397      */
398     checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
399             true, false, map, "GGG---A--A---ACCCT-TTGGG");
400 
401     /*
402      * Include gaps outside exons only when realigning.
403      */
404     checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
405             false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
406 
407     /*
408      * Include gaps following first intron if we are 'preserving mapped gaps'
409      */
410     checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
411             true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
412 
413     /*
414      * Include all gaps in dna when realigning.
415      */
416     checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
417             true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
418   }
419 
420   /**
421    * Test for the case where not all of the protein sequence is mapped to cDNA.
422    */
423   @Test(groups = { "Functional" })
testAlignSequenceAs_withMapping_withUnmappedProtein()424   public void testAlignSequenceAs_withMapping_withUnmappedProtein()
425   {
426     /*
427      * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
428      */
429     final MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] {
430         1, 1, 3, 3 }, 3, 1);
431 
432     /*
433      * -L- 'aligns' ccc------
434      */
435     checkAlignSequenceAs("gggAAAcccTTTggg", "-A-L-P-", false, false, map,
436             "gggAAAccc------TTTggg");
437   }
438 
439   /**
440    * Helper method that performs and verifies the method under test.
441    *
442    * @param alignee
443    *          the sequence to be realigned
444    * @param alignModel
445    *          the sequence whose alignment is to be copied
446    * @param preserveMappedGaps
447    * @param preserveUnmappedGaps
448    * @param map
449    * @param expected
450    */
checkAlignSequenceAs(final String alignee, final String alignModel, final boolean preserveMappedGaps, final boolean preserveUnmappedGaps, MapList map, final String expected)451   protected void checkAlignSequenceAs(final String alignee,
452           final String alignModel, final boolean preserveMappedGaps,
453           final boolean preserveUnmappedGaps, MapList map,
454           final String expected)
455   {
456     SequenceI alignMe = new Sequence("Seq1", alignee);
457     alignMe.createDatasetSequence();
458     SequenceI alignFrom = new Sequence("Seq2", alignModel);
459     alignFrom.createDatasetSequence();
460     AlignedCodonFrame acf = new AlignedCodonFrame();
461     acf.addMap(alignMe.getDatasetSequence(),
462             alignFrom.getDatasetSequence(), map);
463 
464     AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "---", '-',
465             preserveMappedGaps, preserveUnmappedGaps);
466     assertEquals(expected, alignMe.getSequenceAsString());
467   }
468 
469   /**
470    * Test for the alignSequenceAs method where we preserve gaps in introns only.
471    */
472   @Test(groups = { "Functional" })
testAlignSequenceAs_keepIntronGapsOnly()473   public void testAlignSequenceAs_keepIntronGapsOnly()
474   {
475 
476     /*
477      * Intron GGGAAA followed by exon CCCTTT
478      */
479     MapList map = new MapList(new int[] { 7, 12 }, new int[] { 1, 2 }, 3, 1);
480 
481     checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL", false, true, map,
482             "GG-G-AA-ACCCTTT");
483   }
484 
485   /**
486    * Test the method that realigns protein to match mapped codon alignment.
487    */
488   @Test(groups = { "Functional" })
testAlignProteinAsDna()489   public void testAlignProteinAsDna()
490   {
491     // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
492     SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
493     // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
494     SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
495     // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
496     SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
497     AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
498     dna.setDataset(null);
499 
500     // protein alignment will be realigned like dna
501     SequenceI prot1 = new Sequence("Seq1", "CHYQ");
502     SequenceI prot2 = new Sequence("Seq2", "CHYQ");
503     SequenceI prot3 = new Sequence("Seq3", "CHYQ");
504     SequenceI prot4 = new Sequence("Seq4", "R-QSV"); // unmapped, unchanged
505     AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
506         prot3, prot4 });
507     protein.setDataset(null);
508 
509     MapList map = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 }, 3, 1);
510     AlignedCodonFrame acf = new AlignedCodonFrame();
511     acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
512     acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
513     acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
514     ArrayList<AlignedCodonFrame> acfs = new ArrayList<>();
515     acfs.add(acf);
516     protein.setCodonFrames(acfs);
517 
518     /*
519      * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
520      * [8,9,10] [10,11,12] [11,12,13]
521      */
522     AlignmentUtils.alignProteinAsDna(protein, dna);
523     assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
524     assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
525     assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
526     assertEquals("R-QSV", prot4.getSequenceAsString());
527   }
528 
529   /**
530    * Test the method that tests whether a CDNA sequence translates to a protein
531    * sequence
532    */
533   @Test(groups = { "Functional" })
testTranslatesAs()534   public void testTranslatesAs()
535   {
536     // null arguments check
537     assertFalse(AlignmentUtils.translatesAs(null, 0, null));
538     assertFalse(AlignmentUtils.translatesAs(new char[] { 't' }, 0, null));
539     assertFalse(AlignmentUtils.translatesAs(null, 0, new char[] { 'a' }));
540 
541     // straight translation
542     assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
543             "FPKG".toCharArray()));
544     // with extra start codon (not in protein)
545     assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
546             3, "FPKG".toCharArray()));
547     // with stop codon1 (not in protein)
548     assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
549             0, "FPKG".toCharArray()));
550     // with stop codon1 (in protein as *)
551     assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
552             0, "FPKG*".toCharArray()));
553     // with stop codon2 (not in protein)
554     assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
555             0, "FPKG".toCharArray()));
556     // with stop codon3 (not in protein)
557     assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
558             0, "FPKG".toCharArray()));
559     // with start and stop codon1
560     assertTrue(AlignmentUtils.translatesAs(
561             "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG".toCharArray()));
562     // with start and stop codon1 (in protein as *)
563     assertTrue(AlignmentUtils.translatesAs(
564             "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG*".toCharArray()));
565     // with start and stop codon2
566     assertTrue(AlignmentUtils.translatesAs(
567             "atgtttcccaaagggtag".toCharArray(), 3, "FPKG".toCharArray()));
568     // with start and stop codon3
569     assertTrue(AlignmentUtils.translatesAs(
570             "atgtttcccaaagggtga".toCharArray(), 3, "FPKG".toCharArray()));
571 
572     // with embedded stop codons
573     assertTrue(AlignmentUtils.translatesAs(
574             "atgtttTAGcccaaaTAAgggtga".toCharArray(), 3,
575             "F*PK*G".toCharArray()));
576 
577     // wrong protein
578     assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
579             0, "FPMG".toCharArray()));
580 
581     // truncated dna
582     assertFalse(AlignmentUtils.translatesAs("tttcccaaagg".toCharArray(), 0,
583             "FPKG".toCharArray()));
584 
585     // truncated protein
586     assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
587             0, "FPK".toCharArray()));
588 
589     // overlong dna (doesn't end in stop codon)
590     assertFalse(AlignmentUtils.translatesAs(
591             "tttcccaaagggttt".toCharArray(), 0, "FPKG".toCharArray()));
592 
593     // dna + stop codon + more
594     assertFalse(AlignmentUtils.translatesAs(
595             "tttcccaaagggttaga".toCharArray(), 0, "FPKG".toCharArray()));
596 
597     // overlong protein
598     assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
599             0, "FPKGQ".toCharArray()));
600   }
601 
602   /**
603    * Test mapping of protein to cDNA, for cases where the cDNA has start and/or
604    * stop codons in addition to the protein coding sequence.
605    *
606    * @throws IOException
607    */
608   @Test(groups = { "Functional" })
testMapProteinAlignmentToCdna_withStartAndStopCodons()609   public void testMapProteinAlignmentToCdna_withStartAndStopCodons()
610           throws IOException
611   {
612     List<SequenceI> protseqs = new ArrayList<>();
613     protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
614     protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
615     protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
616     AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
617     protein.setDataset(null);
618 
619     List<SequenceI> dnaseqs = new ArrayList<>();
620     // start + SAR:
621     dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
622     // = EIQ + stop
623     dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
624     // = start +EIQ + stop
625     dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
626     dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
627     AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
628     cdna.setDataset(null);
629 
630     assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
631 
632     // 3 mappings made, each from 1 to 1 sequence
633     assertEquals(3, protein.getCodonFrames().size());
634     assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
635     assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
636     assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
637 
638     // V12345 mapped from A22222
639     AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
640             .get(0);
641     assertEquals(1, acf.getdnaSeqs().length);
642     assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
643             acf.getdnaSeqs()[0]);
644     Mapping[] protMappings = acf.getProtMappings();
645     assertEquals(1, protMappings.length);
646     MapList mapList = protMappings[0].getMap();
647     assertEquals(3, mapList.getFromRatio());
648     assertEquals(1, mapList.getToRatio());
649     assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
650             .get(0)));
651     assertEquals(1, mapList.getFromRanges().size());
652     assertTrue(Arrays.equals(new int[] { 1, 3 },
653             mapList.getToRanges().get(0)));
654     assertEquals(1, mapList.getToRanges().size());
655 
656     // V12346 mapped from A33333 starting position 4
657     acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
658     assertEquals(1, acf.getdnaSeqs().length);
659     assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
660             acf.getdnaSeqs()[0]);
661     protMappings = acf.getProtMappings();
662     assertEquals(1, protMappings.length);
663     mapList = protMappings[0].getMap();
664     assertEquals(3, mapList.getFromRatio());
665     assertEquals(1, mapList.getToRatio());
666     assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
667             .get(0)));
668     assertEquals(1, mapList.getFromRanges().size());
669     assertTrue(Arrays.equals(new int[] { 1, 3 },
670             mapList.getToRanges().get(0)));
671     assertEquals(1, mapList.getToRanges().size());
672 
673     // V12347 mapped to A11111 starting position 4
674     acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
675     assertEquals(1, acf.getdnaSeqs().length);
676     assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
677             acf.getdnaSeqs()[0]);
678     protMappings = acf.getProtMappings();
679     assertEquals(1, protMappings.length);
680     mapList = protMappings[0].getMap();
681     assertEquals(3, mapList.getFromRatio());
682     assertEquals(1, mapList.getToRatio());
683     assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
684             .get(0)));
685     assertEquals(1, mapList.getFromRanges().size());
686     assertTrue(Arrays.equals(new int[] { 1, 3 },
687             mapList.getToRanges().get(0)));
688     assertEquals(1, mapList.getToRanges().size());
689 
690     // no mapping involving the 'extra' A44444
691     assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
692   }
693 
694   /**
695    * Test mapping of protein to cDNA, for the case where we have some sequence
696    * cross-references. Verify that 1-to-many mappings are made where
697    * cross-references exist and sequences are mappable.
698    *
699    * @throws IOException
700    */
701   @Test(groups = { "Functional" })
testMapProteinAlignmentToCdna_withXrefs()702   public void testMapProteinAlignmentToCdna_withXrefs() throws IOException
703   {
704     List<SequenceI> protseqs = new ArrayList<>();
705     protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
706     protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
707     protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
708     AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
709     protein.setDataset(null);
710 
711     List<SequenceI> dnaseqs = new ArrayList<>();
712     dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
713     dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
714     dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
715     dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
716     dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
717     AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
718     cdna.setDataset(null);
719 
720     // Xref A22222 to V12345 (should get mapped)
721     dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
722     // Xref V12345 to A44444 (should get mapped)
723     protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
724     // Xref A33333 to V12347 (sequence mismatch - should not get mapped)
725     dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
726     // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
727     // it should get paired up with the unmapped A33333
728     // A11111 should be mapped to V12347
729     // A55555 is spare and has no xref so is not mapped
730 
731     assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
732 
733     // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
734     assertEquals(3, protein.getCodonFrames().size());
735     assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
736     assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
737     assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
738 
739     // one mapping for each of the first 4 cDNA sequences
740     assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
741     assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
742     assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
743     assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
744 
745     // V12345 mapped to A22222 and A44444
746     AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
747             .get(0);
748     assertEquals(2, acf.getdnaSeqs().length);
749     assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
750             acf.getdnaSeqs()[0]);
751     assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
752             acf.getdnaSeqs()[1]);
753 
754     // V12346 mapped to A33333
755     acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
756     assertEquals(1, acf.getdnaSeqs().length);
757     assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
758             acf.getdnaSeqs()[0]);
759 
760     // V12347 mapped to A11111
761     acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
762     assertEquals(1, acf.getdnaSeqs().length);
763     assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
764             acf.getdnaSeqs()[0]);
765 
766     // no mapping involving the 'extra' A55555
767     assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
768   }
769 
770   /**
771    * Test mapping of protein to cDNA, for the case where we have some sequence
772    * cross-references. Verify that once we have made an xref mapping we don't
773    * also map un-xrefd sequeces.
774    *
775    * @throws IOException
776    */
777   @Test(groups = { "Functional" })
testMapProteinAlignmentToCdna_prioritiseXrefs()778   public void testMapProteinAlignmentToCdna_prioritiseXrefs()
779           throws IOException
780   {
781     List<SequenceI> protseqs = new ArrayList<>();
782     protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
783     protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
784     AlignmentI protein = new Alignment(
785             protseqs.toArray(new SequenceI[protseqs.size()]));
786     protein.setDataset(null);
787 
788     List<SequenceI> dnaseqs = new ArrayList<>();
789     dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
790     dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
791     AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
792             .size()]));
793     cdna.setDataset(null);
794 
795     // Xref A22222 to V12345 (should get mapped)
796     // A11111 should then be mapped to the unmapped V12346
797     dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
798 
799     assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
800 
801     // 2 protein mappings made
802     assertEquals(2, protein.getCodonFrames().size());
803     assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
804     assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
805 
806     // one mapping for each of the cDNA sequences
807     assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
808     assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
809 
810     // V12345 mapped to A22222
811     AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
812             .get(0);
813     assertEquals(1, acf.getdnaSeqs().length);
814     assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
815             acf.getdnaSeqs()[0]);
816 
817     // V12346 mapped to A11111
818     acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
819     assertEquals(1, acf.getdnaSeqs().length);
820     assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
821             acf.getdnaSeqs()[0]);
822   }
823 
824   /**
825    * Test the method that shows or hides sequence annotations by type(s) and
826    * selection group.
827    */
828   @Test(groups = { "Functional" })
testShowOrHideSequenceAnnotations()829   public void testShowOrHideSequenceAnnotations()
830   {
831     SequenceI seq1 = new Sequence("Seq1", "AAA");
832     SequenceI seq2 = new Sequence("Seq2", "BBB");
833     SequenceI seq3 = new Sequence("Seq3", "CCC");
834     Annotation[] anns = new Annotation[] { new Annotation(2f) };
835     AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1",
836             anns);
837     ann1.setSequenceRef(seq1);
838     AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2",
839             anns);
840     ann2.setSequenceRef(seq2);
841     AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3",
842             anns);
843     AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns);
844     ann4.setSequenceRef(seq1);
845     AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns);
846     ann5.setSequenceRef(seq2);
847     AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns);
848     AlignmentI al = new Alignment(new SequenceI[] { seq1, seq2, seq3 });
849     al.addAnnotation(ann1); // Structure for Seq1
850     al.addAnnotation(ann2); // Structure for Seq2
851     al.addAnnotation(ann3); // Structure for no sequence
852     al.addAnnotation(ann4); // Temp for seq1
853     al.addAnnotation(ann5); // Temp for seq2
854     al.addAnnotation(ann6); // Temp for no sequence
855     List<String> types = new ArrayList<>();
856     List<SequenceI> scope = new ArrayList<>();
857 
858     /*
859      * Set all sequence related Structure to hidden (ann1, ann2)
860      */
861     types.add("Structure");
862     AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
863             false);
864     assertFalse(ann1.visible);
865     assertFalse(ann2.visible);
866     assertTrue(ann3.visible); // not sequence-related, not affected
867     assertTrue(ann4.visible); // not Structure, not affected
868     assertTrue(ann5.visible); // "
869     assertTrue(ann6.visible); // not sequence-related, not affected
870 
871     /*
872      * Set Temp in {seq1, seq3} to hidden
873      */
874     types.clear();
875     types.add("Temp");
876     scope.add(seq1);
877     scope.add(seq3);
878     AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false,
879             false);
880     assertFalse(ann1.visible); // unchanged
881     assertFalse(ann2.visible); // unchanged
882     assertTrue(ann3.visible); // not sequence-related, not affected
883     assertFalse(ann4.visible); // Temp for seq1 hidden
884     assertTrue(ann5.visible); // not in scope, not affected
885     assertTrue(ann6.visible); // not sequence-related, not affected
886 
887     /*
888      * Set Temp in all sequences to hidden
889      */
890     types.clear();
891     types.add("Temp");
892     scope.add(seq1);
893     scope.add(seq3);
894     AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
895             false);
896     assertFalse(ann1.visible); // unchanged
897     assertFalse(ann2.visible); // unchanged
898     assertTrue(ann3.visible); // not sequence-related, not affected
899     assertFalse(ann4.visible); // Temp for seq1 hidden
900     assertFalse(ann5.visible); // Temp for seq2 hidden
901     assertTrue(ann6.visible); // not sequence-related, not affected
902 
903     /*
904      * Set all types in {seq1, seq3} to visible
905      */
906     types.clear();
907     scope.clear();
908     scope.add(seq1);
909     scope.add(seq3);
910     AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true,
911             true);
912     assertTrue(ann1.visible); // Structure for seq1 set visible
913     assertFalse(ann2.visible); // not in scope, unchanged
914     assertTrue(ann3.visible); // not sequence-related, not affected
915     assertTrue(ann4.visible); // Temp for seq1 set visible
916     assertFalse(ann5.visible); // not in scope, unchanged
917     assertTrue(ann6.visible); // not sequence-related, not affected
918 
919     /*
920      * Set all types in all scope to hidden
921      */
922     AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true,
923             false);
924     assertFalse(ann1.visible);
925     assertFalse(ann2.visible);
926     assertTrue(ann3.visible); // not sequence-related, not affected
927     assertFalse(ann4.visible);
928     assertFalse(ann5.visible);
929     assertTrue(ann6.visible); // not sequence-related, not affected
930   }
931 
932   /**
933    * Tests for the method that checks if one sequence cross-references another
934    */
935   @Test(groups = { "Functional" })
testHasCrossRef()936   public void testHasCrossRef()
937   {
938     assertFalse(AlignmentUtils.hasCrossRef(null, null));
939     SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
940     assertFalse(AlignmentUtils.hasCrossRef(seq1, null));
941     assertFalse(AlignmentUtils.hasCrossRef(null, seq1));
942     SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
943     assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
944 
945     // different ref
946     seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193"));
947     assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
948 
949     // case-insensitive; version number is ignored
950     seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192"));
951     assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
952 
953     // right case!
954     seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
955     assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
956     // test is one-way only
957     assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1));
958   }
959 
960   /**
961    * Tests for the method that checks if either sequence cross-references the
962    * other
963    */
964   @Test(groups = { "Functional" })
testHaveCrossRef()965   public void testHaveCrossRef()
966   {
967     assertFalse(AlignmentUtils.hasCrossRef(null, null));
968     SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
969     assertFalse(AlignmentUtils.haveCrossRef(seq1, null));
970     assertFalse(AlignmentUtils.haveCrossRef(null, seq1));
971     SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
972     assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2));
973 
974     seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
975     assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
976     // next is true for haveCrossRef, false for hasCrossRef
977     assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
978 
979     // now the other way round
980     seq1.setDBRefs(null);
981     seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345"));
982     assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
983     assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
984 
985     // now both ways
986     seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
987     assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
988     assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
989   }
990 
991   /**
992    * Test the method that extracts the cds-only part of a dna alignment.
993    */
994   @Test(groups = { "Functional" })
testMakeCdsAlignment()995   public void testMakeCdsAlignment()
996   {
997     /*
998      * scenario:
999      *     dna1 --> [4, 6] [10,12]        --> pep1
1000      *     dna2 --> [1, 3] [7, 9] [13,15] --> pep2
1001      */
1002     SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
1003     SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
1004     SequenceI pep1 = new Sequence("pep1", "GF");
1005     SequenceI pep2 = new Sequence("pep2", "GFP");
1006     pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "pep1"));
1007     pep2.addDBRef(new DBRefEntry("UNIPROT", "0", "pep2"));
1008     dna1.createDatasetSequence();
1009     dna2.createDatasetSequence();
1010     pep1.createDatasetSequence();
1011     pep2.createDatasetSequence();
1012     AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
1013     dna.setDataset(null);
1014 
1015     /*
1016      * put a variant feature on dna2 base 8
1017      * - should transfer to cds2 base 5
1018      */
1019     dna2.addSequenceFeature(new SequenceFeature("variant", "hgmd", 8, 8,
1020             0f, null));
1021 
1022     /*
1023      * need a sourceDbRef if we are to construct dbrefs to the CDS
1024      * sequence from the dna contig sequences
1025      */
1026     DBRefEntry dbref = new DBRefEntry("ENSEMBL", "0", "dna1");
1027     dna1.getDatasetSequence().addDBRef(dbref);
1028     org.testng.Assert.assertEquals(dbref, dna1.getPrimaryDBRefs().get(0));
1029     dbref = new DBRefEntry("ENSEMBL", "0", "dna2");
1030     dna2.getDatasetSequence().addDBRef(dbref);
1031     org.testng.Assert.assertEquals(dbref, dna2.getPrimaryDBRefs().get(0));
1032 
1033     /*
1034      * CDS sequences are 'discovered' from dna-to-protein mappings on the alignment
1035      * dataset (e.g. added from dbrefs by CrossRef.findXrefSequences)
1036      */
1037     MapList mapfordna1 = new MapList(new int[] { 4, 6, 10, 12 }, new int[] {
1038         1, 2 }, 3, 1);
1039     AlignedCodonFrame acf = new AlignedCodonFrame();
1040     acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(),
1041             mapfordna1);
1042     dna.addCodonFrame(acf);
1043     MapList mapfordna2 = new MapList(new int[] { 1, 3, 7, 9, 13, 15 },
1044             new int[] { 1, 3 }, 3, 1);
1045     acf = new AlignedCodonFrame();
1046     acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(),
1047             mapfordna2);
1048     dna.addCodonFrame(acf);
1049 
1050     /*
1051      * In this case, mappings originally came from matching Uniprot accessions
1052      * - so need an xref on dna involving those regions.
1053      * These are normally constructed from CDS annotation
1054      */
1055     DBRefEntry dna1xref = new DBRefEntry("UNIPROT", "ENSEMBL", "pep1",
1056             new Mapping(mapfordna1));
1057     dna1.addDBRef(dna1xref);
1058     assertEquals(2, dna1.getDBRefs().length); // to self and to pep1
1059     DBRefEntry dna2xref = new DBRefEntry("UNIPROT", "ENSEMBL", "pep2",
1060             new Mapping(mapfordna2));
1061     dna2.addDBRef(dna2xref);
1062     assertEquals(2, dna2.getDBRefs().length); // to self and to pep2
1063 
1064     /*
1065      * execute method under test:
1066      */
1067     AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1068         dna1, dna2 }, dna.getDataset(), null);
1069 
1070     /*
1071      * verify cds sequences
1072      */
1073     assertEquals(2, cds.getSequences().size());
1074     assertEquals("GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
1075     assertEquals("GGGTTTCCC", cds.getSequenceAt(1).getSequenceAsString());
1076 
1077     /*
1078      * verify shared, extended alignment dataset
1079      */
1080     assertSame(dna.getDataset(), cds.getDataset());
1081     SequenceI cds1Dss = cds.getSequenceAt(0).getDatasetSequence();
1082     SequenceI cds2Dss = cds.getSequenceAt(1).getDatasetSequence();
1083     assertTrue(dna.getDataset().getSequences().contains(cds1Dss));
1084     assertTrue(dna.getDataset().getSequences().contains(cds2Dss));
1085 
1086     /*
1087      * verify CDS has a dbref with mapping to peptide
1088      */
1089     assertNotNull(cds1Dss.getDBRefs());
1090     assertEquals(2, cds1Dss.getDBRefs().length);
1091     dbref = cds1Dss.getDBRefs()[0];
1092     assertEquals(dna1xref.getSource(), dbref.getSource());
1093     // version is via ensembl's primary ref
1094     assertEquals(dna1xref.getVersion(), dbref.getVersion());
1095     assertEquals(dna1xref.getAccessionId(), dbref.getAccessionId());
1096     assertNotNull(dbref.getMap());
1097     assertSame(pep1.getDatasetSequence(), dbref.getMap().getTo());
1098     MapList cdsMapping = new MapList(new int[] { 1, 6 },
1099             new int[] { 1, 2 }, 3, 1);
1100     assertEquals(cdsMapping, dbref.getMap().getMap());
1101 
1102     /*
1103      * verify peptide has added a dbref with reverse mapping to CDS
1104      */
1105     assertNotNull(pep1.getDBRefs());
1106     // FIXME pep1.getDBRefs() is 1 - is that the correct behaviour ?
1107     assertEquals(2, pep1.getDBRefs().length);
1108     dbref = pep1.getDBRefs()[1];
1109     assertEquals("ENSEMBL", dbref.getSource());
1110     assertEquals("0", dbref.getVersion());
1111     assertEquals("CDS|dna1", dbref.getAccessionId());
1112     assertNotNull(dbref.getMap());
1113     assertSame(cds1Dss, dbref.getMap().getTo());
1114     assertEquals(cdsMapping.getInverse(), dbref.getMap().getMap());
1115 
1116     /*
1117      * verify cDNA has added a dbref with mapping to CDS
1118      */
1119     assertEquals(3, dna1.getDBRefs().length);
1120     DBRefEntry dbRefEntry = dna1.getDBRefs()[2];
1121     assertSame(cds1Dss, dbRefEntry.getMap().getTo());
1122     MapList dnaToCdsMapping = new MapList(new int[] { 4, 6, 10, 12 },
1123             new int[] { 1, 6 }, 1, 1);
1124     assertEquals(dnaToCdsMapping, dbRefEntry.getMap().getMap());
1125     assertEquals(3, dna2.getDBRefs().length);
1126     dbRefEntry = dna2.getDBRefs()[2];
1127     assertSame(cds2Dss, dbRefEntry.getMap().getTo());
1128     dnaToCdsMapping = new MapList(new int[] { 1, 3, 7, 9, 13, 15 },
1129             new int[] { 1, 9 }, 1, 1);
1130     assertEquals(dnaToCdsMapping, dbRefEntry.getMap().getMap());
1131 
1132     /*
1133      * verify CDS has added a dbref with mapping to cDNA
1134      */
1135     assertEquals(2, cds1Dss.getDBRefs().length);
1136     dbRefEntry = cds1Dss.getDBRefs()[1];
1137     assertSame(dna1.getDatasetSequence(), dbRefEntry.getMap().getTo());
1138     MapList cdsToDnaMapping = new MapList(new int[] { 1, 6 }, new int[] {
1139         4, 6, 10, 12 }, 1, 1);
1140     assertEquals(cdsToDnaMapping, dbRefEntry.getMap().getMap());
1141     assertEquals(2, cds2Dss.getDBRefs().length);
1142     dbRefEntry = cds2Dss.getDBRefs()[1];
1143     assertSame(dna2.getDatasetSequence(), dbRefEntry.getMap().getTo());
1144     cdsToDnaMapping = new MapList(new int[] { 1, 9 }, new int[] { 1, 3, 7,
1145         9, 13, 15 }, 1, 1);
1146     assertEquals(cdsToDnaMapping, dbRefEntry.getMap().getMap());
1147 
1148     /*
1149      * Verify mappings from CDS to peptide, cDNA to CDS, and cDNA to peptide
1150      * the mappings are on the shared alignment dataset
1151      * 6 mappings, 2*(DNA->CDS), 2*(DNA->Pep), 2*(CDS->Pep)
1152      */
1153     List<AlignedCodonFrame> cdsMappings = cds.getDataset().getCodonFrames();
1154     assertEquals(6, cdsMappings.size());
1155 
1156     /*
1157      * verify that mapping sets for dna and cds alignments are different
1158      * [not current behaviour - all mappings are on the alignment dataset]
1159      */
1160     // select -> subselect type to test.
1161     // Assert.assertNotSame(dna.getCodonFrames(), cds.getCodonFrames());
1162     // assertEquals(4, dna.getCodonFrames().size());
1163     // assertEquals(4, cds.getCodonFrames().size());
1164 
1165     /*
1166      * Two mappings involve pep1 (dna to pep1, cds to pep1)
1167      * Mapping from pep1 to GGGTTT in first new exon sequence
1168      */
1169     List<AlignedCodonFrame> pep1Mappings = MappingUtils
1170             .findMappingsForSequence(pep1, cdsMappings);
1171     assertEquals(2, pep1Mappings.size());
1172     List<AlignedCodonFrame> mappings = MappingUtils
1173             .findMappingsForSequence(cds.getSequenceAt(0), pep1Mappings);
1174     assertEquals(1, mappings.size());
1175 
1176     // map G to GGG
1177     SearchResultsI sr = MappingUtils.buildSearchResults(pep1, 1, mappings);
1178     assertEquals(1, sr.getResults().size());
1179     SearchResultMatchI m = sr.getResults().get(0);
1180     assertSame(cds1Dss, m.getSequence());
1181     assertEquals(1, m.getStart());
1182     assertEquals(3, m.getEnd());
1183     // map F to TTT
1184     sr = MappingUtils.buildSearchResults(pep1, 2, mappings);
1185     m = sr.getResults().get(0);
1186     assertSame(cds1Dss, m.getSequence());
1187     assertEquals(4, m.getStart());
1188     assertEquals(6, m.getEnd());
1189 
1190     /*
1191      * Two mappings involve pep2 (dna to pep2, cds to pep2)
1192      * Verify mapping from pep2 to GGGTTTCCC in second new exon sequence
1193      */
1194     List<AlignedCodonFrame> pep2Mappings = MappingUtils
1195             .findMappingsForSequence(pep2, cdsMappings);
1196     assertEquals(2, pep2Mappings.size());
1197     mappings = MappingUtils.findMappingsForSequence(cds.getSequenceAt(1),
1198             pep2Mappings);
1199     assertEquals(1, mappings.size());
1200     // map G to GGG
1201     sr = MappingUtils.buildSearchResults(pep2, 1, mappings);
1202     assertEquals(1, sr.getResults().size());
1203     m = sr.getResults().get(0);
1204     assertSame(cds2Dss, m.getSequence());
1205     assertEquals(1, m.getStart());
1206     assertEquals(3, m.getEnd());
1207     // map F to TTT
1208     sr = MappingUtils.buildSearchResults(pep2, 2, mappings);
1209     m = sr.getResults().get(0);
1210     assertSame(cds2Dss, m.getSequence());
1211     assertEquals(4, m.getStart());
1212     assertEquals(6, m.getEnd());
1213     // map P to CCC
1214     sr = MappingUtils.buildSearchResults(pep2, 3, mappings);
1215     m = sr.getResults().get(0);
1216     assertSame(cds2Dss, m.getSequence());
1217     assertEquals(7, m.getStart());
1218     assertEquals(9, m.getEnd());
1219 
1220     /*
1221      * check cds2 acquired a variant feature in position 5
1222      */
1223     List<SequenceFeature> sfs = cds2Dss.getSequenceFeatures();
1224     assertNotNull(sfs);
1225     assertEquals(1, sfs.size());
1226     assertEquals("variant", sfs.get(0).type);
1227     assertEquals(5, sfs.get(0).begin);
1228     assertEquals(5, sfs.get(0).end);
1229   }
1230 
1231   /**
1232    * Test the method that makes a cds-only alignment from a DNA sequence and its
1233    * product mappings, for the case where there are multiple exon mappings to
1234    * different protein products.
1235    */
1236   @Test(groups = { "Functional" })
testMakeCdsAlignment_multipleProteins()1237   public void testMakeCdsAlignment_multipleProteins()
1238   {
1239     SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
1240     SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT
1241     SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc
1242     SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT
1243     dna1.createDatasetSequence();
1244     pep1.createDatasetSequence();
1245     pep2.createDatasetSequence();
1246     pep3.createDatasetSequence();
1247     pep1.getDatasetSequence().addDBRef(
1248             new DBRefEntry("EMBLCDS", "2", "A12345"));
1249     pep2.getDatasetSequence().addDBRef(
1250             new DBRefEntry("EMBLCDS", "3", "A12346"));
1251     pep3.getDatasetSequence().addDBRef(
1252             new DBRefEntry("EMBLCDS", "4", "A12347"));
1253 
1254     /*
1255      * Create the CDS alignment
1256      */
1257     AlignmentI dna = new Alignment(new SequenceI[] { dna1 });
1258     dna.setDataset(null);
1259 
1260     /*
1261      * Make the mappings from dna to protein
1262      */
1263     // map ...GGG...TTT to GF
1264     MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1265             new int[] { 1, 2 }, 3, 1);
1266     AlignedCodonFrame acf = new AlignedCodonFrame();
1267     acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1268     dna.addCodonFrame(acf);
1269 
1270     // map aaa...ccc to KP
1271     map = new MapList(new int[] { 1, 3, 7, 9 }, new int[] { 1, 2 }, 3, 1);
1272     acf = new AlignedCodonFrame();
1273     acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map);
1274     dna.addCodonFrame(acf);
1275 
1276     // map aaa......TTT to KF
1277     map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 2 }, 3, 1);
1278     acf = new AlignedCodonFrame();
1279     acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map);
1280     dna.addCodonFrame(acf);
1281 
1282     /*
1283      * execute method under test
1284      */
1285     AlignmentI cdsal = AlignmentUtils.makeCdsAlignment(
1286             new SequenceI[] { dna1 }, dna.getDataset(), null);
1287 
1288     /*
1289      * Verify we have 3 cds sequences, mapped to pep1/2/3 respectively
1290      */
1291     List<SequenceI> cds = cdsal.getSequences();
1292     assertEquals(3, cds.size());
1293 
1294     /*
1295      * verify shared, extended alignment dataset
1296      */
1297     assertSame(cdsal.getDataset(), dna.getDataset());
1298     assertTrue(dna.getDataset().getSequences()
1299             .contains(cds.get(0).getDatasetSequence()));
1300     assertTrue(dna.getDataset().getSequences()
1301             .contains(cds.get(1).getDatasetSequence()));
1302     assertTrue(dna.getDataset().getSequences()
1303             .contains(cds.get(2).getDatasetSequence()));
1304 
1305     /*
1306      * verify aligned cds sequences and their xrefs
1307      */
1308     SequenceI cdsSeq = cds.get(0);
1309     assertEquals("GGGTTT", cdsSeq.getSequenceAsString());
1310     // assertEquals("dna1|A12345", cdsSeq.getName());
1311     assertEquals("CDS|dna1", cdsSeq.getName());
1312     // assertEquals(1, cdsSeq.getDBRefs().length);
1313     // DBRefEntry cdsRef = cdsSeq.getDBRefs()[0];
1314     // assertEquals("EMBLCDS", cdsRef.getSource());
1315     // assertEquals("2", cdsRef.getVersion());
1316     // assertEquals("A12345", cdsRef.getAccessionId());
1317 
1318     cdsSeq = cds.get(1);
1319     assertEquals("aaaccc", cdsSeq.getSequenceAsString());
1320     // assertEquals("dna1|A12346", cdsSeq.getName());
1321     assertEquals("CDS|dna1", cdsSeq.getName());
1322     // assertEquals(1, cdsSeq.getDBRefs().length);
1323     // cdsRef = cdsSeq.getDBRefs()[0];
1324     // assertEquals("EMBLCDS", cdsRef.getSource());
1325     // assertEquals("3", cdsRef.getVersion());
1326     // assertEquals("A12346", cdsRef.getAccessionId());
1327 
1328     cdsSeq = cds.get(2);
1329     assertEquals("aaaTTT", cdsSeq.getSequenceAsString());
1330     // assertEquals("dna1|A12347", cdsSeq.getName());
1331     assertEquals("CDS|dna1", cdsSeq.getName());
1332     // assertEquals(1, cdsSeq.getDBRefs().length);
1333     // cdsRef = cdsSeq.getDBRefs()[0];
1334     // assertEquals("EMBLCDS", cdsRef.getSource());
1335     // assertEquals("4", cdsRef.getVersion());
1336     // assertEquals("A12347", cdsRef.getAccessionId());
1337 
1338     /*
1339      * Verify there are mappings from each cds sequence to its protein product
1340      * and also to its dna source
1341      */
1342     List<AlignedCodonFrame> newMappings = cdsal.getCodonFrames();
1343 
1344     /*
1345      * 6 mappings involve dna1 (to pep1/2/3, cds1/2/3)
1346      */
1347     List<AlignedCodonFrame> dnaMappings = MappingUtils
1348             .findMappingsForSequence(dna1, newMappings);
1349     assertEquals(6, dnaMappings.size());
1350 
1351     /*
1352      * dna1 to pep1
1353      */
1354     List<AlignedCodonFrame> mappings = MappingUtils
1355             .findMappingsForSequence(pep1, dnaMappings);
1356     assertEquals(1, mappings.size());
1357     assertEquals(1, mappings.get(0).getMappings().size());
1358     assertSame(pep1.getDatasetSequence(), mappings.get(0).getMappings()
1359             .get(0).getMapping().getTo());
1360 
1361     /*
1362      * dna1 to cds1
1363      */
1364     List<AlignedCodonFrame> dnaToCds1Mappings = MappingUtils
1365             .findMappingsForSequence(cds.get(0), dnaMappings);
1366     Mapping mapping = dnaToCds1Mappings.get(0).getMappings().get(0)
1367             .getMapping();
1368     assertSame(cds.get(0).getDatasetSequence(), mapping.getTo());
1369     assertEquals("G(1) in CDS should map to G(4) in DNA", 4, mapping
1370             .getMap().getToPosition(1));
1371 
1372     /*
1373      * dna1 to pep2
1374      */
1375     mappings = MappingUtils.findMappingsForSequence(pep2, dnaMappings);
1376     assertEquals(1, mappings.size());
1377     assertEquals(1, mappings.get(0).getMappings().size());
1378     assertSame(pep2.getDatasetSequence(), mappings.get(0).getMappings()
1379             .get(0).getMapping().getTo());
1380 
1381     /*
1382      * dna1 to cds2
1383      */
1384     List<AlignedCodonFrame> dnaToCds2Mappings = MappingUtils
1385             .findMappingsForSequence(cds.get(1), dnaMappings);
1386     mapping = dnaToCds2Mappings.get(0).getMappings().get(0).getMapping();
1387     assertSame(cds.get(1).getDatasetSequence(), mapping.getTo());
1388     assertEquals("c(4) in CDS should map to c(7) in DNA", 7, mapping
1389             .getMap().getToPosition(4));
1390 
1391     /*
1392      * dna1 to pep3
1393      */
1394     mappings = MappingUtils.findMappingsForSequence(pep3, dnaMappings);
1395     assertEquals(1, mappings.size());
1396     assertEquals(1, mappings.get(0).getMappings().size());
1397     assertSame(pep3.getDatasetSequence(), mappings.get(0).getMappings()
1398             .get(0).getMapping().getTo());
1399 
1400     /*
1401      * dna1 to cds3
1402      */
1403     List<AlignedCodonFrame> dnaToCds3Mappings = MappingUtils
1404             .findMappingsForSequence(cds.get(2), dnaMappings);
1405     mapping = dnaToCds3Mappings.get(0).getMappings().get(0).getMapping();
1406     assertSame(cds.get(2).getDatasetSequence(), mapping.getTo());
1407     assertEquals("T(4) in CDS should map to T(10) in DNA", 10, mapping
1408             .getMap().getToPosition(4));
1409   }
1410 
1411   @Test(groups = { "Functional" })
testIsMappable()1412   public void testIsMappable()
1413   {
1414     SequenceI dna1 = new Sequence("dna1", "cgCAGtgGT");
1415     SequenceI aa1 = new Sequence("aa1", "RSG");
1416     AlignmentI al1 = new Alignment(new SequenceI[] { dna1 });
1417     AlignmentI al2 = new Alignment(new SequenceI[] { aa1 });
1418 
1419     assertFalse(AlignmentUtils.isMappable(null, null));
1420     assertFalse(AlignmentUtils.isMappable(al1, null));
1421     assertFalse(AlignmentUtils.isMappable(null, al1));
1422     assertFalse(AlignmentUtils.isMappable(al1, al1));
1423     assertFalse(AlignmentUtils.isMappable(al2, al2));
1424 
1425     assertTrue(AlignmentUtils.isMappable(al1, al2));
1426     assertTrue(AlignmentUtils.isMappable(al2, al1));
1427   }
1428 
1429   /**
1430    * Test creating a mapping when the sequences involved do not start at residue
1431    * 1
1432    *
1433    * @throws IOException
1434    */
1435   @Test(groups = { "Functional" })
testMapCdnaToProtein_forSubsequence()1436   public void testMapCdnaToProtein_forSubsequence() throws IOException
1437   {
1438     SequenceI prot = new Sequence("UNIPROT|V12345", "E-I--Q", 10, 12);
1439     prot.createDatasetSequence();
1440 
1441     SequenceI dna = new Sequence("EMBL|A33333", "GAA--AT-C-CAG", 40, 48);
1442     dna.createDatasetSequence();
1443 
1444     MapList map = AlignmentUtils.mapCdnaToProtein(prot, dna);
1445     assertEquals(10, map.getToLowest());
1446     assertEquals(12, map.getToHighest());
1447     assertEquals(40, map.getFromLowest());
1448     assertEquals(48, map.getFromHighest());
1449   }
1450 
1451   /**
1452    * Test for the alignSequenceAs method where we have protein mapped to protein
1453    */
1454   @Test(groups = { "Functional" })
testAlignSequenceAs_mappedProteinProtein()1455   public void testAlignSequenceAs_mappedProteinProtein()
1456   {
1457 
1458     SequenceI alignMe = new Sequence("Match", "MGAASEV");
1459     alignMe.createDatasetSequence();
1460     SequenceI alignFrom = new Sequence("Query", "LQTGYMGAASEVMFSPTRR");
1461     alignFrom.createDatasetSequence();
1462 
1463     AlignedCodonFrame acf = new AlignedCodonFrame();
1464     // this is like a domain or motif match of part of a peptide sequence
1465     MapList map = new MapList(new int[] { 6, 12 }, new int[] { 1, 7 }, 1, 1);
1466     acf.addMap(alignFrom.getDatasetSequence(),
1467             alignMe.getDatasetSequence(), map);
1468 
1469     AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "-", '-', true,
1470             true);
1471     assertEquals("-----MGAASEV-------", alignMe.getSequenceAsString());
1472   }
1473 
1474   /**
1475    * Test for the alignSequenceAs method where there are trailing unmapped
1476    * residues in the model sequence
1477    */
1478   @Test(groups = { "Functional" })
testAlignSequenceAs_withTrailingPeptide()1479   public void testAlignSequenceAs_withTrailingPeptide()
1480   {
1481     // map first 3 codons to KPF; G is a trailing unmapped residue
1482     MapList map = new MapList(new int[] { 1, 9 }, new int[] { 1, 3 }, 3, 1);
1483 
1484     checkAlignSequenceAs("AAACCCTTT", "K-PFG", true, true, map,
1485             "AAA---CCCTTT---");
1486   }
1487 
1488   /**
1489    * Tests for transferring features between mapped sequences
1490    */
1491   @Test(groups = { "Functional" })
testTransferFeatures()1492   public void testTransferFeatures()
1493   {
1494     SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1495     SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1496 
1497     // no overlap
1498     dna.addSequenceFeature(new SequenceFeature("type1", "desc1", 1, 2, 1f,
1499             null));
1500     // partial overlap - to [1, 1]
1501     dna.addSequenceFeature(new SequenceFeature("type2", "desc2", 3, 4, 2f,
1502             null));
1503     // exact overlap - to [1, 3]
1504     dna.addSequenceFeature(new SequenceFeature("type3", "desc3", 4, 6, 3f,
1505             null));
1506     // spanning overlap - to [2, 5]
1507     dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1508             null));
1509     // exactly overlaps whole mapped range [1, 6]
1510     dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1511             null));
1512     // no overlap (internal)
1513     dna.addSequenceFeature(new SequenceFeature("type6", "desc6", 7, 9, 6f,
1514             null));
1515     // no overlap (3' end)
1516     dna.addSequenceFeature(new SequenceFeature("type7", "desc7", 13, 15,
1517             7f, null));
1518     // overlap (3' end) - to [6, 6]
1519     dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1520             8f, null));
1521     // extended overlap - to [6, +]
1522     dna.addSequenceFeature(new SequenceFeature("type9", "desc9", 12, 13,
1523             9f, null));
1524 
1525     MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1526             new int[] { 1, 6 }, 1, 1);
1527 
1528     /*
1529      * transferFeatures() will build 'partial overlap' for regions
1530      * that partially overlap 5' or 3' (start or end) of target sequence
1531      */
1532     AlignmentUtils.transferFeatures(dna, cds, map, null);
1533     List<SequenceFeature> sfs = cds.getSequenceFeatures();
1534     assertEquals(6, sfs.size());
1535 
1536     SequenceFeature sf = sfs.get(0);
1537     assertEquals("type2", sf.getType());
1538     assertEquals("desc2", sf.getDescription());
1539     assertEquals(2f, sf.getScore());
1540     assertEquals(1, sf.getBegin());
1541     assertEquals(1, sf.getEnd());
1542 
1543     sf = sfs.get(1);
1544     assertEquals("type3", sf.getType());
1545     assertEquals("desc3", sf.getDescription());
1546     assertEquals(3f, sf.getScore());
1547     assertEquals(1, sf.getBegin());
1548     assertEquals(3, sf.getEnd());
1549 
1550     sf = sfs.get(2);
1551     assertEquals("type4", sf.getType());
1552     assertEquals(2, sf.getBegin());
1553     assertEquals(5, sf.getEnd());
1554 
1555     sf = sfs.get(3);
1556     assertEquals("type5", sf.getType());
1557     assertEquals(1, sf.getBegin());
1558     assertEquals(6, sf.getEnd());
1559 
1560     sf = sfs.get(4);
1561     assertEquals("type8", sf.getType());
1562     assertEquals(6, sf.getBegin());
1563     assertEquals(6, sf.getEnd());
1564 
1565     sf = sfs.get(5);
1566     assertEquals("type9", sf.getType());
1567     assertEquals(6, sf.getBegin());
1568     assertEquals(6, sf.getEnd());
1569   }
1570 
1571   /**
1572    * Tests for transferring features between mapped sequences
1573    */
1574   @Test(groups = { "Functional" })
testTransferFeatures_withOmit()1575   public void testTransferFeatures_withOmit()
1576   {
1577     SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1578     SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1579 
1580     MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1581             new int[] { 1, 6 }, 1, 1);
1582 
1583     // [5, 11] maps to [2, 5]
1584     dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1585             null));
1586     // [4, 12] maps to [1, 6]
1587     dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1588             null));
1589     // [12, 12] maps to [6, 6]
1590     dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1591             8f, null));
1592 
1593     // desc4 and desc8 are the 'omit these' varargs
1594     AlignmentUtils.transferFeatures(dna, cds, map, null, "type4", "type8");
1595     List<SequenceFeature> sfs = cds.getSequenceFeatures();
1596     assertEquals(1, sfs.size());
1597 
1598     SequenceFeature sf = sfs.get(0);
1599     assertEquals("type5", sf.getType());
1600     assertEquals(1, sf.getBegin());
1601     assertEquals(6, sf.getEnd());
1602   }
1603 
1604   /**
1605    * Tests for transferring features between mapped sequences
1606    */
1607   @Test(groups = { "Functional" })
testTransferFeatures_withSelect()1608   public void testTransferFeatures_withSelect()
1609   {
1610     SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1611     SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1612 
1613     MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1614             new int[] { 1, 6 }, 1, 1);
1615 
1616     // [5, 11] maps to [2, 5]
1617     dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1618             null));
1619     // [4, 12] maps to [1, 6]
1620     dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1621             null));
1622     // [12, 12] maps to [6, 6]
1623     dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1624             8f, null));
1625 
1626     // "type5" is the 'select this type' argument
1627     AlignmentUtils.transferFeatures(dna, cds, map, "type5");
1628     List<SequenceFeature> sfs = cds.getSequenceFeatures();
1629     assertEquals(1, sfs.size());
1630 
1631     SequenceFeature sf = sfs.get(0);
1632     assertEquals("type5", sf.getType());
1633     assertEquals(1, sf.getBegin());
1634     assertEquals(6, sf.getEnd());
1635   }
1636 
1637   /**
1638    * Test the method that extracts the cds-only part of a dna alignment, for the
1639    * case where the cds should be aligned to match its nucleotide sequence.
1640    */
1641   @Test(groups = { "Functional" })
testMakeCdsAlignment_alternativeTranscripts()1642   public void testMakeCdsAlignment_alternativeTranscripts()
1643   {
1644     SequenceI dna1 = new Sequence("dna1", "aaaGGGCC-----CTTTaaaGGG");
1645     // alternative transcript of same dna skips CCC codon
1646     SequenceI dna2 = new Sequence("dna2", "aaaGGGCC-----cttTaaaGGG");
1647     // dna3 has no mapping (protein product) so should be ignored here
1648     SequenceI dna3 = new Sequence("dna3", "aaaGGGCCCCCGGGcttTaaaGGG");
1649     SequenceI pep1 = new Sequence("pep1", "GPFG");
1650     SequenceI pep2 = new Sequence("pep2", "GPG");
1651     dna1.createDatasetSequence();
1652     dna2.createDatasetSequence();
1653     dna3.createDatasetSequence();
1654     pep1.createDatasetSequence();
1655     pep2.createDatasetSequence();
1656 
1657     AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1658     dna.setDataset(null);
1659 
1660     MapList map = new MapList(new int[] { 4, 12, 16, 18 },
1661             new int[] { 1, 4 }, 3, 1);
1662     AlignedCodonFrame acf = new AlignedCodonFrame();
1663     acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1664     dna.addCodonFrame(acf);
1665     map = new MapList(new int[] { 4, 8, 12, 12, 16, 18 },
1666             new int[] { 1, 3 }, 3, 1);
1667     acf = new AlignedCodonFrame();
1668     acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1669     dna.addCodonFrame(acf);
1670 
1671     AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1672         dna1, dna2, dna3 }, dna.getDataset(), null);
1673     List<SequenceI> cdsSeqs = cds.getSequences();
1674     assertEquals(2, cdsSeqs.size());
1675     assertEquals("GGGCCCTTTGGG", cdsSeqs.get(0).getSequenceAsString());
1676     assertEquals("GGGCCTGGG", cdsSeqs.get(1).getSequenceAsString());
1677 
1678     /*
1679      * verify shared, extended alignment dataset
1680      */
1681     assertSame(dna.getDataset(), cds.getDataset());
1682     assertTrue(dna.getDataset().getSequences()
1683             .contains(cdsSeqs.get(0).getDatasetSequence()));
1684     assertTrue(dna.getDataset().getSequences()
1685             .contains(cdsSeqs.get(1).getDatasetSequence()));
1686 
1687     /*
1688      * Verify 6 mappings: dna1 to cds1, cds1 to pep1, dna1 to pep1
1689      * and the same for dna2/cds2/pep2
1690      */
1691     List<AlignedCodonFrame> mappings = cds.getCodonFrames();
1692     assertEquals(6, mappings.size());
1693 
1694     /*
1695      * 2 mappings involve pep1
1696      */
1697     List<AlignedCodonFrame> pep1Mappings = MappingUtils
1698             .findMappingsForSequence(pep1, mappings);
1699     assertEquals(2, pep1Mappings.size());
1700 
1701     /*
1702      * Get mapping of pep1 to cds1 and verify it
1703      * maps GPFG to 1-3,4-6,7-9,10-12
1704      */
1705     List<AlignedCodonFrame> pep1CdsMappings = MappingUtils
1706             .findMappingsForSequence(cds.getSequenceAt(0), pep1Mappings);
1707     assertEquals(1, pep1CdsMappings.size());
1708     SearchResultsI sr = MappingUtils.buildSearchResults(pep1, 1,
1709             pep1CdsMappings);
1710     assertEquals(1, sr.getResults().size());
1711     SearchResultMatchI m = sr.getResults().get(0);
1712     assertEquals(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence());
1713     assertEquals(1, m.getStart());
1714     assertEquals(3, m.getEnd());
1715     sr = MappingUtils.buildSearchResults(pep1, 2, pep1CdsMappings);
1716     m = sr.getResults().get(0);
1717     assertEquals(4, m.getStart());
1718     assertEquals(6, m.getEnd());
1719     sr = MappingUtils.buildSearchResults(pep1, 3, pep1CdsMappings);
1720     m = sr.getResults().get(0);
1721     assertEquals(7, m.getStart());
1722     assertEquals(9, m.getEnd());
1723     sr = MappingUtils.buildSearchResults(pep1, 4, pep1CdsMappings);
1724     m = sr.getResults().get(0);
1725     assertEquals(10, m.getStart());
1726     assertEquals(12, m.getEnd());
1727 
1728     /*
1729      * Get mapping of pep2 to cds2 and verify it
1730      * maps GPG in pep2 to 1-3,4-6,7-9 in second CDS sequence
1731      */
1732     List<AlignedCodonFrame> pep2Mappings = MappingUtils
1733             .findMappingsForSequence(pep2, mappings);
1734     assertEquals(2, pep2Mappings.size());
1735     List<AlignedCodonFrame> pep2CdsMappings = MappingUtils
1736             .findMappingsForSequence(cds.getSequenceAt(1), pep2Mappings);
1737     assertEquals(1, pep2CdsMappings.size());
1738     sr = MappingUtils.buildSearchResults(pep2, 1, pep2CdsMappings);
1739     assertEquals(1, sr.getResults().size());
1740     m = sr.getResults().get(0);
1741     assertEquals(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence());
1742     assertEquals(1, m.getStart());
1743     assertEquals(3, m.getEnd());
1744     sr = MappingUtils.buildSearchResults(pep2, 2, pep2CdsMappings);
1745     m = sr.getResults().get(0);
1746     assertEquals(4, m.getStart());
1747     assertEquals(6, m.getEnd());
1748     sr = MappingUtils.buildSearchResults(pep2, 3, pep2CdsMappings);
1749     m = sr.getResults().get(0);
1750     assertEquals(7, m.getStart());
1751     assertEquals(9, m.getEnd());
1752   }
1753 
1754   /**
1755    * Test the method that realigns protein to match mapped codon alignment.
1756    */
1757   @Test(groups = { "Functional" })
testAlignProteinAsDna_incompleteStartCodon()1758   public void testAlignProteinAsDna_incompleteStartCodon()
1759   {
1760     // seq1: incomplete start codon (not mapped), then [3, 11]
1761     SequenceI dna1 = new Sequence("Seq1", "ccAAA-TTT-GGG-");
1762     // seq2 codons are [4, 5], [8, 11]
1763     SequenceI dna2 = new Sequence("Seq2", "ccaAA-ttT-GGG-");
1764     // seq3 incomplete start codon at 'tt'
1765     SequenceI dna3 = new Sequence("Seq3", "ccaaa-ttt-GGG-");
1766     AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1767     dna.setDataset(null);
1768 
1769     // prot1 has 'X' for incomplete start codon (not mapped)
1770     SequenceI prot1 = new Sequence("Seq1", "XKFG"); // X for incomplete start
1771     SequenceI prot2 = new Sequence("Seq2", "NG");
1772     SequenceI prot3 = new Sequence("Seq3", "XG"); // X for incomplete start
1773     AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
1774         prot3 });
1775     protein.setDataset(null);
1776 
1777     // map dna1 [3, 11] to prot1 [2, 4] KFG
1778     MapList map = new MapList(new int[] { 3, 11 }, new int[] { 2, 4 }, 3, 1);
1779     AlignedCodonFrame acf = new AlignedCodonFrame();
1780     acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
1781 
1782     // map dna2 [4, 5] [8, 11] to prot2 [1, 2] NG
1783     map = new MapList(new int[] { 4, 5, 8, 11 }, new int[] { 1, 2 }, 3, 1);
1784     acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
1785 
1786     // map dna3 [9, 11] to prot3 [2, 2] G
1787     map = new MapList(new int[] { 9, 11 }, new int[] { 2, 2 }, 3, 1);
1788     acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
1789 
1790     ArrayList<AlignedCodonFrame> acfs = new ArrayList<>();
1791     acfs.add(acf);
1792     protein.setCodonFrames(acfs);
1793     Iterator<SequenceI> protseq = protein.getSequences().iterator();
1794     for (SequenceI dnaseq:dna.getSequences()) {
1795       assertCanResolveProteinCDS(dnaseq,protseq.next(),protein);
1796     }
1797     /*
1798      * verify X is included in the aligned proteins, and placed just
1799      * before the first mapped residue
1800      * CCT is between CCC and TTT
1801      */
1802     AlignmentUtils.alignProteinAsDna(protein, dna);
1803     assertEquals("XK-FG", prot1.getSequenceAsString());
1804     assertEquals("--N-G", prot2.getSequenceAsString());
1805     assertEquals("---XG", prot3.getSequenceAsString());
1806   }
1807 
1808   /**
1809    * assert that we can resolve the protein product in the given alignment given a DNA sequence with CDS mapping
1810    * @param dnaseq
1811    * @param protein
1812    */
assertCanResolveProteinCDS(SequenceI dnaseq, SequenceI expProtein, AlignmentI protein)1813   private void assertCanResolveProteinCDS(SequenceI dnaseq, SequenceI expProtein, AlignmentI protein)
1814   {
1815     // try a few different methods to check all work
1816     SequenceI aprot=null;
1817     for (AlignedCodonFrame cf:protein.getCodonFrame(dnaseq))
1818     {
1819       aprot=cf.getAaForDnaSeq(dnaseq);
1820       if (aprot!=null)
1821       {
1822         assertTrue("getAaForDnaSeq didn't return expected protein sequence",aprot!=expProtein);
1823         break;
1824       }
1825     }
1826     assertNotNull("Didn't locate any proteins via AlignmentI.getCodonFrame .. AlignCodonFrame.getAaForDnaSeq", aprot);
1827     // try mapping utils -
1828     List<AlignedCodonFrame> mu_mappings=MappingUtils.findMappingsForSequence(dnaseq, protein.getCodonFrames());
1829     assertNotNull("No mappings found for dnaseq in protein alignment via MappingUtils.findMappingsForSequence",mu_mappings);
1830     assertNotEquals("No mappings found for dnaseq in protein alignment via MappingUtils.findMappingsForSequence",0,mu_mappings.size());
1831     SequenceI mu_alignedprot=null;
1832     List<SequenceToSequenceMapping> foundMap=null;
1833     for (AlignedCodonFrame cf:mu_mappings)
1834     {
1835       foundMap=new ArrayList<>();
1836       mu_alignedprot = cf.findAlignedSequence(dnaseq, protein,foundMap);
1837       if (mu_alignedprot!=null) {
1838         break;
1839       }
1840     }
1841     assertNotNull("Didn't locate proteins via MappingUtils.findMappingsForSequence",mu_alignedprot);
1842     assertTrue("findAlignedSequence didn't return expected protein sequence",mu_alignedprot==expProtein);
1843   }
1844 
1845   /**
1846    * Tests for the method that maps the subset of a dna sequence that has CDS
1847    * (or subtype) feature - case where the start codon is incomplete.
1848    */
1849   @Test(groups = "Functional")
testFindCdsPositions_fivePrimeIncomplete()1850   public void testFindCdsPositions_fivePrimeIncomplete()
1851   {
1852     SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
1853     dnaSeq.createDatasetSequence();
1854     SequenceI ds = dnaSeq.getDatasetSequence();
1855 
1856     // CDS for dna 5-6 (incomplete codon), 7-9
1857     SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
1858     sf.setPhase("2"); // skip 2 bases to start of next codon
1859     ds.addSequenceFeature(sf);
1860     // CDS for dna 13-15
1861     sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
1862     ds.addSequenceFeature(sf);
1863 
1864     List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1865 
1866     /*
1867      * check the mapping starts with the first complete codon
1868      */
1869     assertEquals(6, MappingUtils.getLength(ranges));
1870     assertEquals(2, ranges.size());
1871     assertEquals(7, ranges.get(0)[0]);
1872     assertEquals(9, ranges.get(0)[1]);
1873     assertEquals(13, ranges.get(1)[0]);
1874     assertEquals(15, ranges.get(1)[1]);
1875   }
1876 
1877   /**
1878    * Tests for the method that maps the subset of a dna sequence that has CDS
1879    * (or subtype) feature.
1880    */
1881   @Test(groups = "Functional")
testFindCdsPositions()1882   public void testFindCdsPositions()
1883   {
1884     SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
1885     dnaSeq.createDatasetSequence();
1886     SequenceI ds = dnaSeq.getDatasetSequence();
1887 
1888     // CDS for dna 10-12
1889     SequenceFeature sf = new SequenceFeature("CDS_predicted", "", 10, 12,
1890             0f, null);
1891     sf.setStrand("+");
1892     ds.addSequenceFeature(sf);
1893     // CDS for dna 4-6
1894     sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
1895     sf.setStrand("+");
1896     ds.addSequenceFeature(sf);
1897     // exon feature should be ignored here
1898     sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
1899     ds.addSequenceFeature(sf);
1900 
1901     List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1902     /*
1903      * verify ranges { [4-6], [12-10] }
1904      * note CDS ranges are ordered ascending even if the CDS
1905      * features are not
1906      */
1907     assertEquals(6, MappingUtils.getLength(ranges));
1908     assertEquals(2, ranges.size());
1909     assertEquals(4, ranges.get(0)[0]);
1910     assertEquals(6, ranges.get(0)[1]);
1911     assertEquals(10, ranges.get(1)[0]);
1912     assertEquals(12, ranges.get(1)[1]);
1913   }
1914 
1915   /**
1916    * Tests for the method that maps the subset of a dna sequence that has CDS
1917    * (or subtype) feature, with CDS strand = '-' (reverse)
1918    */
1919   // test turned off as currently findCdsPositions is not strand-dependent
1920   // left in case it comes around again...
1921   @Test(groups = "Functional", enabled = false)
testFindCdsPositions_reverseStrand()1922   public void testFindCdsPositions_reverseStrand()
1923   {
1924     SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
1925     dnaSeq.createDatasetSequence();
1926     SequenceI ds = dnaSeq.getDatasetSequence();
1927 
1928     // CDS for dna 4-6
1929     SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
1930     sf.setStrand("-");
1931     ds.addSequenceFeature(sf);
1932     // exon feature should be ignored here
1933     sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
1934     ds.addSequenceFeature(sf);
1935     // CDS for dna 10-12
1936     sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null);
1937     sf.setStrand("-");
1938     ds.addSequenceFeature(sf);
1939 
1940     List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1941     /*
1942      * verify ranges { [12-10], [6-4] }
1943      */
1944     assertEquals(6, MappingUtils.getLength(ranges));
1945     assertEquals(2, ranges.size());
1946     assertEquals(12, ranges.get(0)[0]);
1947     assertEquals(10, ranges.get(0)[1]);
1948     assertEquals(6, ranges.get(1)[0]);
1949     assertEquals(4, ranges.get(1)[1]);
1950   }
1951 
1952   /**
1953    * Tests for the method that maps the subset of a dna sequence that has CDS
1954    * (or subtype) feature - reverse strand case where the start codon is
1955    * incomplete.
1956    */
1957   @Test(groups = "Functional", enabled = false)
1958   // test turned off as currently findCdsPositions is not strand-dependent
1959   // left in case it comes around again...
testFindCdsPositions_reverseStrandThreePrimeIncomplete()1960   public void testFindCdsPositions_reverseStrandThreePrimeIncomplete()
1961   {
1962     SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
1963     dnaSeq.createDatasetSequence();
1964     SequenceI ds = dnaSeq.getDatasetSequence();
1965 
1966     // CDS for dna 5-9
1967     SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
1968     sf.setStrand("-");
1969     ds.addSequenceFeature(sf);
1970     // CDS for dna 13-15
1971     sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
1972     sf.setStrand("-");
1973     sf.setPhase("2"); // skip 2 bases to start of next codon
1974     ds.addSequenceFeature(sf);
1975 
1976     List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1977 
1978     /*
1979      * check the mapping starts with the first complete codon
1980      * expect ranges [13, 13], [9, 5]
1981      */
1982     assertEquals(6, MappingUtils.getLength(ranges));
1983     assertEquals(2, ranges.size());
1984     assertEquals(13, ranges.get(0)[0]);
1985     assertEquals(13, ranges.get(0)[1]);
1986     assertEquals(9, ranges.get(1)[0]);
1987     assertEquals(5, ranges.get(1)[1]);
1988   }
1989 
1990   @Test(groups = "Functional")
testAlignAs_alternateTranscriptsUngapped()1991   public void testAlignAs_alternateTranscriptsUngapped()
1992   {
1993     SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
1994     SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
1995     AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
1996     ((Alignment) dna).createDatasetAlignment();
1997     SequenceI cds1 = new Sequence("cds1", "GGGTTT");
1998     SequenceI cds2 = new Sequence("cds2", "CCCAAA");
1999     AlignmentI cds = new Alignment(new SequenceI[] { cds1, cds2 });
2000     ((Alignment) cds).createDatasetAlignment();
2001 
2002     AlignedCodonFrame acf = new AlignedCodonFrame();
2003     MapList map = new MapList(new int[] { 4, 9 }, new int[] { 1, 6 }, 1, 1);
2004     acf.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), map);
2005     map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 6 }, 1, 1);
2006     acf.addMap(dna2.getDatasetSequence(), cds2.getDatasetSequence(), map);
2007 
2008     /*
2009      * verify CDS alignment is as:
2010      *   cccGGGTTTaaa (cdna)
2011      *   CCCgggtttAAA (cdna)
2012      *
2013      *   ---GGGTTT--- (cds)
2014      *   CCC------AAA (cds)
2015      */
2016     dna.addCodonFrame(acf);
2017     AlignmentUtils.alignAs(cds, dna);
2018     assertEquals("---GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
2019     assertEquals("CCC------AAA", cds.getSequenceAt(1).getSequenceAsString());
2020   }
2021 
2022   @Test(groups = { "Functional" })
testAddMappedPositions()2023   public void testAddMappedPositions()
2024   {
2025     SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
2026     SequenceI seq1 = new Sequence("cds", "AAATTT");
2027     from.createDatasetSequence();
2028     seq1.createDatasetSequence();
2029     Mapping mapping = new Mapping(seq1, new MapList(
2030             new int[] { 3, 6, 9, 10 }, new int[] { 1, 6 }, 1, 1));
2031     Map<Integer, Map<SequenceI, Character>> map = new TreeMap<>();
2032     AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
2033 
2034     /*
2035      * verify map has seq1 residues in columns 3,4,6,7,11,12
2036      */
2037     assertEquals(6, map.size());
2038     assertEquals('A', map.get(3).get(seq1).charValue());
2039     assertEquals('A', map.get(4).get(seq1).charValue());
2040     assertEquals('A', map.get(6).get(seq1).charValue());
2041     assertEquals('T', map.get(7).get(seq1).charValue());
2042     assertEquals('T', map.get(11).get(seq1).charValue());
2043     assertEquals('T', map.get(12).get(seq1).charValue());
2044 
2045     /*
2046      *
2047      */
2048   }
2049 
2050   /**
2051    * Test case where the mapping 'from' range includes a stop codon which is
2052    * absent in the 'to' range
2053    */
2054   @Test(groups = { "Functional" })
testAddMappedPositions_withStopCodon()2055   public void testAddMappedPositions_withStopCodon()
2056   {
2057     SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
2058     SequenceI seq1 = new Sequence("cds", "AAATTT");
2059     from.createDatasetSequence();
2060     seq1.createDatasetSequence();
2061     Mapping mapping = new Mapping(seq1, new MapList(
2062             new int[] { 3, 6, 9, 10 }, new int[] { 1, 6 }, 1, 1));
2063     Map<Integer, Map<SequenceI, Character>> map = new TreeMap<>();
2064     AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
2065 
2066     /*
2067      * verify map has seq1 residues in columns 3,4,6,7,11,12
2068      */
2069     assertEquals(6, map.size());
2070     assertEquals('A', map.get(3).get(seq1).charValue());
2071     assertEquals('A', map.get(4).get(seq1).charValue());
2072     assertEquals('A', map.get(6).get(seq1).charValue());
2073     assertEquals('T', map.get(7).get(seq1).charValue());
2074     assertEquals('T', map.get(11).get(seq1).charValue());
2075     assertEquals('T', map.get(12).get(seq1).charValue());
2076   }
2077 
2078   /**
2079    * Test for the case where the products for which we want CDS are specified.
2080    * This is to represent the case where EMBL has CDS mappings to both Uniprot
2081    * and EMBLCDSPROTEIN. makeCdsAlignment() should only return the mappings for
2082    * the protein sequences specified.
2083    */
2084   @Test(groups = { "Functional" })
testMakeCdsAlignment_filterProducts()2085   public void testMakeCdsAlignment_filterProducts()
2086   {
2087     SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
2088     SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
2089     SequenceI pep1 = new Sequence("Uniprot|pep1", "GF");
2090     SequenceI pep2 = new Sequence("Uniprot|pep2", "GFP");
2091     SequenceI pep3 = new Sequence("EMBL|pep3", "GF");
2092     SequenceI pep4 = new Sequence("EMBL|pep4", "GFP");
2093     dna1.createDatasetSequence();
2094     dna2.createDatasetSequence();
2095     pep1.createDatasetSequence();
2096     pep2.createDatasetSequence();
2097     pep3.createDatasetSequence();
2098     pep4.createDatasetSequence();
2099     AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
2100     dna.setDataset(null);
2101     AlignmentI emblPeptides = new Alignment(new SequenceI[] { pep3, pep4 });
2102     emblPeptides.setDataset(null);
2103 
2104     AlignedCodonFrame acf = new AlignedCodonFrame();
2105     MapList map = new MapList(new int[] { 4, 6, 10, 12 },
2106             new int[] { 1, 2 }, 3, 1);
2107     acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
2108     acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map);
2109     dna.addCodonFrame(acf);
2110 
2111     acf = new AlignedCodonFrame();
2112     map = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 },
2113             3, 1);
2114     acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
2115     acf.addMap(dna2.getDatasetSequence(), pep4.getDatasetSequence(), map);
2116     dna.addCodonFrame(acf);
2117 
2118     /*
2119      * execute method under test to find CDS for EMBL peptides only
2120      */
2121     AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
2122         dna1, dna2 }, dna.getDataset(), emblPeptides.getSequencesArray());
2123 
2124     assertEquals(2, cds.getSequences().size());
2125     assertEquals("GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
2126     assertEquals("GGGTTTCCC", cds.getSequenceAt(1).getSequenceAsString());
2127 
2128     /*
2129      * verify shared, extended alignment dataset
2130      */
2131     assertSame(dna.getDataset(), cds.getDataset());
2132     assertTrue(dna.getDataset().getSequences()
2133             .contains(cds.getSequenceAt(0).getDatasetSequence()));
2134     assertTrue(dna.getDataset().getSequences()
2135             .contains(cds.getSequenceAt(1).getDatasetSequence()));
2136 
2137     /*
2138      * Verify mappings from CDS to peptide, cDNA to CDS, and cDNA to peptide
2139      * the mappings are on the shared alignment dataset
2140      */
2141     List<AlignedCodonFrame> cdsMappings = cds.getDataset().getCodonFrames();
2142     /*
2143      * 6 mappings, 2*(DNA->CDS), 2*(DNA->Pep), 2*(CDS->Pep)
2144      */
2145     assertEquals(6, cdsMappings.size());
2146 
2147     /*
2148      * verify that mapping sets for dna and cds alignments are different
2149      * [not current behaviour - all mappings are on the alignment dataset]
2150      */
2151     // select -> subselect type to test.
2152     // Assert.assertNotSame(dna.getCodonFrames(), cds.getCodonFrames());
2153     // assertEquals(4, dna.getCodonFrames().size());
2154     // assertEquals(4, cds.getCodonFrames().size());
2155 
2156     /*
2157      * Two mappings involve pep3 (dna to pep3, cds to pep3)
2158      * Mapping from pep3 to GGGTTT in first new exon sequence
2159      */
2160     List<AlignedCodonFrame> pep3Mappings = MappingUtils
2161             .findMappingsForSequence(pep3, cdsMappings);
2162     assertEquals(2, pep3Mappings.size());
2163     List<AlignedCodonFrame> mappings = MappingUtils
2164             .findMappingsForSequence(cds.getSequenceAt(0), pep3Mappings);
2165     assertEquals(1, mappings.size());
2166 
2167     // map G to GGG
2168     SearchResultsI sr = MappingUtils.buildSearchResults(pep3, 1, mappings);
2169     assertEquals(1, sr.getResults().size());
2170     SearchResultMatchI m = sr.getResults().get(0);
2171     assertSame(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence());
2172     assertEquals(1, m.getStart());
2173     assertEquals(3, m.getEnd());
2174     // map F to TTT
2175     sr = MappingUtils.buildSearchResults(pep3, 2, mappings);
2176     m = sr.getResults().get(0);
2177     assertSame(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence());
2178     assertEquals(4, m.getStart());
2179     assertEquals(6, m.getEnd());
2180 
2181     /*
2182      * Two mappings involve pep4 (dna to pep4, cds to pep4)
2183      * Verify mapping from pep4 to GGGTTTCCC in second new exon sequence
2184      */
2185     List<AlignedCodonFrame> pep4Mappings = MappingUtils
2186             .findMappingsForSequence(pep4, cdsMappings);
2187     assertEquals(2, pep4Mappings.size());
2188     mappings = MappingUtils.findMappingsForSequence(cds.getSequenceAt(1),
2189             pep4Mappings);
2190     assertEquals(1, mappings.size());
2191     // map G to GGG
2192     sr = MappingUtils.buildSearchResults(pep4, 1, mappings);
2193     assertEquals(1, sr.getResults().size());
2194     m = sr.getResults().get(0);
2195     assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence());
2196     assertEquals(1, m.getStart());
2197     assertEquals(3, m.getEnd());
2198     // map F to TTT
2199     sr = MappingUtils.buildSearchResults(pep4, 2, mappings);
2200     m = sr.getResults().get(0);
2201     assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence());
2202     assertEquals(4, m.getStart());
2203     assertEquals(6, m.getEnd());
2204     // map P to CCC
2205     sr = MappingUtils.buildSearchResults(pep4, 3, mappings);
2206     m = sr.getResults().get(0);
2207     assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence());
2208     assertEquals(7, m.getStart());
2209     assertEquals(9, m.getEnd());
2210   }
2211 
2212   /**
2213    * Test the method that just copies aligned sequences, provided all sequences
2214    * to be aligned share the aligned sequence's dataset
2215    */
2216   @Test(groups = "Functional")
testAlignAsSameSequences()2217   public void testAlignAsSameSequences()
2218   {
2219     SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
2220     SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
2221     AlignmentI al1 = new Alignment(new SequenceI[] { dna1, dna2 });
2222     ((Alignment) al1).createDatasetAlignment();
2223 
2224     SequenceI dna3 = new Sequence(dna1);
2225     SequenceI dna4 = new Sequence(dna2);
2226     assertSame(dna3.getDatasetSequence(), dna1.getDatasetSequence());
2227     assertSame(dna4.getDatasetSequence(), dna2.getDatasetSequence());
2228     String seq1 = "-cc-GG-GT-TT--aaa";
2229     dna3.setSequence(seq1);
2230     String seq2 = "C--C-Cgg--gtt-tAA-A-";
2231     dna4.setSequence(seq2);
2232     AlignmentI al2 = new Alignment(new SequenceI[] { dna3, dna4 });
2233     ((Alignment) al2).createDatasetAlignment();
2234 
2235     /*
2236      * alignment removes gapped columns (two internal, two trailing)
2237      */
2238     assertTrue(AlignmentUtils.alignAsSameSequences(al1, al2));
2239     String aligned1 = "-cc-GG-GTTT-aaa";
2240     assertEquals(aligned1,
2241             al1.getSequenceAt(0).getSequenceAsString());
2242     String aligned2 = "C--C-Cgg-gtttAAA";
2243     assertEquals(aligned2,
2244             al1.getSequenceAt(1).getSequenceAsString());
2245 
2246     /*
2247      * add another sequence to 'aligned' - should still succeed, since
2248      * unaligned sequences still share a dataset with aligned sequences
2249      */
2250     SequenceI dna5 = new Sequence("dna5", "CCCgggtttAAA");
2251     dna5.createDatasetSequence();
2252     al2.addSequence(dna5);
2253     assertTrue(AlignmentUtils.alignAsSameSequences(al1, al2));
2254     assertEquals(aligned1, al1.getSequenceAt(0).getSequenceAsString());
2255     assertEquals(aligned2, al1.getSequenceAt(1).getSequenceAsString());
2256 
2257     /*
2258      * add another sequence to 'unaligned' - should fail, since now not
2259      * all unaligned sequences share a dataset with aligned sequences
2260      */
2261     SequenceI dna6 = new Sequence("dna6", "CCCgggtttAAA");
2262     dna6.createDatasetSequence();
2263     al1.addSequence(dna6);
2264     // JAL-2110 JBP Comment: what's the use case for this behaviour ?
2265     assertFalse(AlignmentUtils.alignAsSameSequences(al1, al2));
2266   }
2267 
2268   @Test(groups = "Functional")
testAlignAsSameSequencesMultipleSubSeq()2269   public void testAlignAsSameSequencesMultipleSubSeq()
2270   {
2271     SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
2272     SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
2273     SequenceI as1 = dna1.deriveSequence(); // cccGGGTTTaaa/1-12
2274     SequenceI as2 = dna1.deriveSequence().getSubSequence(3, 7); // GGGT/4-7
2275     SequenceI as3 = dna2.deriveSequence(); // CCCgggtttAAA/1-12
2276     as1.insertCharAt(6, 5, '-');
2277     assertEquals("cccGGG-----TTTaaa", as1.getSequenceAsString());
2278     as2.insertCharAt(6, 5, '-');
2279     assertEquals("GGGT-----", as2.getSequenceAsString());
2280     as3.insertCharAt(3, 5, '-');
2281     assertEquals("CCC-----gggtttAAA", as3.getSequenceAsString());
2282     AlignmentI aligned = new Alignment(new SequenceI[] { as1, as2, as3 });
2283 
2284     // why do we need to cast this still ?
2285     ((Alignment) aligned).createDatasetAlignment();
2286     SequenceI uas1 = dna1.deriveSequence();
2287     SequenceI uas2 = dna1.deriveSequence().getSubSequence(3, 7);
2288     SequenceI uas3 = dna2.deriveSequence();
2289     AlignmentI tobealigned = new Alignment(new SequenceI[] { uas1, uas2,
2290         uas3 });
2291     ((Alignment) tobealigned).createDatasetAlignment();
2292 
2293     /*
2294      * alignAs lines up dataset sequences and removes empty columns (two)
2295      */
2296     assertTrue(AlignmentUtils.alignAsSameSequences(tobealigned, aligned));
2297     assertEquals("cccGGG---TTTaaa", uas1.getSequenceAsString());
2298     assertEquals("GGGT", uas2.getSequenceAsString());
2299     assertEquals("CCC---gggtttAAA", uas3.getSequenceAsString());
2300   }
2301 
2302   @Test(groups = { "Functional" })
testTransferGeneLoci()2303   public void testTransferGeneLoci()
2304   {
2305     SequenceI from = new Sequence("transcript",
2306             "aaacccgggTTTAAACCCGGGtttaaacccgggttt");
2307     SequenceI to = new Sequence("CDS", "TTTAAACCCGGG");
2308     MapList map = new MapList(new int[] { 1, 12 }, new int[] { 10, 21 }, 1,
2309             1);
2310 
2311     /*
2312      * first with nothing to transfer
2313      */
2314     AlignmentUtils.transferGeneLoci(from, map, to);
2315     assertNull(to.getGeneLoci());
2316 
2317     /*
2318      * next with gene loci set on 'from' sequence
2319      */
2320     int[] exons = new int[] { 100, 105, 155, 164, 210, 229 };
2321     MapList geneMap = new MapList(new int[] { 1, 36 }, exons, 1, 1);
2322     from.setGeneLoci("human", "GRCh38", "7", geneMap);
2323     AlignmentUtils.transferGeneLoci(from, map, to);
2324 
2325     GeneLociI toLoci = to.getGeneLoci();
2326     assertNotNull(toLoci);
2327     // DBRefEntry constructor upper-cases 'source'
2328     assertEquals("HUMAN", toLoci.getSpeciesId());
2329     assertEquals("GRCh38", toLoci.getAssemblyId());
2330     assertEquals("7", toLoci.getChromosomeId());
2331 
2332     /*
2333      * transcript 'exons' are 1-6, 7-16, 17-36
2334      * CDS 1:12 is transcript 10-21
2335      * transcript 'CDS' is 10-16, 17-21
2336      * which is 'gene' 158-164, 210-214
2337      */
2338     MapList toMap = toLoci.getMapping();
2339     assertEquals(1, toMap.getFromRanges().size());
2340     assertEquals(2, toMap.getFromRanges().get(0).length);
2341     assertEquals(1, toMap.getFromRanges().get(0)[0]);
2342     assertEquals(12, toMap.getFromRanges().get(0)[1]);
2343     assertEquals(2, toMap.getToRanges().size());
2344     assertEquals(2, toMap.getToRanges().get(0).length);
2345     assertEquals(158, toMap.getToRanges().get(0)[0]);
2346     assertEquals(164, toMap.getToRanges().get(0)[1]);
2347     assertEquals(210, toMap.getToRanges().get(1)[0]);
2348     assertEquals(214, toMap.getToRanges().get(1)[1]);
2349     // or summarised as (but toString might change in future):
2350     assertEquals("[ [1, 12] ] 1:1 to [ [158, 164] [210, 214] ]",
2351             toMap.toString());
2352 
2353     /*
2354      * an existing value is not overridden
2355      */
2356     geneMap = new MapList(new int[] { 1, 36 }, new int[] { 36, 1 }, 1, 1);
2357     from.setGeneLoci("inhuman", "GRCh37", "6", geneMap);
2358     AlignmentUtils.transferGeneLoci(from, map, to);
2359     assertEquals("GRCh38", toLoci.getAssemblyId());
2360     assertEquals("7", toLoci.getChromosomeId());
2361     toMap = toLoci.getMapping();
2362     assertEquals("[ [1, 12] ] 1:1 to [ [158, 164] [210, 214] ]",
2363             toMap.toString());
2364   }
2365 
2366   /**
2367    * Tests for the method that maps nucleotide to protein based on CDS features
2368    */
2369   @Test(groups = "Functional")
testMapCdsToProtein()2370   public void testMapCdsToProtein()
2371   {
2372     SequenceI peptide = new Sequence("pep", "KLQ");
2373 
2374     /*
2375      * Case 1: CDS 3 times length of peptide
2376      * NB method only checks lengths match, not translation
2377      */
2378     SequenceI dna = new Sequence("dna", "AACGacgtCTCCT");
2379     dna.createDatasetSequence();
2380     dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null));
2381     dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 13, null));
2382     MapList ml = AlignmentUtils.mapCdsToProtein(dna, peptide);
2383     assertEquals(3, ml.getFromRatio());
2384     assertEquals(1, ml.getToRatio());
2385     assertEquals("[[1, 3]]",
2386             Arrays.deepToString(ml.getToRanges().toArray()));
2387     assertEquals("[[1, 4], [9, 13]]",
2388             Arrays.deepToString(ml.getFromRanges().toArray()));
2389 
2390     /*
2391      * Case 2: CDS 3 times length of peptide + stop codon
2392      * (note code does not currently check trailing codon is a stop codon)
2393      */
2394     dna = new Sequence("dna", "AACGacgtCTCCTCCC");
2395     dna.createDatasetSequence();
2396     dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null));
2397     dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 16, null));
2398     ml = AlignmentUtils.mapCdsToProtein(dna, peptide);
2399     assertEquals(3, ml.getFromRatio());
2400     assertEquals(1, ml.getToRatio());
2401     assertEquals("[[1, 3]]",
2402             Arrays.deepToString(ml.getToRanges().toArray()));
2403     assertEquals("[[1, 4], [9, 13]]",
2404             Arrays.deepToString(ml.getFromRanges().toArray()));
2405 
2406     /*
2407      * Case 3: CDS longer than 3 * peptide + stop codon - no mapping is made
2408      */
2409     dna = new Sequence("dna", "AACGacgtCTCCTTGATCA");
2410     dna.createDatasetSequence();
2411     dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null));
2412     dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 19, null));
2413     ml = AlignmentUtils.mapCdsToProtein(dna, peptide);
2414     assertNull(ml);
2415 
2416     /*
2417      * Case 4: CDS shorter than 3 * peptide - no mapping is made
2418      */
2419     dna = new Sequence("dna", "AACGacgtCTCC");
2420     dna.createDatasetSequence();
2421     dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null));
2422     dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 12, null));
2423     ml = AlignmentUtils.mapCdsToProtein(dna, peptide);
2424     assertNull(ml);
2425 
2426     /*
2427      * Case 5: CDS 3 times length of peptide + part codon - mapping is truncated
2428      */
2429     dna = new Sequence("dna", "AACGacgtCTCCTTG");
2430     dna.createDatasetSequence();
2431     dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null));
2432     dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 15, null));
2433     ml = AlignmentUtils.mapCdsToProtein(dna, peptide);
2434     assertEquals(3, ml.getFromRatio());
2435     assertEquals(1, ml.getToRatio());
2436     assertEquals("[[1, 3]]",
2437             Arrays.deepToString(ml.getToRanges().toArray()));
2438     assertEquals("[[1, 4], [9, 13]]",
2439             Arrays.deepToString(ml.getFromRanges().toArray()));
2440 
2441     /*
2442      * Case 6: incomplete start codon corresponding to X in peptide
2443      */
2444     dna = new Sequence("dna", "ACGacgtCTCCTTGG");
2445     dna.createDatasetSequence();
2446     SequenceFeature sf = new SequenceFeature("CDS", "", 1, 3, null);
2447     sf.setPhase("2"); // skip 2 positions (AC) to start of next codon (GCT)
2448     dna.addSequenceFeature(sf);
2449     dna.addSequenceFeature(new SequenceFeature("CDS", "", 8, 15, null));
2450     peptide = new Sequence("pep", "XLQ");
2451     ml = AlignmentUtils.mapCdsToProtein(dna, peptide);
2452     assertEquals("[[2, 3]]",
2453             Arrays.deepToString(ml.getToRanges().toArray()));
2454     assertEquals("[[3, 3], [8, 12]]",
2455             Arrays.deepToString(ml.getFromRanges().toArray()));
2456   }
2457 
2458   /**
2459    * Tests for the method that locates the CDS sequence that has a mapping to
2460    * the given protein. That is, given a transcript-to-peptide mapping, find the
2461    * cds-to-peptide mapping that relates to both, and return the CDS sequence.
2462    */
2463   @Test
testFindCdsForProtein()2464   public void testFindCdsForProtein()
2465   {
2466     List<AlignedCodonFrame> mappings = new ArrayList<>();
2467     AlignedCodonFrame acf1 = new AlignedCodonFrame();
2468     mappings.add(acf1);
2469 
2470     SequenceI dna1 = new Sequence("dna1", "cgatATcgGCTATCTATGacg");
2471     dna1.createDatasetSequence();
2472 
2473     // NB we currently exclude STOP codon from CDS sequences
2474     // the test would need to change if this changes in future
2475     SequenceI cds1 = new Sequence("cds1", "ATGCTATCT");
2476     cds1.createDatasetSequence();
2477 
2478     SequenceI pep1 = new Sequence("pep1", "MLS");
2479     pep1.createDatasetSequence();
2480     List<AlignedCodonFrame> seqMappings = new ArrayList<>();
2481     MapList mapList = new MapList(
2482             new int[]
2483             { 5, 6, 9, 15 }, new int[] { 1, 3 }, 3, 1);
2484     Mapping dnaToPeptide = new Mapping(pep1.getDatasetSequence(), mapList);
2485 
2486     // add dna to peptide mapping
2487     seqMappings.add(acf1);
2488     acf1.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(),
2489             mapList);
2490 
2491     /*
2492      * first case - no dna-to-CDS mapping exists - search fails
2493      */
2494     SequenceI seq = AlignmentUtils.findCdsForProtein(mappings, dna1,
2495             seqMappings, dnaToPeptide);
2496     assertNull(seq);
2497 
2498     /*
2499      * second case - CDS-to-peptide mapping exists but no dna-to-CDS
2500      * - search fails
2501      */
2502     // todo this test fails if the mapping is added to acf1, not acf2
2503     // need to tidy up use of lists of mappings in AlignedCodonFrame
2504     AlignedCodonFrame acf2 = new AlignedCodonFrame();
2505     mappings.add(acf2);
2506     MapList cdsToPeptideMapping = new MapList(new int[]
2507     { 1, 9 }, new int[] { 1, 3 }, 3, 1);
2508     acf2.addMap(cds1.getDatasetSequence(), pep1.getDatasetSequence(),
2509             cdsToPeptideMapping);
2510     assertNull(AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings,
2511             dnaToPeptide));
2512 
2513     /*
2514      * third case - add dna-to-CDS mapping - CDS is now found!
2515      */
2516     MapList dnaToCdsMapping = new MapList(new int[] { 5, 6, 9, 15 },
2517             new int[]
2518             { 1, 9 }, 1, 1);
2519     acf1.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(),
2520             dnaToCdsMapping);
2521     seq = AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings,
2522             dnaToPeptide);
2523     assertSame(seq, cds1.getDatasetSequence());
2524   }
2525 
2526   /**
2527    * Tests for the method that locates the CDS sequence that has a mapping to
2528    * the given protein. That is, given a transcript-to-peptide mapping, find the
2529    * cds-to-peptide mapping that relates to both, and return the CDS sequence.
2530    * This test is for the case where transcript and CDS are the same length.
2531    */
2532   @Test
testFindCdsForProtein_noUTR()2533   public void testFindCdsForProtein_noUTR()
2534   {
2535     List<AlignedCodonFrame> mappings = new ArrayList<>();
2536     AlignedCodonFrame acf1 = new AlignedCodonFrame();
2537     mappings.add(acf1);
2538 
2539     SequenceI dna1 = new Sequence("dna1", "ATGCTATCTTAA");
2540     dna1.createDatasetSequence();
2541 
2542     // NB we currently exclude STOP codon from CDS sequences
2543     // the test would need to change if this changes in future
2544     SequenceI cds1 = new Sequence("cds1", "ATGCTATCT");
2545     cds1.createDatasetSequence();
2546 
2547     SequenceI pep1 = new Sequence("pep1", "MLS");
2548     pep1.createDatasetSequence();
2549     List<AlignedCodonFrame> seqMappings = new ArrayList<>();
2550     MapList mapList = new MapList(
2551             new int[]
2552             { 1, 9 }, new int[] { 1, 3 }, 3, 1);
2553     Mapping dnaToPeptide = new Mapping(pep1.getDatasetSequence(), mapList);
2554 
2555     // add dna to peptide mapping
2556     seqMappings.add(acf1);
2557     acf1.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(),
2558             mapList);
2559 
2560     /*
2561      * first case - transcript lacks CDS features - it appears to be
2562      * the CDS sequence and is returned
2563      */
2564     SequenceI seq = AlignmentUtils.findCdsForProtein(mappings, dna1,
2565             seqMappings, dnaToPeptide);
2566     assertSame(seq, dna1.getDatasetSequence());
2567 
2568     /*
2569      * second case - transcript has CDS feature - this means it is
2570      * not returned as a match for CDS (CDS sequences don't have CDS features)
2571      */
2572     dna1.addSequenceFeature(
2573             new SequenceFeature(SequenceOntologyI.CDS, "cds", 1, 12, null));
2574     seq = AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings,
2575             dnaToPeptide);
2576     assertNull(seq);
2577 
2578     /*
2579      * third case - CDS-to-peptide mapping exists but no dna-to-CDS
2580      * - search fails
2581      */
2582     // todo this test fails if the mapping is added to acf1, not acf2
2583     // need to tidy up use of lists of mappings in AlignedCodonFrame
2584     AlignedCodonFrame acf2 = new AlignedCodonFrame();
2585     mappings.add(acf2);
2586     MapList cdsToPeptideMapping = new MapList(new int[]
2587     { 1, 9 }, new int[] { 1, 3 }, 3, 1);
2588     acf2.addMap(cds1.getDatasetSequence(), pep1.getDatasetSequence(),
2589             cdsToPeptideMapping);
2590     assertNull(AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings,
2591             dnaToPeptide));
2592 
2593     /*
2594      * fourth case - add dna-to-CDS mapping - CDS is now found!
2595      */
2596     MapList dnaToCdsMapping = new MapList(new int[] { 1, 9 },
2597             new int[]
2598             { 1, 9 }, 1, 1);
2599     acf1.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(),
2600             dnaToCdsMapping);
2601     seq = AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings,
2602             dnaToPeptide);
2603     assertSame(seq, cds1.getDatasetSequence());
2604   }
2605 }
2606