1 package org.broadinstitute.hellbender.testutils;
2 
3 import com.google.common.collect.Sets;
4 import htsjdk.samtools.*;
5 import org.apache.commons.io.FilenameUtils;
6 import org.broadinstitute.hellbender.engine.GATKPath;
7 import org.broadinstitute.hellbender.utils.Utils;
8 import org.broadinstitute.hellbender.utils.read.ReadUtils;
9 import org.testng.Assert;
10 import picard.sam.SortSam;
11 import picard.sam.util.SamComparison;
12 
13 import java.io.File;
14 import java.io.IOException;
15 import java.io.PrintWriter;
16 import java.nio.file.Path;
17 import java.util.*;
18 
19 /**
20  * Collection of utilities for making common assertions about SAM files for unit testing purposes.
21  */
22 public final class SamAssertionUtils {
23 
getReader(final File sam, final ValidationStringency validationStringency, final File reference)24     private static SamReader getReader(final File sam, final ValidationStringency validationStringency, final File reference) {
25         return SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(reference).open(sam);
26     }
getReader(final Path sam, final ValidationStringency validationStringency, final Path reference)27     private static SamReader getReader(final Path sam, final ValidationStringency validationStringency, final Path reference) {
28         return SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(reference).open(sam);
29     }
30     /**
31      *  causes an exception if the given sam files aren't equal
32      *  @param actualSam the actual file
33      *  @param expectedSam the expected file
34      *  @param validationStringency how stringently do we validate the files
35      *  @param reference is allowed to be null
36      */
assertSamsEqual(final File actualSam, final File expectedSam, final ValidationStringency validationStringency, final File reference)37     public static void assertSamsEqual(final File actualSam, final File expectedSam, final ValidationStringency validationStringency, final File reference) throws IOException {
38         assertSamsEqual(actualSam.toPath(), expectedSam.toPath(), validationStringency,
39                 (null==reference?null:reference.toPath()));
40     }
assertSamsEqual(final Path actualSam, final Path expectedSam, final ValidationStringency validationStringency, final Path reference)41     public static void assertSamsEqual(final Path actualSam, final Path expectedSam, final ValidationStringency validationStringency, final Path reference) throws IOException {
42         final String equalStringent = samsEqualStringent(actualSam, expectedSam, validationStringency, reference);
43         Assert.assertNull(equalStringent, "SAM file " + actualSam.toUri().toString() + " differs from expected output:" + expectedSam.toUri().toString() + " " + equalStringent);
44     }
45 
46     /**
47      * causes an exception if the given sam files aren't equal
48      *  @param actualSam the actual file
49      *  @param expectedSam the expected file
50      *  @param validationStringency how stringently do we validate the files
51      */
assertSamsEqual(final File actualSam, final File expectedSam, final ValidationStringency validationStringency)52     public static void assertSamsEqual(final File actualSam, final File expectedSam, final ValidationStringency validationStringency) throws IOException {
53         assertSamsEqual(actualSam, expectedSam, validationStringency, null);
54     }
55 
56     /**
57      * causes an exception if the given sam files aren't equal
58      *  @param actualSam the actual file
59      *  @param expectedSam the expected file
60      *  @param reference is allowed to be null
61      */
assertSamsEqual(final File actualSam, final File expectedSam, final File reference)62     public static void assertSamsEqual(final File actualSam, final File expectedSam, final File reference) throws IOException {
63         assertSamsEqual(actualSam, expectedSam, ValidationStringency.DEFAULT_STRINGENCY, reference);
64     }
assertSamsEqual(final Path actualSam, final Path expectedSam, final Path reference)65     public static void assertSamsEqual(final Path actualSam, final Path expectedSam, final Path reference) throws IOException {
66         assertSamsEqual(actualSam, expectedSam, ValidationStringency.DEFAULT_STRINGENCY, reference);
67     }
68 
69     /**
70      * causes an exception if the given sam files aren't equal
71      *  @param actualSam the actual file
72      *  @param expectedSam the expected file
73      */
assertSamsEqual(final File actualSam, final File expectedSam)74     public static void assertSamsEqual(final File actualSam, final File expectedSam) throws IOException {
75         assertSamsEqual(actualSam, expectedSam, ValidationStringency.DEFAULT_STRINGENCY, null);
76     }
77 
78     /**
79      * causes an exception if the given sam isn't valid
80      * @param reference is allowed to be null
81      */
assertSamValid(final File sam, final ValidationStringency validationStringency, final File reference)82     public static void assertSamValid(final File sam, final ValidationStringency validationStringency, final File reference) throws IOException {
83         assertCRAMContentsIfCRAM(sam);
84         try (final SamReader samReader = getReader(sam, validationStringency, reference)) {
85             final SamFileValidator validator = new SamFileValidator(new PrintWriter(System.out), 8000);
86             validator.setIgnoreWarnings(true);
87             validator.setVerbose(true, 1000);
88             validator.setErrorsToIgnore(Collections.singletonList(SAMValidationError.Type.MISSING_READ_GROUP));
89             final boolean validated = validator.validateSamFileVerbose(samReader, null);
90             Assert.assertTrue(validated, "SAM file validation failed");
91         }
92     }
93 
94     /**
95      * causes an exception if the given sam isn't valid
96      */
assertSamValid(final File sam, final ValidationStringency validationStringency)97     public static void assertSamValid(final File sam, final ValidationStringency validationStringency) throws IOException {
98         assertSamValid(sam, validationStringency, null);
99     }
100 
101     /**
102      * causes an exception if the given sam isn't valid
103      * @param reference is allowed to be null
104      * the default ValidationStringency value for this method is LENIENT
105      */
assertSamValid(final File sam, final File reference)106     public static void assertSamValid(final File sam, final File reference) throws IOException {
107         assertSamValid(sam, ValidationStringency.LENIENT, reference);
108     }
109 
110     /**
111      * causes an exception if the given sam isn't valid
112      * the default ValidationStringency value for this method is LENIENT
113      */
assertSamValid(final File sam)114     public static void assertSamValid(final File sam) throws IOException {
115         assertSamValid(sam, ValidationStringency.LENIENT, null);
116     }
117 
118     /**
119      * Compares SAM/BAM files in a stringent way but not by byte identity (allow reorder of attributes).
120      * Returns null if the files are considered equals and returns a String describing the reason for comparison failure.
121      * The lenient comparison only checks headers and alignment info {@link SamComparison}. Compares headers, and if headers are compatible enough, compares SAMRecords,
122      * looking only at basic alignment info.
123      */
samsEqualLenient(final File actualSam, final File expectedSam, final ValidationStringency validation, final File reference)124     public static String samsEqualLenient(final File actualSam, final File expectedSam, final ValidationStringency validation, final File reference) throws IOException {
125         assertCRAMContentsIfCRAM(actualSam);
126         assertCRAMContentsIfCRAM(expectedSam);
127         try(final SamReader reader1 = getReader(actualSam, validation, reference);
128             final SamReader reader2 = getReader(expectedSam, validation, reference)) {
129 
130             final SamComparison comparison = new SamComparison(reader1, reader2);
131             return comparison.areEqual() ? null : "SamComparison fails";
132         }
133     }
134 
135     /**
136      * Compares SAM/BAM files in a stringent way but not by byte identity (allow reorder of attributes)
137      * Comparing by MD5s is too strict and comparing by SamComparison is too lenient. So we need this method.
138      *
139      * This differs from a byte-to-byte comparison:
140      * - @PG and @CO lines in headers are ignored in the comparison.
141      * - each read in the actual file are allowed to have a superset of the attributes of the corresponding read in the expected set
142      * @return null if equal or message string if not equal.
143      */
samsEqualStringent(final File actualSam, final File expectedSam, final ValidationStringency validation, final File reference)144     public static String samsEqualStringent(final File actualSam, final File expectedSam, final ValidationStringency validation, final File reference) throws IOException {
145         return samsEqualStringent(actualSam.toPath(), expectedSam.toPath(), validation, (null==reference?null:reference.toPath()));
146     }
147 
148     /**
149      * Compares SAM/BAM files in a stringent way but not by byte identity (allow reorder of attributes)
150      * Comparing by MD5s is too strict and comparing by SamComparison is too lenient. So we need this method.
151      *
152      * This differs from a byte-to-byte comparison:
153      * - @PG and @CO lines in headers are ignored in the comparison.
154      * - each read in the actual file are allowed to have a superset of the attributes of the corresponding read in the expected set
155      * @return null if equal or message string if not equal.
156      */
samsEqualStringent(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference)157     public static String samsEqualStringent(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference) throws IOException {
158         if (sameMD5s(actualSam, expectedSam)) {
159             return null;
160         }
161 
162         //  verify that CRAM files have CRAM contents
163         assertCRAMContentsIfCRAM(actualSam);
164         assertCRAMContentsIfCRAM(expectedSam);
165 
166         String msg = equalHeadersIgnoreCOandPG(actualSam, expectedSam, validation, reference);
167         if (msg != null) { return msg; }
168 
169         //At this point we know that the files are not byte-wise identical, but are equal according to SamComparison and their headers are equal
170         //So we iterate over reads and compare them one by one.
171         return compareReads(actualSam, expectedSam, validation, reference);
172     }
173 
174 
sameMD5s(final File actualSam, final File expectedSam)175     private static boolean sameMD5s(final File actualSam, final File expectedSam) throws IOException {
176         final String fileMD5_1 = Utils.calculateFileMD5(actualSam);
177         final String fileMD5_2 = Utils.calculateFileMD5(expectedSam);
178         return fileMD5_1.equals(fileMD5_2);
179     }
180 
181 
sameMD5s(final Path actualSam, final Path expectedSam)182     private static boolean sameMD5s(final Path actualSam, final Path expectedSam) throws IOException {
183         final String fileMD5_1 = Utils.calculatePathMD5(actualSam);
184         final String fileMD5_2 = Utils.calculatePathMD5(expectedSam);
185         return fileMD5_1.equals(fileMD5_2);
186     }
187 
compareReads(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference)188     private static String compareReads(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference) throws IOException {
189         try(final SamReader reader1 = getReader(actualSam, validation, reference);
190             final SamReader reader2 = getReader(expectedSam, validation, reference)) {
191             final SAMRecordIterator it1 = reader1.iterator();
192             final SAMRecordIterator it2 = reader2.iterator();
193             while (it1.hasNext() && it2.hasNext()) {
194                 final SAMRecord read1 = it1.next();
195                 final SAMRecord read2 = it2.next();
196                 final String eqMessage = readsEqualAllowAddingAttributes(read1, read2);
197                 if (eqMessage != null){
198                     return eqMessage;
199                 }
200             }
201             if (it1.hasNext() || it2.hasNext()) {
202                 //at least one has no more records (because the while loop is done) and at least one does have records. So we're not equal.
203                 return "Not the same number of reads";
204             }
205             return null;
206         }
207     }
208 
equalHeadersIgnoreCOandPG(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference)209     private static String equalHeadersIgnoreCOandPG(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference) throws IOException {
210         try(final SamReader reader1 = getReader(actualSam, validation, reference);
211                 final SamReader reader2 = getReader(expectedSam, validation, reference)){
212 
213             final SAMFileHeader h1 = reader1.getFileHeader();
214             final SAMFileHeader h2 = reader2.getFileHeader();
215             String msg;
216 
217             //Note: we allow the versions to differ
218 
219             msg = compareValues(h1.getCreator(), h2.getCreator(), "File creator");
220             if (msg != null) { return msg; }
221 
222             msg = compareValues(h1.getAttribute("SO"), h2.getAttribute("SO"), "Sort order");
223             if (msg != null) { return msg; }
224 
225             if (! Objects.equals(h1.getSequenceDictionary(), h2.getSequenceDictionary())){
226                 return "Different Sequence dictionaries";
227             }
228 
229             msg = compareReadGroups(h1, h2);
230             if (msg != null) { return msg; }
231 
232             return msg;
233         }
234     }
235 
compareReadGroups(final SAMFileHeader h1, final SAMFileHeader h2)236     private static String compareReadGroups(final SAMFileHeader h1, final SAMFileHeader h2) {
237         final List<SAMReadGroupRecord> l1 = h1.getReadGroups();
238         final List<SAMReadGroupRecord> l2 = h2.getReadGroups();
239         final String msg = compareValues(l1.size(), l2.size(), "Number of read groups");
240         if (msg != null){ return msg; }
241 
242         for (int i = 0; i < l1.size(); ++i) {
243             if (! Objects.equals(l1.get(i), l2.get(i))){
244                  return "Read group records different:" + l1.get(i) + " vs " + l2.get(i);
245             }
246         }
247         return null;
248     }
249 
compareValues(final T v1, final T v2, final String label)250     private static <T> String compareValues(final T v1, final T v2, final String label) {
251         boolean eq = Objects.equals(v1, v2);
252         if (eq) {
253             return null;
254         } else {
255             final String s1 = String.valueOf(v1);
256             final String s2 = String.valueOf(v2);
257             return label + " differs. File 1: " + s1 + " File 2: " + s2;
258         }
259     }
260 
261     /**
262      * Compares the reads but ignores order of attributes.
263      * Also allows actualRead to have a superset of attributes of expectedRead.
264      */
readsEqualAllowAddingAttributes(final SAMRecord actualRead, final SAMRecord expectedRead)265     private static String readsEqualAllowAddingAttributes(final SAMRecord actualRead, final SAMRecord expectedRead) {
266         final String actualName = actualRead.getReadName();
267         final String expectedName = expectedRead.getReadName();
268 
269         String msg;
270 
271         msg = compareValues(actualName, expectedName, "name");
272         if (msg != null){ return msg; }
273 
274         final String readNames = "actualName:" + actualName + " expectedName:" + expectedName;
275 
276         msg = compareValues(SAMFlag.getFlags(actualRead.getFlags()), SAMFlag.getFlags(expectedRead.getFlags()), readNames + " getFlags");
277         if (msg != null){ return msg; }
278 
279         msg = compareValues(actualRead.getInferredInsertSize(), expectedRead.getInferredInsertSize(), readNames + " getInferredInsertSize");
280         if (msg != null){ return msg; }
281 
282         msg = compareValues(actualRead.getMappingQuality(), expectedRead.getMappingQuality(), readNames + " getMappingQuality");
283         if (msg != null){ return msg; }
284 
285         msg = compareValues(actualRead.getMateReferenceIndex(), expectedRead.getMateReferenceIndex(), readNames + "getMateReferenceIndex");
286         if (msg != null){ return msg; }
287 
288         msg = compareValues(actualRead.getMateAlignmentStart(), expectedRead.getMateAlignmentStart(), readNames + "getMateAlignmentStart");
289         if (msg != null){ return msg; }
290 
291         msg = compareValues(actualRead.getReferenceIndex(), expectedRead.getReferenceIndex(), readNames + " getReferenceIndex");
292         if (msg != null){ return msg; }
293 
294         msg = compareValues(actualRead.getAlignmentStart(), expectedRead.getAlignmentStart(), readNames + " getAlignmentStart");
295         if (msg != null){ return msg; }
296 
297         msg = compareValues(actualRead.getCigar(), expectedRead.getCigar(), readNames + " getCigar");
298         if (msg != null){ return msg; }
299 
300         msg = compareValues(actualRead.getReferenceName(), expectedRead.getReferenceName(), readNames + " getReferenceName");
301         if (msg != null){ return msg; }
302 
303         msg = compareValues(actualRead.getMateReferenceName(), expectedRead.getMateReferenceName(), readNames + " getMateReferenceName");
304         if (msg != null){ return msg; }
305 
306         if (!Arrays.equals(actualRead.getReadBases(), expectedRead.getReadBases())){
307             return "getReadBases different actualRead:" + actualName + " expectedRead:" + expectedName + " (" + Arrays.toString(actualRead.getReadBases()) + " vs " + Arrays.toString(expectedRead.getReadBases()) + ")";
308         }
309         if (!Arrays.equals(actualRead.getBaseQualities(), expectedRead.getBaseQualities())){
310             return "getBaseQualities different actualRead:" + actualName + " expectedRead:" + expectedName + " (" + Arrays.toString(actualRead.getBaseQualities()) + " vs " + Arrays.toString(expectedRead.getBaseQualities()) + ")";
311         }
312         return compareReadAttributes(actualRead, expectedRead);
313     }
314 
compareReadAttributes(final SAMRecord actualRead, final SAMRecord expectedRead)315     private static String compareReadAttributes(final SAMRecord actualRead, final SAMRecord expectedRead) {
316         final String actualName = actualRead.getReadName();
317         final String expectedName = expectedRead.getReadName();
318         final String readNames = "actualName:" + actualName + " expectedName:" + expectedName;
319 
320         String msg;
321         final List<SAMRecord.SAMTagAndValue> actualAttributes = actualRead.getAttributes();
322         final List<SAMRecord.SAMTagAndValue> expectedAttributes = expectedRead.getAttributes();
323 
324         //We want to compare attributes regardless of order, so we put them in a map
325         final Map<String, Object> actualAttributesByName = new LinkedHashMap<>();
326         final Map<String, Object> expectedAttributesByName = new LinkedHashMap<>();
327 
328         for (final SAMRecord.SAMTagAndValue samTagAndValue : actualAttributes) {
329             actualAttributesByName.put(samTagAndValue.tag, samTagAndValue.value);
330         }
331         for (final SAMRecord.SAMTagAndValue samTagAndValue : expectedAttributes) {
332             expectedAttributesByName.put(samTagAndValue.tag, samTagAndValue.value);
333         }
334 
335         final Sets.SetView<String> attrDiff = Sets.difference(expectedAttributesByName.keySet(), actualAttributesByName.keySet());
336         if (!attrDiff.isEmpty()){
337             final StringBuilder sb= new StringBuilder();
338             sb.append("expected read contains attributes that actual read lacks: " + readNames + " " + attrDiff + "\n");
339             for (final String attr : attrDiff) {
340                 sb.append(attr + " " + expectedAttributesByName.get(attr) + "\n");
341             }
342             return sb.toString();
343         }
344 
345         for (int i = 0; i < expectedAttributesByName.size(); i++) {
346             final String expectedTag = expectedAttributes.get(i).tag;
347             final Object expectedValue = expectedAttributesByName.get(expectedTag);
348             final Object actualValue = actualAttributesByName.get(expectedTag);
349 
350             msg = compareValues(actualValue, expectedValue, readNames + " attribute " + expectedTag);
351             if (msg != null){ return msg; }
352         }
353         return null;
354     }
355 
356     /**
357      * Compares the two given bam files, optionally sorting them before comparison.
358      * The sorting is helpful to compare files that are different but equivalent (eg read pairs with same coordinates get reordered).
359      */
assertEqualBamFiles( final File resultFile, final File expectedFile, final boolean compareBamFilesSorted, final ValidationStringency stringency)360     public static void assertEqualBamFiles(
361             final File resultFile,
362             final File expectedFile,
363             final boolean compareBamFilesSorted,
364             final ValidationStringency stringency) throws IOException {
365         assertEqualBamFiles(resultFile, expectedFile, null, compareBamFilesSorted, stringency);
366     }
367 
368     /**
369      * Compares the two given bam files, optionally sorting them before comparison.
370      * The sorting is helpful to compare files that are different but equivalent (eg read pairs with same coordinates get reordered).
371      */
assertEqualBamFiles( final File resultFile, final File expectedFile, final File reference, final boolean compareBamFilesSorted, final ValidationStringency stringency)372     public static void assertEqualBamFiles(
373             final File resultFile,
374             final File expectedFile,
375             final File reference,
376             final boolean compareBamFilesSorted,
377             final ValidationStringency stringency) throws IOException {
378 
379         if (compareBamFilesSorted) {
380             final File resultFileSorted= BaseTest.createTempFile("resultsFileSorted", "." + FilenameUtils.getExtension(resultFile.getName()));
381             final File expectedFileSorted = BaseTest.createTempFile("expectedFileSorted", "." + FilenameUtils.getExtension(expectedFile.getName()));
382 
383             sortSam(resultFile, resultFileSorted, reference, stringency);
384             sortSam(expectedFile, expectedFileSorted, reference, stringency);
385 
386             assertSamsEqual(resultFileSorted, expectedFileSorted, stringency, reference);
387         } else {
388             assertSamsEqual(resultFile, expectedFile, stringency, reference);
389         }
390     }
391 
392     /**
393      * Validate/assert that the contents are CRAM if the extension is .cram
394      */
assertCRAMContentsIfCRAM(final File putativeCRAMFile)395     public static void assertCRAMContentsIfCRAM(final File putativeCRAMFile) {
396         Path path = (null==putativeCRAMFile?null:putativeCRAMFile.toPath());
397         assertCRAMContentsIfCRAM(path);
398     }
399 
400     /**
401      * Validate/assert that the contents are CRAM if the extension is .cram
402      */
assertCRAMContentsIfCRAM(final Path putativeCRAMPath)403     public static void assertCRAMContentsIfCRAM(final Path putativeCRAMPath) {
404         if (new GATKPath(putativeCRAMPath.toUri().toString()).isCram()) {
405             assertCRAMContents(putativeCRAMPath);
406         }
407     }
408 
409     /**
410      * Unconditionally validate/assert that the contents are CRAM
411      */
assertCRAMContents(final Path putativeCRAMPath)412     public static void assertCRAMContents(final Path putativeCRAMPath) {
413         Assert.assertTrue(ReadUtils.hasCRAMFileContents(putativeCRAMPath), "should have had CRAM contents: " + putativeCRAMPath.toUri().toString());
414     }
415 
sortSam(final File input, final File output, final File reference, final ValidationStringency stringency)416     private static void sortSam(final File input, final File output, final File reference, final ValidationStringency stringency) {
417         final SortSam sort = new SortSam();
418 
419         // We can't use ArgumentsBuilder since it assumes GATK argument names, but we're running a Picard
420         // tool, which uses upper case argument names.
421         final List<String> args = new ArrayList<>(6);
422         args.add("-I");
423         args.add(input.getAbsolutePath());
424         args.add("-O");
425         args.add(output.getAbsolutePath());
426         args.add("-SO");
427         args.add(SAMFileHeader.SortOrder.coordinate.name());
428         args.add("--VALIDATION_STRINGENCY");
429         args.add(stringency.name());
430         if (reference != null) {
431             args.add("--REFERENCE_SEQUENCE");
432             args.add(reference.getAbsolutePath());
433         }
434 
435         int returnCode  = sort.instanceMain(args.toArray(new String[0]));
436         if (returnCode != 0) {
437             throw new RuntimeException("Failure running SortSam on inputs");
438         }
439     }
440 
441     /**
442      * Get the program records (@PG) in the BAM file header
443      *
444      * @param bamFile   the BAM file
445      * @return  program records from the BAN file header
446      * @throws IOException if it cannot close the BAM file
447      */
getProgramRecords(final File bamFile)448     private static List<SAMProgramRecord> getProgramRecords(final File bamFile) throws IOException {
449         try(final SamReader bamInReader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(bamFile)) {
450             return bamInReader.getFileHeader().getProgramRecords();
451         }
452     }
453 
454     /**
455      * Assert the output BAM file header contains the input BAM file header Program Records (@PG)
456      * @param inputBam  input BAM file
457      * @param outputBam output BAM file
458      */
assertOutBamContainsInBamProgramRecords(final File inputBam, final File outputBam)459     public static void assertOutBamContainsInBamProgramRecords(final File inputBam, final File outputBam) throws IOException {
460         final List<SAMProgramRecord> bamInProgramRecords = getProgramRecords(inputBam);
461         final List<SAMProgramRecord> bamOutProgramRecords = getProgramRecords(outputBam);
462         Assert.assertTrue(bamOutProgramRecords.containsAll(bamInProgramRecords));
463     }
464 
465 }
466