1 package org.broadinstitute.hellbender.testutils; 2 3 import com.google.common.collect.Sets; 4 import htsjdk.samtools.*; 5 import org.apache.commons.io.FilenameUtils; 6 import org.broadinstitute.hellbender.engine.GATKPath; 7 import org.broadinstitute.hellbender.utils.Utils; 8 import org.broadinstitute.hellbender.utils.read.ReadUtils; 9 import org.testng.Assert; 10 import picard.sam.SortSam; 11 import picard.sam.util.SamComparison; 12 13 import java.io.File; 14 import java.io.IOException; 15 import java.io.PrintWriter; 16 import java.nio.file.Path; 17 import java.util.*; 18 19 /** 20 * Collection of utilities for making common assertions about SAM files for unit testing purposes. 21 */ 22 public final class SamAssertionUtils { 23 getReader(final File sam, final ValidationStringency validationStringency, final File reference)24 private static SamReader getReader(final File sam, final ValidationStringency validationStringency, final File reference) { 25 return SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(reference).open(sam); 26 } getReader(final Path sam, final ValidationStringency validationStringency, final Path reference)27 private static SamReader getReader(final Path sam, final ValidationStringency validationStringency, final Path reference) { 28 return SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(reference).open(sam); 29 } 30 /** 31 * causes an exception if the given sam files aren't equal 32 * @param actualSam the actual file 33 * @param expectedSam the expected file 34 * @param validationStringency how stringently do we validate the files 35 * @param reference is allowed to be null 36 */ assertSamsEqual(final File actualSam, final File expectedSam, final ValidationStringency validationStringency, final File reference)37 public static void assertSamsEqual(final File actualSam, final File expectedSam, final ValidationStringency validationStringency, final File reference) throws IOException { 38 assertSamsEqual(actualSam.toPath(), expectedSam.toPath(), validationStringency, 39 (null==reference?null:reference.toPath())); 40 } assertSamsEqual(final Path actualSam, final Path expectedSam, final ValidationStringency validationStringency, final Path reference)41 public static void assertSamsEqual(final Path actualSam, final Path expectedSam, final ValidationStringency validationStringency, final Path reference) throws IOException { 42 final String equalStringent = samsEqualStringent(actualSam, expectedSam, validationStringency, reference); 43 Assert.assertNull(equalStringent, "SAM file " + actualSam.toUri().toString() + " differs from expected output:" + expectedSam.toUri().toString() + " " + equalStringent); 44 } 45 46 /** 47 * causes an exception if the given sam files aren't equal 48 * @param actualSam the actual file 49 * @param expectedSam the expected file 50 * @param validationStringency how stringently do we validate the files 51 */ assertSamsEqual(final File actualSam, final File expectedSam, final ValidationStringency validationStringency)52 public static void assertSamsEqual(final File actualSam, final File expectedSam, final ValidationStringency validationStringency) throws IOException { 53 assertSamsEqual(actualSam, expectedSam, validationStringency, null); 54 } 55 56 /** 57 * causes an exception if the given sam files aren't equal 58 * @param actualSam the actual file 59 * @param expectedSam the expected file 60 * @param reference is allowed to be null 61 */ assertSamsEqual(final File actualSam, final File expectedSam, final File reference)62 public static void assertSamsEqual(final File actualSam, final File expectedSam, final File reference) throws IOException { 63 assertSamsEqual(actualSam, expectedSam, ValidationStringency.DEFAULT_STRINGENCY, reference); 64 } assertSamsEqual(final Path actualSam, final Path expectedSam, final Path reference)65 public static void assertSamsEqual(final Path actualSam, final Path expectedSam, final Path reference) throws IOException { 66 assertSamsEqual(actualSam, expectedSam, ValidationStringency.DEFAULT_STRINGENCY, reference); 67 } 68 69 /** 70 * causes an exception if the given sam files aren't equal 71 * @param actualSam the actual file 72 * @param expectedSam the expected file 73 */ assertSamsEqual(final File actualSam, final File expectedSam)74 public static void assertSamsEqual(final File actualSam, final File expectedSam) throws IOException { 75 assertSamsEqual(actualSam, expectedSam, ValidationStringency.DEFAULT_STRINGENCY, null); 76 } 77 78 /** 79 * causes an exception if the given sam isn't valid 80 * @param reference is allowed to be null 81 */ assertSamValid(final File sam, final ValidationStringency validationStringency, final File reference)82 public static void assertSamValid(final File sam, final ValidationStringency validationStringency, final File reference) throws IOException { 83 assertCRAMContentsIfCRAM(sam); 84 try (final SamReader samReader = getReader(sam, validationStringency, reference)) { 85 final SamFileValidator validator = new SamFileValidator(new PrintWriter(System.out), 8000); 86 validator.setIgnoreWarnings(true); 87 validator.setVerbose(true, 1000); 88 validator.setErrorsToIgnore(Collections.singletonList(SAMValidationError.Type.MISSING_READ_GROUP)); 89 final boolean validated = validator.validateSamFileVerbose(samReader, null); 90 Assert.assertTrue(validated, "SAM file validation failed"); 91 } 92 } 93 94 /** 95 * causes an exception if the given sam isn't valid 96 */ assertSamValid(final File sam, final ValidationStringency validationStringency)97 public static void assertSamValid(final File sam, final ValidationStringency validationStringency) throws IOException { 98 assertSamValid(sam, validationStringency, null); 99 } 100 101 /** 102 * causes an exception if the given sam isn't valid 103 * @param reference is allowed to be null 104 * the default ValidationStringency value for this method is LENIENT 105 */ assertSamValid(final File sam, final File reference)106 public static void assertSamValid(final File sam, final File reference) throws IOException { 107 assertSamValid(sam, ValidationStringency.LENIENT, reference); 108 } 109 110 /** 111 * causes an exception if the given sam isn't valid 112 * the default ValidationStringency value for this method is LENIENT 113 */ assertSamValid(final File sam)114 public static void assertSamValid(final File sam) throws IOException { 115 assertSamValid(sam, ValidationStringency.LENIENT, null); 116 } 117 118 /** 119 * Compares SAM/BAM files in a stringent way but not by byte identity (allow reorder of attributes). 120 * Returns null if the files are considered equals and returns a String describing the reason for comparison failure. 121 * The lenient comparison only checks headers and alignment info {@link SamComparison}. Compares headers, and if headers are compatible enough, compares SAMRecords, 122 * looking only at basic alignment info. 123 */ samsEqualLenient(final File actualSam, final File expectedSam, final ValidationStringency validation, final File reference)124 public static String samsEqualLenient(final File actualSam, final File expectedSam, final ValidationStringency validation, final File reference) throws IOException { 125 assertCRAMContentsIfCRAM(actualSam); 126 assertCRAMContentsIfCRAM(expectedSam); 127 try(final SamReader reader1 = getReader(actualSam, validation, reference); 128 final SamReader reader2 = getReader(expectedSam, validation, reference)) { 129 130 final SamComparison comparison = new SamComparison(reader1, reader2); 131 return comparison.areEqual() ? null : "SamComparison fails"; 132 } 133 } 134 135 /** 136 * Compares SAM/BAM files in a stringent way but not by byte identity (allow reorder of attributes) 137 * Comparing by MD5s is too strict and comparing by SamComparison is too lenient. So we need this method. 138 * 139 * This differs from a byte-to-byte comparison: 140 * - @PG and @CO lines in headers are ignored in the comparison. 141 * - each read in the actual file are allowed to have a superset of the attributes of the corresponding read in the expected set 142 * @return null if equal or message string if not equal. 143 */ samsEqualStringent(final File actualSam, final File expectedSam, final ValidationStringency validation, final File reference)144 public static String samsEqualStringent(final File actualSam, final File expectedSam, final ValidationStringency validation, final File reference) throws IOException { 145 return samsEqualStringent(actualSam.toPath(), expectedSam.toPath(), validation, (null==reference?null:reference.toPath())); 146 } 147 148 /** 149 * Compares SAM/BAM files in a stringent way but not by byte identity (allow reorder of attributes) 150 * Comparing by MD5s is too strict and comparing by SamComparison is too lenient. So we need this method. 151 * 152 * This differs from a byte-to-byte comparison: 153 * - @PG and @CO lines in headers are ignored in the comparison. 154 * - each read in the actual file are allowed to have a superset of the attributes of the corresponding read in the expected set 155 * @return null if equal or message string if not equal. 156 */ samsEqualStringent(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference)157 public static String samsEqualStringent(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference) throws IOException { 158 if (sameMD5s(actualSam, expectedSam)) { 159 return null; 160 } 161 162 // verify that CRAM files have CRAM contents 163 assertCRAMContentsIfCRAM(actualSam); 164 assertCRAMContentsIfCRAM(expectedSam); 165 166 String msg = equalHeadersIgnoreCOandPG(actualSam, expectedSam, validation, reference); 167 if (msg != null) { return msg; } 168 169 //At this point we know that the files are not byte-wise identical, but are equal according to SamComparison and their headers are equal 170 //So we iterate over reads and compare them one by one. 171 return compareReads(actualSam, expectedSam, validation, reference); 172 } 173 174 sameMD5s(final File actualSam, final File expectedSam)175 private static boolean sameMD5s(final File actualSam, final File expectedSam) throws IOException { 176 final String fileMD5_1 = Utils.calculateFileMD5(actualSam); 177 final String fileMD5_2 = Utils.calculateFileMD5(expectedSam); 178 return fileMD5_1.equals(fileMD5_2); 179 } 180 181 sameMD5s(final Path actualSam, final Path expectedSam)182 private static boolean sameMD5s(final Path actualSam, final Path expectedSam) throws IOException { 183 final String fileMD5_1 = Utils.calculatePathMD5(actualSam); 184 final String fileMD5_2 = Utils.calculatePathMD5(expectedSam); 185 return fileMD5_1.equals(fileMD5_2); 186 } 187 compareReads(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference)188 private static String compareReads(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference) throws IOException { 189 try(final SamReader reader1 = getReader(actualSam, validation, reference); 190 final SamReader reader2 = getReader(expectedSam, validation, reference)) { 191 final SAMRecordIterator it1 = reader1.iterator(); 192 final SAMRecordIterator it2 = reader2.iterator(); 193 while (it1.hasNext() && it2.hasNext()) { 194 final SAMRecord read1 = it1.next(); 195 final SAMRecord read2 = it2.next(); 196 final String eqMessage = readsEqualAllowAddingAttributes(read1, read2); 197 if (eqMessage != null){ 198 return eqMessage; 199 } 200 } 201 if (it1.hasNext() || it2.hasNext()) { 202 //at least one has no more records (because the while loop is done) and at least one does have records. So we're not equal. 203 return "Not the same number of reads"; 204 } 205 return null; 206 } 207 } 208 equalHeadersIgnoreCOandPG(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference)209 private static String equalHeadersIgnoreCOandPG(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference) throws IOException { 210 try(final SamReader reader1 = getReader(actualSam, validation, reference); 211 final SamReader reader2 = getReader(expectedSam, validation, reference)){ 212 213 final SAMFileHeader h1 = reader1.getFileHeader(); 214 final SAMFileHeader h2 = reader2.getFileHeader(); 215 String msg; 216 217 //Note: we allow the versions to differ 218 219 msg = compareValues(h1.getCreator(), h2.getCreator(), "File creator"); 220 if (msg != null) { return msg; } 221 222 msg = compareValues(h1.getAttribute("SO"), h2.getAttribute("SO"), "Sort order"); 223 if (msg != null) { return msg; } 224 225 if (! Objects.equals(h1.getSequenceDictionary(), h2.getSequenceDictionary())){ 226 return "Different Sequence dictionaries"; 227 } 228 229 msg = compareReadGroups(h1, h2); 230 if (msg != null) { return msg; } 231 232 return msg; 233 } 234 } 235 compareReadGroups(final SAMFileHeader h1, final SAMFileHeader h2)236 private static String compareReadGroups(final SAMFileHeader h1, final SAMFileHeader h2) { 237 final List<SAMReadGroupRecord> l1 = h1.getReadGroups(); 238 final List<SAMReadGroupRecord> l2 = h2.getReadGroups(); 239 final String msg = compareValues(l1.size(), l2.size(), "Number of read groups"); 240 if (msg != null){ return msg; } 241 242 for (int i = 0; i < l1.size(); ++i) { 243 if (! Objects.equals(l1.get(i), l2.get(i))){ 244 return "Read group records different:" + l1.get(i) + " vs " + l2.get(i); 245 } 246 } 247 return null; 248 } 249 compareValues(final T v1, final T v2, final String label)250 private static <T> String compareValues(final T v1, final T v2, final String label) { 251 boolean eq = Objects.equals(v1, v2); 252 if (eq) { 253 return null; 254 } else { 255 final String s1 = String.valueOf(v1); 256 final String s2 = String.valueOf(v2); 257 return label + " differs. File 1: " + s1 + " File 2: " + s2; 258 } 259 } 260 261 /** 262 * Compares the reads but ignores order of attributes. 263 * Also allows actualRead to have a superset of attributes of expectedRead. 264 */ readsEqualAllowAddingAttributes(final SAMRecord actualRead, final SAMRecord expectedRead)265 private static String readsEqualAllowAddingAttributes(final SAMRecord actualRead, final SAMRecord expectedRead) { 266 final String actualName = actualRead.getReadName(); 267 final String expectedName = expectedRead.getReadName(); 268 269 String msg; 270 271 msg = compareValues(actualName, expectedName, "name"); 272 if (msg != null){ return msg; } 273 274 final String readNames = "actualName:" + actualName + " expectedName:" + expectedName; 275 276 msg = compareValues(SAMFlag.getFlags(actualRead.getFlags()), SAMFlag.getFlags(expectedRead.getFlags()), readNames + " getFlags"); 277 if (msg != null){ return msg; } 278 279 msg = compareValues(actualRead.getInferredInsertSize(), expectedRead.getInferredInsertSize(), readNames + " getInferredInsertSize"); 280 if (msg != null){ return msg; } 281 282 msg = compareValues(actualRead.getMappingQuality(), expectedRead.getMappingQuality(), readNames + " getMappingQuality"); 283 if (msg != null){ return msg; } 284 285 msg = compareValues(actualRead.getMateReferenceIndex(), expectedRead.getMateReferenceIndex(), readNames + "getMateReferenceIndex"); 286 if (msg != null){ return msg; } 287 288 msg = compareValues(actualRead.getMateAlignmentStart(), expectedRead.getMateAlignmentStart(), readNames + "getMateAlignmentStart"); 289 if (msg != null){ return msg; } 290 291 msg = compareValues(actualRead.getReferenceIndex(), expectedRead.getReferenceIndex(), readNames + " getReferenceIndex"); 292 if (msg != null){ return msg; } 293 294 msg = compareValues(actualRead.getAlignmentStart(), expectedRead.getAlignmentStart(), readNames + " getAlignmentStart"); 295 if (msg != null){ return msg; } 296 297 msg = compareValues(actualRead.getCigar(), expectedRead.getCigar(), readNames + " getCigar"); 298 if (msg != null){ return msg; } 299 300 msg = compareValues(actualRead.getReferenceName(), expectedRead.getReferenceName(), readNames + " getReferenceName"); 301 if (msg != null){ return msg; } 302 303 msg = compareValues(actualRead.getMateReferenceName(), expectedRead.getMateReferenceName(), readNames + " getMateReferenceName"); 304 if (msg != null){ return msg; } 305 306 if (!Arrays.equals(actualRead.getReadBases(), expectedRead.getReadBases())){ 307 return "getReadBases different actualRead:" + actualName + " expectedRead:" + expectedName + " (" + Arrays.toString(actualRead.getReadBases()) + " vs " + Arrays.toString(expectedRead.getReadBases()) + ")"; 308 } 309 if (!Arrays.equals(actualRead.getBaseQualities(), expectedRead.getBaseQualities())){ 310 return "getBaseQualities different actualRead:" + actualName + " expectedRead:" + expectedName + " (" + Arrays.toString(actualRead.getBaseQualities()) + " vs " + Arrays.toString(expectedRead.getBaseQualities()) + ")"; 311 } 312 return compareReadAttributes(actualRead, expectedRead); 313 } 314 compareReadAttributes(final SAMRecord actualRead, final SAMRecord expectedRead)315 private static String compareReadAttributes(final SAMRecord actualRead, final SAMRecord expectedRead) { 316 final String actualName = actualRead.getReadName(); 317 final String expectedName = expectedRead.getReadName(); 318 final String readNames = "actualName:" + actualName + " expectedName:" + expectedName; 319 320 String msg; 321 final List<SAMRecord.SAMTagAndValue> actualAttributes = actualRead.getAttributes(); 322 final List<SAMRecord.SAMTagAndValue> expectedAttributes = expectedRead.getAttributes(); 323 324 //We want to compare attributes regardless of order, so we put them in a map 325 final Map<String, Object> actualAttributesByName = new LinkedHashMap<>(); 326 final Map<String, Object> expectedAttributesByName = new LinkedHashMap<>(); 327 328 for (final SAMRecord.SAMTagAndValue samTagAndValue : actualAttributes) { 329 actualAttributesByName.put(samTagAndValue.tag, samTagAndValue.value); 330 } 331 for (final SAMRecord.SAMTagAndValue samTagAndValue : expectedAttributes) { 332 expectedAttributesByName.put(samTagAndValue.tag, samTagAndValue.value); 333 } 334 335 final Sets.SetView<String> attrDiff = Sets.difference(expectedAttributesByName.keySet(), actualAttributesByName.keySet()); 336 if (!attrDiff.isEmpty()){ 337 final StringBuilder sb= new StringBuilder(); 338 sb.append("expected read contains attributes that actual read lacks: " + readNames + " " + attrDiff + "\n"); 339 for (final String attr : attrDiff) { 340 sb.append(attr + " " + expectedAttributesByName.get(attr) + "\n"); 341 } 342 return sb.toString(); 343 } 344 345 for (int i = 0; i < expectedAttributesByName.size(); i++) { 346 final String expectedTag = expectedAttributes.get(i).tag; 347 final Object expectedValue = expectedAttributesByName.get(expectedTag); 348 final Object actualValue = actualAttributesByName.get(expectedTag); 349 350 msg = compareValues(actualValue, expectedValue, readNames + " attribute " + expectedTag); 351 if (msg != null){ return msg; } 352 } 353 return null; 354 } 355 356 /** 357 * Compares the two given bam files, optionally sorting them before comparison. 358 * The sorting is helpful to compare files that are different but equivalent (eg read pairs with same coordinates get reordered). 359 */ assertEqualBamFiles( final File resultFile, final File expectedFile, final boolean compareBamFilesSorted, final ValidationStringency stringency)360 public static void assertEqualBamFiles( 361 final File resultFile, 362 final File expectedFile, 363 final boolean compareBamFilesSorted, 364 final ValidationStringency stringency) throws IOException { 365 assertEqualBamFiles(resultFile, expectedFile, null, compareBamFilesSorted, stringency); 366 } 367 368 /** 369 * Compares the two given bam files, optionally sorting them before comparison. 370 * The sorting is helpful to compare files that are different but equivalent (eg read pairs with same coordinates get reordered). 371 */ assertEqualBamFiles( final File resultFile, final File expectedFile, final File reference, final boolean compareBamFilesSorted, final ValidationStringency stringency)372 public static void assertEqualBamFiles( 373 final File resultFile, 374 final File expectedFile, 375 final File reference, 376 final boolean compareBamFilesSorted, 377 final ValidationStringency stringency) throws IOException { 378 379 if (compareBamFilesSorted) { 380 final File resultFileSorted= BaseTest.createTempFile("resultsFileSorted", "." + FilenameUtils.getExtension(resultFile.getName())); 381 final File expectedFileSorted = BaseTest.createTempFile("expectedFileSorted", "." + FilenameUtils.getExtension(expectedFile.getName())); 382 383 sortSam(resultFile, resultFileSorted, reference, stringency); 384 sortSam(expectedFile, expectedFileSorted, reference, stringency); 385 386 assertSamsEqual(resultFileSorted, expectedFileSorted, stringency, reference); 387 } else { 388 assertSamsEqual(resultFile, expectedFile, stringency, reference); 389 } 390 } 391 392 /** 393 * Validate/assert that the contents are CRAM if the extension is .cram 394 */ assertCRAMContentsIfCRAM(final File putativeCRAMFile)395 public static void assertCRAMContentsIfCRAM(final File putativeCRAMFile) { 396 Path path = (null==putativeCRAMFile?null:putativeCRAMFile.toPath()); 397 assertCRAMContentsIfCRAM(path); 398 } 399 400 /** 401 * Validate/assert that the contents are CRAM if the extension is .cram 402 */ assertCRAMContentsIfCRAM(final Path putativeCRAMPath)403 public static void assertCRAMContentsIfCRAM(final Path putativeCRAMPath) { 404 if (new GATKPath(putativeCRAMPath.toUri().toString()).isCram()) { 405 assertCRAMContents(putativeCRAMPath); 406 } 407 } 408 409 /** 410 * Unconditionally validate/assert that the contents are CRAM 411 */ assertCRAMContents(final Path putativeCRAMPath)412 public static void assertCRAMContents(final Path putativeCRAMPath) { 413 Assert.assertTrue(ReadUtils.hasCRAMFileContents(putativeCRAMPath), "should have had CRAM contents: " + putativeCRAMPath.toUri().toString()); 414 } 415 sortSam(final File input, final File output, final File reference, final ValidationStringency stringency)416 private static void sortSam(final File input, final File output, final File reference, final ValidationStringency stringency) { 417 final SortSam sort = new SortSam(); 418 419 // We can't use ArgumentsBuilder since it assumes GATK argument names, but we're running a Picard 420 // tool, which uses upper case argument names. 421 final List<String> args = new ArrayList<>(6); 422 args.add("-I"); 423 args.add(input.getAbsolutePath()); 424 args.add("-O"); 425 args.add(output.getAbsolutePath()); 426 args.add("-SO"); 427 args.add(SAMFileHeader.SortOrder.coordinate.name()); 428 args.add("--VALIDATION_STRINGENCY"); 429 args.add(stringency.name()); 430 if (reference != null) { 431 args.add("--REFERENCE_SEQUENCE"); 432 args.add(reference.getAbsolutePath()); 433 } 434 435 int returnCode = sort.instanceMain(args.toArray(new String[0])); 436 if (returnCode != 0) { 437 throw new RuntimeException("Failure running SortSam on inputs"); 438 } 439 } 440 441 /** 442 * Get the program records (@PG) in the BAM file header 443 * 444 * @param bamFile the BAM file 445 * @return program records from the BAN file header 446 * @throws IOException if it cannot close the BAM file 447 */ getProgramRecords(final File bamFile)448 private static List<SAMProgramRecord> getProgramRecords(final File bamFile) throws IOException { 449 try(final SamReader bamInReader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(bamFile)) { 450 return bamInReader.getFileHeader().getProgramRecords(); 451 } 452 } 453 454 /** 455 * Assert the output BAM file header contains the input BAM file header Program Records (@PG) 456 * @param inputBam input BAM file 457 * @param outputBam output BAM file 458 */ assertOutBamContainsInBamProgramRecords(final File inputBam, final File outputBam)459 public static void assertOutBamContainsInBamProgramRecords(final File inputBam, final File outputBam) throws IOException { 460 final List<SAMProgramRecord> bamInProgramRecords = getProgramRecords(inputBam); 461 final List<SAMProgramRecord> bamOutProgramRecords = getProgramRecords(outputBam); 462 Assert.assertTrue(bamOutProgramRecords.containsAll(bamInProgramRecords)); 463 } 464 465 } 466