1 package org.broadinstitute.hellbender.utils.read; 2 3 import htsjdk.samtools.*; 4 import htsjdk.samtools.util.FileExtensions; 5 import org.apache.commons.lang3.tuple.MutablePair; 6 import org.apache.commons.lang3.tuple.Pair; 7 import org.apache.logging.log4j.LogManager; 8 import org.apache.logging.log4j.Logger; 9 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; 10 import org.broadinstitute.hellbender.engine.ReadsContext; 11 import org.broadinstitute.hellbender.exceptions.GATKException; 12 import org.broadinstitute.hellbender.exceptions.UserException; 13 import org.broadinstitute.hellbender.utils.BaseUtils; 14 import org.broadinstitute.hellbender.utils.QualityUtils; 15 import org.broadinstitute.hellbender.utils.SimpleInterval; 16 import org.broadinstitute.hellbender.utils.Utils; 17 import org.broadinstitute.hellbender.utils.recalibration.EventType; 18 19 import java.io.BufferedInputStream; 20 import java.io.File; 21 import java.io.IOException; 22 import java.io.InputStream; 23 import java.nio.file.Files; 24 import java.nio.file.OpenOption; 25 import java.nio.file.Path; 26 import java.util.*; 27 28 /** 29 * A miscellaneous collection of utilities for working with reads, headers, etc. 30 * Static methods only, please. 31 */ 32 public final class ReadUtils { 33 ReadUtils()34 private ReadUtils() { 35 } 36 37 private static final Logger logger = LogManager.getLogger(); 38 39 /** 40 * The default quality score for an insertion or deletion, if 41 * none are provided for this read. 42 */ 43 public static final byte DEFAULT_INSERTION_DELETION_QUAL = (byte)45; 44 45 // Base Quality Score Recalibrator specific attribute tags 46 public static final String BQSR_BASE_INSERTION_QUALITIES = "BI"; // base qualities for insertions 47 public static final String BQSR_BASE_DELETION_QUALITIES = "BD"; // base qualities for deletions 48 49 public static final int READ_INDEX_NOT_FOUND = -1; 50 private static final int DEFAULT_ADAPTOR_SIZE = 100; 51 52 public static final String ORIGINAL_BASE_QUALITIES_TAG = SAMTag.OQ.name(); 53 54 /** 55 * BAM file magic value that starts every bam file 56 */ 57 public static final byte[] BAM_MAGIC = "BAM\1".getBytes(); 58 59 /** 60 * HACK: This is used to make a copy of a read. 61 * Really, SAMRecord should provide a copy constructor or a factory method. 62 */ cloneSAMRecord(final SAMRecord originalRead)63 public static SAMRecord cloneSAMRecord(final SAMRecord originalRead) { 64 if (originalRead == null) { 65 return null; 66 } 67 try { 68 return (SAMRecord)originalRead.clone(); 69 } catch (final CloneNotSupportedException e) { 70 throw new IllegalStateException(e); 71 } 72 } 73 74 /** 75 * HACK: This is used to make a copy of a header. 76 * Really, SAMFileHeader should provide a copy constructor or a factory method. 77 */ 78 cloneSAMFileHeader( final SAMFileHeader header )79 public static SAMFileHeader cloneSAMFileHeader( final SAMFileHeader header ) { 80 if (header == null) return null; 81 return header.clone(); 82 } 83 84 /** 85 * Checks whether read is a headerless SAMRecordToGATKReadAdapter, and if it is, sets its 86 * header to the provided header. 87 * 88 * @param read A potentially headerless GATKRead 89 * @param header header to store in the read, if it's a headerless SAMRecord-backed read 90 */ restoreHeaderIfNecessary( final GATKRead read, final SAMFileHeader header )91 public static void restoreHeaderIfNecessary( final GATKRead read, final SAMFileHeader header ) { 92 if ( read instanceof SAMRecordToGATKReadAdapter ) { 93 SAMRecordToGATKReadAdapter readAdapter = (SAMRecordToGATKReadAdapter)read; 94 if ( ! readAdapter.hasHeader() ) { 95 readAdapter.setHeader(header); 96 } 97 } 98 } 99 100 /** 101 * Retrieve the original base qualities of the given read, if present, 102 * as stored in the OQ attribute. 103 * 104 * @param read read to check 105 * @return original base qualities as stored in the OQ attribute, or null 106 * if the OQ attribute is not present 107 */ getOriginalBaseQualities( final GATKRead read )108 public static byte[] getOriginalBaseQualities( final GATKRead read ) { 109 if ( ! read.hasAttribute(ORIGINAL_BASE_QUALITIES_TAG) ) { 110 return null; 111 } 112 final String oqString = read.getAttributeAsString(ORIGINAL_BASE_QUALITIES_TAG); 113 return !oqString.isEmpty() ? SAMUtils.fastqToPhred(oqString) : null; 114 } 115 116 /** 117 * Returns the base qualities for the read as a string. 118 * 119 * @param read read whose base qualities should be returned 120 * @return Base qualities string as printable ASCII values (encoded as a FASTQ string). 121 */ getBaseQualityString( final GATKRead read )122 public static String getBaseQualityString( final GATKRead read ) { 123 Utils.nonNull(read); 124 if ( Arrays.equals(SAMRecord.NULL_QUALS, read.getBaseQualities()) ) { 125 return SAMRecord.NULL_QUALS_STRING; 126 } 127 return SAMUtils.phredToFastq(read.getBaseQualities()); 128 } 129 130 /** 131 * Set the base qualities from a string of ASCII encoded values 132 * @param read read whose base qualities should be set 133 * @param baseQualityString ASCII encoded (encoded as a FASTQ string) values of base qualities. 134 */ setBaseQualityString(final GATKRead read, final String baseQualityString)135 public static void setBaseQualityString(final GATKRead read, final String baseQualityString) { 136 Utils.nonNull(read); 137 Utils.nonNull(baseQualityString); 138 139 if ( SAMRecord.NULL_QUALS_STRING.equals(baseQualityString) ) { 140 read.setBaseQualities(SAMRecord.NULL_QUALS); 141 } else { 142 read.setBaseQualities(SAMUtils.fastqToPhred(baseQualityString)); 143 } 144 } 145 146 /** 147 * Returns the reference index in the given header of the read's contig, 148 * or {@link SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read is unmapped. 149 * 150 * @param read read whose reference index to look up 151 * @param header SAM header defining contig indices 152 * @return the reference index in the given header of the read's contig, 153 * or {@link SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read is unmapped. 154 */ getReferenceIndex( final GATKRead read, final SAMFileHeader header )155 public static int getReferenceIndex( final GATKRead read, final SAMFileHeader header ) { 156 if ( read.isUnmapped() ) { 157 return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; 158 } 159 160 return header.getSequenceIndex(read.getContig()); 161 } 162 163 /** 164 * Returns the reference index in the given header of the read's assigned contig. 165 * 166 * Unlike {@link #getReferenceIndex}, which returns {@link SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} 167 * for all unmapped reads, this method will return a reference index for unmapped 168 * reads that are assigned a nominal position (eg., the position of their mates), 169 * which is useful for sorting. 170 * 171 * @param read read whose reference index to look up 172 * @param header SAM header defining contig indices 173 * @return the reference index in the given header of the read's contig, 174 * or {@link SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read's contig 175 * is not found in the header 176 */ getAssignedReferenceIndex( final GATKRead read, final SAMFileHeader header )177 public static int getAssignedReferenceIndex( final GATKRead read, final SAMFileHeader header ) { 178 return header.getSequenceIndex(read.getAssignedContig()); 179 } 180 181 /** 182 * Checks whether the provided read has an assigned position. This is different than checking 183 * unmapped status, since unmapped reads are often assigned a nominal position (eg., the position 184 * of their mapped mate). A read is considered to have no assigned position if its assigned contig 185 * is either {@code null} or {@link ReadConstants#UNSET_CONTIG}, or its assigned start position is 186 * {@link ReadConstants#UNSET_POSITION}, regardless of whether the read is actually marked as mapped 187 * or unmapped. 188 * 189 * @param read read to check 190 * @return true if the read has no assigned position, otherwise false 191 */ readHasNoAssignedPosition( final GATKRead read )192 public static boolean readHasNoAssignedPosition( final GATKRead read ) { 193 // Check actual assigned positions rather than unmapped status, so that unmapped reads with 194 // assigned positions will be considered to have a position 195 return read.getAssignedContig() == null || 196 read.getAssignedContig().equals(ReadConstants.UNSET_CONTIG) || 197 read.getAssignedStart() == ReadConstants.UNSET_POSITION; 198 } 199 200 /** 201 * Returns the reference index in the given header of the contig of the read's mate, 202 * or {@link SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read's mate is unmapped. 203 * 204 * @param read read whose mate's reference index to look up 205 * @param header SAM header defining contig indices 206 * @return the reference index in the given header of the contig of the read's mate, 207 * or {@link SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read's mate is unmapped. 208 */ getMateReferenceIndex( final GATKRead read, final SAMFileHeader header )209 public static int getMateReferenceIndex( final GATKRead read, final SAMFileHeader header ) { 210 if ( read.mateIsUnmapped() ) { 211 return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; 212 } 213 214 return header.getSequenceIndex(read.getMateContig()); 215 } 216 217 /** 218 * Returns a {@link SAMReadGroupRecord} object corresponding to the provided read's read group. 219 * 220 * @param read read whose read group to retrieve 221 * @param header SAM header containing read groups 222 * @return a {@link SAMReadGroupRecord} object corresponding to the provided read's read group, 223 * or null if the read has no read group 224 */ getSAMReadGroupRecord( final GATKRead read, final SAMFileHeader header )225 public static SAMReadGroupRecord getSAMReadGroupRecord( final GATKRead read, final SAMFileHeader header ) { 226 final String readGroupName = read.getReadGroup(); 227 return readGroupName != null ? header.getReadGroup(readGroupName) : null; 228 } 229 230 /** 231 * Returns the platform associated with the provided read's read group. 232 * 233 * @param read read whose platform information to retrieve 234 * @param header SAM header containing read groups 235 * @return the platform for the provided read's read group as a String, 236 * or null if the read has no read group. 237 */ getPlatform( final GATKRead read, final SAMFileHeader header )238 public static String getPlatform( final GATKRead read, final SAMFileHeader header ) { 239 final SAMReadGroupRecord readGroup = getSAMReadGroupRecord(read, header); 240 return readGroup != null ? readGroup.getPlatform() : null; 241 } 242 243 /** 244 * Returns the platform unit associated with the provided read's read group. 245 * 246 * @param read read whose platform unit to retrieve 247 * @param header SAM header containing read groups 248 * @return the platform unit for the provided read's read group as a String, 249 * or null if the read has no read group. 250 */ getPlatformUnit( final GATKRead read, final SAMFileHeader header )251 public static String getPlatformUnit( final GATKRead read, final SAMFileHeader header ) { 252 final SAMReadGroupRecord readGroup = getSAMReadGroupRecord(read, header); 253 return readGroup != null ? readGroup.getPlatformUnit() : null; 254 } 255 256 /** 257 * Returns the library associated with the provided read's read group. 258 * 259 * @param read read whose library to retrieve 260 * @param header SAM header containing read groups 261 * @return the library for the provided read's read group as a String, 262 * or null if the read has no read group. 263 */ getLibrary( final GATKRead read, final SAMFileHeader header )264 public static String getLibrary( final GATKRead read, final SAMFileHeader header ) { 265 final SAMReadGroupRecord readGroup = getSAMReadGroupRecord(read, header); 266 return readGroup != null ? readGroup.getLibrary() : null; 267 } 268 269 /** 270 * Returns the sample name associated with the provided read's read group. 271 * 272 * @param read read whose sample name to retrieve 273 * @param header SAM header containing read groups 274 * @return the sample name for the provided read's read group as a String, 275 * or null if the read has no read group. 276 */ getSampleName( final GATKRead read, final SAMFileHeader header )277 public static String getSampleName( final GATKRead read, final SAMFileHeader header ) { 278 final SAMReadGroupRecord readGroup = getSAMReadGroupRecord(read, header); 279 return readGroup != null ? readGroup.getSample() : null; 280 } 281 282 /** 283 * Returns the read's unclipped start if the read is on the forward strand, 284 * or the read's unclipped end if the read is on the reverse strand. 285 * 286 * @param read read whose stranded unclipped start to retrieve 287 * @return the read's unclipped start if the read is on the forward strand, 288 * or the read's unclipped end if the read is on the reverse strand. 289 */ getStrandedUnclippedStart( final GATKRead read )290 public static int getStrandedUnclippedStart( final GATKRead read ) { 291 return read.isReverseStrand() ? read.getUnclippedEnd() : read.getUnclippedStart(); 292 } 293 isEmpty(final SAMRecord read)294 public static boolean isEmpty(final SAMRecord read) { 295 return read.getReadBases() == null || read.getReadLength() == 0; 296 } 297 prettyPrintSequenceRecords( final SAMSequenceDictionary sequenceDictionary )298 public static String prettyPrintSequenceRecords( final SAMSequenceDictionary sequenceDictionary ) { 299 final String[] sequenceRecordNames = new String[sequenceDictionary.size()]; 300 int sequenceRecordIndex = 0; 301 for (final SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences()) { 302 sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName(); 303 } 304 return Arrays.deepToString(sequenceRecordNames); 305 } 306 307 /** 308 * @param read read to query 309 * @return true if the read has a mate and that mate is mapped, otherwise false 310 */ readHasMappedMate( final GATKRead read )311 public static boolean readHasMappedMate( final GATKRead read ) { 312 return read.isPaired() && ! read.mateIsUnmapped(); 313 } 314 315 /** 316 * @param read read to query 317 * @return true if the read has a mate and that mate is mapped, otherwise false 318 */ readHasMappedMate( final SAMRecord read )319 public static boolean readHasMappedMate( final SAMRecord read ) { 320 return read.getReadPairedFlag() && ! read.getMateUnmappedFlag(); 321 } 322 323 /** 324 * Check whether the given String represents a legal attribute name according to the SAM spec, 325 * and throw an exception if it doesn't. 326 * 327 * Legal attribute names are two characters long, start with a letter, and end with a letter or digit. 328 * 329 * @param attributeName name to check 330 * @throws IllegalArgumentException if the attribute name is illegal according to the SAM spec. 331 */ assertAttributeNameIsLegal( final String attributeName )332 public static void assertAttributeNameIsLegal( final String attributeName ) { 333 if ( attributeName == null || 334 attributeName.length() != 2 || 335 ! Character.isLetter(attributeName.charAt(0)) || 336 ! Character.isLetterOrDigit(attributeName.charAt(1)) ) { 337 338 throw new IllegalArgumentException("Read attribute " + attributeName + " invalid: attribute names must be non-null two-character Strings matching the pattern /[A-Za-z][A-Za-z0-9]/"); 339 } 340 } 341 342 /** 343 * Encapsulates a integer attribute into an {@link OptionalInt} instance. 344 * @param read the input read. 345 * @param tag the attribute tag name. 346 * @throws IllegalArgumentException if {@code read} or {@code tag} are {@code null}. 347 * @throws GATKException.ReadAttributeTypeMismatch if the value provided for that attribute is not an integer. 348 * @return never {@code null}, but perhaps empty indicating that no value was provided for this attribute. 349 */ getOptionalIntAttribute(final SAMRecord read, final String tag)350 public static OptionalInt getOptionalIntAttribute(final SAMRecord read, final String tag) { 351 Utils.nonNull(read); 352 Utils.nonNull(tag); 353 final Object obj = read.getAttribute(tag); 354 if (obj == null) { 355 return OptionalInt.empty(); 356 } else if (obj instanceof Integer || obj instanceof Short) { 357 final Number num = (Number) obj; 358 return OptionalInt.of(num.intValue()); 359 } else if (obj instanceof CharSequence) { 360 final String str = "" + obj; 361 try { 362 return OptionalInt.of(Integer.parseInt(str)); 363 } catch (final NumberFormatException ex) { 364 throw new GATKException.ReadAttributeTypeMismatch(read, tag, "integer", ex); 365 } 366 } else { 367 throw new GATKException.ReadAttributeTypeMismatch(read, tag, "integer", obj); 368 } 369 } 370 371 /** 372 * Encapsulates a integer attribute into an {@link OptionalInt} instance. 373 * @param read the input read. 374 * @param tag the attribute tag name. 375 * @throws IllegalArgumentException if {@code read} or {@code tag} are {@code null}. 376 * @throws GATKException.ReadAttributeTypeMismatch if the value provided for that attribute is not an integer. 377 * @return never {@code null}, but perhaps empty indicating that no value was provided for this attribute. 378 */ getOptionalIntAttribute(final GATKRead read, final String tag)379 public static OptionalInt getOptionalIntAttribute(final GATKRead read, final String tag) { 380 Utils.nonNull(read); 381 Utils.nonNull(tag); 382 final Integer obj = read.getAttributeAsInteger(tag); 383 return obj == null ? OptionalInt.empty() : OptionalInt.of(obj); 384 } 385 386 /** 387 * Helper method for interrogating if a read and its mate (if it exists) are unmapped 388 * @param read a read with mate information to interrogate 389 * @return true if this read and its are unmapped 390 */ readAndMateAreUnmapped(GATKRead read)391 public static boolean readAndMateAreUnmapped(GATKRead read) { 392 return read.isUnmapped() && (!read.isPaired() || read.mateIsUnmapped()); 393 } 394 395 /** 396 * Interrogates the header to determine if the bam is expected to be sorted such that reads with the same name appear in order. 397 * This can correspond to either a queryname sorted bam or a querygrouped bam (unordered readname groups) 398 * @param header header corresponding to the bam file in question 399 * @return true if the header has has the right readname group 400 */ isReadNameGroupedBam(SAMFileHeader header)401 public static boolean isReadNameGroupedBam(SAMFileHeader header) { 402 return SAMFileHeader.SortOrder.queryname.equals(header.getSortOrder()) || SAMFileHeader.GroupOrder.query.equals(header.getGroupOrder()); 403 } 404 405 /** 406 * Create a map of reads overlapping {@code interval} to their mates by looking for all possible mates within some 407 * maximum fragment size. This is not guaranteed to find all mates, in particular near structural variant breakpoints 408 * where mates may align far away. 409 * 410 * The algorithm is: 411 * 1) make two maps of read name --> read for reads overlapping {@code interval}, one for first-of-pair reads and one 412 * for second-of-pair reads. 413 * 2) For all reads in an expanded interval padded by {@code fragmentSize} on both sides look for a read of the same name 414 * that is second-of-pair if this read is first-of-pair or vice-versa. If such a read is found then this is that read's mate. 415 * 416 * @param readsContext 417 * @param fragmentSize the maximum distance on either side of {@code interval} to look for mates. 418 * @return a map of reads ot their mates for all reads for which a mate could be found. 419 */ getReadToMateMap(final ReadsContext readsContext, final int fragmentSize)420 public static Map<GATKRead, GATKRead> getReadToMateMap(final ReadsContext readsContext, final int fragmentSize) { 421 final Map<String, GATKRead> readOnes = new HashMap<>(); 422 final Map<String, GATKRead> readTwos = new HashMap<>(); 423 Utils.stream(readsContext.iterator()).forEach(read -> (read.isFirstOfPair() ? readOnes : readTwos).put(read.getName(), read)); 424 425 final Map<GATKRead, GATKRead> result = new HashMap<>(); 426 final SimpleInterval originalInterval = readsContext.getInterval(); 427 final SimpleInterval expandedInterval = new SimpleInterval(originalInterval.getContig(), Math.max(1, originalInterval.getStart() - fragmentSize), originalInterval.getEnd() + fragmentSize); 428 Utils.stream(readsContext.iterator(expandedInterval)).forEach(mate -> { 429 final GATKRead read = (mate.isFirstOfPair() ? readTwos : readOnes).get(mate.getName()); 430 if (read != null) { 431 result.put(read, mate); 432 } 433 }); 434 435 return result; 436 } 437 438 public static final int SAM_READ_PAIRED_FLAG = 0x1; 439 public static final int SAM_PROPER_PAIR_FLAG = 0x2; 440 public static final int SAM_READ_UNMAPPED_FLAG = 0x4; 441 public static final int SAM_MATE_UNMAPPED_FLAG = 0x8; 442 public static final int SAM_READ_STRAND_FLAG = 0x10; 443 public static final int SAM_MATE_STRAND_FLAG = 0x20; 444 public static final int SAM_FIRST_OF_PAIR_FLAG = 0x40; 445 public static final int SAM_SECOND_OF_PAIR_FLAG = 0x80; 446 public static final int SAM_NOT_PRIMARY_ALIGNMENT_FLAG = 0x100; 447 public static final int SAM_READ_FAILS_VENDOR_QUALITY_CHECK_FLAG = 0x200; 448 public static final int SAM_DUPLICATE_READ_FLAG = 0x400; 449 public static final int SAM_SUPPLEMENTARY_ALIGNMENT_FLAG = 0x800; 450 451 /** 452 * Construct a set of SAM bitwise flags from a GATKRead 453 * 454 * @param read read from which to construct the flags 455 * @return SAM-compliant set of bitwise flags reflecting the properties in the given read 456 */ getSAMFlagsForRead( final GATKRead read )457 public static int getSAMFlagsForRead( final GATKRead read ) { 458 int samFlags = 0; 459 460 if ( read.isPaired() ) { 461 samFlags |= SAM_READ_PAIRED_FLAG; 462 } 463 if ( read.isProperlyPaired() ) { 464 samFlags |= SAM_PROPER_PAIR_FLAG; 465 } 466 if ( read.isUnmapped() ) { 467 samFlags |= SAM_READ_UNMAPPED_FLAG; 468 } 469 if ( read.isPaired() && read.mateIsUnmapped() ) { 470 samFlags |= SAM_MATE_UNMAPPED_FLAG; 471 } 472 if ( !read.isUnmapped() && read.isReverseStrand() ) { 473 samFlags |= SAM_READ_STRAND_FLAG; 474 } 475 if ( read.isPaired() && ! read.mateIsUnmapped() && read.mateIsReverseStrand() ) { 476 samFlags |= SAM_MATE_STRAND_FLAG; 477 } 478 if ( read.isFirstOfPair() ) { 479 samFlags |= SAM_FIRST_OF_PAIR_FLAG; 480 } 481 if ( read.isSecondOfPair() ) { 482 samFlags |= SAM_SECOND_OF_PAIR_FLAG; 483 } 484 if ( read.isSecondaryAlignment() ) { 485 samFlags |= SAM_NOT_PRIMARY_ALIGNMENT_FLAG; 486 } 487 if ( read.failsVendorQualityCheck() ) { 488 samFlags |= SAM_READ_FAILS_VENDOR_QUALITY_CHECK_FLAG; 489 } 490 if ( read.isDuplicate() ) { 491 samFlags |= SAM_DUPLICATE_READ_FLAG; 492 } 493 if ( read.isSupplementaryAlignment() ) { 494 samFlags |= SAM_SUPPLEMENTARY_ALIGNMENT_FLAG; 495 } 496 497 return samFlags; 498 } 499 500 /** 501 * Finds the adaptor boundary around the read and returns the first base inside the adaptor that is closest to 502 * the read boundary. If the read is in the positive strand, this is the first base after the end of the 503 * fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the 504 * beginning of the fragment. 505 * 506 * There are two cases we need to treat here: 507 * 508 * 1) Our read is in the reverse strand : 509 * 510 * <----------------------| * 511 * |---------------------> 512 * 513 * in these cases, the adaptor boundary is at the mate start (minus one) 514 * 515 * 2) Our read is in the forward strand : 516 * 517 * |----------------------> * 518 * <----------------------| 519 * 520 * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) 521 * 522 * @param read the read being tested for the adaptor boundary 523 * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. 524 * CANNOT_COMPUTE_ADAPTOR_BOUNDARY if the read is unmapped or the mate is mapped to another contig. 525 */ getAdaptorBoundary(final GATKRead read)526 public static int getAdaptorBoundary(final GATKRead read) { 527 if ( ! hasWellDefinedFragmentSize(read) ) { 528 return CANNOT_COMPUTE_ADAPTOR_BOUNDARY; 529 } else if ( read.isReverseStrand() ) { 530 return read.getMateStart() - 1; // case 1 (see header) 531 } else { 532 final int insertSize = Math.abs(read.getFragmentLength()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value) 533 return read.getStart() + insertSize; // case 2 (see header) 534 } 535 } 536 537 public static int CANNOT_COMPUTE_ADAPTOR_BOUNDARY = Integer.MIN_VALUE; 538 539 /** 540 * Can the adaptor sequence of read be reliably removed from the read based on the alignment of 541 * read and its mate? 542 * 543 * @param read the read to check 544 * @return true if it can, false otherwise 545 */ hasWellDefinedFragmentSize(final GATKRead read)546 public static boolean hasWellDefinedFragmentSize(final GATKRead read) { 547 if ( read.getFragmentLength() == 0 ) 548 // no adaptors in reads with mates in another chromosome or unmapped pairs 549 { 550 return false; 551 } 552 if ( ! read.isPaired() ) 553 // only reads that are paired can be adaptor trimmed 554 { 555 return false; 556 } 557 if ( read.isUnmapped() || read.mateIsUnmapped() ) 558 // only reads when both reads are mapped can be trimmed 559 { 560 return false; 561 } 562 // if ( ! read.isProperlyPaired() ) 563 // // note this flag isn't always set properly in BAMs, can will stop us from eliminating some proper pairs 564 // // reads that aren't part of a proper pair (i.e., have strange alignments) can't be trimmed 565 // return false; 566 if ( read.isReverseStrand() == read.mateIsReverseStrand() ) 567 // sanity check on isProperlyPaired to ensure that read1 and read2 aren't on the same strand 568 { 569 return false; 570 } 571 572 if ( read.isReverseStrand() ) { 573 // we're on the negative strand, so our read runs right to left 574 return read.getEnd() > read.getMateStart(); 575 } else { 576 // we're on the positive strand, so our mate should be to our right (his start + insert size should be past our start) 577 return read.getStart() <= read.getMateStart() + read.getFragmentLength(); 578 } 579 } 580 581 /** 582 * If a read starts in INSERTION, returns the first element length. 583 * 584 * Warning: If the read has Hard or Soft clips before the insertion this function will return 0. 585 * 586 * @param read 587 * @return the length of the first insertion, or 0 if there is none (see warning). 588 */ getFirstInsertionOffset(final GATKRead read)589 public static int getFirstInsertionOffset(final GATKRead read) { 590 final CigarElement e = read.getCigarElement(0); 591 if ( e.getOperator() == CigarOperator.I ) { 592 return e.getLength(); 593 } else { 594 return 0; 595 } 596 } 597 598 /** 599 * If a read ends in INSERTION, returns the last element length. 600 * 601 * Warning: If the read has Hard or Soft clips after the insertion this function will return 0. 602 * 603 * @param read 604 * @return the length of the last insertion, or 0 if there is none (see warning). 605 */ getLastInsertionOffset(final GATKRead read)606 public static int getLastInsertionOffset(final GATKRead read) { 607 final List<CigarElement> cigarElements = read.getCigarElements(); 608 final CigarElement e = cigarElements.get(cigarElements.size() - 1); 609 if ( e.getOperator() == CigarOperator.I ) { 610 return e.getLength(); 611 } else { 612 return 0; 613 } 614 } 615 616 /** 617 * Calculates the reference coordinate for the beginning of the read taking into account soft clips but not hard clips. 618 * 619 * Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. 620 * 621 * @return the unclipped start of the read taking soft clips (but not hard clips) into account 622 */ getSoftStart(final GATKRead read)623 public static int getSoftStart(final GATKRead read) { 624 Utils.nonNull(read, "read"); 625 626 int softStart = read.getStart(); 627 for (final CigarElement cig : read.getCigarElements()) { 628 final CigarOperator op = cig.getOperator(); 629 630 if (op == CigarOperator.SOFT_CLIP) { 631 softStart -= cig.getLength(); 632 } else if (op != CigarOperator.HARD_CLIP) { 633 break; 634 } 635 } 636 return softStart; 637 } 638 639 /** 640 * Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips. 641 * 642 * Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips. 643 * 644 * @return the unclipped end of the read taking soft clips (but not hard clips) into account 645 */ getSoftEnd(final GATKRead read)646 public static int getSoftEnd(final GATKRead read) { 647 Utils.nonNull(read, "read"); 648 649 boolean foundAlignedBase = false; 650 int softEnd = read.getEnd(); 651 final List<CigarElement> cigs = read.getCigarElements(); 652 for (int i = cigs.size() - 1; i >= 0; --i) { 653 final CigarElement cig = cigs.get(i); 654 final CigarOperator op = cig.getOperator(); 655 656 if (op == CigarOperator.SOFT_CLIP){ // assumes the soft clip that we found is at the end of the aligned read 657 softEnd += cig.getLength(); 658 } else if (op != CigarOperator.HARD_CLIP) { 659 foundAlignedBase = true; 660 break; 661 } 662 } 663 if( !foundAlignedBase ) { // for example 64H14S, the soft end is actually the same as the alignment end 664 softEnd = read.getEnd(); 665 } 666 return softEnd; 667 } 668 669 /** 670 * Find the 0-based index within a read base array corresponding to a given 1-based position in the reference, along with the cigar operator of 671 * the element containing that base. If the reference coordinate occurs within a deletion, the first index after the deletion is returned. 672 * Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips. 673 * 674 * @param alignmentStart The soft start of the read on the reference 675 * @param cigar The read's cigar 676 * @param refCoord The target reference coordinate 677 * @return If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED}; 678 * if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read coordinate; 679 * if the reference coordinate falls within a deletion, the first read coordinate after the deletion. Note: if the last cigar element is 680 * a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}. 681 */ getReadIndexForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord)682 public static Pair<Integer, CigarOperator> getReadIndexForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord) { 683 if (refCoord < alignmentStart) { 684 return new MutablePair<>(READ_INDEX_NOT_FOUND, null); 685 } 686 int firstReadPosOfElement = 0; //inclusive 687 int firstRefPosOfElement = alignmentStart; //inclusive 688 int lastReadPosOfElement = 0; //exclusive 689 int lastRefPosOfElement = alignmentStart; //exclusive 690 691 // advance forward through all the cigar elements until we bracket the reference coordinate 692 for (final CigarElement element : cigar) { 693 final CigarOperator operator = element.getOperator(); 694 firstReadPosOfElement = lastReadPosOfElement; 695 firstRefPosOfElement = lastRefPosOfElement; 696 lastReadPosOfElement += operator.consumesReadBases() ? element.getLength() : 0; 697 lastRefPosOfElement += operator.consumesReferenceBases() || operator == CigarOperator.S ? element.getLength() : 0; 698 699 if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) { // refCoord falls within this cigar element 700 final int readPosAtRefCoord = firstReadPosOfElement + (operator.consumesReadBases() ? ( refCoord - firstRefPosOfElement) : 0); 701 return Pair.of(readPosAtRefCoord, operator); 702 } 703 } 704 return new MutablePair<>(READ_INDEX_NOT_FOUND, null); 705 } 706 707 708 /** 709 * Returns the index within the read's bases array corresponding to the requested reference coordinate -- or the read coordinate immediately preceding 710 * a deletion in which the reference coordinate falls -- along with the cigar operator in which the reference coordinate occurs. 711 */ getReadIndexForReferenceCoordinate(final GATKRead read, final int refCoord)712 public static Pair<Integer, CigarOperator> getReadIndexForReferenceCoordinate(final GATKRead read, final int refCoord) { 713 return getReadIndexForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord); 714 } 715 getReadBaseAtReferenceCoordinate(final GATKRead read, final int refCoord)716 public static Optional<Byte> getReadBaseAtReferenceCoordinate(final GATKRead read, final int refCoord) { 717 if (refCoord < read.getStart() || read.getEnd() < refCoord) { 718 return Optional.empty(); 719 } 720 final Pair<Integer, CigarOperator> offsetAndOperator = getReadIndexForReferenceCoordinate(read, refCoord); 721 return (offsetAndOperator.getLeft() != READ_INDEX_NOT_FOUND && offsetAndOperator.getRight().consumesReadBases()) ? 722 Optional.of(read.getBase(offsetAndOperator.getLeft())) : Optional.empty(); 723 } 724 getReadBaseQualityAtReferenceCoordinate(final GATKRead read, final int refCoord)725 public static Optional<Byte> getReadBaseQualityAtReferenceCoordinate(final GATKRead read, final int refCoord) { 726 if (refCoord < read.getStart() || read.getEnd() < refCoord) { 727 return Optional.empty(); 728 } 729 final Pair<Integer, CigarOperator> offsetAndOperator = getReadIndexForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord); 730 return (offsetAndOperator.getRight() != null && offsetAndOperator.getRight().consumesReadBases()) ? 731 Optional.of(read.getBaseQuality(offsetAndOperator.getLeft())) : Optional.empty(); 732 } 733 734 735 /** 736 * Is a base inside a read? 737 * 738 * @param read the read to evaluate 739 * @param referenceCoordinate the reference coordinate of the base to test 740 * @return true if it is inside the read, false otherwise. 741 */ isInsideRead(final GATKRead read, final int referenceCoordinate)742 public static boolean isInsideRead(final GATKRead read, final int referenceCoordinate) { 743 return referenceCoordinate >= read.getStart() && referenceCoordinate <= read.getEnd(); 744 } 745 746 /** 747 * Returns the reverse complement of the read bases 748 * 749 * @param bases the read bases 750 * @return the reverse complement of the read bases 751 */ getBasesReverseComplement(final byte[] bases)752 public static String getBasesReverseComplement(final byte[] bases) { 753 String reverse = ""; 754 for (int i = bases.length-1; i >=0; i--) { 755 reverse += (char) BaseUtils.getComplement(bases[i]); 756 } 757 return reverse; 758 } 759 760 /** 761 * Returns the reverse complement of the read bases 762 * 763 * @param read the read 764 * @return the reverse complement of the read bases 765 */ getBasesReverseComplement(final GATKRead read)766 public static String getBasesReverseComplement(final GATKRead read) { 767 return getBasesReverseComplement(read.getBases()); 768 } 769 770 /** 771 * Creates an "empty", unmapped read with the provided read's read group and mate 772 * information, but empty (not-null) fields: 773 * - Cigar String 774 * - Read Bases 775 * - Base Qualities 776 * 777 * Use this method if you want to create a new empty read based on 778 * another read 779 * 780 * @param read a read to copy fields from 781 * @return a read with no bases but safe for the GATK 782 */ emptyRead( final GATKRead read )783 public static GATKRead emptyRead( final GATKRead read ) { 784 final GATKRead emptyRead = read.copy(); 785 786 emptyRead.setIsUnmapped(); 787 emptyRead.setMappingQuality(0); 788 emptyRead.setCigar(""); 789 emptyRead.setBases(new byte[0]); 790 emptyRead.setBaseQualities(new byte[0]); 791 792 emptyRead.clearAttributes(); 793 String readGroup = read.getReadGroup(); 794 if (readGroup != null) { 795 emptyRead.setAttribute(SAMTag.RG.name(), readGroup); 796 } 797 798 return emptyRead; 799 } 800 setInsertionBaseQualities( final GATKRead read, final byte[] quals)801 public static void setInsertionBaseQualities( final GATKRead read, final byte[] quals) { 802 read.setAttribute(BQSR_BASE_INSERTION_QUALITIES, quals == null ? null : SAMUtils.phredToFastq(quals)); 803 } 804 setDeletionBaseQualities( final GATKRead read, final byte[] quals)805 public static void setDeletionBaseQualities( final GATKRead read, final byte[] quals) { 806 read.setAttribute(BQSR_BASE_DELETION_QUALITIES, quals == null ? null : SAMUtils.phredToFastq(quals)); 807 } 808 809 /** 810 * @return whether or not this read has base insertion or deletion qualities (one of the two is sufficient to return true) 811 */ hasBaseIndelQualities(final GATKRead read)812 public static boolean hasBaseIndelQualities(final GATKRead read) { 813 return read.hasAttribute(BQSR_BASE_INSERTION_QUALITIES) || read.hasAttribute(BQSR_BASE_DELETION_QUALITIES); 814 } 815 816 /** 817 * @return the base deletion quality or null if read doesn't have one 818 */ getExistingBaseInsertionQualities(final GATKRead read)819 public static byte[] getExistingBaseInsertionQualities(final GATKRead read) { 820 return SAMUtils.fastqToPhred(read.getAttributeAsString(BQSR_BASE_INSERTION_QUALITIES)); 821 } 822 823 /** 824 * @return the base deletion quality or null if read doesn't have one 825 */ getExistingBaseDeletionQualities(final GATKRead read)826 public static byte[] getExistingBaseDeletionQualities(final GATKRead read) { 827 return SAMUtils.fastqToPhred( read.getAttributeAsString(BQSR_BASE_DELETION_QUALITIES)); 828 } 829 830 /** 831 * Default utility to query the base insertion quality of a read. If the read doesn't have one, it creates an array of default qualities (currently Q45) 832 * and assigns it to the read. 833 * 834 * @return the base insertion quality array 835 */ getBaseInsertionQualities(final GATKRead read)836 public static byte[] getBaseInsertionQualities(final GATKRead read) { 837 byte [] quals = getExistingBaseInsertionQualities(read); 838 if( quals == null ) { 839 quals = new byte[read.getBaseQualityCount()]; 840 Arrays.fill(quals, DEFAULT_INSERTION_DELETION_QUAL); // Some day in the future when base insertion and base deletion quals exist the samtools API will 841 // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 842 } 843 return quals; 844 } 845 846 /** 847 * Default utility to query the base deletion quality of a read. If the read doesn't have one, it creates an array of default qualities (currently Q45) 848 * and assigns it to the read. 849 * 850 * @return the base deletion quality array 851 */ getBaseDeletionQualities(final GATKRead read)852 public static byte[] getBaseDeletionQualities(final GATKRead read) { 853 byte[] quals = getExistingBaseDeletionQualities(read); 854 if( quals == null ) { 855 quals = new byte[read.getBaseQualityCount()]; 856 Arrays.fill(quals, DEFAULT_INSERTION_DELETION_QUAL); // Some day in the future when base insertion and base deletion quals exist the samtools API will 857 // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 858 } 859 return quals; 860 } 861 getBaseQualities( final GATKRead read, final EventType errorModel )862 public static byte[] getBaseQualities( final GATKRead read, final EventType errorModel ) { 863 switch( errorModel ) { 864 case BASE_SUBSTITUTION: 865 return read.getBaseQualities(); 866 case BASE_INSERTION: 867 return getBaseInsertionQualities(read); 868 case BASE_DELETION: 869 return getBaseDeletionQualities(read); 870 default: 871 throw new GATKException("Unrecognized Base Recalibration type: " + errorModel ); 872 } 873 } 874 875 /** 876 * Resets the quality scores of the reads to the orginal (pre-BQSR) ones. 877 */ resetOriginalBaseQualities(final GATKRead read)878 public static GATKRead resetOriginalBaseQualities(final GATKRead read) { 879 final byte[] originalQuals = ReadUtils.getOriginalBaseQualities(read); 880 if ( originalQuals != null ){ 881 read.setBaseQualities(originalQuals); 882 } 883 return read; 884 } 885 886 /** 887 * Check to ensure that the alignment makes sense based on the contents of the header. 888 * @param header The SAM file header. 889 * @param read The read to verify. 890 * @return true if alignment agrees with header, false otherwise. 891 */ alignmentAgreesWithHeader(final SAMFileHeader header, final GATKRead read)892 public static boolean alignmentAgreesWithHeader(final SAMFileHeader header, final GATKRead read) { 893 final int referenceIndex = getReferenceIndex(read, header); 894 // Read is aligned to nonexistent contig 895 if( ! read.isUnmapped() && referenceIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ) { 896 return false; 897 } 898 final SAMSequenceRecord contigHeader = header.getSequence(referenceIndex); 899 // Read is aligned to a point after the end of the contig 900 return read.isUnmapped() || read.getStart() <= contigHeader.getSequenceLength(); 901 } 902 903 /** 904 * Create a common SAMFileWriter for use with GATK tools. 905 * 906 * @param outputFile - if this file has a .cram extension then a reference is required. Can not be null. 907 * @param referenceFile - the reference source to use. Can not be null if a output file has a .cram extension. 908 * @param header - header to be used for the output writer 909 * @param preSorted - if true then the records must already be sorted to match the header sort order 910 * @param createOutputBamIndex - if true an index will be created for .BAM and .CRAM files 911 * @param createMD5 - if true an MD5 file will be created 912 * 913 * @return SAMFileWriter 914 */ createCommonSAMWriter( final File outputFile, final File referenceFile, final SAMFileHeader header, final boolean preSorted, boolean createOutputBamIndex, final boolean createMD5)915 public static SAMFileWriter createCommonSAMWriter( 916 final File outputFile, 917 final File referenceFile, 918 final SAMFileHeader header, 919 final boolean preSorted, 920 boolean createOutputBamIndex, 921 final boolean createMD5) 922 { 923 return createCommonSAMWriter( 924 (null == outputFile ? null : outputFile.toPath()), 925 null == referenceFile ? null : referenceFile.toPath(), 926 header, 927 preSorted, 928 createOutputBamIndex, 929 createMD5); 930 } 931 932 /** 933 * Create a common SAMFileWriter for use with GATK tools. 934 * 935 * @param outputPath - if this file has a .cram extension then a reference is required. Can not be null. 936 * @param referenceFile - the reference source to use. Can not be null if a output file has a .cram extension. 937 * @param header - header to be used for the output writer 938 * @param preSorted - if true then the records must already be sorted to match the header sort order 939 * @param createOutputBamIndex - if true an index will be created for .BAM and .CRAM files 940 * @param createMD5 - if true an MD5 file will be created 941 * 942 * @return SAMFileWriter 943 */ createCommonSAMWriter( final Path outputPath, final Path referenceFile, final SAMFileHeader header, final boolean preSorted, boolean createOutputBamIndex, final boolean createMD5)944 public static SAMFileWriter createCommonSAMWriter( 945 final Path outputPath, 946 final Path referenceFile, 947 final SAMFileHeader header, 948 final boolean preSorted, 949 boolean createOutputBamIndex, 950 final boolean createMD5) 951 { 952 Utils.nonNull(outputPath); 953 Utils.nonNull(header); 954 955 if (createOutputBamIndex && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) { 956 logger.warn("Skipping index file creation for: " + 957 outputPath + ". Index file creation requires reads in coordinate sorted order."); 958 createOutputBamIndex = false; 959 } 960 961 final SAMFileWriterFactory factory = new SAMFileWriterFactory().setCreateIndex(createOutputBamIndex).setCreateMd5File(createMD5); 962 return ReadUtils.createCommonSAMWriterFromFactory(factory, outputPath, referenceFile, header, preSorted); 963 } 964 965 /** 966 * Create a common SAMFileWriter from a factory for use with GATK tools. Assumes that if the factory has been set 967 * to create an index, the header must be set to coordinate sorted. 968 * 969 * @param outputFile if this file has a .cram extension then a reference is required. Can not be null. 970 * @param referenceFile the reference source to use. Can not be null if a output file has a .cram extension. 971 * @param header header to be used for the output writer 972 * @param preSorted if true then records must already be sorted to match the header sort order 973 * @param factory SAMFileWriterFactory factory to use 974 * @return SAMFileWriter 975 */ createCommonSAMWriterFromFactory( final SAMFileWriterFactory factory, final File outputFile, final File referenceFile, final SAMFileHeader header, final boolean preSorted)976 public static SAMFileWriter createCommonSAMWriterFromFactory( 977 final SAMFileWriterFactory factory, 978 final File outputFile, 979 final File referenceFile, 980 final SAMFileHeader header, 981 final boolean preSorted) 982 { 983 return createCommonSAMWriterFromFactory(factory, 984 Utils.nonNull(outputFile).toPath(), referenceFile == null ? null : referenceFile.toPath(), header, preSorted); 985 } 986 987 /** 988 * Create a common SAMFileWriter from a factory for use with GATK tools. Assumes that if the factory has been set 989 * to create an index, the header must be set to coordinate sorted. 990 * 991 * @param outputPath if this file has a .cram extension then a reference is required. Can not be null. 992 * @param referenceFile the reference source to use. Can not be null if a output file has a .cram extension. 993 * @param header header to be used for the output writer 994 * @param preSorted if true then records must already be sorted to match the header sort order 995 * @param factory SAMFileWriterFactory factory to use 996 * @param openOptions (optional) NIO options specifying how to open the file 997 * @return SAMFileWriter 998 */ createCommonSAMWriterFromFactory( final SAMFileWriterFactory factory, final Path outputPath, final Path referenceFile, final SAMFileHeader header, final boolean preSorted, OpenOption... openOptions)999 public static SAMFileWriter createCommonSAMWriterFromFactory( 1000 final SAMFileWriterFactory factory, 1001 final Path outputPath, 1002 final Path referenceFile, 1003 final SAMFileHeader header, 1004 final boolean preSorted, 1005 OpenOption... openOptions) 1006 { 1007 Utils.nonNull(outputPath); 1008 Utils.nonNull(header); 1009 1010 if (null == referenceFile && outputPath.toString().endsWith(FileExtensions.CRAM)) { 1011 throw new UserException.MissingReference("A reference file is required for writing CRAM files"); 1012 } 1013 1014 return factory.makeWriter(header.clone(), preSorted, outputPath, referenceFile); 1015 } 1016 1017 /** 1018 * Validate that a file has CRAM contents by checking that it has a valid CRAM file header 1019 * (no matter what the extension). 1020 * 1021 * @param putativeCRAMPath File to check. 1022 * @return true if the file has a valid CRAM file header, otherwise false 1023 */ hasCRAMFileContents(final Path putativeCRAMPath)1024 public static boolean hasCRAMFileContents(final Path putativeCRAMPath) { 1025 try (final InputStream fileStream = Files.newInputStream(putativeCRAMPath)) { 1026 try (final BufferedInputStream bis = new BufferedInputStream(fileStream)) { 1027 return SamStreams.isCRAMFile(bis); 1028 } 1029 } 1030 catch (IOException e) { 1031 throw new UserException.CouldNotReadInputFile(e.getMessage()); 1032 } 1033 } 1034 1035 /** 1036 * Validate that a file has CRAM contents by checking that it has a valid CRAM file header 1037 * (no matter what the extension). 1038 * 1039 * @param putativeCRAMFile File to check. 1040 * @return true if the file has a valid CRAM file header, otherwise false 1041 */ hasCRAMFileContents(final File putativeCRAMFile)1042 public static boolean hasCRAMFileContents(final File putativeCRAMFile) { 1043 return hasCRAMFileContents(putativeCRAMFile.toPath()); 1044 } 1045 isNonPrimary(GATKRead read)1046 public static boolean isNonPrimary(GATKRead read) { 1047 return read.isSecondaryAlignment() || read.isSupplementaryAlignment() || read.isUnmapped(); 1048 } 1049 1050 /** 1051 * is this base inside the adaptor of the read? 1052 * 1053 * There are two cases to treat here: 1054 * 1055 * 1) Read is in the negative strand => Adaptor boundary is on the left tail 1056 * 2) Read is in the positive strand => Adaptor boundary is on the right tail 1057 * 1058 * Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event) 1059 * 1060 * @param read the read to test 1061 * @param basePos base position in REFERENCE coordinates (not read coordinates) 1062 * @return whether or not the base is in the adaptor 1063 */ isBaseInsideAdaptor(final GATKRead read, long basePos)1064 public static boolean isBaseInsideAdaptor(final GATKRead read, long basePos) { 1065 final int adaptorBoundary = read.getAdaptorBoundary(); 1066 if (adaptorBoundary == CANNOT_COMPUTE_ADAPTOR_BOUNDARY || read.getFragmentLength() > DEFAULT_ADAPTOR_SIZE) 1067 return false; 1068 1069 return read.isReverseStrand() ? basePos <= adaptorBoundary : basePos >= adaptorBoundary; 1070 } 1071 1072 /** 1073 * Pull out the sample names from a SAMFileHeader 1074 * 1075 * note that we use a TreeSet so that they are sorted 1076 * 1077 * @param header the sam file header 1078 * @return list of strings representing the sample names 1079 */ getSamplesFromHeader( final SAMFileHeader header )1080 public static Set<String> getSamplesFromHeader( final SAMFileHeader header ) { 1081 // get all of the unique sample names 1082 final Set<String> samples = new TreeSet<>(); 1083 final List<SAMReadGroupRecord> readGroups = header.getReadGroups(); 1084 1085 for ( SAMReadGroupRecord readGroup : readGroups ) { 1086 final String sample = readGroup.getSample(); 1087 if ( sample != null ) { 1088 samples.add(sample); 1089 } 1090 } 1091 1092 return samples; 1093 } 1094 1095 /** 1096 * Validate that the expected input sort order is either "unsorted", or that it 1097 * matches the actualSortOrder. If validation fails a UserException is thrown 1098 * unless assumeSorted is true. 1099 * 1100 * @param actualSortOrder the actual sort order of the input 1101 * @param expectedSortOrder the sort order expected for this context 1102 * @param sourceName the name of the read source for inclusion in error messages 1103 * @param assumeSorted if true, no exception is thrown when the actualSortOrder 1104 * doesn't match the expectedSortOrder. An error messsage is 1105 * logged instead 1106 * @return boolean indicating if the validation passed 1107 * @throws UserException if the expectedSortOrder is anything other than "unsorted" 1108 * and the actualSortOrder doesn't match expectedSortOrder and assumeSorted is false 1109 */ validateExpectedSortOrder( final SAMFileHeader.SortOrder actualSortOrder, final SAMFileHeader.SortOrder expectedSortOrder, final boolean assumeSorted, final String sourceName)1110 public static boolean validateExpectedSortOrder( 1111 final SAMFileHeader.SortOrder actualSortOrder, 1112 final SAMFileHeader.SortOrder expectedSortOrder, 1113 final boolean assumeSorted, 1114 final String sourceName) 1115 { 1116 boolean isValid = true; 1117 if (expectedSortOrder != SAMFileHeader.SortOrder.unsorted && 1118 actualSortOrder != expectedSortOrder) { 1119 final String message = String.format("Input \"%s\" has sort order \"%s\" but \"%s\" is required.", 1120 sourceName, 1121 actualSortOrder.name(), 1122 expectedSortOrder.name()); 1123 isValid = false; 1124 if (assumeSorted) { 1125 logger.warn(message + " Assuming it's properly sorted anyway."); 1126 } 1127 else { 1128 throw new UserException( 1129 message + 1130 "If you believe the file to be sorted correctly, use " + 1131 StandardArgumentDefinitions.ASSUME_SORTED_LONG_NAME + 1132 "=true" 1133 ); 1134 } 1135 } 1136 1137 return isValid; 1138 } 1139 1140 /** 1141 * Returns the offset (0-based index) of the first base in the read that is aligned against the reference. 1142 * <p> 1143 * In most cases for mapped reads, this is typically equal to the sum of the size of soft-clipping at the 1144 * beginning of the alignment. 1145 * </p> 1146 * <p> 1147 * Notice that this index makes reference to the offset of that first base in the array returned by {@link GATKRead#getBases()}, If you 1148 * are after the first base in the original unclipped and not reverse-complemented read, you must use 1149 * {@link #getFirstAlignedReadPosition} instead. 1150 * </p> 1151 * 1152 * @throws IllegalArgumentException if the input {@code read} is {@code null} or does not have any base aligned 1153 * against the reference (e.g. is unmapped). 1154 * 1155 * @return a number between 0 and the read length-1. 1156 */ getFirstAlignedBaseOffset(final GATKRead read)1157 public static int getFirstAlignedBaseOffset(final GATKRead read) { 1158 Utils.nonNull(read, "the input read cannot be null"); 1159 if (read.isUnmapped()) { 1160 throw new IllegalArgumentException("the input read is unmapped and therefore does not have any base aligned"); 1161 } else { 1162 final List<CigarElement> cigarElements = read.getCigarElements(); 1163 if (cigarElements.isEmpty()) { 1164 throw new IllegalArgumentException("the input read is mapped yet contains no cigar-elements: " + read.commonToString()); 1165 } 1166 int result = 0; 1167 for (final CigarElement ce : cigarElements) { 1168 final int length = ce.getLength(); 1169 final CigarOperator co = ce.getOperator(); 1170 if (length > 0 && co.isAlignment()) { 1171 return result; 1172 } else if (co.consumesReadBases()) { 1173 result += length; 1174 } 1175 } 1176 throw new IllegalArgumentException("the input read cigar does not contain any alignment element"); 1177 } 1178 } 1179 1180 /** 1181 * @param read a GATK read 1182 * @return true if the read is F2R1, false otherwise 1183 */ isF2R1(final GATKRead read)1184 public static boolean isF2R1(final GATKRead read) { 1185 return read.isReverseStrand() == read.isFirstOfPair(); 1186 } 1187 1188 /** 1189 * @param read a GATK read 1190 * @return true if the read is F1R2, false otherwise 1191 */ isF1R2(final GATKRead read)1192 public static boolean isF1R2(final GATKRead read) { 1193 return read.isReverseStrand() != read.isFirstOfPair(); 1194 } 1195 1196 /** 1197 * Used to be called isUsableRead() 1198 **/ readHasReasonableMQ(final GATKRead read)1199 public static boolean readHasReasonableMQ(final GATKRead read){ 1200 return read.getMappingQuality() != 0 && read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE; 1201 } 1202 } 1203