1 package org.broadinstitute.hellbender.utils.read;
2 
3 import htsjdk.samtools.*;
4 import htsjdk.samtools.util.FileExtensions;
5 import org.apache.commons.lang3.tuple.MutablePair;
6 import org.apache.commons.lang3.tuple.Pair;
7 import org.apache.logging.log4j.LogManager;
8 import org.apache.logging.log4j.Logger;
9 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
10 import org.broadinstitute.hellbender.engine.ReadsContext;
11 import org.broadinstitute.hellbender.exceptions.GATKException;
12 import org.broadinstitute.hellbender.exceptions.UserException;
13 import org.broadinstitute.hellbender.utils.BaseUtils;
14 import org.broadinstitute.hellbender.utils.QualityUtils;
15 import org.broadinstitute.hellbender.utils.SimpleInterval;
16 import org.broadinstitute.hellbender.utils.Utils;
17 import org.broadinstitute.hellbender.utils.recalibration.EventType;
18 
19 import java.io.BufferedInputStream;
20 import java.io.File;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.nio.file.Files;
24 import java.nio.file.OpenOption;
25 import java.nio.file.Path;
26 import java.util.*;
27 
28 /**
29  * A miscellaneous collection of utilities for working with reads, headers, etc.
30  * Static methods only, please.
31  */
32 public final class ReadUtils {
33 
ReadUtils()34     private ReadUtils() {
35     }
36 
37     private static final Logger logger = LogManager.getLogger();
38 
39     /**
40      * The default quality score for an insertion or deletion, if
41      * none are provided for this read.
42      */
43     public static final byte DEFAULT_INSERTION_DELETION_QUAL = (byte)45;
44 
45     // Base Quality Score Recalibrator specific attribute tags
46     public static final String BQSR_BASE_INSERTION_QUALITIES = "BI";                // base qualities for insertions
47     public static final String BQSR_BASE_DELETION_QUALITIES = "BD";                 // base qualities for deletions
48 
49     public static final int READ_INDEX_NOT_FOUND = -1;
50     private static final int DEFAULT_ADAPTOR_SIZE = 100;
51 
52     public static final String ORIGINAL_BASE_QUALITIES_TAG = SAMTag.OQ.name();
53 
54     /**
55      * BAM file magic value that starts every bam file
56      */
57     public static final byte[] BAM_MAGIC = "BAM\1".getBytes();
58 
59     /**
60      * HACK: This is used to make a copy of a read.
61      * Really, SAMRecord should provide a copy constructor or a factory method.
62      */
cloneSAMRecord(final SAMRecord originalRead)63     public static SAMRecord cloneSAMRecord(final SAMRecord originalRead) {
64         if (originalRead == null) {
65             return null;
66         }
67         try {
68             return (SAMRecord)originalRead.clone();
69         } catch (final CloneNotSupportedException e) {
70             throw new IllegalStateException(e);
71         }
72     }
73 
74     /**
75      * HACK: This is used to make a copy of a header.
76      * Really, SAMFileHeader should provide a copy constructor or a factory method.
77      */
78 
cloneSAMFileHeader( final SAMFileHeader header )79     public static SAMFileHeader cloneSAMFileHeader( final SAMFileHeader header ) {
80         if (header == null) return null;
81         return header.clone();
82     }
83 
84     /**
85      * Checks whether read is a headerless SAMRecordToGATKReadAdapter, and if it is, sets its
86      * header to the provided header.
87      *
88      * @param read A potentially headerless GATKRead
89      * @param header header to store in the read, if it's a headerless SAMRecord-backed read
90      */
restoreHeaderIfNecessary( final GATKRead read, final SAMFileHeader header )91     public static void restoreHeaderIfNecessary( final GATKRead read, final SAMFileHeader header ) {
92         if ( read instanceof SAMRecordToGATKReadAdapter ) {
93             SAMRecordToGATKReadAdapter readAdapter = (SAMRecordToGATKReadAdapter)read;
94             if ( ! readAdapter.hasHeader() ) {
95                 readAdapter.setHeader(header);
96             }
97         }
98     }
99 
100     /**
101      * Retrieve the original base qualities of the given read, if present,
102      * as stored in the OQ attribute.
103      *
104      * @param read read to check
105      * @return original base qualities as stored in the OQ attribute, or null
106      *         if the OQ attribute is not present
107      */
getOriginalBaseQualities( final GATKRead read )108     public static byte[] getOriginalBaseQualities( final GATKRead read ) {
109         if ( ! read.hasAttribute(ORIGINAL_BASE_QUALITIES_TAG) ) {
110             return null;
111         }
112         final String oqString = read.getAttributeAsString(ORIGINAL_BASE_QUALITIES_TAG);
113         return !oqString.isEmpty() ? SAMUtils.fastqToPhred(oqString) : null;
114     }
115 
116     /**
117      * Returns the base qualities for the read as a string.
118      *
119      * @param read read whose base qualities should be returned
120      * @return Base qualities string as printable ASCII values (encoded as a FASTQ string).
121      */
getBaseQualityString( final GATKRead read )122     public static String getBaseQualityString( final GATKRead read ) {
123         Utils.nonNull(read);
124         if ( Arrays.equals(SAMRecord.NULL_QUALS, read.getBaseQualities()) ) {
125             return SAMRecord.NULL_QUALS_STRING;
126         }
127         return SAMUtils.phredToFastq(read.getBaseQualities());
128     }
129 
130     /**
131      * Set the base qualities from a string of ASCII encoded values
132      * @param read read whose base qualities should be set
133      * @param baseQualityString ASCII encoded (encoded as a FASTQ string) values of base qualities.
134      */
setBaseQualityString(final GATKRead read, final String baseQualityString)135     public static void setBaseQualityString(final GATKRead read, final String baseQualityString) {
136         Utils.nonNull(read);
137         Utils.nonNull(baseQualityString);
138 
139         if ( SAMRecord.NULL_QUALS_STRING.equals(baseQualityString) ) {
140             read.setBaseQualities(SAMRecord.NULL_QUALS);
141         } else {
142             read.setBaseQualities(SAMUtils.fastqToPhred(baseQualityString));
143         }
144     }
145 
146     /**
147      * Returns the reference index in the given header of the read's contig,
148      * or {@link SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read is unmapped.
149      *
150      * @param read read whose reference index to look up
151      * @param header SAM header defining contig indices
152      * @return the reference index in the given header of the read's contig,
153      *         or {@link SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read is unmapped.
154      */
getReferenceIndex( final GATKRead read, final SAMFileHeader header )155     public static int getReferenceIndex( final GATKRead read, final SAMFileHeader header ) {
156         if ( read.isUnmapped() ) {
157             return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
158         }
159 
160         return header.getSequenceIndex(read.getContig());
161     }
162 
163     /**
164      * Returns the reference index in the given header of the read's assigned contig.
165      *
166      * Unlike {@link #getReferenceIndex}, which returns {@link SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX}
167      * for all unmapped reads, this method will return a reference index for unmapped
168      * reads that are assigned a nominal position (eg., the position of their mates),
169      * which is useful for sorting.
170      *
171      * @param read read whose reference index to look up
172      * @param header SAM header defining contig indices
173      * @return the reference index in the given header of the read's contig,
174      *         or {@link SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read's contig
175      *         is not found in the header
176      */
getAssignedReferenceIndex( final GATKRead read, final SAMFileHeader header )177     public static int getAssignedReferenceIndex( final GATKRead read, final SAMFileHeader header ) {
178         return header.getSequenceIndex(read.getAssignedContig());
179     }
180 
181     /**
182      * Checks whether the provided read has an assigned position. This is different than checking
183      * unmapped status, since unmapped reads are often assigned a nominal position (eg., the position
184      * of their mapped mate). A read is considered to have no assigned position if its assigned contig
185      * is either {@code null} or {@link ReadConstants#UNSET_CONTIG}, or its assigned start position is
186      * {@link ReadConstants#UNSET_POSITION}, regardless of whether the read is actually marked as mapped
187      * or unmapped.
188      *
189      * @param read read to check
190      * @return true if the read has no assigned position, otherwise false
191      */
readHasNoAssignedPosition( final GATKRead read )192     public static boolean readHasNoAssignedPosition( final GATKRead read ) {
193         // Check actual assigned positions rather than unmapped status, so that unmapped reads with
194         // assigned positions will be considered to have a position
195         return read.getAssignedContig() == null ||
196                read.getAssignedContig().equals(ReadConstants.UNSET_CONTIG) ||
197                read.getAssignedStart() == ReadConstants.UNSET_POSITION;
198     }
199 
200     /**
201      * Returns the reference index in the given header of the contig of the read's mate,
202      * or {@link SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read's mate is unmapped.
203      *
204      * @param read read whose mate's reference index to look up
205      * @param header SAM header defining contig indices
206      * @return the reference index in the given header of the contig of the read's mate,
207      *         or {@link SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read's mate is unmapped.
208      */
getMateReferenceIndex( final GATKRead read, final SAMFileHeader header )209     public static int getMateReferenceIndex( final GATKRead read, final SAMFileHeader header ) {
210         if ( read.mateIsUnmapped() ) {
211             return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
212         }
213 
214         return header.getSequenceIndex(read.getMateContig());
215     }
216 
217     /**
218      * Returns a {@link SAMReadGroupRecord} object corresponding to the provided read's read group.
219      *
220      * @param read read whose read group to retrieve
221      * @param header SAM header containing read groups
222      * @return a {@link SAMReadGroupRecord} object corresponding to the provided read's read group,
223      *         or null if the read has no read group
224      */
getSAMReadGroupRecord( final GATKRead read, final SAMFileHeader header )225     public static SAMReadGroupRecord getSAMReadGroupRecord( final GATKRead read, final SAMFileHeader header ) {
226         final String readGroupName = read.getReadGroup();
227         return readGroupName != null ? header.getReadGroup(readGroupName) : null;
228     }
229 
230     /**
231      * Returns the platform associated with the provided read's read group.
232      *
233      * @param read read whose platform information to retrieve
234      * @param header SAM header containing read groups
235      * @return the platform for the provided read's read group as a String,
236      *         or null if the read has no read group.
237      */
getPlatform( final GATKRead read, final SAMFileHeader header )238     public static String getPlatform( final GATKRead read, final SAMFileHeader header ) {
239         final SAMReadGroupRecord readGroup = getSAMReadGroupRecord(read, header);
240         return readGroup != null ? readGroup.getPlatform() : null;
241     }
242 
243     /**
244      * Returns the platform unit associated with the provided read's read group.
245      *
246      * @param read read whose platform unit to retrieve
247      * @param header SAM header containing read groups
248      * @return the platform unit for the provided read's read group as a String,
249      *         or null if the read has no read group.
250      */
getPlatformUnit( final GATKRead read, final SAMFileHeader header )251     public static String getPlatformUnit( final GATKRead read, final SAMFileHeader header ) {
252         final SAMReadGroupRecord readGroup = getSAMReadGroupRecord(read, header);
253         return readGroup != null ? readGroup.getPlatformUnit() : null;
254     }
255 
256     /**
257      * Returns the library associated with the provided read's read group.
258      *
259      * @param read read whose library to retrieve
260      * @param header SAM header containing read groups
261      * @return the library for the provided read's read group as a String,
262      *         or null if the read has no read group.
263      */
getLibrary( final GATKRead read, final SAMFileHeader header )264     public static String getLibrary( final GATKRead read, final SAMFileHeader header ) {
265         final SAMReadGroupRecord readGroup = getSAMReadGroupRecord(read, header);
266         return readGroup != null ? readGroup.getLibrary() : null;
267     }
268 
269     /**
270      * Returns the sample name associated with the provided read's read group.
271      *
272      * @param read read whose sample name to retrieve
273      * @param header SAM header containing read groups
274      * @return the sample name for the provided read's read group as a String,
275      *         or null if the read has no read group.
276      */
getSampleName( final GATKRead read, final SAMFileHeader header )277     public static String getSampleName( final GATKRead read, final SAMFileHeader header ) {
278         final SAMReadGroupRecord readGroup = getSAMReadGroupRecord(read, header);
279         return readGroup != null ? readGroup.getSample() : null;
280     }
281 
282     /**
283      * Returns the read's unclipped start if the read is on the forward strand,
284      * or the read's unclipped end if the read is on the reverse strand.
285      *
286      * @param read read whose stranded unclipped start to retrieve
287      * @return the read's unclipped start if the read is on the forward strand,
288      *         or the read's unclipped end if the read is on the reverse strand.
289      */
getStrandedUnclippedStart( final GATKRead read )290     public static int getStrandedUnclippedStart( final GATKRead read ) {
291         return read.isReverseStrand() ? read.getUnclippedEnd() : read.getUnclippedStart();
292     }
293 
isEmpty(final SAMRecord read)294     public static boolean isEmpty(final SAMRecord read) {
295         return read.getReadBases() == null || read.getReadLength() == 0;
296     }
297 
prettyPrintSequenceRecords( final SAMSequenceDictionary sequenceDictionary )298     public static String prettyPrintSequenceRecords( final SAMSequenceDictionary sequenceDictionary ) {
299         final String[] sequenceRecordNames = new String[sequenceDictionary.size()];
300         int sequenceRecordIndex = 0;
301         for (final SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences()) {
302             sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
303         }
304         return Arrays.deepToString(sequenceRecordNames);
305     }
306 
307     /**
308      * @param read read to query
309      * @return true if the read has a mate and that mate is mapped, otherwise false
310      */
readHasMappedMate( final GATKRead read )311     public static boolean readHasMappedMate( final GATKRead read ) {
312         return read.isPaired() && ! read.mateIsUnmapped();
313     }
314 
315     /**
316      * @param read read to query
317      * @return true if the read has a mate and that mate is mapped, otherwise false
318      */
readHasMappedMate( final SAMRecord read )319     public static boolean readHasMappedMate( final SAMRecord read ) {
320         return read.getReadPairedFlag() && ! read.getMateUnmappedFlag();
321     }
322 
323     /**
324      * Check whether the given String represents a legal attribute name according to the SAM spec,
325      * and throw an exception if it doesn't.
326      *
327      * Legal attribute names are two characters long, start with a letter, and end with a letter or digit.
328      *
329      * @param attributeName name to check
330      * @throws IllegalArgumentException if the attribute name is illegal according to the SAM spec.
331      */
assertAttributeNameIsLegal( final String attributeName )332     public static void assertAttributeNameIsLegal( final String attributeName ) {
333         if ( attributeName == null ||
334              attributeName.length() != 2 ||
335              ! Character.isLetter(attributeName.charAt(0)) ||
336              ! Character.isLetterOrDigit(attributeName.charAt(1)) ) {
337 
338             throw new IllegalArgumentException("Read attribute " + attributeName + " invalid: attribute names must be non-null two-character Strings matching the pattern /[A-Za-z][A-Za-z0-9]/");
339         }
340     }
341 
342     /**
343      * Encapsulates a integer attribute into an {@link OptionalInt} instance.
344      * @param read the input read.
345      * @param tag the attribute tag name.
346      * @throws IllegalArgumentException if {@code read} or {@code tag} are {@code null}.
347      * @throws GATKException.ReadAttributeTypeMismatch if the value provided for that attribute is not an integer.
348      * @return never {@code null}, but perhaps empty indicating that no value was provided for this attribute.
349      */
getOptionalIntAttribute(final SAMRecord read, final String tag)350     public static OptionalInt getOptionalIntAttribute(final SAMRecord read, final String tag) {
351         Utils.nonNull(read);
352         Utils.nonNull(tag);
353         final Object obj = read.getAttribute(tag);
354         if (obj == null) {
355             return OptionalInt.empty();
356         } else if (obj instanceof Integer || obj instanceof Short) {
357             final Number num = (Number) obj;
358             return OptionalInt.of(num.intValue());
359         } else if (obj instanceof CharSequence) {
360             final String str = "" + obj;
361             try {
362                 return OptionalInt.of(Integer.parseInt(str));
363             } catch (final NumberFormatException ex) {
364                 throw new GATKException.ReadAttributeTypeMismatch(read, tag, "integer", ex);
365             }
366         } else {
367             throw new GATKException.ReadAttributeTypeMismatch(read, tag, "integer", obj);
368         }
369     }
370 
371     /**
372      * Encapsulates a integer attribute into an {@link OptionalInt} instance.
373      * @param read the input read.
374      * @param tag the attribute tag name.
375      * @throws IllegalArgumentException if {@code read} or {@code tag} are {@code null}.
376      * @throws GATKException.ReadAttributeTypeMismatch if the value provided for that attribute is not an integer.
377      * @return never {@code null}, but perhaps empty indicating that no value was provided for this attribute.
378      */
getOptionalIntAttribute(final GATKRead read, final String tag)379     public static OptionalInt getOptionalIntAttribute(final GATKRead read, final String tag) {
380         Utils.nonNull(read);
381         Utils.nonNull(tag);
382         final Integer obj = read.getAttributeAsInteger(tag);
383         return obj == null ? OptionalInt.empty() : OptionalInt.of(obj);
384     }
385 
386     /**
387      * Helper method for interrogating if a read and its mate (if it exists) are unmapped
388      * @param read a read with mate information to interrogate
389      * @return true if this read and its are unmapped
390      */
readAndMateAreUnmapped(GATKRead read)391     public static boolean readAndMateAreUnmapped(GATKRead read) {
392         return read.isUnmapped() && (!read.isPaired() || read.mateIsUnmapped());
393     }
394 
395     /**
396      * Interrogates the header to determine if the bam is expected to be sorted such that reads with the same name appear in order.
397      * This can correspond to either a queryname sorted bam or a querygrouped bam (unordered readname groups)
398      * @param header header corresponding to the bam file in question
399      * @return true if the header has has the right readname group
400      */
isReadNameGroupedBam(SAMFileHeader header)401     public static boolean isReadNameGroupedBam(SAMFileHeader header) {
402         return SAMFileHeader.SortOrder.queryname.equals(header.getSortOrder()) || SAMFileHeader.GroupOrder.query.equals(header.getGroupOrder());
403     }
404 
405     /**
406      * Create a map of reads overlapping {@code interval} to their mates by looking for all possible mates within some
407      * maximum fragment size.  This is not guaranteed to find all mates, in particular near structural variant breakpoints
408      * where mates may align far away.
409      *
410      * The algorithm is:
411      * 1) make two maps of read name --> read for reads overlapping {@code interval}, one for first-of-pair reads and one
412      *    for second-of-pair reads.
413      * 2) For all reads in an expanded interval padded by {@code fragmentSize} on both sides look for a read of the same name
414      *    that is second-of-pair if this read is first-of-pair or vice-versa.  If such a read is found then this is that read's mate.
415      *
416      * @param readsContext
417      * @param fragmentSize the maximum distance on either side of {@code interval} to look for mates.
418      * @return a map of reads ot their mates for all reads for which a mate could be found.
419      */
getReadToMateMap(final ReadsContext readsContext, final int fragmentSize)420     public static Map<GATKRead, GATKRead> getReadToMateMap(final ReadsContext readsContext, final int fragmentSize) {
421         final Map<String, GATKRead> readOnes = new HashMap<>();
422         final Map<String, GATKRead> readTwos = new HashMap<>();
423         Utils.stream(readsContext.iterator()).forEach(read -> (read.isFirstOfPair() ? readOnes : readTwos).put(read.getName(), read));
424 
425         final Map<GATKRead, GATKRead> result = new HashMap<>();
426         final SimpleInterval originalInterval = readsContext.getInterval();
427         final SimpleInterval expandedInterval = new SimpleInterval(originalInterval.getContig(), Math.max(1, originalInterval.getStart() - fragmentSize), originalInterval.getEnd() + fragmentSize);
428         Utils.stream(readsContext.iterator(expandedInterval)).forEach(mate -> {
429             final GATKRead read = (mate.isFirstOfPair() ? readTwos : readOnes).get(mate.getName());
430             if (read != null) {
431                 result.put(read, mate);
432             }
433         });
434 
435         return result;
436     }
437 
438     public static final int SAM_READ_PAIRED_FLAG = 0x1;
439     public static final int SAM_PROPER_PAIR_FLAG = 0x2;
440     public static final int SAM_READ_UNMAPPED_FLAG = 0x4;
441     public static final int SAM_MATE_UNMAPPED_FLAG = 0x8;
442     public static final int SAM_READ_STRAND_FLAG = 0x10;
443     public static final int SAM_MATE_STRAND_FLAG = 0x20;
444     public static final int SAM_FIRST_OF_PAIR_FLAG = 0x40;
445     public static final int SAM_SECOND_OF_PAIR_FLAG = 0x80;
446     public static final int SAM_NOT_PRIMARY_ALIGNMENT_FLAG = 0x100;
447     public static final int SAM_READ_FAILS_VENDOR_QUALITY_CHECK_FLAG = 0x200;
448     public static final int SAM_DUPLICATE_READ_FLAG = 0x400;
449     public static final int SAM_SUPPLEMENTARY_ALIGNMENT_FLAG = 0x800;
450 
451     /**
452      * Construct a set of SAM bitwise flags from a GATKRead
453      *
454      * @param read read from which to construct the flags
455      * @return SAM-compliant set of bitwise flags reflecting the properties in the given read
456      */
getSAMFlagsForRead( final GATKRead read )457     public static int getSAMFlagsForRead( final GATKRead read ) {
458         int samFlags = 0;
459 
460         if ( read.isPaired() ) {
461             samFlags |= SAM_READ_PAIRED_FLAG;
462         }
463         if ( read.isProperlyPaired() ) {
464             samFlags |= SAM_PROPER_PAIR_FLAG;
465         }
466         if ( read.isUnmapped() ) {
467             samFlags |= SAM_READ_UNMAPPED_FLAG;
468         }
469         if ( read.isPaired() && read.mateIsUnmapped() ) {
470             samFlags |= SAM_MATE_UNMAPPED_FLAG;
471         }
472         if ( !read.isUnmapped() && read.isReverseStrand() ) {
473             samFlags |= SAM_READ_STRAND_FLAG;
474         }
475         if ( read.isPaired() && ! read.mateIsUnmapped() && read.mateIsReverseStrand() ) {
476             samFlags |= SAM_MATE_STRAND_FLAG;
477         }
478         if ( read.isFirstOfPair() ) {
479             samFlags |= SAM_FIRST_OF_PAIR_FLAG;
480         }
481         if ( read.isSecondOfPair() ) {
482             samFlags |= SAM_SECOND_OF_PAIR_FLAG;
483         }
484         if ( read.isSecondaryAlignment() ) {
485             samFlags |= SAM_NOT_PRIMARY_ALIGNMENT_FLAG;
486         }
487         if ( read.failsVendorQualityCheck() ) {
488             samFlags |= SAM_READ_FAILS_VENDOR_QUALITY_CHECK_FLAG;
489         }
490         if ( read.isDuplicate() ) {
491             samFlags |= SAM_DUPLICATE_READ_FLAG;
492         }
493         if ( read.isSupplementaryAlignment() ) {
494             samFlags |= SAM_SUPPLEMENTARY_ALIGNMENT_FLAG;
495         }
496 
497         return samFlags;
498     }
499 
500     /**
501      * Finds the adaptor boundary around the read and returns the first base inside the adaptor that is closest to
502      * the read boundary. If the read is in the positive strand, this is the first base after the end of the
503      * fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the
504      * beginning of the fragment.
505      *
506      * There are two cases we need to treat here:
507      *
508      * 1) Our read is in the reverse strand :
509      *
510      *     <----------------------| *
511      *   |--------------------->
512      *
513      *   in these cases, the adaptor boundary is at the mate start (minus one)
514      *
515      * 2) Our read is in the forward strand :
516      *
517      *   |---------------------->   *
518      *     <----------------------|
519      *
520      *   in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one)
521      *
522      * @param read the read being tested for the adaptor boundary
523      * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read.
524      * CANNOT_COMPUTE_ADAPTOR_BOUNDARY if the read is unmapped or the mate is mapped to another contig.
525      */
getAdaptorBoundary(final GATKRead read)526     public static int getAdaptorBoundary(final GATKRead read) {
527         if ( ! hasWellDefinedFragmentSize(read) ) {
528             return CANNOT_COMPUTE_ADAPTOR_BOUNDARY;
529         } else if ( read.isReverseStrand() ) {
530             return read.getMateStart() - 1;           // case 1 (see header)
531         } else {
532             final int insertSize = Math.abs(read.getFragmentLength());    // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value)
533             return read.getStart() + insertSize;  // case 2 (see header)
534         }
535     }
536 
537     public static int CANNOT_COMPUTE_ADAPTOR_BOUNDARY = Integer.MIN_VALUE;
538 
539     /**
540      * Can the adaptor sequence of read be reliably removed from the read based on the alignment of
541      * read and its mate?
542      *
543      * @param read the read to check
544      * @return true if it can, false otherwise
545      */
hasWellDefinedFragmentSize(final GATKRead read)546     public static boolean hasWellDefinedFragmentSize(final GATKRead read) {
547         if ( read.getFragmentLength() == 0 )
548             // no adaptors in reads with mates in another chromosome or unmapped pairs
549         {
550             return false;
551 	    }
552         if ( ! read.isPaired() )
553             // only reads that are paired can be adaptor trimmed
554         {
555             return false;
556 	}
557         if ( read.isUnmapped() || read.mateIsUnmapped() )
558             // only reads when both reads are mapped can be trimmed
559         {
560             return false;
561 	}
562 //        if ( ! read.isProperlyPaired() )
563 //            // note this flag isn't always set properly in BAMs, can will stop us from eliminating some proper pairs
564 //            // reads that aren't part of a proper pair (i.e., have strange alignments) can't be trimmed
565 //            return false;
566         if ( read.isReverseStrand() == read.mateIsReverseStrand() )
567             // sanity check on isProperlyPaired to ensure that read1 and read2 aren't on the same strand
568 	    {
569             return false;
570         }
571 
572         if ( read.isReverseStrand() ) {
573             // we're on the negative strand, so our read runs right to left
574             return read.getEnd() > read.getMateStart();
575         } else {
576             // we're on the positive strand, so our mate should be to our right (his start + insert size should be past our start)
577             return read.getStart() <= read.getMateStart() + read.getFragmentLength();
578         }
579     }
580 
581     /**
582      * If a read starts in INSERTION, returns the first element length.
583      *
584      * Warning: If the read has Hard or Soft clips before the insertion this function will return 0.
585      *
586      * @param read
587      * @return the length of the first insertion, or 0 if there is none (see warning).
588      */
getFirstInsertionOffset(final GATKRead read)589     public static int getFirstInsertionOffset(final GATKRead read) {
590         final CigarElement e = read.getCigarElement(0);
591         if ( e.getOperator() == CigarOperator.I ) {
592             return e.getLength();
593         } else {
594             return 0;
595         }
596     }
597 
598     /**
599      * If a read ends in INSERTION, returns the last element length.
600      *
601      * Warning: If the read has Hard or Soft clips after the insertion this function will return 0.
602      *
603      * @param read
604      * @return the length of the last insertion, or 0 if there is none (see warning).
605      */
getLastInsertionOffset(final GATKRead read)606     public static int getLastInsertionOffset(final GATKRead read) {
607         final List<CigarElement> cigarElements = read.getCigarElements();
608         final CigarElement e = cigarElements.get(cigarElements.size() - 1);
609         if ( e.getOperator() == CigarOperator.I ) {
610             return e.getLength();
611         } else {
612             return 0;
613         }
614     }
615 
616     /**
617      * Calculates the reference coordinate for the beginning of the read taking into account soft clips but not hard clips.
618      *
619      * Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips.
620      *
621      * @return the unclipped start of the read taking soft clips (but not hard clips) into account
622      */
getSoftStart(final GATKRead read)623     public static int getSoftStart(final GATKRead read) {
624         Utils.nonNull(read, "read");
625 
626         int softStart = read.getStart();
627         for (final CigarElement cig : read.getCigarElements()) {
628             final CigarOperator op = cig.getOperator();
629 
630             if (op == CigarOperator.SOFT_CLIP) {
631                 softStart -= cig.getLength();
632             } else if (op != CigarOperator.HARD_CLIP) {
633                 break;
634             }
635         }
636         return softStart;
637     }
638 
639     /**
640      * Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips.
641      *
642      * Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips.
643      *
644      * @return the unclipped end of the read taking soft clips (but not hard clips) into account
645      */
getSoftEnd(final GATKRead read)646     public static int getSoftEnd(final GATKRead read) {
647         Utils.nonNull(read, "read");
648 
649         boolean foundAlignedBase = false;
650         int softEnd = read.getEnd();
651         final List<CigarElement> cigs = read.getCigarElements();
652         for (int i = cigs.size() - 1; i >= 0; --i) {
653             final CigarElement cig = cigs.get(i);
654             final CigarOperator op = cig.getOperator();
655 
656             if (op == CigarOperator.SOFT_CLIP){ // assumes the soft clip that we found is at the end of the aligned read
657                 softEnd += cig.getLength();
658             } else if (op != CigarOperator.HARD_CLIP) {
659                 foundAlignedBase = true;
660                 break;
661             }
662         }
663         if( !foundAlignedBase ) { // for example 64H14S, the soft end is actually the same as the alignment end
664             softEnd = read.getEnd();
665         }
666         return softEnd;
667     }
668 
669     /**
670      * Find the 0-based index within a read base array corresponding to a given 1-based position in the reference, along with the cigar operator of
671      * the element containing that base.  If the reference coordinate occurs within a deletion, the first index after the deletion is returned.
672      * Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips.
673      *
674      * @param alignmentStart        The soft start of the read on the reference
675      * @param cigar                 The read's cigar
676      * @param refCoord              The target reference coordinate
677      * @return                      If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED};
678      *                              if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read coordinate;
679      *                              if the reference coordinate falls within a deletion, the first read coordinate after the deletion.  Note: if the last cigar element is
680      *                              a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}.
681      */
getReadIndexForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord)682     public static Pair<Integer, CigarOperator> getReadIndexForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord) {
683         if (refCoord < alignmentStart) {
684             return new MutablePair<>(READ_INDEX_NOT_FOUND, null);
685         }
686         int firstReadPosOfElement = 0;              //inclusive
687         int firstRefPosOfElement = alignmentStart;  //inclusive
688         int lastReadPosOfElement = 0;               //exclusive
689         int lastRefPosOfElement = alignmentStart;   //exclusive
690 
691         // advance forward through all the cigar elements until we bracket the reference coordinate
692         for (final CigarElement element : cigar) {
693             final CigarOperator operator = element.getOperator();
694             firstReadPosOfElement = lastReadPosOfElement;
695             firstRefPosOfElement = lastRefPosOfElement;
696             lastReadPosOfElement += operator.consumesReadBases() ? element.getLength() : 0;
697             lastRefPosOfElement += operator.consumesReferenceBases() || operator == CigarOperator.S ? element.getLength() : 0;
698 
699             if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) {   // refCoord falls within this cigar element
700                 final int readPosAtRefCoord = firstReadPosOfElement + (operator.consumesReadBases() ? ( refCoord - firstRefPosOfElement) : 0);
701                 return Pair.of(readPosAtRefCoord, operator);
702             }
703         }
704         return new MutablePair<>(READ_INDEX_NOT_FOUND, null);
705     }
706 
707 
708     /**
709      * Returns the index within the read's bases array corresponding to the requested reference coordinate -- or the read coordinate immediately preceding
710      * a deletion in which the reference coordinate falls -- along with the cigar operator in which the reference coordinate occurs.
711      */
getReadIndexForReferenceCoordinate(final GATKRead read, final int refCoord)712     public static Pair<Integer, CigarOperator> getReadIndexForReferenceCoordinate(final GATKRead read, final int refCoord) {
713         return getReadIndexForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord);
714     }
715 
getReadBaseAtReferenceCoordinate(final GATKRead read, final int refCoord)716     public static Optional<Byte> getReadBaseAtReferenceCoordinate(final GATKRead read, final int refCoord) {
717         if (refCoord < read.getStart() || read.getEnd() < refCoord) {
718             return Optional.empty();
719         }
720         final Pair<Integer, CigarOperator> offsetAndOperator = getReadIndexForReferenceCoordinate(read, refCoord);
721         return (offsetAndOperator.getLeft() != READ_INDEX_NOT_FOUND && offsetAndOperator.getRight().consumesReadBases()) ?
722                 Optional.of(read.getBase(offsetAndOperator.getLeft())) : Optional.empty();
723     }
724 
getReadBaseQualityAtReferenceCoordinate(final GATKRead read, final int refCoord)725     public static Optional<Byte> getReadBaseQualityAtReferenceCoordinate(final GATKRead read, final int refCoord) {
726         if (refCoord < read.getStart() || read.getEnd() < refCoord) {
727             return Optional.empty();
728         }
729         final Pair<Integer, CigarOperator> offsetAndOperator = getReadIndexForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord);
730         return (offsetAndOperator.getRight() != null && offsetAndOperator.getRight().consumesReadBases()) ?
731                 Optional.of(read.getBaseQuality(offsetAndOperator.getLeft())) : Optional.empty();
732     }
733 
734 
735     /**
736      * Is a base inside a read?
737      *
738      * @param read                the read to evaluate
739      * @param referenceCoordinate the reference coordinate of the base to test
740      * @return true if it is inside the read, false otherwise.
741      */
isInsideRead(final GATKRead read, final int referenceCoordinate)742     public static boolean isInsideRead(final GATKRead read, final int referenceCoordinate) {
743         return referenceCoordinate >= read.getStart() && referenceCoordinate <= read.getEnd();
744     }
745 
746     /**
747      * Returns the reverse complement of the read bases
748      *
749      * @param bases the read bases
750      * @return the reverse complement of the read bases
751      */
getBasesReverseComplement(final byte[] bases)752     public static String getBasesReverseComplement(final byte[] bases) {
753         String reverse = "";
754         for (int i = bases.length-1; i >=0; i--) {
755             reverse += (char) BaseUtils.getComplement(bases[i]);
756         }
757         return reverse;
758     }
759 
760     /**
761      * Returns the reverse complement of the read bases
762      *
763      * @param read the read
764      * @return the reverse complement of the read bases
765      */
getBasesReverseComplement(final GATKRead read)766     public static String getBasesReverseComplement(final GATKRead read) {
767         return getBasesReverseComplement(read.getBases());
768     }
769 
770     /**
771      * Creates an "empty", unmapped read with the provided read's read group and mate
772      * information, but empty (not-null) fields:
773      *  - Cigar String
774      *  - Read Bases
775      *  - Base Qualities
776      *
777      * Use this method if you want to create a new empty read based on
778      * another read
779      *
780      * @param read a read to copy fields from
781      * @return a read with no bases but safe for the GATK
782      */
emptyRead( final GATKRead read )783     public static GATKRead emptyRead( final GATKRead read ) {
784         final GATKRead emptyRead = read.copy();
785 
786         emptyRead.setIsUnmapped();
787         emptyRead.setMappingQuality(0);
788         emptyRead.setCigar("");
789         emptyRead.setBases(new byte[0]);
790         emptyRead.setBaseQualities(new byte[0]);
791 
792         emptyRead.clearAttributes();
793         String readGroup = read.getReadGroup();
794         if (readGroup != null) {
795             emptyRead.setAttribute(SAMTag.RG.name(), readGroup);
796         }
797 
798         return emptyRead;
799     }
800 
setInsertionBaseQualities( final GATKRead read, final byte[] quals)801     public static void setInsertionBaseQualities( final GATKRead read, final byte[] quals) {
802         read.setAttribute(BQSR_BASE_INSERTION_QUALITIES, quals == null ? null : SAMUtils.phredToFastq(quals));
803     }
804 
setDeletionBaseQualities( final GATKRead read, final byte[] quals)805     public static void setDeletionBaseQualities( final GATKRead read, final byte[] quals) {
806         read.setAttribute(BQSR_BASE_DELETION_QUALITIES, quals == null ? null : SAMUtils.phredToFastq(quals));
807     }
808 
809     /**
810      * @return whether or not this read has base insertion or deletion qualities (one of the two is sufficient to return true)
811      */
hasBaseIndelQualities(final GATKRead read)812     public static boolean hasBaseIndelQualities(final GATKRead read) {
813         return read.hasAttribute(BQSR_BASE_INSERTION_QUALITIES) || read.hasAttribute(BQSR_BASE_DELETION_QUALITIES);
814     }
815 
816     /**
817      * @return the base deletion quality or null if read doesn't have one
818      */
getExistingBaseInsertionQualities(final GATKRead read)819     public static byte[] getExistingBaseInsertionQualities(final GATKRead read) {
820         return SAMUtils.fastqToPhred(read.getAttributeAsString(BQSR_BASE_INSERTION_QUALITIES));
821     }
822 
823     /**
824      * @return the base deletion quality or null if read doesn't have one
825      */
getExistingBaseDeletionQualities(final GATKRead read)826     public static byte[] getExistingBaseDeletionQualities(final GATKRead read) {
827         return SAMUtils.fastqToPhred( read.getAttributeAsString(BQSR_BASE_DELETION_QUALITIES));
828     }
829 
830     /**
831      * Default utility to query the base insertion quality of a read. If the read doesn't have one, it creates an array of default qualities (currently Q45)
832      * and assigns it to the read.
833      *
834      * @return the base insertion quality array
835      */
getBaseInsertionQualities(final GATKRead read)836     public static byte[] getBaseInsertionQualities(final GATKRead read) {
837         byte [] quals = getExistingBaseInsertionQualities(read);
838         if( quals == null ) {
839             quals = new byte[read.getBaseQualityCount()];
840             Arrays.fill(quals, DEFAULT_INSERTION_DELETION_QUAL); // Some day in the future when base insertion and base deletion quals exist the samtools API will
841             // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
842         }
843         return quals;
844     }
845 
846     /**
847      * Default utility to query the base deletion quality of a read. If the read doesn't have one, it creates an array of default qualities (currently Q45)
848      * and assigns it to the read.
849      *
850      * @return the base deletion quality array
851      */
getBaseDeletionQualities(final GATKRead read)852     public static byte[] getBaseDeletionQualities(final GATKRead read) {
853         byte[] quals = getExistingBaseDeletionQualities(read);
854         if( quals == null ) {
855             quals = new byte[read.getBaseQualityCount()];
856             Arrays.fill(quals, DEFAULT_INSERTION_DELETION_QUAL);  // Some day in the future when base insertion and base deletion quals exist the samtools API will
857             // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
858         }
859         return quals;
860     }
861 
getBaseQualities( final GATKRead read, final EventType errorModel )862     public static byte[] getBaseQualities( final GATKRead read, final EventType errorModel ) {
863         switch( errorModel ) {
864             case BASE_SUBSTITUTION:
865                 return read.getBaseQualities();
866             case BASE_INSERTION:
867                 return getBaseInsertionQualities(read);
868             case BASE_DELETION:
869                 return getBaseDeletionQualities(read);
870             default:
871                 throw new GATKException("Unrecognized Base Recalibration type: " + errorModel );
872         }
873     }
874 
875     /**
876      * Resets the quality scores of the reads to the orginal (pre-BQSR) ones.
877      */
resetOriginalBaseQualities(final GATKRead read)878     public static GATKRead resetOriginalBaseQualities(final GATKRead read) {
879         final byte[] originalQuals = ReadUtils.getOriginalBaseQualities(read);
880         if ( originalQuals != null ){
881             read.setBaseQualities(originalQuals);
882         }
883         return read;
884     }
885 
886     /**
887      * Check to ensure that the alignment makes sense based on the contents of the header.
888      * @param header The SAM file header.
889      * @param read The read to verify.
890      * @return true if alignment agrees with header, false otherwise.
891      */
alignmentAgreesWithHeader(final SAMFileHeader header, final GATKRead read)892     public static boolean alignmentAgreesWithHeader(final SAMFileHeader header, final GATKRead read) {
893         final int referenceIndex = getReferenceIndex(read, header);
894         // Read is aligned to nonexistent contig
895         if( ! read.isUnmapped() && referenceIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ) {
896             return false;
897         }
898         final SAMSequenceRecord contigHeader = header.getSequence(referenceIndex);
899         // Read is aligned to a point after the end of the contig
900         return read.isUnmapped() || read.getStart() <= contigHeader.getSequenceLength();
901     }
902 
903     /**
904      * Create a common SAMFileWriter for use with GATK tools.
905      *
906      * @param outputFile - if this file has a .cram extension then a reference is required. Can not be null.
907      * @param referenceFile - the reference source to use. Can not be null if a output file has a .cram extension.
908      * @param header - header to be used for the output writer
909      * @param preSorted - if true then the records must already be sorted to match the header sort order
910      * @param createOutputBamIndex - if true an index will be created for .BAM and .CRAM files
911      * @param createMD5 - if true an MD5 file will be created
912      *
913      * @return SAMFileWriter
914      */
createCommonSAMWriter( final File outputFile, final File referenceFile, final SAMFileHeader header, final boolean preSorted, boolean createOutputBamIndex, final boolean createMD5)915     public static SAMFileWriter createCommonSAMWriter(
916             final File outputFile,
917             final File referenceFile,
918             final SAMFileHeader header,
919             final boolean preSorted,
920             boolean createOutputBamIndex,
921             final boolean createMD5)
922     {
923         return createCommonSAMWriter(
924             (null == outputFile ? null : outputFile.toPath()),
925             null == referenceFile ? null : referenceFile.toPath(),
926             header,
927             preSorted,
928             createOutputBamIndex,
929             createMD5);
930     }
931 
932     /**
933      * Create a common SAMFileWriter for use with GATK tools.
934      *
935      * @param outputPath - if this file has a .cram extension then a reference is required. Can not be null.
936      * @param referenceFile - the reference source to use. Can not be null if a output file has a .cram extension.
937      * @param header - header to be used for the output writer
938      * @param preSorted - if true then the records must already be sorted to match the header sort order
939      * @param createOutputBamIndex - if true an index will be created for .BAM and .CRAM files
940      * @param createMD5 - if true an MD5 file will be created
941      *
942      * @return SAMFileWriter
943      */
createCommonSAMWriter( final Path outputPath, final Path referenceFile, final SAMFileHeader header, final boolean preSorted, boolean createOutputBamIndex, final boolean createMD5)944     public static SAMFileWriter createCommonSAMWriter(
945         final Path outputPath,
946         final Path referenceFile,
947         final SAMFileHeader header,
948         final boolean preSorted,
949         boolean createOutputBamIndex,
950         final boolean createMD5)
951     {
952         Utils.nonNull(outputPath);
953         Utils.nonNull(header);
954 
955         if (createOutputBamIndex && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
956             logger.warn("Skipping index file creation for: " +
957                 outputPath +  ". Index file creation requires reads in coordinate sorted order.");
958             createOutputBamIndex = false;
959         }
960 
961         final SAMFileWriterFactory factory = new SAMFileWriterFactory().setCreateIndex(createOutputBamIndex).setCreateMd5File(createMD5);
962         return ReadUtils.createCommonSAMWriterFromFactory(factory, outputPath, referenceFile, header, preSorted);
963     }
964 
965     /**
966      * Create a common SAMFileWriter from a factory for use with GATK tools. Assumes that if the factory has been set
967      * to create an index, the header must be set to coordinate sorted.
968      *
969      * @param outputFile if this file has a .cram extension then a reference is required. Can not be null.
970      * @param referenceFile the reference source to use. Can not be null if a output file has a .cram extension.
971      * @param header header to be used for the output writer
972      * @param preSorted if true then records must already be sorted to match the header sort order
973      * @param factory SAMFileWriterFactory factory to use
974      * @return SAMFileWriter
975      */
createCommonSAMWriterFromFactory( final SAMFileWriterFactory factory, final File outputFile, final File referenceFile, final SAMFileHeader header, final boolean preSorted)976     public static SAMFileWriter createCommonSAMWriterFromFactory(
977             final SAMFileWriterFactory factory,
978             final File outputFile,
979             final File referenceFile,
980             final SAMFileHeader header,
981             final boolean preSorted)
982     {
983         return createCommonSAMWriterFromFactory(factory,
984             Utils.nonNull(outputFile).toPath(), referenceFile == null ? null : referenceFile.toPath(), header, preSorted);
985     }
986 
987     /**
988      * Create a common SAMFileWriter from a factory for use with GATK tools. Assumes that if the factory has been set
989      * to create an index, the header must be set to coordinate sorted.
990      *
991      * @param outputPath if this file has a .cram extension then a reference is required. Can not be null.
992      * @param referenceFile the reference source to use. Can not be null if a output file has a .cram extension.
993      * @param header header to be used for the output writer
994      * @param preSorted if true then records must already be sorted to match the header sort order
995      * @param factory SAMFileWriterFactory factory to use
996      * @param openOptions (optional) NIO options specifying how to open the file
997      * @return SAMFileWriter
998      */
createCommonSAMWriterFromFactory( final SAMFileWriterFactory factory, final Path outputPath, final Path referenceFile, final SAMFileHeader header, final boolean preSorted, OpenOption... openOptions)999     public static SAMFileWriter createCommonSAMWriterFromFactory(
1000         final SAMFileWriterFactory factory,
1001         final Path outputPath,
1002         final Path referenceFile,
1003         final SAMFileHeader header,
1004         final boolean preSorted,
1005         OpenOption... openOptions)
1006     {
1007         Utils.nonNull(outputPath);
1008         Utils.nonNull(header);
1009 
1010         if (null == referenceFile && outputPath.toString().endsWith(FileExtensions.CRAM)) {
1011             throw new UserException.MissingReference("A reference file is required for writing CRAM files");
1012         }
1013 
1014         return factory.makeWriter(header.clone(), preSorted, outputPath, referenceFile);
1015     }
1016 
1017     /**
1018      * Validate that a file has CRAM contents by checking that it has a valid CRAM file header
1019      * (no matter what the extension).
1020      *
1021      * @param putativeCRAMPath File to check.
1022      * @return true if the file has a valid CRAM file header, otherwise false
1023      */
hasCRAMFileContents(final Path putativeCRAMPath)1024     public static boolean hasCRAMFileContents(final Path putativeCRAMPath) {
1025         try (final InputStream fileStream = Files.newInputStream(putativeCRAMPath)) {
1026             try (final BufferedInputStream bis = new BufferedInputStream(fileStream)) {
1027                 return SamStreams.isCRAMFile(bis);
1028             }
1029         }
1030         catch (IOException e) {
1031             throw new UserException.CouldNotReadInputFile(e.getMessage());
1032         }
1033     }
1034 
1035     /**
1036      * Validate that a file has CRAM contents by checking that it has a valid CRAM file header
1037      * (no matter what the extension).
1038      *
1039      * @param putativeCRAMFile File to check.
1040      * @return true if the file has a valid CRAM file header, otherwise false
1041      */
hasCRAMFileContents(final File putativeCRAMFile)1042     public static boolean hasCRAMFileContents(final File putativeCRAMFile) {
1043         return hasCRAMFileContents(putativeCRAMFile.toPath());
1044     }
1045 
isNonPrimary(GATKRead read)1046     public static boolean isNonPrimary(GATKRead read) {
1047         return read.isSecondaryAlignment() || read.isSupplementaryAlignment() || read.isUnmapped();
1048     }
1049 
1050     /**
1051      * is this base inside the adaptor of the read?
1052      *
1053      * There are two cases to treat here:
1054      *
1055      * 1) Read is in the negative strand => Adaptor boundary is on the left tail
1056      * 2) Read is in the positive strand => Adaptor boundary is on the right tail
1057      *
1058      * Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event)
1059      *
1060      * @param read the read to test
1061      * @param basePos base position in REFERENCE coordinates (not read coordinates)
1062      * @return whether or not the base is in the adaptor
1063      */
isBaseInsideAdaptor(final GATKRead read, long basePos)1064     public static boolean isBaseInsideAdaptor(final GATKRead read, long basePos) {
1065         final int adaptorBoundary = read.getAdaptorBoundary();
1066         if (adaptorBoundary == CANNOT_COMPUTE_ADAPTOR_BOUNDARY || read.getFragmentLength() > DEFAULT_ADAPTOR_SIZE)
1067             return false;
1068 
1069         return read.isReverseStrand() ? basePos <= adaptorBoundary : basePos >= adaptorBoundary;
1070     }
1071 
1072     /**
1073      * Pull out the sample names from a SAMFileHeader
1074      *
1075      * note that we use a TreeSet so that they are sorted
1076      *
1077      * @param header  the sam file header
1078      * @return list of strings representing the sample names
1079      */
getSamplesFromHeader( final SAMFileHeader header )1080     public static Set<String> getSamplesFromHeader( final SAMFileHeader header ) {
1081         // get all of the unique sample names
1082         final Set<String> samples = new TreeSet<>();
1083         final List<SAMReadGroupRecord> readGroups = header.getReadGroups();
1084 
1085         for ( SAMReadGroupRecord readGroup : readGroups ) {
1086             final String sample = readGroup.getSample();
1087             if ( sample != null ) {
1088                 samples.add(sample);
1089             }
1090         }
1091 
1092         return samples;
1093     }
1094 
1095     /**
1096      * Validate that the expected input sort order is either "unsorted", or that it
1097      * matches the actualSortOrder. If validation fails a UserException is thrown
1098      * unless assumeSorted is true.
1099      *
1100      * @param actualSortOrder the actual sort order of the input
1101      * @param expectedSortOrder the sort order expected for this context
1102      * @param sourceName the name of the read source for inclusion in error messages
1103      * @param assumeSorted if true, no exception is thrown when the actualSortOrder
1104      *                     doesn't match the expectedSortOrder. An error messsage is
1105      *                     logged instead
1106      * @return boolean indicating if the validation passed
1107      * @throws UserException if the expectedSortOrder is anything other than "unsorted"
1108      * and the actualSortOrder doesn't match expectedSortOrder and assumeSorted is false
1109      */
validateExpectedSortOrder( final SAMFileHeader.SortOrder actualSortOrder, final SAMFileHeader.SortOrder expectedSortOrder, final boolean assumeSorted, final String sourceName)1110     public static boolean validateExpectedSortOrder(
1111             final SAMFileHeader.SortOrder actualSortOrder,
1112             final SAMFileHeader.SortOrder expectedSortOrder,
1113             final boolean assumeSorted,
1114             final String sourceName)
1115     {
1116         boolean isValid = true;
1117         if (expectedSortOrder != SAMFileHeader.SortOrder.unsorted &&
1118                 actualSortOrder != expectedSortOrder) {
1119             final String message = String.format("Input \"%s\" has sort order \"%s\" but \"%s\" is required.",
1120                     sourceName,
1121                     actualSortOrder.name(),
1122                     expectedSortOrder.name());
1123             isValid = false;
1124             if (assumeSorted) {
1125                 logger.warn(message + " Assuming it's properly sorted anyway.");
1126             }
1127             else {
1128                 throw new UserException(
1129                         message +
1130                                 "If you believe the file to be sorted correctly, use " +
1131                                 StandardArgumentDefinitions.ASSUME_SORTED_LONG_NAME +
1132                                 "=true"
1133                 );
1134             }
1135         }
1136 
1137         return isValid;
1138     }
1139 
1140     /**
1141      * Returns the offset (0-based index) of the first base in the read that is aligned against the reference.
1142      * <p>
1143      *     In most cases for mapped reads, this is typically equal to the sum of the size of soft-clipping at the
1144      *     beginning of the alignment.
1145      * </p>
1146      * <p>
1147      *     Notice that this index makes reference to the offset of that first base in the array returned by {@link GATKRead#getBases()}, If you
1148      *     are after the first base in the original unclipped and not reverse-complemented read, you must use
1149      *     {@link #getFirstAlignedReadPosition} instead.
1150      * </p>
1151      *
1152      * @throws IllegalArgumentException if the input {@code read} is {@code null} or does not have any base aligned
1153      *  against the reference (e.g. is unmapped).
1154      *
1155      * @return a number between 0 and the read length-1.
1156      */
getFirstAlignedBaseOffset(final GATKRead read)1157     public static int getFirstAlignedBaseOffset(final GATKRead read) {
1158         Utils.nonNull(read, "the input read cannot be null");
1159         if (read.isUnmapped()) {
1160             throw new IllegalArgumentException("the input read is unmapped and therefore does not have any base aligned");
1161         } else {
1162             final List<CigarElement> cigarElements = read.getCigarElements();
1163             if (cigarElements.isEmpty()) {
1164                 throw new IllegalArgumentException("the input read is mapped yet contains no cigar-elements: " + read.commonToString());
1165             }
1166             int result = 0;
1167             for (final CigarElement ce : cigarElements) {
1168                 final int length = ce.getLength();
1169                 final CigarOperator co = ce.getOperator();
1170                 if (length > 0 && co.isAlignment()) {
1171                     return result;
1172                 } else if (co.consumesReadBases()) {
1173                     result += length;
1174                 }
1175             }
1176             throw new IllegalArgumentException("the input read cigar does not contain any alignment element");
1177         }
1178     }
1179 
1180     /**
1181      * @param read a GATK read
1182      * @return true if the read is F2R1, false otherwise
1183      */
isF2R1(final GATKRead read)1184     public static boolean isF2R1(final GATKRead read) {
1185         return read.isReverseStrand() == read.isFirstOfPair();
1186     }
1187 
1188     /**
1189      * @param read a GATK read
1190      * @return true if the read is F1R2, false otherwise
1191      */
isF1R2(final GATKRead read)1192     public static boolean isF1R2(final GATKRead read) {
1193         return read.isReverseStrand() != read.isFirstOfPair();
1194     }
1195 
1196     /**
1197      * Used to be called isUsableRead()
1198      **/
readHasReasonableMQ(final GATKRead read)1199     public static boolean readHasReasonableMQ(final GATKRead read){
1200         return read.getMappingQuality() != 0 && read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE;
1201     }
1202 }
1203