1 package org.broadinstitute.hellbender.utils.pileup;
2 
3 import com.google.common.annotations.VisibleForTesting;
4 import htsjdk.samtools.SAMFileHeader;
5 import htsjdk.samtools.util.Locatable;
6 import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
7 import org.broadinstitute.hellbender.exceptions.UserException;
8 import org.broadinstitute.hellbender.tools.walkers.qc.Pileup;
9 import org.broadinstitute.hellbender.utils.BaseUtils;
10 import org.broadinstitute.hellbender.utils.QualityUtils;
11 import org.broadinstitute.hellbender.utils.Utils;
12 import org.broadinstitute.hellbender.utils.fragments.FragmentCollection;
13 import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine;
14 import org.broadinstitute.hellbender.utils.read.GATKRead;
15 import org.broadinstitute.hellbender.utils.read.ReadUtils;
16 
17 import java.util.*;
18 import java.util.function.Predicate;
19 import java.util.function.ToIntFunction;
20 import java.util.stream.Collectors;
21 import java.util.stream.Stream;
22 import java.util.stream.StreamSupport;
23 
24 /**
25  * Represents a pileup of reads at a given position.
26  */
27 public class ReadPileup implements Iterable<PileupElement> {
28     private final Locatable loc;
29     private final List<PileupElement> pileupElements;
30 
31     /** Constant used by samtools to downgrade a quality for overlapping reads that disagrees in their base. */
32     public static final double SAMTOOLS_OVERLAP_LOW_CONFIDENCE = 0.8;
33 
34     /**
35      * Create a new pileup at loc, using the reads and their corresponding
36      * offsets.
37      * Note: This constructor keeps an alias to the given list.
38      */
ReadPileup(final Locatable loc, final List<PileupElement> pileup)39     public ReadPileup(final Locatable loc, final List<PileupElement> pileup) {
40         this.loc = loc;
41         this.pileupElements = pileup;
42     }
43 
44     /**
45      * Create a new pileup at loc, using an stratified pileup
46      * Note: the current implementation of ReadPileup does not efficiently retrieve the stratified pileup
47      */
ReadPileup(final Locatable loc, final Map<String, ReadPileup> stratifiedPileup)48     public ReadPileup(final Locatable loc, final Map<String, ReadPileup> stratifiedPileup) {
49         // NOTE: this bit of code is a potential performance hotspot for the HaplotypeCaller.
50         // This straightforward loop outperforms the equivalent streaming expression by over 2x.
51         List<PileupElement> allElements = new ArrayList<>(stratifiedPileup.size() * 1000);
52         for ( final Map.Entry<String, ReadPileup> pileupEntry : stratifiedPileup.entrySet() ) {
53             allElements.addAll(pileupEntry.getValue().pileupElements);
54         }
55 
56         this.loc = loc;
57         this.pileupElements = allElements;
58     }
59 
60     /**
61      * Create a new pileup without any aligned reads
62      */
ReadPileup(final Locatable loc)63     public ReadPileup(final Locatable loc) {
64         this(loc, new ArrayList<PileupElement>());
65     }
66 
67     /**
68      * Create a new pileup with the given reads.
69      */
ReadPileup(final Locatable loc, final List<GATKRead> reads, final int offset)70     public ReadPileup(final Locatable loc, final List<GATKRead> reads, final int offset) {
71         this(loc, readsOffsetsToPileup(reads, offset));
72     }
73 
74     /**
75      * Create a new pileup with the given reads.
76      */
ReadPileup(final Locatable loc, final List<GATKRead> reads, final List<Integer> offsets)77     public ReadPileup(final Locatable loc, final List<GATKRead> reads, final List<Integer> offsets) {
78         this(loc, readsOffsetsToPileup(reads, offsets));
79     }
80 
81     /**
82      * Get the pileup of reads covering a locus.  This is useful, for example, in VariantWalkers, which work on
83      * ReadsContexts and not AlignmentContexts.
84      */
ReadPileup(final Locatable loc, final Iterable<GATKRead> reads)85     public ReadPileup(final Locatable loc, final Iterable<GATKRead> reads) {
86         final List<PileupElement> pile = StreamSupport.stream(reads.spliterator(), false)
87                 .filter(ReadFilterLibrary.PASSES_VENDOR_QUALITY_CHECK.and(ReadFilterLibrary.NOT_DUPLICATE))
88                 .map(AlignmentStateMachine::new)
89                 .map(asm -> {
90                     while ( asm.stepForwardOnGenome() != null && asm.getGenomePosition() < loc.getStart()) { }
91                     return asm.getGenomePosition() == loc.getStart() ? asm.makePileupElement() : null;
92                 }).filter(Objects::nonNull).collect(Collectors.toList());
93 
94         this.loc = loc;
95         pileupElements = pile;
96     }
97 
98     /**
99      * Returns the first element corresponding to the given read or null there is no such element.
100      *
101      * @param read or null if elements with no reads are to be retrieved from this pileup.
102      * @return the first element corresponding to the given read or null there is no such element.
103      */
104     @VisibleForTesting
getElementForRead(final GATKRead read)105     PileupElement getElementForRead(final GATKRead read) {
106         return getElementStream().filter(el -> Objects.equals(el.getRead(), read)).findAny().orElse(null);
107     }
108 
109     /**
110      * Helper routine for converting reads and offset lists to a PileupElement list.
111      */
readsOffsetsToPileup(final List<GATKRead> reads, final List<Integer> offsets)112     private static List<PileupElement> readsOffsetsToPileup(final List<GATKRead> reads, final List<Integer> offsets) {
113         if (reads.size() != offsets.size()) {
114             throw new IllegalArgumentException("Reads and offset lists have different sizes!");
115         }
116 
117         final List<PileupElement> pileup = new ArrayList<>(reads.size());
118         for (int i = 0; i < reads.size(); i++) {
119             pileup.add(PileupElement.createPileupForReadAndOffset(reads.get(i), offsets.get(i)));
120         }
121 
122         return pileup;
123     }
124 
125     /**
126      * Helper routine for converting reads and a single offset to a PileupElement list.
127      */
readsOffsetsToPileup(final List<GATKRead> reads, final int offset)128     private static List<PileupElement> readsOffsetsToPileup(final List<GATKRead> reads, final int offset) {
129         return reads.stream().map(r -> PileupElement.createPileupForReadAndOffset(r, offset)).collect(Collectors.toList());
130     }
131 
132     /**
133      * Make a new pileup consisting of elements of this pileup that satisfy the predicate.
134      * NOTE: the new pileup will not be independent of the old one (no deep copy of the underlying data is performed).
135      */
makeFilteredPileup(final Predicate<PileupElement> filter)136     public ReadPileup makeFilteredPileup(final Predicate<PileupElement> filter) {
137         return new ReadPileup(loc, getElementStream().filter(filter).collect(Collectors.toList()));
138     }
139 
140     /**
141      * Make a new pileup from elements whose reads have read groups that agree with the given lane ID.
142      * (if they have a name equal to the ID or starting with ID followed by a period ".").
143      * Also, if both laneID and read group ID are {@code null}, the read will be included.
144      * Returns empty pileup if no suitable elements are found.
145      * NOTE: the new pileup will not be independent of the old one (no deep copy of the underlying data is performed).
146      */
getPileupForLane(final String laneID)147     public ReadPileup getPileupForLane(final String laneID) {
148         return makeFilteredPileup(p -> {
149             final GATKRead read = p.getRead();
150             final String readGroupID = read.getReadGroup();
151             if (laneID == null && readGroupID == null) {
152                 return true;
153             }
154             if (laneID != null && readGroupID != null) {
155                 final boolean laneSame = readGroupID.startsWith(laneID + "."); // lane is the same, but sample identifier is different
156                 final boolean exactlySame = readGroupID.equals(laneID);        // in case there is no sample identifier, they have to be exactly the same
157                 if (laneSame || exactlySame) {
158                     return true;
159                 }
160             }
161             return false;
162         });
163     }
164 
165     /**
166      * Make a new pileup from elements whose reads belong to the given sample.
167      * Passing null sample as an argument retrieves reads whose read group or sample name is {@code null}
168      * NOTE: the new pileup will not be independent of the old one (no deep copy of the underlying data is performed).
169      */
getPileupForSample(final String sample, final SAMFileHeader header)170     public ReadPileup getPileupForSample(final String sample, final SAMFileHeader header) {
171         return makeFilteredPileup(pe -> Objects.equals(ReadUtils.getSampleName(pe.getRead(), header), sample));
172     }
173 
174     /**
175      * Gets a set of the read groups represented in this pileup.
176      * Note: contains null if a read has a null read group
177      */
getReadGroupIDs()178     public Set<String> getReadGroupIDs() {
179         return getElementStream().map(pe -> pe.getRead().getReadGroup()).collect(Collectors.toSet());
180     }
181 
182     /**
183      * Gets a set of the samples represented in this pileup.
184      * Note: contains null if a read has a null read group or a null sample name.
185      */
getSamples(final SAMFileHeader header)186     public Set<String> getSamples(final SAMFileHeader header) {
187         return getElementStream().map(pe -> pe.getRead()).map(r -> ReadUtils.getSampleName(r, header)).collect(Collectors.toSet());
188     }
189 
190     /**
191      * Splits the ReadPileup by sample
192      *
193      * @param header            the header to retrieve the samples from
194      * @param unknownSampleName the sample name if there is no read group/sample name; {@code null} if lack of RG is not expected
195      * @return a Map of sample name to ReadPileup (including empty pileups for a sample)
196      * @throws org.broadinstitute.hellbender.exceptions.UserException.ReadMissingReadGroup if unknownSampleName is {@code null} and there are reads without RG/sample name
197      */
splitBySample(final SAMFileHeader header, final String unknownSampleName)198     public Map<String, ReadPileup> splitBySample(final SAMFileHeader header, final String unknownSampleName) {
199         final Map<String, ReadPileup> toReturn = new HashMap<>();
200         for (final String sample : getSamples(header)) {
201             final ReadPileup pileupBySample = getPileupForSample(sample, header);
202             if (sample != null) {
203                 toReturn.put(sample, pileupBySample);
204             } else {
205                 if (unknownSampleName == null) {
206                     throw new UserException.ReadMissingReadGroup(pileupBySample.iterator().next().getRead());
207                 }
208                 toReturn.put(unknownSampleName, pileupBySample);
209             }
210         }
211         return toReturn;
212     }
213 
214     /**
215      * The best way to access PileupElements where you only care about the bases and quals in the pileup.
216      * <p>
217      * for (PileupElement p : this) { doSomething(p); }
218      * <p>
219      * Provides efficient iteration of the data.
220      */
221     @Override
iterator()222     public Iterator<PileupElement> iterator() {
223         // Profiling has determined that returning a custom unmodifiable iterator is faster than
224         // Collections.unmodifiableList(pileupElements).iterator()
225         return new Iterator<PileupElement>() {
226             private final int len = pileupElements.size();
227             private int i = 0;
228 
229             @Override
230             public boolean hasNext() {
231                 return i < len;
232             }
233 
234             @Override
235             public PileupElement next() {
236                 return pileupElements.get(i++);
237             }
238 
239             @Override
240             public void remove() {
241                 throw new UnsupportedOperationException("Cannot remove from a pileup element iterator");
242             }
243         };
244     }
245 
246     /**
247      * Iterator over sorted by read start PileupElements.
248      */
249     public Iterator<PileupElement> sortedIterator() {
250         return getElementStream()
251                 .sorted((l, r) -> Integer.compare(l.getRead().getStart(), r.getRead().getStart()))
252                 .iterator();
253     }
254 
255     /**
256      * The number of elements in this pileup.
257      */
258     public int size() {
259         return pileupElements.size();
260     }
261 
262     /**
263      * @return true if there are 0 elements in the pileup, false otherwise
264      */
265     public boolean isEmpty() {
266         return size() == 0;
267     }
268 
269     /**
270      * @return the location of this pileup.
271      */
272     public Locatable getLocation() {
273         return loc;
274     }
275 
276     /**
277      * Get counts of A, C, G, T in order, which returns a int[4] vector with counts according
278      * to BaseUtils.simpleBaseToBaseIndex for each base.
279      * Deletions are not counted.
280      */
281     public int[] getBaseCounts() {
282         final int[] counts = new int[4];
283 
284         for (final PileupElement pile : this) {
285             // skip deletion sites
286             if (!pile.isDeletion()) {
287                 final int index = BaseUtils.simpleBaseToBaseIndex(pile.getBase());
288                 if (index != -1) {
289                     counts[index]++;
290                 }
291             }
292         }
293 
294         return counts;
295     }
296 
297     /**
298      * Fixes the quality of all the elements that come from an overlapping pair in the same way as
299      * samtools does {@see tweak_overlap_quality function in
300      * <a href="https://github.com/samtools/htslib/blob/master/sam.c">samtools</a>}.
301      * <p>
302      * Setting the quality of one of the bases to 0 effectively removes the redundant base for
303      * calling. In addition, if the bases overlap we have increased confidence if they agree (or
304      * reduced if they don't). Thus, the algorithm proceeds as following:
305      * <p>
306      * 1. If the bases are the same, the quality of the first element is the sum of both qualities
307      * and the quality of the second is reduced to 0.
308      * 2. If the bases are different, the base with the highest quality is reduced with a factor of
309      * 0.8, and the quality of the lowest is reduced to 0.
310      * <p>
311      * Note: Resulting qualities higher than {@link QualityUtils#MAX_SAM_QUAL_SCORE} are capped.
312      */
313     public void fixOverlaps() {
314         final FragmentCollection<PileupElement> fragments = FragmentCollection.create(this);
315         fragments.getOverlappingPairs().stream()
316                 .forEach(
317                         elements -> fixPairOverlappingQualities(elements.getLeft(), elements.getRight())
318                 );
319     }
320 
321     /**
322      * Fixes the quality of two elements that come from an overlapping pair in the same way as
323      * samtools does {@see tweak_overlap_quality function in
324      * <a href="https://github.com/samtools/htslib/blob/master/sam.c">samtools</a>}.
325      * The only difference with the samtools API is the cap for high values ({@link QualityUtils#MAX_SAM_QUAL_SCORE}).
326      */
327     @VisibleForTesting
328     static void fixPairOverlappingQualities(final PileupElement firstElement,
329                                             final PileupElement secondElement) {
330         // only if they do not represent deletions
331         if (!secondElement.isDeletion() && !firstElement.isDeletion()) {
332             final byte[] firstQuals = firstElement.getRead().getBaseQualities();
333             final byte[] secondQuals = secondElement.getRead().getBaseQualities();
334             if (firstElement.getBase() == secondElement.getBase()) {
335                 // if both have the same base, extra confidence in the firts of them
336                 firstQuals[firstElement.getOffset()] =
337                         (byte) (firstQuals[firstElement.getOffset()] + secondQuals[secondElement
338                                 .getOffset()]);
339                 // cap to maximum byte value
340                 if (firstQuals[firstElement.getOffset()] < 0
341                         || firstQuals[firstElement.getOffset()] > QualityUtils.MAX_SAM_QUAL_SCORE) {
342                     firstQuals[firstElement.getOffset()] = QualityUtils.MAX_SAM_QUAL_SCORE;
343                 }
344                 secondQuals[secondElement.getOffset()] = 0;
345             } else {
346                 // if not, we lost confidence in the one with higher quality
347                 if (firstElement.getQual() >= secondElement.getQual()) {
348                     firstQuals[firstElement.getOffset()] =
349                             (byte) (SAMTOOLS_OVERLAP_LOW_CONFIDENCE * firstQuals[firstElement.getOffset()]);
350                     secondQuals[secondElement.getOffset()] = 0;
351                 } else {
352                     secondQuals[secondElement.getOffset()] =
353                             (byte) (SAMTOOLS_OVERLAP_LOW_CONFIDENCE * secondQuals[secondElement.getOffset()]);
354                     firstQuals[firstElement.getOffset()] = 0;
355                 }
356             }
357             firstElement.getRead().setBaseQualities(firstQuals);
358             secondElement.getRead().setBaseQualities(secondQuals);
359         }
360     }
361 
362     public final static Comparator<PileupElement> baseQualTieBreaker = new Comparator<PileupElement>() {
363         @Override
364         public int compare(PileupElement o1, PileupElement o2) {
365             return Byte.compare(o1.getQual(), o2.getQual());
366         }
367     };
368 
369     public final static Comparator<PileupElement> mapQualTieBreaker = new Comparator<PileupElement>() {
370         @Override
371         public int compare(PileupElement o1, PileupElement o2) {
372             return Integer.compare(o1.getMappingQual(), o2.getMappingQual());
373         }
374     };
375 
376     /**
377      * Returns a new ReadPileup where only one read from an overlapping read
378      * pair is retained.  If the two reads in question disagree to their basecall,
379      * neither read is retained.  If they agree on the base, the read with the higher
380      * base quality observation is retained
381      *
382      * @return the newly filtered pileup
383      */
384     public ReadPileup getOverlappingFragmentFilteredPileup(SAMFileHeader header) {
385         return getOverlappingFragmentFilteredPileup(true, baseQualTieBreaker, header);
386     }
387 
388     /**
389      * Returns a new ReadPileup where only one read from an overlapping read
390      * pair is retained.  If discardDiscordant and the two reads in question disagree to their basecall,
391      * neither read is retained.  Otherwise, the read with the higher
392      * quality (base or mapping, depending on baseQualNotMapQual) observation is retained
393      *
394      * @return the newly filtered pileup
395      */
396     public ReadPileup getOverlappingFragmentFilteredPileup(boolean discardDiscordant, Comparator<PileupElement> tieBreaker, SAMFileHeader header) {
397         List<PileupElement> filteredPileupList = new ArrayList<PileupElement>();
398 
399         for (ReadPileup pileup : this.splitBySample(header, null).values()) {
400             Collection<PileupElement> elements = filterSingleSampleForOverlaps(pileup, tieBreaker, discardDiscordant);
401             filteredPileupList.addAll(elements);
402         }
403 
404         return new ReadPileup(loc, filteredPileupList);
405     }
406 
407     private Collection<PileupElement> filterSingleSampleForOverlaps(ReadPileup pileup, Comparator<PileupElement> tieBreaker, boolean discardDiscordant) {
408         Map<String, PileupElement> filteredPileup = new HashMap<String, PileupElement>();
409         Set<String> readNamesDeleted = new HashSet<>();
410 
411         for (PileupElement p : pileup) {
412             String readName = p.getRead().getName();
413 
414             // if we've never seen this read before, life is good
415             if (!filteredPileup.containsKey(readName)) {
416                 if(!readNamesDeleted.contains(readName)) {
417                     filteredPileup.put(readName, p);
418                 }
419             } else {
420                 PileupElement existing = filteredPileup.get(readName);
421 
422                 // if the reads disagree at this position, throw them all out.  Otherwise
423                 // keep the element with the highest quality score
424                 if (discardDiscordant && existing.getBase() != p.getBase()) {
425                     filteredPileup.remove(readName);
426                     readNamesDeleted.add(readName);
427                 } else if (tieBreaker.compare(existing, p) < 0) {
428                     filteredPileup.put(readName, p);
429                 }
430             }
431         }
432         return(filteredPileup.values());
433     }
434 
435     @Override
436     public String toString() {
437         return String.format("%s %s %s %s",
438                 loc.getContig(),
439                 loc.getStart(),
440                 new String(getBases()),
441                 getQualsString());
442     }
443 
444     /**
445      * Format, assuming a single-sample, in a samtools-like string.
446      * Each line represents a genomic position, consisting of chromosome name, coordinate,
447      * reference base, read bases and read qualities
448      *
449      * @param ref the reference base
450      * @return pileup line
451      */
452     public String getPileupString(final char ref) {
453         // In the pileup format,
454         return String.format("%s %s %c %s %s",
455                 getLocation().getContig(), getLocation().getStart(),    // chromosome name and coordinate
456                 ref,                                                     // reference base
457                 new String(getBases()),
458                 getQualsString());
459     }
460 
461     /**
462      * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time
463      */
464     public List<GATKRead> getReads() {
465         return getElementStream().map(pe -> pe.getRead()).collect(Collectors.toList());
466     }
467 
468     private Stream<PileupElement> getElementStream() {
469         return pileupElements.stream();
470     }
471 
472     /**
473      * Returns the number of elements that satisfy the predicate.
474      */
475     public int getNumberOfElements(final Predicate<PileupElement> peFilter) {
476         Utils.nonNull(peFilter);
477         //Note: pileups are small so int is fine.
478         return (int) getElementStream().filter(peFilter).count();
479     }
480 
481     /**
482      * Returns a list of the offsets in this pileup.
483      * Note: this call costs O(n) and allocates fresh lists each time
484      */
485     public List<Integer> getOffsets() {
486         return getElementStream().map(pe -> pe.getOffset()).collect(Collectors.toList());
487     }
488 
489     //Extracts an int array by mapping each element in the pileup to int.
490     private int[] extractIntArray(final ToIntFunction<PileupElement> map) {
491         return getElementStream().mapToInt(map).toArray();
492     }
493 
494     /**
495      * Returns an array of the bases in this pileup.
496      * Note: this call costs O(n) and allocates fresh array each time
497      */
498     public byte[] getBases() {
499         return toByteArray(extractIntArray(pe -> pe.getBase()));
500     }
501 
502     /**
503      * Returns an array of the quals in this pileup.
504      * Note: this call costs O(n) and allocates fresh array each time
505      */
506     public byte[] getBaseQuals() {
507         return toByteArray(extractIntArray(pe -> pe.getQual()));
508     }
509 
510     //Converts array of ints to array of bytes by hard casting (loses precision if ints are large).
511     private byte[] toByteArray(final int[] ints) {
512         final byte[] bytes = new byte[ints.length];
513         for (int i = 0; i < ints.length; i++) {
514             bytes[i] = (byte) ints[i];
515         }
516         return bytes;
517     }
518 
519     /**
520      * Get an array of the mapping qualities.
521      */
522     public int[] getMappingQuals() {
523         return extractIntArray(pe -> pe.getMappingQual());
524     }
525 
526     private String getQualsString() {
527         final byte[] quals = getBaseQuals();
528         for (int i = 0; i < quals.length; i++) {
529             quals[i] = (byte) (33 + quals[i]);  //as per SAM spec
530         }
531         return new String(quals);
532     }
533 }
534