1 package org.broadinstitute.hellbender.utils.pileup; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import htsjdk.samtools.SAMFileHeader; 5 import htsjdk.samtools.util.Locatable; 6 import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; 7 import org.broadinstitute.hellbender.exceptions.UserException; 8 import org.broadinstitute.hellbender.tools.walkers.qc.Pileup; 9 import org.broadinstitute.hellbender.utils.BaseUtils; 10 import org.broadinstitute.hellbender.utils.QualityUtils; 11 import org.broadinstitute.hellbender.utils.Utils; 12 import org.broadinstitute.hellbender.utils.fragments.FragmentCollection; 13 import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine; 14 import org.broadinstitute.hellbender.utils.read.GATKRead; 15 import org.broadinstitute.hellbender.utils.read.ReadUtils; 16 17 import java.util.*; 18 import java.util.function.Predicate; 19 import java.util.function.ToIntFunction; 20 import java.util.stream.Collectors; 21 import java.util.stream.Stream; 22 import java.util.stream.StreamSupport; 23 24 /** 25 * Represents a pileup of reads at a given position. 26 */ 27 public class ReadPileup implements Iterable<PileupElement> { 28 private final Locatable loc; 29 private final List<PileupElement> pileupElements; 30 31 /** Constant used by samtools to downgrade a quality for overlapping reads that disagrees in their base. */ 32 public static final double SAMTOOLS_OVERLAP_LOW_CONFIDENCE = 0.8; 33 34 /** 35 * Create a new pileup at loc, using the reads and their corresponding 36 * offsets. 37 * Note: This constructor keeps an alias to the given list. 38 */ ReadPileup(final Locatable loc, final List<PileupElement> pileup)39 public ReadPileup(final Locatable loc, final List<PileupElement> pileup) { 40 this.loc = loc; 41 this.pileupElements = pileup; 42 } 43 44 /** 45 * Create a new pileup at loc, using an stratified pileup 46 * Note: the current implementation of ReadPileup does not efficiently retrieve the stratified pileup 47 */ ReadPileup(final Locatable loc, final Map<String, ReadPileup> stratifiedPileup)48 public ReadPileup(final Locatable loc, final Map<String, ReadPileup> stratifiedPileup) { 49 // NOTE: this bit of code is a potential performance hotspot for the HaplotypeCaller. 50 // This straightforward loop outperforms the equivalent streaming expression by over 2x. 51 List<PileupElement> allElements = new ArrayList<>(stratifiedPileup.size() * 1000); 52 for ( final Map.Entry<String, ReadPileup> pileupEntry : stratifiedPileup.entrySet() ) { 53 allElements.addAll(pileupEntry.getValue().pileupElements); 54 } 55 56 this.loc = loc; 57 this.pileupElements = allElements; 58 } 59 60 /** 61 * Create a new pileup without any aligned reads 62 */ ReadPileup(final Locatable loc)63 public ReadPileup(final Locatable loc) { 64 this(loc, new ArrayList<PileupElement>()); 65 } 66 67 /** 68 * Create a new pileup with the given reads. 69 */ ReadPileup(final Locatable loc, final List<GATKRead> reads, final int offset)70 public ReadPileup(final Locatable loc, final List<GATKRead> reads, final int offset) { 71 this(loc, readsOffsetsToPileup(reads, offset)); 72 } 73 74 /** 75 * Create a new pileup with the given reads. 76 */ ReadPileup(final Locatable loc, final List<GATKRead> reads, final List<Integer> offsets)77 public ReadPileup(final Locatable loc, final List<GATKRead> reads, final List<Integer> offsets) { 78 this(loc, readsOffsetsToPileup(reads, offsets)); 79 } 80 81 /** 82 * Get the pileup of reads covering a locus. This is useful, for example, in VariantWalkers, which work on 83 * ReadsContexts and not AlignmentContexts. 84 */ ReadPileup(final Locatable loc, final Iterable<GATKRead> reads)85 public ReadPileup(final Locatable loc, final Iterable<GATKRead> reads) { 86 final List<PileupElement> pile = StreamSupport.stream(reads.spliterator(), false) 87 .filter(ReadFilterLibrary.PASSES_VENDOR_QUALITY_CHECK.and(ReadFilterLibrary.NOT_DUPLICATE)) 88 .map(AlignmentStateMachine::new) 89 .map(asm -> { 90 while ( asm.stepForwardOnGenome() != null && asm.getGenomePosition() < loc.getStart()) { } 91 return asm.getGenomePosition() == loc.getStart() ? asm.makePileupElement() : null; 92 }).filter(Objects::nonNull).collect(Collectors.toList()); 93 94 this.loc = loc; 95 pileupElements = pile; 96 } 97 98 /** 99 * Returns the first element corresponding to the given read or null there is no such element. 100 * 101 * @param read or null if elements with no reads are to be retrieved from this pileup. 102 * @return the first element corresponding to the given read or null there is no such element. 103 */ 104 @VisibleForTesting getElementForRead(final GATKRead read)105 PileupElement getElementForRead(final GATKRead read) { 106 return getElementStream().filter(el -> Objects.equals(el.getRead(), read)).findAny().orElse(null); 107 } 108 109 /** 110 * Helper routine for converting reads and offset lists to a PileupElement list. 111 */ readsOffsetsToPileup(final List<GATKRead> reads, final List<Integer> offsets)112 private static List<PileupElement> readsOffsetsToPileup(final List<GATKRead> reads, final List<Integer> offsets) { 113 if (reads.size() != offsets.size()) { 114 throw new IllegalArgumentException("Reads and offset lists have different sizes!"); 115 } 116 117 final List<PileupElement> pileup = new ArrayList<>(reads.size()); 118 for (int i = 0; i < reads.size(); i++) { 119 pileup.add(PileupElement.createPileupForReadAndOffset(reads.get(i), offsets.get(i))); 120 } 121 122 return pileup; 123 } 124 125 /** 126 * Helper routine for converting reads and a single offset to a PileupElement list. 127 */ readsOffsetsToPileup(final List<GATKRead> reads, final int offset)128 private static List<PileupElement> readsOffsetsToPileup(final List<GATKRead> reads, final int offset) { 129 return reads.stream().map(r -> PileupElement.createPileupForReadAndOffset(r, offset)).collect(Collectors.toList()); 130 } 131 132 /** 133 * Make a new pileup consisting of elements of this pileup that satisfy the predicate. 134 * NOTE: the new pileup will not be independent of the old one (no deep copy of the underlying data is performed). 135 */ makeFilteredPileup(final Predicate<PileupElement> filter)136 public ReadPileup makeFilteredPileup(final Predicate<PileupElement> filter) { 137 return new ReadPileup(loc, getElementStream().filter(filter).collect(Collectors.toList())); 138 } 139 140 /** 141 * Make a new pileup from elements whose reads have read groups that agree with the given lane ID. 142 * (if they have a name equal to the ID or starting with ID followed by a period "."). 143 * Also, if both laneID and read group ID are {@code null}, the read will be included. 144 * Returns empty pileup if no suitable elements are found. 145 * NOTE: the new pileup will not be independent of the old one (no deep copy of the underlying data is performed). 146 */ getPileupForLane(final String laneID)147 public ReadPileup getPileupForLane(final String laneID) { 148 return makeFilteredPileup(p -> { 149 final GATKRead read = p.getRead(); 150 final String readGroupID = read.getReadGroup(); 151 if (laneID == null && readGroupID == null) { 152 return true; 153 } 154 if (laneID != null && readGroupID != null) { 155 final boolean laneSame = readGroupID.startsWith(laneID + "."); // lane is the same, but sample identifier is different 156 final boolean exactlySame = readGroupID.equals(laneID); // in case there is no sample identifier, they have to be exactly the same 157 if (laneSame || exactlySame) { 158 return true; 159 } 160 } 161 return false; 162 }); 163 } 164 165 /** 166 * Make a new pileup from elements whose reads belong to the given sample. 167 * Passing null sample as an argument retrieves reads whose read group or sample name is {@code null} 168 * NOTE: the new pileup will not be independent of the old one (no deep copy of the underlying data is performed). 169 */ getPileupForSample(final String sample, final SAMFileHeader header)170 public ReadPileup getPileupForSample(final String sample, final SAMFileHeader header) { 171 return makeFilteredPileup(pe -> Objects.equals(ReadUtils.getSampleName(pe.getRead(), header), sample)); 172 } 173 174 /** 175 * Gets a set of the read groups represented in this pileup. 176 * Note: contains null if a read has a null read group 177 */ getReadGroupIDs()178 public Set<String> getReadGroupIDs() { 179 return getElementStream().map(pe -> pe.getRead().getReadGroup()).collect(Collectors.toSet()); 180 } 181 182 /** 183 * Gets a set of the samples represented in this pileup. 184 * Note: contains null if a read has a null read group or a null sample name. 185 */ getSamples(final SAMFileHeader header)186 public Set<String> getSamples(final SAMFileHeader header) { 187 return getElementStream().map(pe -> pe.getRead()).map(r -> ReadUtils.getSampleName(r, header)).collect(Collectors.toSet()); 188 } 189 190 /** 191 * Splits the ReadPileup by sample 192 * 193 * @param header the header to retrieve the samples from 194 * @param unknownSampleName the sample name if there is no read group/sample name; {@code null} if lack of RG is not expected 195 * @return a Map of sample name to ReadPileup (including empty pileups for a sample) 196 * @throws org.broadinstitute.hellbender.exceptions.UserException.ReadMissingReadGroup if unknownSampleName is {@code null} and there are reads without RG/sample name 197 */ splitBySample(final SAMFileHeader header, final String unknownSampleName)198 public Map<String, ReadPileup> splitBySample(final SAMFileHeader header, final String unknownSampleName) { 199 final Map<String, ReadPileup> toReturn = new HashMap<>(); 200 for (final String sample : getSamples(header)) { 201 final ReadPileup pileupBySample = getPileupForSample(sample, header); 202 if (sample != null) { 203 toReturn.put(sample, pileupBySample); 204 } else { 205 if (unknownSampleName == null) { 206 throw new UserException.ReadMissingReadGroup(pileupBySample.iterator().next().getRead()); 207 } 208 toReturn.put(unknownSampleName, pileupBySample); 209 } 210 } 211 return toReturn; 212 } 213 214 /** 215 * The best way to access PileupElements where you only care about the bases and quals in the pileup. 216 * <p> 217 * for (PileupElement p : this) { doSomething(p); } 218 * <p> 219 * Provides efficient iteration of the data. 220 */ 221 @Override iterator()222 public Iterator<PileupElement> iterator() { 223 // Profiling has determined that returning a custom unmodifiable iterator is faster than 224 // Collections.unmodifiableList(pileupElements).iterator() 225 return new Iterator<PileupElement>() { 226 private final int len = pileupElements.size(); 227 private int i = 0; 228 229 @Override 230 public boolean hasNext() { 231 return i < len; 232 } 233 234 @Override 235 public PileupElement next() { 236 return pileupElements.get(i++); 237 } 238 239 @Override 240 public void remove() { 241 throw new UnsupportedOperationException("Cannot remove from a pileup element iterator"); 242 } 243 }; 244 } 245 246 /** 247 * Iterator over sorted by read start PileupElements. 248 */ 249 public Iterator<PileupElement> sortedIterator() { 250 return getElementStream() 251 .sorted((l, r) -> Integer.compare(l.getRead().getStart(), r.getRead().getStart())) 252 .iterator(); 253 } 254 255 /** 256 * The number of elements in this pileup. 257 */ 258 public int size() { 259 return pileupElements.size(); 260 } 261 262 /** 263 * @return true if there are 0 elements in the pileup, false otherwise 264 */ 265 public boolean isEmpty() { 266 return size() == 0; 267 } 268 269 /** 270 * @return the location of this pileup. 271 */ 272 public Locatable getLocation() { 273 return loc; 274 } 275 276 /** 277 * Get counts of A, C, G, T in order, which returns a int[4] vector with counts according 278 * to BaseUtils.simpleBaseToBaseIndex for each base. 279 * Deletions are not counted. 280 */ 281 public int[] getBaseCounts() { 282 final int[] counts = new int[4]; 283 284 for (final PileupElement pile : this) { 285 // skip deletion sites 286 if (!pile.isDeletion()) { 287 final int index = BaseUtils.simpleBaseToBaseIndex(pile.getBase()); 288 if (index != -1) { 289 counts[index]++; 290 } 291 } 292 } 293 294 return counts; 295 } 296 297 /** 298 * Fixes the quality of all the elements that come from an overlapping pair in the same way as 299 * samtools does {@see tweak_overlap_quality function in 300 * <a href="https://github.com/samtools/htslib/blob/master/sam.c">samtools</a>}. 301 * <p> 302 * Setting the quality of one of the bases to 0 effectively removes the redundant base for 303 * calling. In addition, if the bases overlap we have increased confidence if they agree (or 304 * reduced if they don't). Thus, the algorithm proceeds as following: 305 * <p> 306 * 1. If the bases are the same, the quality of the first element is the sum of both qualities 307 * and the quality of the second is reduced to 0. 308 * 2. If the bases are different, the base with the highest quality is reduced with a factor of 309 * 0.8, and the quality of the lowest is reduced to 0. 310 * <p> 311 * Note: Resulting qualities higher than {@link QualityUtils#MAX_SAM_QUAL_SCORE} are capped. 312 */ 313 public void fixOverlaps() { 314 final FragmentCollection<PileupElement> fragments = FragmentCollection.create(this); 315 fragments.getOverlappingPairs().stream() 316 .forEach( 317 elements -> fixPairOverlappingQualities(elements.getLeft(), elements.getRight()) 318 ); 319 } 320 321 /** 322 * Fixes the quality of two elements that come from an overlapping pair in the same way as 323 * samtools does {@see tweak_overlap_quality function in 324 * <a href="https://github.com/samtools/htslib/blob/master/sam.c">samtools</a>}. 325 * The only difference with the samtools API is the cap for high values ({@link QualityUtils#MAX_SAM_QUAL_SCORE}). 326 */ 327 @VisibleForTesting 328 static void fixPairOverlappingQualities(final PileupElement firstElement, 329 final PileupElement secondElement) { 330 // only if they do not represent deletions 331 if (!secondElement.isDeletion() && !firstElement.isDeletion()) { 332 final byte[] firstQuals = firstElement.getRead().getBaseQualities(); 333 final byte[] secondQuals = secondElement.getRead().getBaseQualities(); 334 if (firstElement.getBase() == secondElement.getBase()) { 335 // if both have the same base, extra confidence in the firts of them 336 firstQuals[firstElement.getOffset()] = 337 (byte) (firstQuals[firstElement.getOffset()] + secondQuals[secondElement 338 .getOffset()]); 339 // cap to maximum byte value 340 if (firstQuals[firstElement.getOffset()] < 0 341 || firstQuals[firstElement.getOffset()] > QualityUtils.MAX_SAM_QUAL_SCORE) { 342 firstQuals[firstElement.getOffset()] = QualityUtils.MAX_SAM_QUAL_SCORE; 343 } 344 secondQuals[secondElement.getOffset()] = 0; 345 } else { 346 // if not, we lost confidence in the one with higher quality 347 if (firstElement.getQual() >= secondElement.getQual()) { 348 firstQuals[firstElement.getOffset()] = 349 (byte) (SAMTOOLS_OVERLAP_LOW_CONFIDENCE * firstQuals[firstElement.getOffset()]); 350 secondQuals[secondElement.getOffset()] = 0; 351 } else { 352 secondQuals[secondElement.getOffset()] = 353 (byte) (SAMTOOLS_OVERLAP_LOW_CONFIDENCE * secondQuals[secondElement.getOffset()]); 354 firstQuals[firstElement.getOffset()] = 0; 355 } 356 } 357 firstElement.getRead().setBaseQualities(firstQuals); 358 secondElement.getRead().setBaseQualities(secondQuals); 359 } 360 } 361 362 public final static Comparator<PileupElement> baseQualTieBreaker = new Comparator<PileupElement>() { 363 @Override 364 public int compare(PileupElement o1, PileupElement o2) { 365 return Byte.compare(o1.getQual(), o2.getQual()); 366 } 367 }; 368 369 public final static Comparator<PileupElement> mapQualTieBreaker = new Comparator<PileupElement>() { 370 @Override 371 public int compare(PileupElement o1, PileupElement o2) { 372 return Integer.compare(o1.getMappingQual(), o2.getMappingQual()); 373 } 374 }; 375 376 /** 377 * Returns a new ReadPileup where only one read from an overlapping read 378 * pair is retained. If the two reads in question disagree to their basecall, 379 * neither read is retained. If they agree on the base, the read with the higher 380 * base quality observation is retained 381 * 382 * @return the newly filtered pileup 383 */ 384 public ReadPileup getOverlappingFragmentFilteredPileup(SAMFileHeader header) { 385 return getOverlappingFragmentFilteredPileup(true, baseQualTieBreaker, header); 386 } 387 388 /** 389 * Returns a new ReadPileup where only one read from an overlapping read 390 * pair is retained. If discardDiscordant and the two reads in question disagree to their basecall, 391 * neither read is retained. Otherwise, the read with the higher 392 * quality (base or mapping, depending on baseQualNotMapQual) observation is retained 393 * 394 * @return the newly filtered pileup 395 */ 396 public ReadPileup getOverlappingFragmentFilteredPileup(boolean discardDiscordant, Comparator<PileupElement> tieBreaker, SAMFileHeader header) { 397 List<PileupElement> filteredPileupList = new ArrayList<PileupElement>(); 398 399 for (ReadPileup pileup : this.splitBySample(header, null).values()) { 400 Collection<PileupElement> elements = filterSingleSampleForOverlaps(pileup, tieBreaker, discardDiscordant); 401 filteredPileupList.addAll(elements); 402 } 403 404 return new ReadPileup(loc, filteredPileupList); 405 } 406 407 private Collection<PileupElement> filterSingleSampleForOverlaps(ReadPileup pileup, Comparator<PileupElement> tieBreaker, boolean discardDiscordant) { 408 Map<String, PileupElement> filteredPileup = new HashMap<String, PileupElement>(); 409 Set<String> readNamesDeleted = new HashSet<>(); 410 411 for (PileupElement p : pileup) { 412 String readName = p.getRead().getName(); 413 414 // if we've never seen this read before, life is good 415 if (!filteredPileup.containsKey(readName)) { 416 if(!readNamesDeleted.contains(readName)) { 417 filteredPileup.put(readName, p); 418 } 419 } else { 420 PileupElement existing = filteredPileup.get(readName); 421 422 // if the reads disagree at this position, throw them all out. Otherwise 423 // keep the element with the highest quality score 424 if (discardDiscordant && existing.getBase() != p.getBase()) { 425 filteredPileup.remove(readName); 426 readNamesDeleted.add(readName); 427 } else if (tieBreaker.compare(existing, p) < 0) { 428 filteredPileup.put(readName, p); 429 } 430 } 431 } 432 return(filteredPileup.values()); 433 } 434 435 @Override 436 public String toString() { 437 return String.format("%s %s %s %s", 438 loc.getContig(), 439 loc.getStart(), 440 new String(getBases()), 441 getQualsString()); 442 } 443 444 /** 445 * Format, assuming a single-sample, in a samtools-like string. 446 * Each line represents a genomic position, consisting of chromosome name, coordinate, 447 * reference base, read bases and read qualities 448 * 449 * @param ref the reference base 450 * @return pileup line 451 */ 452 public String getPileupString(final char ref) { 453 // In the pileup format, 454 return String.format("%s %s %c %s %s", 455 getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate 456 ref, // reference base 457 new String(getBases()), 458 getQualsString()); 459 } 460 461 /** 462 * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time 463 */ 464 public List<GATKRead> getReads() { 465 return getElementStream().map(pe -> pe.getRead()).collect(Collectors.toList()); 466 } 467 468 private Stream<PileupElement> getElementStream() { 469 return pileupElements.stream(); 470 } 471 472 /** 473 * Returns the number of elements that satisfy the predicate. 474 */ 475 public int getNumberOfElements(final Predicate<PileupElement> peFilter) { 476 Utils.nonNull(peFilter); 477 //Note: pileups are small so int is fine. 478 return (int) getElementStream().filter(peFilter).count(); 479 } 480 481 /** 482 * Returns a list of the offsets in this pileup. 483 * Note: this call costs O(n) and allocates fresh lists each time 484 */ 485 public List<Integer> getOffsets() { 486 return getElementStream().map(pe -> pe.getOffset()).collect(Collectors.toList()); 487 } 488 489 //Extracts an int array by mapping each element in the pileup to int. 490 private int[] extractIntArray(final ToIntFunction<PileupElement> map) { 491 return getElementStream().mapToInt(map).toArray(); 492 } 493 494 /** 495 * Returns an array of the bases in this pileup. 496 * Note: this call costs O(n) and allocates fresh array each time 497 */ 498 public byte[] getBases() { 499 return toByteArray(extractIntArray(pe -> pe.getBase())); 500 } 501 502 /** 503 * Returns an array of the quals in this pileup. 504 * Note: this call costs O(n) and allocates fresh array each time 505 */ 506 public byte[] getBaseQuals() { 507 return toByteArray(extractIntArray(pe -> pe.getQual())); 508 } 509 510 //Converts array of ints to array of bytes by hard casting (loses precision if ints are large). 511 private byte[] toByteArray(final int[] ints) { 512 final byte[] bytes = new byte[ints.length]; 513 for (int i = 0; i < ints.length; i++) { 514 bytes[i] = (byte) ints[i]; 515 } 516 return bytes; 517 } 518 519 /** 520 * Get an array of the mapping qualities. 521 */ 522 public int[] getMappingQuals() { 523 return extractIntArray(pe -> pe.getMappingQual()); 524 } 525 526 private String getQualsString() { 527 final byte[] quals = getBaseQuals(); 528 for (int i = 0; i < quals.length; i++) { 529 quals[i] = (byte) (33 + quals[i]); //as per SAM spec 530 } 531 return new String(quals); 532 } 533 } 534