1 package org.broadinstitute.hellbender.tools.walkers.annotator; 2 3 import com.google.common.primitives.Ints; 4 import htsjdk.samtools.Cigar; 5 import htsjdk.samtools.CigarElement; 6 import htsjdk.samtools.CigarOperator; 7 import htsjdk.variant.variantcontext.VariantContext; 8 import org.broadinstitute.barclay.help.DocumentedFeature; 9 import org.broadinstitute.hellbender.utils.MathUtils; 10 import org.broadinstitute.hellbender.utils.Utils; 11 import org.broadinstitute.hellbender.utils.help.HelpConstants; 12 import org.broadinstitute.hellbender.utils.read.AlignmentUtils; 13 import org.broadinstitute.hellbender.utils.read.GATKRead; 14 import org.broadinstitute.hellbender.utils.read.ReadUtils; 15 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; 16 17 import java.util.List; 18 import java.util.OptionalDouble; 19 import java.util.OptionalInt; 20 21 /** 22 * Median distance of variant starts from ends of reads supporting each alt allele. 23 * 24 * </p>The output is an array containing, for each alt allele, the median distance of the variant start from the closest read end over all reads that best match that allele.</p> 25 * </p>For example, for variant context with ref allele A and alt allele C the read position for alt-supporting read GGGGCTT is 2 because the A to C 26 * substitution is 2 bases from the right end of the read, which is less than its distance from the left end. 27 * For variant context with ref allele AG and alt allele A (deletion) the read position of alt-supporting read ATTTTT is 0. 28 * For variant context with ref allele A and alt allele AG (insertion) the read position of alt-supporting read TTTTAG is 1.</p> 29 * <p>The annotation considers only the read's bases themselves and not the position they map to with respect to the reference. For example, 30 * suppose a substitution is preceded by 80 matching bases and followed by 10 matching bases, a 10-base deletion, and 10 more matching bases. Its distance from the end of the read 31 * is 20 bases, not 30 bases, because the deleted bases belong to the reference, not the read. Similarly soft-clipped bases are counted in the distance.</p> 32 * <p>This annotation is useful for filtering alignment artifacts.</p> 33 */ 34 @DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Median distance of variant starts from ends of reads supporting each allele (MPOS)") 35 public class ReadPosition extends PerAlleleAnnotation implements StandardMutectAnnotation { 36 37 // we don't want a GGA mode allele with no reads to prejudice us against a site so we assign a non-suspicious value 38 private static final int VALUE_FOR_NO_READS = 50; 39 40 @Override aggregate(final List<Integer> values)41 protected int aggregate(final List<Integer> values) { 42 return values.isEmpty() ? VALUE_FOR_NO_READS : MathUtils.median(Ints.toArray(values)); 43 } 44 45 @Override getVcfKey()46 protected String getVcfKey() { return GATKVCFConstants.MEDIAN_READ_POSITON_KEY; } 47 48 @Override getDescription()49 protected String getDescription() { return "median distance from end of read"; } 50 51 @Override getValueForRead(final GATKRead read, final VariantContext vc)52 protected OptionalInt getValueForRead(final GATKRead read, final VariantContext vc) { 53 if (vc.getStart() < read.getStart() || read.getEnd() < vc.getStart()) { 54 return OptionalInt.empty(); 55 } 56 final OptionalDouble valueAsDouble = ReadPosRankSumTest.getReadPosition(read, vc); 57 return valueAsDouble.isPresent() ? OptionalInt.of((int) valueAsDouble.getAsDouble()) : OptionalInt.empty(); 58 } 59 } 60