1 package org.broadinstitute.hellbender.tools.walkers.variantutils; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import htsjdk.variant.variantcontext.*; 5 import htsjdk.variant.variantcontext.writer.VariantContextWriter; 6 import htsjdk.variant.vcf.*; 7 import org.apache.commons.lang3.tuple.Pair; 8 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; 9 import org.broadinstitute.barclay.argparser.Hidden; 10 import org.broadinstitute.barclay.help.DocumentedFeature; 11 import org.broadinstitute.barclay.argparser.Argument; 12 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; 13 import org.broadinstitute.hellbender.engine.FeatureContext; 14 import org.broadinstitute.hellbender.engine.GATKPath; 15 import org.broadinstitute.hellbender.engine.ReadsContext; 16 import org.broadinstitute.hellbender.engine.ReferenceContext; 17 import org.broadinstitute.hellbender.engine.VariantWalker; 18 import org.broadinstitute.hellbender.utils.IndexRange; 19 import org.broadinstitute.hellbender.utils.SimpleInterval; 20 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; 21 import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; 22 import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils; 23 import org.broadinstitute.hellbender.utils.variant.VcfUtils; 24 import picard.cmdline.programgroups.VariantManipulationProgramGroup; 25 import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod; 26 import org.broadinstitute.hellbender.utils.read.AlignmentUtils; 27 28 import java.nio.file.Path; 29 import java.util.*; 30 import java.util.stream.Collectors; 31 import java.util.stream.IntStream; 32 33 /** 34 * Left-align indels in a variant callset 35 * 36 * <p> 37 * This tool takes a VCF file, left-aligns the indels and trims common bases from indels, 38 * leaving them with a minimum representation. The same indel can often be placed at multiple positions and still 39 * represent the same haplotype. While the standard convention with VCF is to place an indel at the left-most position 40 * this isn't always done, so this tool can be used to left-align them. This tool optionally splits multiallelic 41 * sites into biallelics and left-aligns individual alleles. Optionally, the tool will not trim common bases from indels. 42 * </p> 43 * 44 * <h3>Input</h3> 45 * <p> 46 * A variant call set to left-align and trim. 47 * </p> 48 * 49 * <h3>Output</h3> 50 * <p> 51 * A left-aligned VCF. 52 * </p> 53 * 54 * <h3>Usage examples</h3> 55 * 56 * <h4>Left align and trim alleles</h4> 57 * <pre> 58 * gatk LeftAlignAndTrimVariants \ 59 * -R reference.fasta \ 60 * -V input.vcf \ 61 * -O output.vcf 62 * </pre> 63 * 64 * <h4>Left align and don't trim alleles</h4> 65 * <pre> 66 * gatk LeftAlignAndTrimVariants \ 67 * -R reference.fasta \ 68 * -V input.vcf \ 69 * -O output.vcf \ 70 * --dont-trim-alleles 71 * </pre> 72 * 73 * <h4>Left align and trim alleles, process alleles <= 208 bases</h4> 74 * <pre> 75 * gatk LeftAlignAndTrimVariants \ 76 * -R reference.fasta \ 77 * -V input.vcf \ 78 * -O output.vcf \ 79 * --max-indel-length 208 80 * </pre> 81 * <h4>Split multiallics into biallelics, left align and trim alleles</h4> 82 * <pre> 83 * gatk LeftAlignAndTrimVariants \ 84 * -R reference.fasta \ 85 * -V input.vcf \ 86 * -O output.vcf \ 87 * --split-multi-allelics 88 * </pre> 89 * 90 * <h4>Split multiallelics into biallics, left align but don't trim alleles, and store the original AC, AF, and AN values</h4> 91 * <pre> 92 * gatk LeftAlignAndTrimVariants \ 93 * -R reference.fasta \ 94 * -V input.vcf \ 95 * -O output.vcf \ 96 * --split-multi-allelics \ 97 * --dont-trim-alleles 98 * --keep-original-ac 99 * </pre> 100 * <h4> Left align variants up to 2000 bases to the left (default is at most left aligning 1000 bases to left)</h4> 101 * <pre> 102 * gatk LeftAlignAndTrimVariants \ 103 * -R reference.fasta \ 104 * -V input.vcf \ 105 * -O output.vcf \ 106 * --max-leading-bases 2000 107 * </pre> 108 */ 109 @CommandLineProgramProperties( 110 summary = "This tool takes a VCF file, left-aligns the indels and trims common bases from indels," + 111 "leaving them with a minimum representation. The same indel can often be placed at multiple positions and still" + 112 "represent the same haplotype. While the standard convention with VCF is to place an indel at the left-most position" + 113 "this isn't always done, so this tool can be used to left-align them. This tool optionally splits multiallelic" + 114 "sites into biallelics and left-aligns individual alleles. Optionally, the tool will not trim common bases from indels.", 115 oneLineSummary = "Left align and trim vairants", 116 programGroup = VariantManipulationProgramGroup.class 117 ) 118 @DocumentedFeature 119 public class LeftAlignAndTrimVariants extends VariantWalker { 120 121 public static final String DONT_TRIM_ALLELES_LONG_NAME = "dont-trim-alleles"; 122 public static final String DONT_TRIM_ALLELES_SHORT_NAME = "no-trim"; 123 public static final String SPLIT_MULTIALLELEICS_LONG_NAME = "split-multi-allelics"; 124 public static final String KEEP_ORIGINAL_AC_LONG_NAME = "keep-original-ac"; 125 public static final String MAX_INDEL_LENGTH_LONG_NAME = "max-indel-length"; 126 127 public static final int DEFAULT_MAX_LEADING_BASES= 1000; 128 129 @VisibleForTesting 130 static final int DEFAULT_MAX_INDEL_SIZE = 200; 131 132 public static final String MAX_LEADING_BASES_LONG_NAME = "max-leading-bases"; 133 /** 134 * Output file to which to write left aligned variants 135 */ 136 @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, 137 shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, 138 doc = "File to which variants should be written") 139 public GATKPath outFile = null; 140 /** 141 * If this argument is set, bases common to all alleles will not be removed and will not leave their minimal representation. 142 */ 143 @Argument(fullName = DONT_TRIM_ALLELES_LONG_NAME, shortName = DONT_TRIM_ALLELES_SHORT_NAME, doc = "Do not Trim alleles to remove bases common to all of them", optional = true) 144 protected boolean dontTrimAlleles = false; 145 146 /** 147 * If this argument is set, split multiallelic records and left-align individual alleles. 148 * If this argument is not set, multiallelic records are not attempted to left-align and will be copied as is. 149 */ 150 @Argument(fullName = SPLIT_MULTIALLELEICS_LONG_NAME, doc = "Split multiallelic records and left-align individual alleles", optional = true) 151 protected boolean splitMultiallelics = false; 152 153 /** 154 * When subsetting a callset, this tool recalculates the AC, AF, and AN values corresponding to the contents of the 155 * subset. If this flag is enabled, the original values of those annotations will be stored in new annotations called 156 * AC_Orig, AF_Orig, and AN_Orig. 157 */ 158 @Argument(fullName = KEEP_ORIGINAL_AC_LONG_NAME, doc = "Store the original AC, AF, and AN values after subsetting", optional = true) 159 private boolean keepOriginalChrCounts = false; 160 161 /** 162 * Maximum indel size to realign. Indels larger than this will be left unadjusted. 163 */ 164 @Argument(fullName = MAX_INDEL_LENGTH_LONG_NAME, doc = "Set maximum indel size to realign", optional = true) 165 protected int maxIndelSize = DEFAULT_MAX_INDEL_SIZE; 166 167 /** 168 * Distance in reference to look back before allele 169 */ 170 @Argument(fullName = MAX_LEADING_BASES_LONG_NAME, doc = "Set max reference window size to look back before allele", optional = true) 171 protected int maxLeadingBases = DEFAULT_MAX_LEADING_BASES; 172 173 @Hidden 174 @Argument(fullName = "suppress-reference-path", optional = true, 175 doc = "Suppress reference path in output for test result differencing") 176 private boolean suppressReferencePath = false; 177 178 private VariantContextWriter vcfWriter = null; 179 private VCFHeader vcfHeader = null; 180 181 VariantContext lastVariant; 182 183 @Override onTraversalStart()184 public void onTraversalStart() { 185 final Map<String, VCFHeader> vcfHeaders = Collections.singletonMap(getDrivingVariantsFeatureInput().getName(), getHeaderForVariants()); 186 final SortedSet<String> vcfSamples = VcfUtils.getSortedSampleSet(vcfHeaders, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); 187 188 // Initialize VCF header lines 189 final Path refPath = referenceArguments.getReferencePath(); 190 final Set<VCFHeaderLine> actualLines = VcfUtils.updateHeaderContigLines(createVCFHeaderLineList(vcfHeaders), refPath, getReferenceDictionary(), suppressReferencePath); 191 192 vcfWriter = createVCFWriter(outFile); 193 vcfHeader = new VCFHeader(actualLines, vcfSamples); 194 vcfWriter.writeHeader(vcfHeader); 195 } 196 197 /** 198 * Prepare the VCF header lines 199 */ createVCFHeaderLineList(Map<String, VCFHeader> vcfHeaders)200 private Set<VCFHeaderLine> createVCFHeaderLineList(Map<String, VCFHeader> vcfHeaders) { 201 final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfHeaders.values(), true); 202 headerLines.addAll(getDefaultToolVCFHeaderLines()); 203 204 if (splitMultiallelics) { 205 GATKVariantContextUtils.addChromosomeCountsToHeader(headerLines); 206 } 207 208 if (keepOriginalChrCounts) { 209 headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AC_KEY)); 210 headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AF_KEY)); 211 headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AN_KEY)); 212 } 213 return headerLines; 214 } 215 216 217 /** 218 * Left aligns variants in repetitive regions. Also trims alleles and splits multiallelics to biallelics, if desired 219 */ 220 @Override apply(VariantContext vc, ReadsContext readsContext, ReferenceContext ref, FeatureContext featureContext)221 public void apply(VariantContext vc, ReadsContext readsContext, ReferenceContext ref, FeatureContext featureContext) { 222 final List<VariantContext> vcList; 223 if (splitMultiallelics) { 224 if (vc.getGenotypes().stream().anyMatch(g -> g.hasAnyAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY))) { 225 vcList = GATKVariantContextUtils.splitSomaticVariantContextToBiallelics(vc, false, vcfHeader); 226 } else { 227 vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc, false, GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, keepOriginalChrCounts); 228 } 229 } else { 230 vcList = Collections.singletonList(vc); 231 } 232 233 for (final VariantContext splitVariant : vcList) { 234 final List<Integer> indelLengths = splitVariant.getIndelLengths(); 235 final int indelLength = indelLengths == null ? 0 : indelLengths.stream().map(Math::abs).max(Integer::compareTo).orElse(0); 236 237 if (indelLength > maxIndelSize) { 238 logger.info(String.format("%s (%d) at position %s:%d; skipping that record. Set --max-indel-length >= %d", 239 "Indel is too long", indelLength, splitVariant.getContig(), splitVariant.getStart(), indelLength)); 240 lastVariant = splitVariant; 241 vcfWriter.add(splitVariant); 242 } else { 243 final int distanceToLastVariant = (lastVariant != null && splitVariant.contigsMatch(lastVariant)) ? splitVariant.getStart() - lastVariant.getEnd() : Integer.MAX_VALUE; 244 lastVariant = leftAlignAndTrim(splitVariant, ref, Math.min(maxLeadingBases, distanceToLastVariant - 1), !dontTrimAlleles); 245 vcfWriter.add(lastVariant); 246 } 247 } 248 } 249 250 /** 251 * Reference is required for left alignment 252 */ 253 @Override requiresReference()254 public boolean requiresReference() { 255 return true; 256 } 257 258 /** 259 * Print out message of how many variants were realigned 260 * 261 * @return 262 */ 263 @Override onTraversalSuccess()264 public Object onTraversalSuccess() { 265 return "SUCCESS"; 266 } 267 268 /** 269 * Close out the new variants file. 270 */ 271 @Override closeTool()272 public void closeTool() { 273 if (vcfWriter != null) { 274 vcfWriter.close(); 275 } 276 } 277 278 /** 279 * Main routine workhorse. By definition, it will only take biallelic vc's. Splitting into multiple alleles has to be 280 * handled by calling routine. 281 * 282 * @param vc Input VC with variants to left align 283 * @param ref Reference context 284 * @return new VC. 285 */ 286 @VisibleForTesting leftAlignAndTrim(final VariantContext vc, final ReferenceContext ref, final int maxLeadingBases, final boolean trim)287 static VariantContext leftAlignAndTrim(final VariantContext vc, final ReferenceContext ref, final int maxLeadingBases, final boolean trim) { 288 if (!vc.isIndel() || maxLeadingBases <= 0) { 289 return vc; 290 } 291 292 293 for(int leadingBases = Math.min(maxLeadingBases, 10); leadingBases <= maxLeadingBases; leadingBases = Math.min(2*leadingBases, maxLeadingBases)) { 294 final int refStart = Math.max(vc.getStart() - leadingBases, 1); 295 296 // reference sequence starting before the variant (to give space for left-alignment) and ending at the variant end 297 final byte[] refSeq = ref.getBases(new SimpleInterval(vc.getContig(), refStart, vc.getEnd())); 298 299 final int variantOffsetInRef = vc.getStart() - refStart; 300 301 final List<byte[]> sequences = vc.getAlleles().stream().map(a -> { 302 final byte[] result = new byte[variantOffsetInRef + a.length()]; 303 System.arraycopy(refSeq, 0, result, 0, variantOffsetInRef); 304 System.arraycopy(a.getBases(), 0, result, variantOffsetInRef, a.length()); 305 return result; 306 }).collect(Collectors.toList()); 307 308 final List<IndexRange> alleleRanges = vc.getAlleles().stream() 309 .map(a -> new IndexRange(variantOffsetInRef + 1, variantOffsetInRef + a.length())) // +1 to ignore the shared base in front 310 .collect(Collectors.toList()); 311 312 // note that this also shifts the index ranges as a side effect, so below they can be used to output allele bases 313 final Pair<Integer, Integer> shifts = AlignmentUtils.normalizeAlleles(sequences, alleleRanges, variantOffsetInRef, trim); 314 315 if (shifts.getLeft() == 0 && shifts.getRight() == 0) { 316 return vc; 317 } else if (shifts.getLeft() == variantOffsetInRef && leadingBases < maxLeadingBases) { 318 continue; 319 } 320 321 final Map<Allele, Allele> alleleMap = IntStream.range(0, alleleRanges.size()).boxed() 322 .collect(Collectors.toMap( 323 n -> vc.getAlleles().get(n), 324 n -> Allele.create(Arrays.copyOfRange(sequences.get(n), variantOffsetInRef - shifts.getLeft(), variantOffsetInRef - shifts.getRight() + vc.getAlleles().get(n).length()), n == 0))); 325 326 final GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); 327 for (final Genotype genotype : vc.getGenotypes()) { 328 final List<Allele> newAlleles = genotype.getAlleles().stream().map(a -> alleleMap.getOrDefault(a, Allele.NO_CALL)).collect(Collectors.toList()); 329 newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make()); 330 } 331 332 return new VariantContextBuilder(vc).start(vc.getStart() - shifts.getLeft()).stop(vc.getEnd() - shifts.getRight()) 333 .alleles(alleleMap.values()).genotypes(newGenotypes).make(); 334 } 335 336 return vc; 337 } 338 } 339