1 package org.broadinstitute.hellbender.tools.walkers.variantutils;
2 
3 import com.google.common.annotations.VisibleForTesting;
4 import htsjdk.variant.variantcontext.*;
5 import htsjdk.variant.variantcontext.writer.VariantContextWriter;
6 import htsjdk.variant.vcf.*;
7 import org.apache.commons.lang3.tuple.Pair;
8 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
9 import org.broadinstitute.barclay.argparser.Hidden;
10 import org.broadinstitute.barclay.help.DocumentedFeature;
11 import org.broadinstitute.barclay.argparser.Argument;
12 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
13 import org.broadinstitute.hellbender.engine.FeatureContext;
14 import org.broadinstitute.hellbender.engine.GATKPath;
15 import org.broadinstitute.hellbender.engine.ReadsContext;
16 import org.broadinstitute.hellbender.engine.ReferenceContext;
17 import org.broadinstitute.hellbender.engine.VariantWalker;
18 import org.broadinstitute.hellbender.utils.IndexRange;
19 import org.broadinstitute.hellbender.utils.SimpleInterval;
20 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
21 import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
22 import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
23 import org.broadinstitute.hellbender.utils.variant.VcfUtils;
24 import picard.cmdline.programgroups.VariantManipulationProgramGroup;
25 import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod;
26 import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
27 
28 import java.nio.file.Path;
29 import java.util.*;
30 import java.util.stream.Collectors;
31 import java.util.stream.IntStream;
32 
33 /**
34  * Left-align indels in a variant callset
35  *
36  * <p>
37  * This tool takes a VCF file, left-aligns the indels and trims common bases from indels,
38  * leaving them with a minimum representation. The same indel can often be placed at multiple positions and still
39  * represent the same haplotype. While the standard convention with VCF is to place an indel at the left-most position
40  * this isn't always done, so this tool can be used to left-align them. This tool optionally splits multiallelic
41  * sites into biallelics and left-aligns individual alleles. Optionally, the tool will not trim common bases from indels.
42  * </p>
43  *
44  * <h3>Input</h3>
45  * <p>
46  * A variant call set to left-align and trim.
47  * </p>
48  *
49  * <h3>Output</h3>
50  * <p>
51  * A left-aligned VCF.
52  * </p>
53  *
54  * <h3>Usage examples</h3>
55  *
56  * <h4>Left align and trim alleles</h4>
57  * <pre>
58  * gatk LeftAlignAndTrimVariants \
59  *   -R reference.fasta \
60  *   -V input.vcf \
61  *   -O output.vcf
62  * </pre>
63  *
64  * <h4>Left align and don't trim alleles</h4>
65  * <pre>
66  * gatk LeftAlignAndTrimVariants \
67  *   -R reference.fasta \
68  *   -V input.vcf \
69  *   -O output.vcf \
70  *   --dont-trim-alleles
71  * </pre>
72  *
73  * <h4>Left align and trim alleles, process alleles <= 208 bases</h4>
74  * <pre>
75  * gatk LeftAlignAndTrimVariants \
76  *   -R reference.fasta \
77  *   -V input.vcf \
78  *   -O output.vcf \
79  *   --max-indel-length 208
80  * </pre>
81  * <h4>Split multiallics into biallelics, left align and trim alleles</h4>
82  * <pre>
83  * gatk LeftAlignAndTrimVariants \
84  *   -R reference.fasta \
85  *   -V input.vcf \
86  *   -O output.vcf \
87  *   --split-multi-allelics
88  * </pre>
89  *
90  * <h4>Split multiallelics into biallics, left align but don't trim alleles, and store the original AC, AF, and AN values</h4>
91  * <pre>
92  * gatk LeftAlignAndTrimVariants \
93  *   -R reference.fasta \
94  *   -V input.vcf \
95  *   -O output.vcf \
96  *   --split-multi-allelics \
97  *   --dont-trim-alleles
98  *   --keep-original-ac
99  * </pre>
100  * <h4> Left align variants up to 2000 bases to the left (default is at most left aligning 1000 bases to left)</h4>
101  * <pre>
102  * gatk LeftAlignAndTrimVariants \
103  *   -R reference.fasta \
104  *   -V input.vcf \
105  *   -O output.vcf \
106  *   --max-leading-bases 2000
107  * </pre>
108  */
109 @CommandLineProgramProperties(
110         summary = "This tool takes a VCF file, left-aligns the indels and trims common bases from indels," +
111                 "leaving them with a minimum representation. The same indel can often be placed at multiple positions and still" +
112                 "represent the same haplotype. While the standard convention with VCF is to place an indel at the left-most position" +
113                 "this isn't always done, so this tool can be used to left-align them. This tool optionally splits multiallelic" +
114                 "sites into biallelics and left-aligns individual alleles. Optionally, the tool will not trim common bases from indels.",
115         oneLineSummary = "Left align and trim vairants",
116         programGroup = VariantManipulationProgramGroup.class
117 )
118 @DocumentedFeature
119 public class LeftAlignAndTrimVariants extends VariantWalker {
120 
121     public static final String DONT_TRIM_ALLELES_LONG_NAME = "dont-trim-alleles";
122     public static final String DONT_TRIM_ALLELES_SHORT_NAME = "no-trim";
123     public static final String SPLIT_MULTIALLELEICS_LONG_NAME = "split-multi-allelics";
124     public static final String KEEP_ORIGINAL_AC_LONG_NAME = "keep-original-ac";
125     public static final String MAX_INDEL_LENGTH_LONG_NAME = "max-indel-length";
126 
127     public static final int DEFAULT_MAX_LEADING_BASES= 1000;
128 
129     @VisibleForTesting
130     static final int DEFAULT_MAX_INDEL_SIZE = 200;
131 
132     public static final String MAX_LEADING_BASES_LONG_NAME = "max-leading-bases";
133     /**
134      * Output file to which to write left aligned variants
135      */
136     @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
137             shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
138             doc = "File to which variants should be written")
139     public GATKPath outFile = null;
140     /**
141      * If this argument is set, bases common to all alleles will not be removed and will not leave their minimal representation.
142      */
143     @Argument(fullName = DONT_TRIM_ALLELES_LONG_NAME, shortName = DONT_TRIM_ALLELES_SHORT_NAME, doc = "Do not Trim alleles to remove bases common to all of them", optional = true)
144     protected boolean dontTrimAlleles = false;
145 
146     /**
147      * If this argument is set, split multiallelic records and left-align individual alleles.
148      * If this argument is not set, multiallelic records are not attempted to left-align and will be copied as is.
149      */
150     @Argument(fullName = SPLIT_MULTIALLELEICS_LONG_NAME, doc = "Split multiallelic records and left-align individual alleles", optional = true)
151     protected boolean splitMultiallelics = false;
152 
153     /**
154      * When subsetting a callset, this tool recalculates the AC, AF, and AN values corresponding to the contents of the
155      * subset. If this flag is enabled, the original values of those annotations will be stored in new annotations called
156      * AC_Orig, AF_Orig, and AN_Orig.
157      */
158     @Argument(fullName = KEEP_ORIGINAL_AC_LONG_NAME, doc = "Store the original AC, AF, and AN values after subsetting", optional = true)
159     private boolean keepOriginalChrCounts = false;
160 
161     /**
162      * Maximum indel size to realign.  Indels larger than this will be left unadjusted.
163      */
164     @Argument(fullName = MAX_INDEL_LENGTH_LONG_NAME, doc = "Set maximum indel size to realign", optional = true)
165     protected int maxIndelSize = DEFAULT_MAX_INDEL_SIZE;
166 
167     /**
168      * Distance in reference to look back before allele
169      */
170     @Argument(fullName = MAX_LEADING_BASES_LONG_NAME, doc = "Set max reference window size to look back before allele", optional = true)
171     protected int maxLeadingBases = DEFAULT_MAX_LEADING_BASES;
172 
173     @Hidden
174     @Argument(fullName = "suppress-reference-path", optional = true,
175             doc = "Suppress reference path in output for test result differencing")
176     private boolean suppressReferencePath = false;
177 
178     private VariantContextWriter vcfWriter = null;
179     private VCFHeader vcfHeader = null;
180 
181     VariantContext lastVariant;
182 
183     @Override
onTraversalStart()184     public void onTraversalStart() {
185         final Map<String, VCFHeader> vcfHeaders = Collections.singletonMap(getDrivingVariantsFeatureInput().getName(), getHeaderForVariants());
186         final SortedSet<String> vcfSamples = VcfUtils.getSortedSampleSet(vcfHeaders, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
187 
188         // Initialize VCF header lines
189         final Path refPath = referenceArguments.getReferencePath();
190         final Set<VCFHeaderLine> actualLines = VcfUtils.updateHeaderContigLines(createVCFHeaderLineList(vcfHeaders), refPath, getReferenceDictionary(), suppressReferencePath);
191 
192         vcfWriter = createVCFWriter(outFile);
193         vcfHeader = new VCFHeader(actualLines, vcfSamples);
194         vcfWriter.writeHeader(vcfHeader);
195     }
196 
197     /**
198      * Prepare the VCF header lines
199      */
createVCFHeaderLineList(Map<String, VCFHeader> vcfHeaders)200     private Set<VCFHeaderLine> createVCFHeaderLineList(Map<String, VCFHeader> vcfHeaders) {
201         final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfHeaders.values(), true);
202         headerLines.addAll(getDefaultToolVCFHeaderLines());
203 
204         if (splitMultiallelics) {
205             GATKVariantContextUtils.addChromosomeCountsToHeader(headerLines);
206         }
207 
208         if (keepOriginalChrCounts) {
209             headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AC_KEY));
210             headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AF_KEY));
211             headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AN_KEY));
212         }
213         return headerLines;
214     }
215 
216 
217     /**
218      * Left aligns variants in repetitive regions.  Also trims alleles and splits multiallelics to biallelics, if desired
219      */
220     @Override
apply(VariantContext vc, ReadsContext readsContext, ReferenceContext ref, FeatureContext featureContext)221     public void apply(VariantContext vc, ReadsContext readsContext, ReferenceContext ref, FeatureContext featureContext) {
222         final List<VariantContext> vcList;
223         if (splitMultiallelics) {
224             if (vc.getGenotypes().stream().anyMatch(g -> g.hasAnyAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY))) {
225                 vcList = GATKVariantContextUtils.splitSomaticVariantContextToBiallelics(vc, false, vcfHeader);
226             } else {
227                 vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc, false, GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, keepOriginalChrCounts);
228             }
229         } else {
230             vcList = Collections.singletonList(vc);
231         }
232 
233         for (final VariantContext splitVariant : vcList) {
234             final List<Integer> indelLengths = splitVariant.getIndelLengths();
235             final int indelLength = indelLengths == null ? 0 : indelLengths.stream().map(Math::abs).max(Integer::compareTo).orElse(0);
236 
237             if (indelLength > maxIndelSize) {
238                 logger.info(String.format("%s (%d) at position %s:%d; skipping that record. Set --max-indel-length >= %d",
239                         "Indel is too long", indelLength, splitVariant.getContig(), splitVariant.getStart(), indelLength));
240                 lastVariant = splitVariant;
241                 vcfWriter.add(splitVariant);
242             } else {
243                 final int distanceToLastVariant = (lastVariant != null && splitVariant.contigsMatch(lastVariant)) ? splitVariant.getStart() - lastVariant.getEnd() : Integer.MAX_VALUE;
244                 lastVariant = leftAlignAndTrim(splitVariant, ref, Math.min(maxLeadingBases, distanceToLastVariant - 1), !dontTrimAlleles);
245                 vcfWriter.add(lastVariant);
246             }
247         }
248     }
249 
250     /**
251      * Reference is required for left alignment
252      */
253     @Override
requiresReference()254     public boolean requiresReference() {
255         return true;
256     }
257 
258     /**
259      * Print out message of how many variants were realigned
260      *
261      * @return
262      */
263     @Override
onTraversalSuccess()264     public Object onTraversalSuccess() {
265         return "SUCCESS";
266     }
267 
268     /**
269      * Close out the new variants file.
270      */
271     @Override
closeTool()272     public void closeTool() {
273         if (vcfWriter != null) {
274             vcfWriter.close();
275         }
276     }
277 
278     /**
279      * Main routine workhorse. By definition, it will only take biallelic vc's. Splitting into multiple alleles has to be
280      * handled by calling routine.
281      *
282      * @param vc  Input VC with variants to left align
283      * @param ref Reference context
284      * @return new VC.
285      */
286     @VisibleForTesting
leftAlignAndTrim(final VariantContext vc, final ReferenceContext ref, final int maxLeadingBases, final boolean trim)287     static VariantContext leftAlignAndTrim(final VariantContext vc, final ReferenceContext ref, final int maxLeadingBases, final boolean trim) {
288         if (!vc.isIndel() || maxLeadingBases <= 0) {
289             return vc;
290         }
291 
292 
293         for(int leadingBases = Math.min(maxLeadingBases, 10); leadingBases <= maxLeadingBases; leadingBases = Math.min(2*leadingBases, maxLeadingBases)) {
294             final int refStart = Math.max(vc.getStart() - leadingBases, 1);
295 
296             // reference sequence starting before the variant (to give space for left-alignment) and ending at the variant end
297             final byte[] refSeq = ref.getBases(new SimpleInterval(vc.getContig(), refStart, vc.getEnd()));
298 
299             final int variantOffsetInRef = vc.getStart() - refStart;
300 
301             final List<byte[]> sequences = vc.getAlleles().stream().map(a -> {
302                 final byte[] result = new byte[variantOffsetInRef + a.length()];
303                 System.arraycopy(refSeq, 0, result, 0, variantOffsetInRef);
304                 System.arraycopy(a.getBases(), 0, result, variantOffsetInRef, a.length());
305                 return result;
306             }).collect(Collectors.toList());
307 
308             final List<IndexRange> alleleRanges = vc.getAlleles().stream()
309                     .map(a -> new IndexRange(variantOffsetInRef + 1, variantOffsetInRef + a.length()))  // +1 to ignore the shared base in front
310                     .collect(Collectors.toList());
311 
312             // note that this also shifts the index ranges as a side effect, so below they can be used to output allele bases
313             final Pair<Integer, Integer> shifts = AlignmentUtils.normalizeAlleles(sequences, alleleRanges, variantOffsetInRef, trim);
314 
315             if (shifts.getLeft() == 0 && shifts.getRight() == 0) {
316                 return vc;
317             } else if (shifts.getLeft() == variantOffsetInRef && leadingBases < maxLeadingBases) {
318                 continue;
319             }
320 
321             final Map<Allele, Allele> alleleMap = IntStream.range(0, alleleRanges.size()).boxed()
322                     .collect(Collectors.toMap(
323                             n -> vc.getAlleles().get(n),
324                             n -> Allele.create(Arrays.copyOfRange(sequences.get(n), variantOffsetInRef - shifts.getLeft(), variantOffsetInRef - shifts.getRight() + vc.getAlleles().get(n).length()), n == 0)));
325 
326             final GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples());
327             for (final Genotype genotype : vc.getGenotypes()) {
328                 final List<Allele> newAlleles = genotype.getAlleles().stream().map(a -> alleleMap.getOrDefault(a, Allele.NO_CALL)).collect(Collectors.toList());
329                 newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make());
330             }
331 
332             return new VariantContextBuilder(vc).start(vc.getStart() - shifts.getLeft()).stop(vc.getEnd() - shifts.getRight())
333                     .alleles(alleleMap.values()).genotypes(newGenotypes).make();
334         }
335 
336         return vc;
337     }
338 }
339