1 package org.broadinstitute.hellbender.utils.variant;
2 
3 import htsjdk.samtools.SAMSequenceDictionary;
4 import htsjdk.samtools.util.Locatable;
5 import htsjdk.tribble.TribbleException;
6 import htsjdk.utils.ValidationUtils;
7 import htsjdk.variant.variantcontext.*;
8 import htsjdk.variant.variantcontext.writer.Options;
9 import htsjdk.variant.variantcontext.writer.VariantContextWriter;
10 import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
11 import htsjdk.variant.vcf.*;
12 import org.apache.commons.io.FilenameUtils;
13 import org.apache.commons.lang3.ArrayUtils;
14 import org.apache.commons.lang3.tuple.MutablePair;
15 import org.apache.commons.lang3.tuple.Pair;
16 import org.apache.logging.log4j.LogManager;
17 import org.apache.logging.log4j.Logger;
18 import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils;
19 import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.StrandBiasUtils;
20 import org.broadinstitute.hellbender.exceptions.GATKException;
21 import org.broadinstitute.hellbender.utils.genotyper.GenotypePriorCalculator;
22 import org.broadinstitute.hellbender.tools.walkers.genotyper.*;
23 import org.broadinstitute.hellbender.utils.*;
24 import org.broadinstitute.hellbender.utils.param.ParamUtils;
25 import org.broadinstitute.hellbender.utils.pileup.PileupElement;
26 import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
27 
28 import java.io.Serializable;
29 import java.nio.file.Path;
30 import java.util.*;
31 import java.util.function.BiFunction;
32 import java.util.stream.Collectors;
33 import java.util.stream.IntStream;
34 
35 public final class GATKVariantContextUtils {
36 
37     private static final Logger logger = LogManager.getLogger(GATKVariantContextUtils.class);
38 
39     public static final String MERGE_FILTER_PREFIX = "filterIn";
40     public static final String MERGE_REF_IN_ALL = "ReferenceInAlln";
41     public static final String MERGE_FILTER_IN_ALL = "FilteredInAll";
42     public static final String MERGE_INTERSECTION = "Intersection";
43 
44     public static final int DEFAULT_PLOIDY = HomoSapiensConstants.DEFAULT_PLOIDY;
45 
46     private static final GenotypeLikelihoodCalculators GL_CALCS = new GenotypeLikelihoodCalculators();
47 
48     public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
49 
isInformative(final double[] gls)50     public static boolean isInformative(final double[] gls) {
51         return MathUtils.sum(gls) < GATKVariantContextUtils.SUM_GL_THRESH_NOCALL;
52     }
53 
54     /**
55      * A method that constructs a mutable set of VCF header lines containing the tool name, version, date and command line,
56      * and return to the requesting tool.
57      * @param toolkitShortName  short name for the tool kit, e.g. "gatk"
58      * @param toolName          name of the tool
59      * @param versionString     version of the tool
60      * @param dateTime          date and time at invocation of the tool
61      * @param cmdLine           command line (e.g. options, flags) when the tool is invoked.
62      * @return A mutable set of VCF header lines containing the tool name, version, date and command line.
63      */
getDefaultVCFHeaderLines(final String toolkitShortName, final String toolName, final String versionString, final String dateTime, final String cmdLine)64     public static Set<VCFHeaderLine> getDefaultVCFHeaderLines(final String toolkitShortName, final String toolName,
65                                                               final String versionString, final String dateTime,
66                                                               final String cmdLine) {
67         final Set<VCFHeaderLine> defaultVCFHeaderLines = new HashSet<>();
68         final Map<String, String> simpleHeaderLineMap = new HashMap<>(4);
69         simpleHeaderLineMap.put("ID", toolName);
70         simpleHeaderLineMap.put("Version", versionString);
71         simpleHeaderLineMap.put("Date", dateTime);
72         simpleHeaderLineMap.put("CommandLine", cmdLine);
73         defaultVCFHeaderLines.add(new VCFHeaderLine("source", toolName));
74         defaultVCFHeaderLines.add(new VCFSimpleHeaderLine(String.format("%sCommandLine", toolkitShortName), simpleHeaderLineMap));
75         return defaultVCFHeaderLines;
76     }
77 
78     /**
79      * Creates a VariantContextWriter whose outputFile type is based on the extension of the output file name.
80      * The default options set by VariantContextWriter are cleared before applying ALLOW_MISSING_FIELDS_IN_HEADER (if
81      * <code>lenientProcessing</code> is set), followed by the set of options specified by any <code>options</code> args.
82      *
83      * @param outPath output Path for this writer. May not be null.
84      * @param referenceDictionary required if on the fly indexing is set, otherwise can be null
85      * @param createMD5 true if an md5 file should be created
86      * @param options variable length list of additional Options to be set for this writer
87      * @returns VariantContextWriter must be closed by the caller
88      */
createVCFWriter( final Path outPath, final SAMSequenceDictionary referenceDictionary, final boolean createMD5, final Options... options)89     public static VariantContextWriter createVCFWriter(
90             final Path outPath,
91             final SAMSequenceDictionary referenceDictionary,
92             final boolean createMD5,
93             final Options... options)
94     {
95         Utils.nonNull(outPath);
96 
97         VariantContextWriterBuilder vcWriterBuilder =
98                 new VariantContextWriterBuilder().clearOptions().setOutputPath(outPath);
99 
100         if (VariantContextWriterBuilder.OutputType.UNSPECIFIED == VariantContextWriterBuilder.determineOutputTypeFromFile(outPath)) {
101             // the only way the user has to specify an output type is by file extension, and htsjdk
102             // throws if it can't map the file extension to a known vcf type, so fallback to a default
103             // of VCF
104             logger.warn(String.format(
105                     "Can't determine output variant file format from output file extension \"%s\". Defaulting to VCF.",
106                     FilenameUtils.getExtension(outPath.getFileName().toString())));
107             vcWriterBuilder = vcWriterBuilder.setOutputFileType(VariantContextWriterBuilder.OutputType.VCF);
108         }
109 
110         if (createMD5) {
111             vcWriterBuilder.setCreateMD5();
112         }
113 
114         if (null != referenceDictionary) {
115             vcWriterBuilder = vcWriterBuilder.setReferenceDictionary(referenceDictionary);
116         }
117 
118         for (Options opt : options) {
119             vcWriterBuilder = vcWriterBuilder.setOption(opt);
120         }
121 
122         return vcWriterBuilder.build();
123     }
124 
125     /**
126      * Diploid NO_CALL allele list...
127      *
128      * @deprecated you should use {@link #noCallAlleles(int)} instead. It indicates the presence of a hardcoded diploid assumption which is bad.
129      */
130     @Deprecated
131     public final static List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
132 
hasPLIncompatibleAlleles(final Collection<Allele> alleleSet1, final Collection<Allele> alleleSet2)133     private static boolean hasPLIncompatibleAlleles(final Collection<Allele> alleleSet1, final Collection<Allele> alleleSet2) {
134         final Iterator<Allele> it1 = alleleSet1.iterator();
135         final Iterator<Allele> it2 = alleleSet2.iterator();
136 
137         while ( it1.hasNext() && it2.hasNext() ) {
138             final Allele a1 = it1.next();
139             final Allele a2 = it2.next();
140             if ( ! a1.equals(a2) )
141                 return true;
142         }
143 
144         // by this point, at least one of the iterators is empty.  All of the elements
145         // we've compared are equal up until this point.  But it's possible that the
146         // sets aren't the same size, which is indicated by the test below.  If they
147         // are of the same size, though, the sets are compatible
148         return it1.hasNext() || it2.hasNext();
149     }
150 
151     /**
152      *  Does an allele match any in a list of alleles?
153      *  We don't assume that ref1/alt1 are in their minimal representation
154      * @param ref1
155      * @param alt1
156      * @param ref2
157      * @param altList
158      * @return
159      */
isAlleleInList(final Allele ref1, final Allele alt1, final Allele ref2, final List<Allele> altList)160     public static boolean isAlleleInList(final Allele ref1, final Allele alt1, final Allele ref2, final List<Allele> altList) {
161         final Allele commonRef;
162         if (ref1.equals(ref2)) {
163             return altList.contains(alt1);
164         } else {
165                 commonRef = determineReferenceAllele(ref1, ref2);
166         }
167         final Map<Allele, Allele> alleleMap;
168         if (ref1.equals(commonRef)) {
169             alleleMap = GATKVariantContextUtils.createAlleleMapping(commonRef, ref2, altList);
170             return alleleMap.values().contains(alt1);
171         } else if (ref2.equals(commonRef)) {
172             alleleMap = GATKVariantContextUtils.createAlleleMapping(commonRef, ref1, Arrays.asList(alt1));
173             return altList.contains(alleleMap.get(alt1));
174         } else {
175             throw new IllegalStateException("Reference alleles " + ref1 + " and " + ref2 + " have common reference allele " + commonRef + " which is equal to neither.");
176         }
177     }
178 
179     /**
180      * Determines the common reference allele
181      *
182      * @param VCs    the list of VariantContexts
183      * @param loc    if not null, ignore records that do not begin at this start location
184      * @return possibly null Allele
185      */
determineReferenceAllele(final List<VariantContext> VCs, final Locatable loc)186     public static Allele determineReferenceAllele(final List<VariantContext> VCs, final Locatable loc) {
187         Allele ref = null;
188 
189         for ( final VariantContext vc : VCs ) {
190             if ( contextMatchesLoc(vc, loc) ) {
191                 final Allele myRef = vc.getReference();
192                 try {
193                     ref = determineReferenceAllele(ref, myRef);
194                 } catch (IllegalStateException e) {
195                     throw new IllegalStateException(String.format("The provided variant file(s) have inconsistent references " +
196                             "for the same position(s) at %s:%d, %s vs. %s", vc.getContig(), vc.getStart(), ref, myRef));
197                 }
198             }
199         }
200         return ref;
201     }
202 
determineReferenceAllele(final Allele ref1, final Allele ref2)203     public static Allele determineReferenceAllele(final Allele ref1, final Allele ref2) {
204         if ( ref1 == null || ref1.length() < ref2.length() ) {
205             return ref2;
206         } else if ( ref2 == null || ref2.length() < ref1.length()) {
207             return ref1;
208         }
209         else if ( ref1.length() == ref2.length() && ! ref1.equals(ref2) ) {
210             throw new IllegalStateException(String.format("The provided reference alleles do not appear to represent the same position, %s vs. %s", ref1, ref2));
211         } else {  //the lengths are the same and they're equal, so we could return ref1 or ref2
212             return ref1;
213         }
214     }
215 
216     /**
217      * Calculates the total ploidy of a variant context as the sum of all plodies across genotypes.
218      * @param vc the target variant context.
219      * @param defaultPloidy the default ploidy to be assume when there is no ploidy information for a genotype.
220      * @return never {@code null}.
221      */
totalPloidy(final VariantContext vc, final int defaultPloidy)222     public static int totalPloidy(final VariantContext vc, final int defaultPloidy) {
223         Utils.nonNull(vc, "the vc provided cannot be null");
224         Utils.validateArg(defaultPloidy >= 0, "the default ploidy must 0 or greater");
225         return vc.getGenotypes().stream().mapToInt(Genotype::getPloidy)
226                 .map(p -> p <= 0 ? defaultPloidy : p).sum();
227     }
228 
229     /**
230      * Returns the type of the substitution (transition vs transversion) represented by this variant. Only applicable to bi allelic SNPs.
231      */
getSNPSubstitutionType(final VariantContext context)232     public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(final VariantContext context) {
233         Utils.nonNull(context);
234         if (!context.isSNP() || !context.isBiallelic()) {
235             throw new IllegalArgumentException("Requested SNP substitution type for bialleic non-SNP " + context);
236         }
237         return BaseUtils.SNPSubstitutionType(context.getReference().getBases()[0], context.getAlternateAllele(0).getBases()[0]);
238     }
239 
240     /**
241      * If this is a BiAllelic SNP, is it a transition?
242      */
isTransition(final VariantContext context)243     public static boolean isTransition(final VariantContext context) {
244         Utils.nonNull(context);
245         return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION;
246     }
247 
248 
249     /**
250      * Returns a homozygous call allele list given the only allele and the ploidy.
251      *
252      * @param allele the only allele in the allele list.
253      * @param ploidy the ploidy of the resulting allele list.
254      *
255      * @throws IllegalArgumentException if {@code allele} is {@code null} or ploidy is negative.
256      *
257      * @return never {@code null}.
258      */
homozygousAlleleList(final Allele allele, final int ploidy)259     public static List<Allele> homozygousAlleleList(final Allele allele, final int ploidy) {
260         Utils.nonNull(allele);
261         Utils.validateArg(ploidy >= 0, "ploidy cannot be negative");
262 
263         // Use a tailored inner class to implement the list:
264         return Collections.nCopies(ploidy,allele);
265     }
266 
267     /**
268      * Add the genotype call (GT) field to GenotypeBuilder using the requested {@link GenotypeAssignmentMethod}
269      *
270      * @param gb the builder where we should put our newly called alleles, cannot be null
271      * @param assignmentMethod the method to use to do the assignment, cannot be null
272      * @param genotypeLikelihoods a vector of likelihoods to use if the method requires PLs, should be log10 likelihoods, cannot be null
273      * @param allelesToUse the alleles with respect to which the likelihoods are defined
274      */
makeGenotypeCall(final int ploidy, final GenotypeBuilder gb, final GenotypeAssignmentMethod assignmentMethod, final double[] genotypeLikelihoods, final List<Allele> allelesToUse, final List<Allele> originalGT, final GenotypePriorCalculator gpc)275     public static void makeGenotypeCall(final int ploidy,
276                                         final GenotypeBuilder gb,
277                                         final GenotypeAssignmentMethod assignmentMethod,
278                                         final double[] genotypeLikelihoods,
279                                         final List<Allele> allelesToUse,
280                                         final List<Allele> originalGT,
281                                         final GenotypePriorCalculator gpc) {
282         if(originalGT == null && assignmentMethod == GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL) {
283             throw new IllegalArgumentException("original GT cannot be null if assignmentMethod is BEST_MATCH_TO_ORIGINAL");
284         }
285         if (assignmentMethod == GenotypeAssignmentMethod.SET_TO_NO_CALL) {
286             gb.alleles(noCallAlleles(ploidy)).noGQ();
287         } else if (assignmentMethod == GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN) {
288             if ( genotypeLikelihoods == null || !isInformative(genotypeLikelihoods) ) {
289                 gb.alleles(noCallAlleles(ploidy)).noGQ();
290             } else {
291                 final int maxLikelihoodIndex = MathUtils.maxElementIndex(genotypeLikelihoods);
292                 final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, allelesToUse.size());
293                 final GenotypeAlleleCounts alleleCounts = glCalc.genotypeAlleleCountsAt(maxLikelihoodIndex);
294 
295                 final List<Allele> finalAlleles = alleleCounts.asAlleleList(allelesToUse);
296                 if (finalAlleles.contains(Allele.NON_REF_ALLELE)) {
297                     gb.alleles(GATKVariantContextUtils.noCallAlleles(ploidy));
298                     gb.PL(new int[genotypeLikelihoods.length]);
299                 } else {
300                     gb.alleles(finalAlleles);
301                 }
302                 final int numAltAlleles = allelesToUse.size() - 1;
303                 if ( numAltAlleles > 0 ) {
304                     gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(maxLikelihoodIndex, genotypeLikelihoods));
305                 }
306             }
307         } else if (assignmentMethod == GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS) {
308             gb.alleles(noCallAlleles(ploidy)).noGQ().noAD().noPL().noAttributes();
309         } else if (assignmentMethod == GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL) {
310             final List<Allele> best = new LinkedList<>();
311             final Allele ref = allelesToUse.get(0);
312             for (final Allele originalAllele : originalGT) {
313                 best.add((allelesToUse.contains(originalAllele) || originalAllele.isNoCall()) ? originalAllele : ref);
314             }
315             gb.alleles(best);
316         } else if (assignmentMethod == GenotypeAssignmentMethod.USE_POSTERIOR_PROBABILITIES) {
317             if (gpc == null) {
318                 throw new GATKException("cannot uses posteriors without an genotype prior calculator present");
319             } else {
320                 // Calculate posteriors.
321                 final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, allelesToUse.size());
322                 final double[] log10Priors = gpc.getLog10Priors(glCalc, allelesToUse);
323                 final double[] log10Posteriors = MathUtils.ebeAdd(log10Priors, genotypeLikelihoods);
324                 final double[] normalizedLog10Posteriors = MathUtils.scaleLogSpaceArrayForNumericalStability(log10Posteriors);
325                 // Update GP and PG annotations:
326                 gb.attribute(VCFConstants.GENOTYPE_POSTERIORS_KEY, Arrays.stream(normalizedLog10Posteriors)
327                         .map(v -> v == 0.0 ? 0.0 : v * -10) // the reason for the == 0.0 is to avoid a signed 0 output "-0.0"
328                         .mapToObj(GATKVariantContextUtils::formatGP).toArray());
329                 gb.attribute(GATKVCFConstants.GENOTYPE_PRIOR_KEY, Arrays.stream(log10Priors)
330                         .map(v -> v == 0.0 ? 0.0 : v * -10)
331                         .mapToObj(GATKVariantContextUtils::formatGP).toArray());
332                 // Set the GQ accordingly
333                 final int maxPosteriorIndex = MathUtils.maxElementIndex(log10Posteriors);
334                 if ( allelesToUse.size() > 0 ) {
335                     gb.log10PError(getGQLog10FromPosteriors(maxPosteriorIndex, normalizedLog10Posteriors));
336                 }
337                 // Finally we update the genotype alleles.
338                 gb.alleles(glCalc.genotypeAlleleCountsAt(maxPosteriorIndex).asAlleleList(allelesToUse));
339             }
340         }
341     }
342 
getGQLog10FromPosteriors(final int bestGenotypeIndex, final double[] log10Posteriors)343     private static double getGQLog10FromPosteriors(final int bestGenotypeIndex, final double[] /**/log10Posteriors) {
344         if (bestGenotypeIndex < 0) {
345             return CommonInfo.NO_LOG10_PERROR;
346         } else {
347             switch (log10Posteriors.length) {
348                 case 0:
349                 case 1: return CommonInfo.NO_LOG10_PERROR;
350                 case 2: return bestGenotypeIndex == 0 ? log10Posteriors[1] : log10Posteriors[0];
351                 case 3: return Math.min(0, MathUtils.log10SumLog10(
352                                  log10Posteriors[ bestGenotypeIndex == 0 ? 2 : bestGenotypeIndex - 1],
353                                  log10Posteriors[ bestGenotypeIndex == 2 ? 0 : bestGenotypeIndex + 1]));
354                 default:
355                     if (bestGenotypeIndex == 0) {
356                         return MathUtils.log10SumLog10(log10Posteriors, 1, log10Posteriors.length);
357                     } else if (bestGenotypeIndex == log10Posteriors.length - 1) {
358                         return MathUtils.log10SumLog10(log10Posteriors, 0, bestGenotypeIndex);
359                     } else {
360                         return Math.min(0.0, MathUtils.log10SumLog10(
361                                 MathUtils.log10sumLog10(log10Posteriors, 0, bestGenotypeIndex),
362                                 MathUtils.log10sumLog10(log10Posteriors, bestGenotypeIndex + 1, log10Posteriors.length)
363                         ));
364                     }
365             }
366         }
367     }
368 
formatGP(final double gp)369     private static String formatGP(final double gp) {
370         final String formatted = String.format("%.2f", gp);
371         int last = formatted.length() - 1;
372         if (formatted.charAt(last) == '0') {
373             if (formatted.charAt(--last) == '0') {
374                 return formatted.substring(0, --last); // exclude the '.' as-well.
375             } else {
376                 return formatted.substring(0, ++last);
377             }
378         } else {
379             return formatted;
380         }
381     }
382 
383     /**
384      *
385      * @param ploidy
386      * @param allele
387      * @return (return a single no-call allele for ploidy 0, as on Y for females)
388      */
makePloidyLengthAlleleList(final int ploidy, final Allele allele)389     public static List<Allele> makePloidyLengthAlleleList(final int ploidy, final Allele allele) {
390         if (ploidy == 0) {
391             return Arrays.asList(Allele.NO_CALL);
392         }
393         final List<Allele> repeatedList = new ArrayList<>();
394         for (int i = 0; i < ploidy; i++) {
395             repeatedList.add(allele);
396         }
397         return repeatedList;
398     }
399 
makeGenotypeCall(final int ploidy, final GenotypeBuilder gb, final GenotypeAssignmentMethod assignmentMethod, final double[] genotypeLikelihoods, final List<Allele> allelesToUse, final GenotypePriorCalculator gpc)400     public static void makeGenotypeCall(final int ploidy,
401                                         final GenotypeBuilder gb,
402                                         final GenotypeAssignmentMethod assignmentMethod,
403                                         final double[] genotypeLikelihoods,
404                                         final List<Allele> allelesToUse,
405                                         final GenotypePriorCalculator gpc){
406         makeGenotypeCall(ploidy,gb,assignmentMethod,genotypeLikelihoods,allelesToUse,null, gpc);
407     }
408 
409     /**
410      * Return the rightmost variant context in maybeOverlapping that overlaps curPos
411      *
412      * @param curPos non-null genome loc
413      * @param maybeOverlapping a collection of variant contexts that might overlap curPos
414      * @return the rightmost VariantContext, or null if none overlaps
415      */
getOverlappingVariantContext(final Locatable curPos, final Collection<VariantContext> maybeOverlapping)416     public static VariantContext getOverlappingVariantContext(final Locatable curPos, final Collection<VariantContext> maybeOverlapping) {
417         VariantContext overlaps = null;
418         for ( final VariantContext vc : maybeOverlapping ) {
419             if ( curPos.overlaps(vc) ) {
420                 if ( overlaps == null || vc.getStart() > overlaps.getStart() ) {
421                     overlaps = vc;
422                 }
423             }
424         }
425         return overlaps;
426     }
427 
428     /**
429      * Determines whether the provided VariantContext has real alternate alleles.
430      *
431      * @param vc  the VariantContext to evaluate
432      * @return true if it has proper alternate alleles, false otherwise
433      */
isProperlyPolymorphic(final VariantContext vc)434     public static boolean isProperlyPolymorphic(final VariantContext vc) {
435         //obvious cases
436         if (vc == null || vc.getAlternateAlleles().isEmpty()) {
437             return false;
438         } else if (vc.isBiallelic()) {
439             return !(GATKVCFConstants.isSpanningDeletion(vc.getAlternateAllele(0)) || vc.isSymbolic());
440         } else if (GATKVCFConstants.isSpanningDeletion(vc.getAlternateAllele(0)) && vc.getAlternateAllele(1).equals(Allele.NON_REF_ALLELE)){
441             return false;
442         } else {
443             return true;
444         }
445     }
446 
447     /** This is lifted directly from htsjdk with some minor modifications!  However, it is a private method there.
448      *
449      * This method cannot return {@link VariantContext.Type} MIXED
450      *
451      * Please see https://github.com/samtools/htsjdk/issues/999
452      *
453      * <p>Here are some cases that will not work properly, though this may not be an issue in practice:  </p>
454      *  <ul>
455      *      <li>"CGT" --> "GGA" this will be a MNP, but really it is two SNPs.</li>
456      *      <li>Spanning deletions for alternate will show as {@link VariantContext.Type} NO_VARIATION</li>
457      *      <li>Spanning deletions for reference will throw exception. </li>
458      *      <li>Reference that is symbolic will throw an exception.</li>
459      *  </ul>
460      *
461      * @param ref reference allele. Never {@code null}
462      * @param allele alternate allele to compare. Never {@code null}
463      * @return
464      */
typeOfVariant(final Allele ref, final Allele allele)465     public static VariantContext.Type typeOfVariant(final Allele ref, final Allele allele) {
466         Utils.nonNull(ref);
467         Utils.nonNull(allele);
468 
469         if ( ref.isSymbolic() )
470             throw new IllegalStateException("Unexpected error: encountered a record with a symbolic reference allele");
471 
472         if ( allele.isSymbolic() )
473             return VariantContext.Type.SYMBOLIC;
474 
475         if (allele.equals(Allele.SPAN_DEL)) {
476             return VariantContext.Type.NO_VARIATION;
477         }
478 
479         if ( ref.equals(Allele.SPAN_DEL) )
480             throw new IllegalStateException("Unexpected error: encountered a record with a spanning deletion reference allele");
481 
482         if ( ref.length() == allele.length() ) {
483             if (ref.basesMatch(allele)) {
484                 return VariantContext.Type.NO_VARIATION;
485             } else if ( allele.length() == 1 )
486                 return VariantContext.Type.SNP;
487 
488             // If the two alleles are the same length and only differ by one base, then still a SNP.
489             else if (IntStream.range(0, ref.length()).filter(i -> ref.getBases()[i] != allele.getBases()[i]).count() == 1) {
490                 return VariantContext.Type.SNP;
491             } else
492                 return VariantContext.Type.MNP;
493         }
494 
495         // Important note: previously we were checking that one allele is the prefix of the other.  However, that's not an
496         // appropriate check as can be seen from the following example:
497         // REF = CTTA and ALT = C,CT,CA
498         // This should be assigned the INDEL type but was being marked as a MIXED type because of the prefix check.
499         // In truth, it should be absolutely impossible to return a MIXED type from this method because it simply
500         // performs a pairwise comparison of a single alternate allele against the reference allele (whereas the MIXED type
501         // is reserved for cases of multiple alternate alleles of different types).  Therefore, if we've reached this point
502         // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL.
503 
504         return VariantContext.Type.INDEL;
505 
506         // old incorrect logic:
507         // if (oneIsPrefixOfOther(ref, allele))
508         //     return Type.INDEL;
509         // else
510         //     return Type.MIXED;
511     }
512 
513     /**
514      *  This method should only be run on variants that are known to be indels.  See {@code typeOfVariant}
515      *
516      *<p>Here are some cases that will not work properly, though this may not be an issue in practice:  </p>
517      *  <ul>
518      *      <li>"CT" --> "CATT" this is really just a simple AT insertion, but this will show up as complex.</li>
519      *  </ul>
520      * @param ref reference allele. Never {@code null}
521      * @param allele alternate allele to compare. Never {@code null}
522      * @return true if the indel is complex (for example, also includes a SNP), false if simple indel.  If the input alleles define a variant that is not
523      *  an indel, then the behavior of this method is undefined (though will probably just return false).
524      *
525      */
isComplexIndel(final Allele ref, final Allele allele)526     public static boolean isComplexIndel(final Allele ref, final Allele allele) {
527 
528         Utils.nonNull(ref);
529         Utils.nonNull(allele);
530 
531         // Symbolic --> false
532         if (ref.isSymbolic() || (ref.length() == 0)) {
533             return false;
534         }
535         if (allele.isSymbolic() || (allele.length() == 0)) {
536             return false;
537         }
538 
539         // SNP, MNP, or no variation --> false
540         if ( ref.length() == allele.length() ) {
541             return false;
542         }
543 
544         // obvious simple del or simple indel
545         if ((allele.length() == 1) || (ref.length() == 1)) {
546             return false;
547         }
548 
549         // If the ref starts with the alt or vice versa, this is still simple.
550         if (allele.length() > ref.length()) {
551             final boolean isAltStartsWithRef = IntStream.range(0, ref.length()).allMatch(i -> ref.getBases()[i] == allele.getBases()[i]);
552             return !isAltStartsWithRef;
553         } else {
554             final boolean isRefStartsWithAlt = IntStream.range(0, allele.length()).allMatch(i -> ref.getBases()[i] == allele.getBases()[i]);
555             return !isRefStartsWithAlt;
556         }
557     }
558 
559     /**
560      * Given a set of alleles (reference and alternate), choose the allele that is the best match for the given read (and offset)
561      * TODO: This method cannot recognize equivalent alleles (See https://github.com/broadinstitute/gatk/issues/5061)
562      * @param pileupElement read and offset.  Never {@code null}
563      * @param referenceAllele Reference allele.  Never {@code null}
564      * @param altAlleles List of candidate alternate alleles.  Never {@code null}
565      * @param minBaseQualityCutoff minimum base quality for the bases that match the allele in order to be counted.
566      *                             Must be positive or zero.  If you do not want any filtering, specify 0.
567      * @return The allele (reference or from the altAlleles) that matches.  {@code null} if none are a match or the base qualities
568      *      corresponding to the allele don't all exceed the minimum.
569      */
chooseAlleleForRead(final PileupElement pileupElement, final Allele referenceAllele, final List<Allele> altAlleles, int minBaseQualityCutoff)570     public static Allele chooseAlleleForRead(final PileupElement pileupElement, final Allele referenceAllele, final List<Allele> altAlleles, int minBaseQualityCutoff) {
571         Utils.nonNull(pileupElement);
572         Utils.nonNull(referenceAllele);
573         Utils.nonNull(altAlleles);
574         ParamUtils.isPositiveOrZero(minBaseQualityCutoff, "Minimum base quality must be positive or zero.");
575 
576         final boolean isRef = referenceAllele.basesMatch(getBasesForAlleleInRead(pileupElement, referenceAllele))
577                 && !pileupElement.isBeforeDeletionStart() && !pileupElement.isBeforeInsertion();
578 
579         Allele pileupAllele = null;
580         if (!isRef) {
581 
582             for (Allele altAllele : altAlleles) {
583                 final VariantContext.Type variantType = typeOfVariant(referenceAllele, altAllele);
584 
585                 if (variantType == VariantContext.Type.INDEL) {
586                     if (isIndelInThePileupElement(pileupElement, referenceAllele, altAllele)) {
587                         pileupAllele = altAllele;
588                     }
589 
590                 } else if (variantType == VariantContext.Type.MNP || variantType == VariantContext.Type.SNP) {
591                     if (doesReadContainAllele(pileupElement, altAllele) == Trilean.TRUE) {
592                         pileupAllele = altAllele;
593                     }
594                 }
595             }
596         } else {
597             pileupAllele = referenceAllele;
598         }
599 
600         if ((pileupAllele != null) && (getMinBaseQualityForAlleleInRead(pileupElement, pileupAllele) < minBaseQualityCutoff)) {
601             pileupAllele = null;
602         }
603 
604         return pileupAllele;
605     }
606 
isIndelInThePileupElement(final PileupElement pileupElement, final Allele referenceAllele, final Allele altAllele)607     private static boolean isIndelInThePileupElement(final PileupElement pileupElement, final Allele referenceAllele, final Allele altAllele) {
608         boolean isAltAlleleInThePileup = false;
609 
610         // Check insertion
611         if (pileupElement.isBeforeInsertion()) {
612             final String insertionBases = pileupElement.getBasesOfImmediatelyFollowingInsertion();
613             // edge case: ignore a deletion immediately preceding an insertion as p.getBasesOfImmediatelyFollowingInsertion() returns null [EB]
614             if ((insertionBases != null) && (Allele.extend(referenceAllele, insertionBases.getBytes()).basesMatch(altAllele))) {
615                 isAltAlleleInThePileup = true;
616             }
617         } else if (pileupElement.isBeforeDeletionStart()) {
618             final int deletionLength = pileupElement.getLengthOfImmediatelyFollowingIndel();
619             if ((referenceAllele.getBases().length - altAllele.getBases().length) == deletionLength) {
620                 isAltAlleleInThePileup = true;
621             }
622         }
623         return isAltAlleleInThePileup;
624     }
625 
626     /**
627      * @param pileupElement pileup element representing the read.  Never {@code null}
628      * @param allele allele to get overlapping bases in read.  Never {@code null}
629      * @return array of the bytes that correspond to the allele in the pileup element.  Note that, if the read ends, this
630      * list can be smaller than the length of the allele.
631      */
getBasesForAlleleInRead(final PileupElement pileupElement, final Allele allele)632     private static byte[] getBasesForAlleleInRead(final PileupElement pileupElement, final Allele allele) {
633         Utils.nonNull(pileupElement);
634         Utils.nonNull(allele);
635         return ArrayUtils.subarray(pileupElement.getRead().getBases(), pileupElement.getOffset(), pileupElement.getOffset() + allele.getBases().length);
636     }
637 
638     /**
639      * TODO: Test.  And make sure to test with reference alleles to make sure "*" is not included.
640      * @param pileupElement pileup element representing the read.  Never {@code null}
641      * @param allele query allele.  Never {@code null}
642      * @return Whether the read contains the allele.  Note that unknown can occur as well.
643      */
doesReadContainAllele(final PileupElement pileupElement, final Allele allele)644     public static Trilean doesReadContainAllele(final PileupElement pileupElement, final Allele allele) {
645         Utils.nonNull(pileupElement);
646         Utils.nonNull(allele);
647 
648         final byte[] readBases = ArrayUtils.subarray(pileupElement.getRead().getBases(), pileupElement.getOffset(), pileupElement.getOffset() + allele.getBases().length);
649 
650         if (readBases.length < allele.getBases().length) {
651             return Trilean.UNKNOWN;
652         }
653 
654         if (allele.basesMatch(readBases)) {
655             return Trilean.TRUE;
656         } else {
657             return Trilean.FALSE;
658         }
659     }
660 
661     /**
662      * Find the minimum base quality for all bases in a read that correspond to a given allele.
663      *
664      * @param pileupElement pileup element representing the read.  Never {@code null}
665      * @param allele query allele.  Never {@code null}
666      * @return lowest base quality seen in the corresponding bases of the read
667      */
getMinBaseQualityForAlleleInRead(final PileupElement pileupElement, final Allele allele)668     private static int getMinBaseQualityForAlleleInRead(final PileupElement pileupElement, final Allele allele) {
669         Utils.nonNull(pileupElement);
670         Utils.nonNull(allele);
671         final byte[] alleleBases = allele.getBases();
672         final byte[] pileupBaseQualities = ArrayUtils.subarray(pileupElement.getRead().getBaseQualities(), pileupElement.getOffset(), pileupElement.getOffset() + alleleBases.length);
673         final OptionalInt minQuality = IntStream.range(0, pileupBaseQualities.length).map(i -> Byte.toUnsignedInt(pileupBaseQualities[i])).min();
674         return minQuality.orElse(-1);
675     }
676 
containsInlineIndel(final VariantContext vc)677     public static boolean containsInlineIndel(final VariantContext vc) {
678         final List<Allele> alleles = vc.getAlleles();
679         final int refLength = alleles.get(0).length();
680         for (int i = 1; i < alleles.size(); i++) {
681             final Allele alt = alleles.get(i);
682             if (!alt.isSymbolic() && alt != Allele.SPAN_DEL && alt.length() != refLength) {
683                 return true;
684             }
685         }
686         return false;
687     }
688 
689     public enum GenotypeMergeType {
690         /**
691          * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD.
692          */
693         UNIQUIFY,
694         /**
695          * Take genotypes in priority order (see the priority argument).
696          */
697         PRIORITIZE,
698         /**
699          * Take the genotypes in any order.
700          */
701         UNSORTED,
702         /**
703          * Require that all samples/genotypes be unique between all inputs.
704          */
705         REQUIRE_UNIQUE
706     }
707 
708     public enum FilteredRecordMergeType {
709         /**
710          * Union - leaves the record if any record is unfiltered.
711          */
712         KEEP_IF_ANY_UNFILTERED,
713         /**
714          * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this.
715          */
716         KEEP_IF_ALL_UNFILTERED,
717         /**
718          * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset.
719          */
720         KEEP_UNCONDITIONAL
721     }
722 
723     /**
724      * Returns true iff VC is an non-complex indel where every allele represents an expansion or
725      * contraction of a series of identical bases in the reference.
726      *
727      * For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT
728      *
729      * If VC = -/CT, then this function returns true because the CT insertion matches exactly the
730      * upcoming reference.
731      * If VC = -/CTA then this function returns false because the CTA isn't a perfect match
732      *
733      * Now consider deletions:
734      *
735      * If VC = CT/- then again the same logic applies and this returns true
736      * The case of CTA/- makes no sense because it doesn't actually match the reference bases.
737      *
738      * The logic of this function is pretty simple.  Take all of the non-null alleles in VC.  For
739      * each insertion allele of n bases, check if that allele matches the next n reference bases.
740      * For each deletion allele of n bases, check if this matches the reference bases at n - 2 n,
741      * as it must necessarily match the first n bases.  If this test returns true for all
742      * alleles you are a tandem repeat, otherwise you are not.
743      *
744      * @param vc
745      * @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference
746      * @return
747      */
isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad)748     public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) {
749         final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1);
750         if ( ! vc.isIndel() ) // only indels are tandem repeats
751             return false;
752 
753         final Allele ref = vc.getReference();
754 
755         for ( final Allele allele : vc.getAlternateAlleles() ) {
756             if ( ! isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad) )
757                 return false;
758         }
759 
760         // we've passed all of the tests, so we are a repeat
761         return true;
762     }
763 
764     /**
765      *
766      * @param vc
767      * @param refBasesStartingAtVCWithoutPad    Ref bases excluding the initial base of the variant context where the alt matches the ref.
768      *                                          For example, if the reference sequence is GATCCACCACCAGTCGA and we have a deletion
769      *                                          of one STR unit CCA, it is represented as a variant context TCCA -> T, where the 'T' is
770      *                                          the padding base.  In this case, {@code refBasesStartingAtVCWithoutPad} is CCACCACCAGTCGA.
771      * @return
772      */
getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithoutPad)773     public static Pair<List<Integer>, byte[]> getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithoutPad) {
774         Utils.nonNull(vc);
775         Utils.nonNull(refBasesStartingAtVCWithoutPad);
776 
777         if ( ! vc.isIndel() ){ // only indels are tandem repeats
778             return null;
779         }
780 
781         final Allele refAllele = vc.getReference();
782         final byte[] refAlleleBases = Arrays.copyOfRange(refAllele.getBases(), 1, refAllele.length());
783 
784         byte[] repeatUnit = null;
785         final List<Integer> lengths = new ArrayList<>();
786 
787         for ( final Allele allele : vc.getAlternateAlleles() ) {
788             Pair<int[],byte[]> result = getNumTandemRepeatUnits(refAlleleBases, Arrays.copyOfRange(allele.getBases(), 1, allele.length()), refBasesStartingAtVCWithoutPad);
789 
790             final int[] repetitionCount = result.getLeft();
791             // repetition count = 0 means allele is not a tandem expansion of context
792             if (repetitionCount[0] == 0 || repetitionCount[1] == 0)
793                 return null;
794 
795             if (lengths.isEmpty()) {
796                 lengths.add(repetitionCount[0]); // add ref allele length only once
797             }
798             lengths.add(repetitionCount[1]);  // add this alt allele's length
799 
800             repeatUnit = result.getRight();
801         }
802 
803         return new MutablePair<>(lengths,repeatUnit);
804     }
805 
getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext)806     public static Pair<int[],byte[]> getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext) {
807          /* we can't exactly apply same logic as in basesAreRepeated() to compute tandem unit and number of repeated units.
808            Consider case where ref =ATATAT and we have an insertion of ATAT. Natural description is (AT)3 -> (AT)2.
809          */
810 
811         byte[] longB;
812         // find first repeat unit based on either ref or alt, whichever is longer
813         if (altBases.length > refBases.length)
814             longB = altBases;
815         else
816             longB = refBases;
817 
818         // see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units
819         // for example, -*,CACA needs to first be decomposed into (CA)2
820         final int repeatUnitLength = findRepeatedSubstring(longB);
821         final byte[] repeatUnit = Arrays.copyOf(longB, repeatUnitLength);
822 
823         final int[] repetitionCount = new int[2];
824         // look for repetitions forward on the ref bases (i.e. starting at beginning of ref bases)
825         int repetitionsInRef = findNumberOfRepetitions(repeatUnit, refBases, true);
826         repetitionCount[0] = findNumberOfRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef;
827         repetitionCount[1] = findNumberOfRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef;
828 
829         return new MutablePair<>(repetitionCount, repeatUnit);
830 
831     }
832 
833     /**
834      * Find out if a string can be represented as a tandem number of substrings.
835      * For example ACTACT is a 2-tandem of ACT,
836      * but ACTACA is not.
837      *
838      * @param bases                 String to be tested
839      * @return                      Length of repeat unit, if string can be represented as tandem of substring (if it can't
840      *                              be represented as one, it will be just the length of the input string)
841      */
findRepeatedSubstring(byte[] bases)842     public static int findRepeatedSubstring(byte[] bases) {
843 
844         int repLength;
845         for (repLength=1; repLength <=bases.length; repLength++) {
846             final byte[] candidateRepeatUnit = Arrays.copyOf(bases,repLength);
847             boolean allBasesMatch = true;
848             for (int start = repLength; start < bases.length; start += repLength ) {
849                 // check that remaining of string is exactly equal to repeat unit
850                 final byte[] basePiece = Arrays.copyOfRange(bases,start,start+candidateRepeatUnit.length);
851                 if (!Arrays.equals(candidateRepeatUnit, basePiece)) {
852                     allBasesMatch = false;
853                     break;
854                 }
855             }
856             if (allBasesMatch)
857                 return repLength;
858         }
859 
860         return repLength;
861     }
862 
863     /**
864      * Finds number of repetitions a string consists of.
865      * For example, for string ATAT and repeat unit AT, number of repetitions = 2
866      * @param repeatUnit             Non-empty substring represented by byte array
867      * @param testString             String to test (represented by byte array), may be empty
868      * @param leadingRepeats         Look for leading (at the beginning of string) or trailing (at end of string) repetitions
869      * For example:
870      *    GATAT has 0 leading repeats of AT but 2 trailing repeats of AT
871      *    ATATG has 1 leading repeat of A but 2 leading repeats of AT
872      *    CCCCCCCC has 2 leading and 2 trailing repeats of CCC
873      *
874      * @return  Number of repetitions (0 if testString is not a concatenation of n repeatUnit's, including the case of empty testString)
875      */
findNumberOfRepetitions(byte[] repeatUnit, byte[] testString, boolean leadingRepeats)876     public static int findNumberOfRepetitions(byte[] repeatUnit, byte[] testString, boolean leadingRepeats) {
877         Utils.nonNull(repeatUnit, "repeatUnit");
878         Utils.nonNull(testString, "testString");
879         Utils.validateArg(repeatUnit.length != 0, "empty repeatUnit");
880         if (testString.length == 0){
881             return 0;
882         }
883         return findNumberOfRepetitions(repeatUnit, 0, repeatUnit.length, testString, 0, testString.length, leadingRepeats);
884     }
885 
886     /**
887      * Finds number of repetitions a string consists of.
888      * Same as {@link #findNumberOfRepetitions} but operates on subarrays of a bigger array to save on copying.
889      * For example, for string ATAT and repeat unit AT, number of repetitions = 2
890      * @param repeatUnitFull             Non-empty substring represented by byte array
891      * @param offsetInRepeatUnitFull     the offset in repeatUnitFull from which to read the repeat unit
892      * @param repeatUnitLength           length of the repeat unit
893      * @param testStringFull             string to test (represented by byte array), may be empty
894      * @param offsetInTestStringFull     the offset in offsetInRepeatUnitFull from which to read the test string
895      * @param testStringLength           length of the test string
896      * @param leadingRepeats         Look for leading (at the beginning of string) or trailing (at end of string) repetitions
897      * For example:
898      *    GATAT has 0 leading repeats of AT but 2 trailing repeats of AT
899      *    ATATG has 1 leading repeat of A but 2 leading repeats of AT
900      *    CCCCCCCC has 2 leading and 2 trailing repeats of CCC
901      * @return  Number of repetitions (0 if testString is not a concatenation of n repeatUnit's, including the case of empty testString)
902      */
findNumberOfRepetitions(final byte[] repeatUnitFull, final int offsetInRepeatUnitFull, final int repeatUnitLength, final byte[] testStringFull, final int offsetInTestStringFull, final int testStringLength, final boolean leadingRepeats)903     public static int findNumberOfRepetitions(final byte[] repeatUnitFull, final int offsetInRepeatUnitFull, final int repeatUnitLength, final byte[] testStringFull, final int offsetInTestStringFull, final int testStringLength, final boolean leadingRepeats) {
904         Utils.nonNull(repeatUnitFull, "repeatUnit");
905         Utils.nonNull(testStringFull, "testString");
906         Utils.validIndex(offsetInRepeatUnitFull, repeatUnitFull.length);
907         Utils.validateArg(repeatUnitLength >= 0 && repeatUnitLength <= repeatUnitFull.length, "repeatUnitLength");
908         if (testStringLength == 0){
909             return 0;
910         }
911         Utils.validIndex(offsetInTestStringFull, testStringFull.length);
912         Utils.validateArg(testStringLength >= 0 && testStringLength <= testStringFull.length, "testStringLength");
913         final int lengthDifference = testStringLength - repeatUnitLength;
914 
915         if (leadingRepeats) {
916             int numRepeats = 0;
917             // look forward on the test string
918             for (int start = 0; start <= lengthDifference; start += repeatUnitLength) {
919                 if(Utils.equalRange(testStringFull, start + offsetInTestStringFull, repeatUnitFull, offsetInRepeatUnitFull, repeatUnitLength)) {
920                     numRepeats++;
921                 } else {
922                     return numRepeats;
923                 }
924             }
925             return numRepeats;
926         } else {
927             // look backward. For example, if repeatUnit = AT and testString = GATAT, number of repeat units is still 2
928             int numRepeats = 0;
929             // look backward on the test string
930             for (int start = lengthDifference; start >= 0; start -= repeatUnitLength) {
931                 if (Utils.equalRange(testStringFull, start + offsetInTestStringFull, repeatUnitFull, offsetInRepeatUnitFull, repeatUnitLength)) {
932                     numRepeats++;
933                 } else {
934                     return numRepeats;
935                 }
936             }
937             return numRepeats;
938         }
939     }
940 
941     /**
942      * Helper function for isTandemRepeat that checks that allele matches somewhere on the reference
943      */
isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad)944     protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) {
945         if ( ! Allele.oneIsPrefixOfOther(ref, alt) )
946             return false; // we require one allele be a prefix of another
947 
948         if ( ref.length() > alt.length() ) { // we are a deletion
949             return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2);
950         } else { // we are an insertion
951             return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1);
952         }
953     }
954 
basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches)955     protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) {
956         final String potentialRepeat = l.substring(s.length()); // skip s bases
957 
958         for ( int i = 0; i < minNumberOfMatches; i++) {
959             final int start = i * potentialRepeat.length();
960             final int end = (i+1) * potentialRepeat.length();
961             if ( ref.length() < end )
962                 return false; // we ran out of bases to test
963             final String refSub = ref.substring(start, end);
964             if ( ! refSub.equals(potentialRepeat) )
965                 return false; // repeat didn't match, fail
966         }
967 
968         return true; // we passed all tests, we matched
969     }
970 
971     /**
972      * Subset the samples in VC to reference only information with ref call alleles
973      *
974      * Preserves DP if present
975      *
976      * @param vc the variant context to subset down to
977      * @param defaultPloidy defaultPloidy to use if a genotype doesn't have any alleles
978      * @return a GenotypesContext
979      */
subsetToRefOnly(final VariantContext vc, final int defaultPloidy)980     public static GenotypesContext subsetToRefOnly(final VariantContext vc, final int defaultPloidy) {
981         Utils.nonNull(vc == null, "vc cannot be null");
982         Utils.validateArg(defaultPloidy >= 1, () -> "defaultPloidy must be >= 1 but got " + defaultPloidy);
983 
984         final GenotypesContext oldGTs = vc.getGenotypes();
985         if (oldGTs.isEmpty()) return oldGTs;
986         final GenotypesContext newGTs = GenotypesContext.create(oldGTs.size());
987 
988         final Allele ref = vc.getReference();
989         final List<Allele> diploidRefAlleles = Arrays.asList(ref, ref);
990 
991         // create the new genotypes
992         for ( final Genotype g : vc.getGenotypes() ) {
993             final int gPloidy = g.getPloidy() == 0 ? defaultPloidy : g.getPloidy();
994             final List<Allele> refAlleles = gPloidy == 2 ? diploidRefAlleles : Collections.nCopies(gPloidy, ref);
995             final GenotypeBuilder gb = new GenotypeBuilder(g.getSampleName(), refAlleles);
996             if ( g.hasDP() ) gb.DP(g.getDP());
997             if ( g.hasGQ() ) gb.GQ(g.getGQ());
998             newGTs.add(gb.make());
999         }
1000 
1001         return newGTs;
1002     }
1003 
removePLsAndAD(final Genotype g)1004     public static Genotype removePLsAndAD(final Genotype g) {
1005         return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g;
1006     }
1007 
1008     //TODO consider refactor variant-context merging code so that we share as much as possible between
1009     //TODO simpleMerge and referenceConfidenceMerge
1010     //TODO likely using a separate helper class or hierarchy.
1011     /**
1012      * Merges VariantContexts into a single hybrid.  Takes genotypes for common samples in priority order, if provided.
1013      * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
1014      * the sample name
1015      *
1016      * @param unsortedVCs               collection of unsorted VCs
1017      * @param priorityListOfVCs         priority list detailing the order in which we should grab the VCs
1018      * @param filteredRecordMergeType   merge type for filtered records
1019      * @param genotypeMergeOptions      merge option for genotypes
1020      * @param filteredAreUncalled       are filtered records uncalled?
1021      * @return new VariantContext       representing the merge of unsortedVCs
1022      */
simpleMerge(final Collection<VariantContext> unsortedVCs, final List<String> priorityListOfVCs, final FilteredRecordMergeType filteredRecordMergeType, final GenotypeMergeType genotypeMergeOptions, final boolean filteredAreUncalled)1023     public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs,
1024                                              final List<String> priorityListOfVCs,
1025                                              final FilteredRecordMergeType filteredRecordMergeType,
1026                                              final GenotypeMergeType genotypeMergeOptions,
1027                                              final boolean filteredAreUncalled) {
1028         int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size();
1029         return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled);
1030     }
1031 
1032     /**
1033      * Merges VariantContexts into a single hybrid.  Takes genotypes for common samples in priority order, if provided.
1034      * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
1035      * the sample name.
1036      * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use
1037      * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge.
1038      *
1039      * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/
1040      *
1041      * @param unsortedVCs               collection of unsorted VCs
1042      * @param priorityListOfVCs         priority list detailing the order in which we should grab the VCs
1043      * @param filteredRecordMergeType   merge type for filtered records
1044      * @param genotypeMergeOptions      merge option for genotypes
1045      * @param filteredAreUncalled       are filtered records uncalled?
1046      * @return new VariantContext       representing the merge of unsortedVCs
1047      */
simpleMerge(final Collection<VariantContext> unsortedVCs, final List<String> priorityListOfVCs, final int originalNumOfVCs, final FilteredRecordMergeType filteredRecordMergeType, final GenotypeMergeType genotypeMergeOptions, final boolean filteredAreUncalled)1048     public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs,
1049                                              final List<String> priorityListOfVCs,
1050                                              final int originalNumOfVCs,
1051                                              final FilteredRecordMergeType filteredRecordMergeType,
1052                                              final GenotypeMergeType genotypeMergeOptions,
1053                                              final boolean filteredAreUncalled) {
1054         if ( unsortedVCs == null || unsortedVCs.isEmpty() )
1055             return null;
1056 
1057         if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size())
1058             throw new IllegalArgumentException("the number of the original VariantContexts must be the same as the number of VariantContexts in the priority list");
1059 
1060         final List<VariantContext> preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions);
1061         // Make sure all variant contexts are padded with reference base in case of indels if necessary
1062         List<VariantContext> VCs = preFilteredVCs.stream()
1063                 .filter(vc -> !filteredAreUncalled || vc.isNotFiltered())
1064                 .collect(Collectors.toList());
1065 
1066         if ( VCs.isEmpty() ) // everything is filtered out and we're filteredAreUncalled
1067             return null;
1068 
1069         // establish the baseline info from the first VC
1070         final VariantContext first = VCs.get(0);
1071         final String name = first.getSource();
1072         final Allele refAllele = determineReferenceAllele(VCs);
1073 
1074         final LinkedHashSet<Allele> alleles = new LinkedHashSet<>();
1075         final Set<String> filters = new LinkedHashSet<>();
1076         final Map<String, Object> attributes = new LinkedHashMap<>();
1077         final Set<String> inconsistentAttributes = new LinkedHashSet<>();
1078         final Set<String> variantSources = new LinkedHashSet<>(); // contains the set of sources we found in our set of VCs that are variant
1079         final Set<String> rsIDs = new LinkedHashSet<>(1); // most of the time there's one id
1080 
1081         VariantContext longestVC = first;
1082         int depth = 0;
1083         double log10PError = CommonInfo.NO_LOG10_PERROR;
1084         boolean anyVCHadFiltersApplied = false;
1085         GenotypesContext genotypes = GenotypesContext.create();
1086 
1087         // counting the number of filtered and variant VCs
1088         int nFiltered = 0;
1089 
1090         // cycle through and add info from the other VCs, making sure the loc/reference matches
1091         for ( final VariantContext vc : VCs ) {
1092             Utils.validate(longestVC.getStart() == vc.getStart(), () -> "BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString());
1093 
1094             if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) )
1095                 longestVC = vc; // get the longest location
1096 
1097             nFiltered += vc.isFiltered() ? 1 : 0;
1098             if ( vc.isVariant() ) variantSources.add(vc.getSource());
1099 
1100             AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc);
1101 
1102             alleles.addAll(alleleMapping.values());
1103 
1104             mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY);
1105 
1106             // We always take the QUAL of the first VC with a non-MISSING qual for the combined value
1107             if ( log10PError == CommonInfo.NO_LOG10_PERROR )
1108                 log10PError =  vc.getLog10PError();
1109 
1110             filters.addAll(vc.getFilters());
1111             anyVCHadFiltersApplied |= vc.filtersWereApplied();
1112 
1113             //
1114             // add attributes
1115             //
1116             // special case DP (add it up) and ID (just preserve it)
1117             //
1118             if (vc.hasAttribute(VCFConstants.DEPTH_KEY))
1119                 depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0);
1120             if ( vc.hasID() ) rsIDs.add(vc.getID());
1121 
1122             for (final Map.Entry<String, Object> p : vc.getAttributes().entrySet()) {
1123                 final String key = p.getKey();
1124                 final Object value = p.getValue();
1125                 // only output annotations that have the same value in every input VC
1126                 // if we don't like the key already, don't go anywhere
1127                 if ( ! inconsistentAttributes.contains(key) ) {
1128                     final boolean alreadyFound = attributes.containsKey(key);
1129                     final Object boundValue = attributes.get(key);
1130                     final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4);
1131 
1132                     if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) {
1133                         // we found the value but we're inconsistent, put it in the exclude list
1134                         inconsistentAttributes.add(key);
1135                         attributes.remove(key);
1136                     } else if ( ! alreadyFound || boundIsMissingValue )  { // no value
1137                         attributes.put(key, value);
1138                     }
1139                 }
1140             }
1141         }
1142 
1143         // if we have more alternate alleles in the merged VC than in one or more of the
1144         // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD
1145         for ( final VariantContext vc : VCs ) {
1146             if (vc.getAlleles().size() == 1)
1147                 continue;
1148             if ( hasPLIncompatibleAlleles(alleles, vc.getAlleles())) {
1149                 if ( ! genotypes.isEmpty() ) {
1150                     logger.debug(String.format("Stripping PLs at %s:%d-%d due to incompatible alleles merged=%s vs. single=%s",
1151                             vc.getContig(), vc.getStart(), vc.getEnd(), alleles, vc.getAlleles()));
1152                 }
1153                 genotypes = stripPLsAndAD(genotypes);
1154                 // this will remove stale AC,AF attributed from vc
1155                 VariantContextUtils.calculateChromosomeCounts(vc, attributes, true);
1156                 break;
1157             }
1158         }
1159 
1160         // if at least one record was unfiltered and we want a union, clear all of the filters
1161         if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL )
1162             filters.clear();
1163 
1164         if ( depth > 0 )
1165             attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth));
1166 
1167         final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);
1168 
1169         final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID);
1170         builder.loc(longestVC.getContig(), longestVC.getStart(), longestVC.getEnd());
1171         builder.alleles(alleles);
1172         builder.genotypes(genotypes);
1173         builder.log10PError(log10PError);
1174         if ( anyVCHadFiltersApplied ) {
1175             builder.filters(filters.isEmpty() ? filters : new TreeSet<>(filters));
1176         }
1177         builder.attributes(new TreeMap<>(attributes));
1178 
1179         // Trim the padded bases of all alleles if necessary
1180         final VariantContext merged = builder.make();
1181         return merged;
1182     }
1183 
1184 
1185     //TODO as part of a larger refactoring effort remapAlleles can be merged with createAlleleMapping.
1186 
stripPLsAndAD(final GenotypesContext genotypes)1187     public static GenotypesContext stripPLsAndAD(final GenotypesContext genotypes) {
1188         final GenotypesContext newGs = GenotypesContext.create(genotypes.size());
1189         for ( final Genotype g : genotypes ) {
1190             newGs.add(removePLsAndAD(g));
1191         }
1192         return newGs;
1193     }
1194 
determineReferenceAllele(final List<VariantContext> VCs)1195     private static Allele determineReferenceAllele(final List<VariantContext> VCs) {
1196         return determineReferenceAllele(VCs, null);
1197     }
1198 
contextMatchesLoc(final VariantContext vc, final Locatable loc)1199     public static boolean contextMatchesLoc(final VariantContext vc, final Locatable loc) {
1200         return loc == null || loc.getStart() == vc.getStart();
1201     }
1202 
resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc)1203     public static AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc) {
1204         if ( refAllele.equals(vc.getReference()) )
1205             return new AlleleMapper(vc);
1206         else {
1207             final Map<Allele, Allele> map = createAlleleMapping(refAllele, vc);
1208             map.put(vc.getReference(), refAllele);
1209             return new AlleleMapper(map);
1210         }
1211     }
1212 
createAlleleMapping(final Allele refAllele, final VariantContext oneVC)1213     public static Map<Allele, Allele> createAlleleMapping(final Allele refAllele,
1214                                                           final VariantContext oneVC) {
1215         return createAlleleMapping(refAllele, oneVC.getReference(), oneVC.getAlternateAlleles());
1216     }
1217 
1218     //TODO as part of a larger refactoring effort {@link #createAlleleMapping} can be merged with {@link ReferenceConfidenceVariantContextMerger#remapAlleles}.
1219     /**
1220      * Create an allele mapping for the given context where its reference allele must (potentially) be extended to the given allele
1221      *
1222      * The refAllele is the longest reference allele seen at this start site.
1223      * So imagine it is:
1224      * refAllele: ACGTGA
1225      * myRef:     ACGT
1226      * myAlt:     A
1227      *
1228      * We need to remap all of the alleles in vc to include the extra GA so that
1229      * myRef => refAllele and myAlt => AGA
1230      *
1231      * @param refAllele          the new (extended) reference allele
1232      * @param inputRef           the reference allele that may need to be extended
1233      * @param inputAlts          the alternate alleles that may need to be extended
1234      * @return a non-null mapping of original alleles to new (extended) ones
1235      */
createAlleleMapping(final Allele refAllele, final Allele inputRef, final List<Allele> inputAlts)1236     public static Map<Allele, Allele> createAlleleMapping(final Allele refAllele,
1237                                                            final Allele inputRef, final List<Allele> inputAlts) {
1238         Utils.validate( refAllele.length() > inputRef.length(), () -> "BUG: inputRef="+inputRef+" is longer than refAllele="+refAllele);
1239         final byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), inputRef.length(), refAllele.length());
1240 
1241         final Map<Allele, Allele> map = new LinkedHashMap<>();
1242         for ( final Allele a : inputAlts ) {
1243             if ( isNonSymbolicExtendableAllele(a) ) {
1244                 Allele extended = Allele.extend(a, extraBases);
1245                 map.put(a, extended);
1246             } else if (a.equals(Allele.SPAN_DEL)) {
1247                 map.put(a, a);
1248             }
1249         }
1250 
1251         return map;
1252     }
1253 
isNonSymbolicExtendableAllele(final Allele allele)1254     private static boolean isNonSymbolicExtendableAllele(final Allele allele) {
1255         return ! (allele.isReference() || allele.isSymbolic() || allele.equals(Allele.SPAN_DEL));
1256     }
1257 
sortVariantContextsByPriority(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, GenotypeMergeType mergeOption )1258     public static List<VariantContext> sortVariantContextsByPriority(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, GenotypeMergeType mergeOption ) {
1259         if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null )
1260             throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list");
1261 
1262         if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED )
1263             return new ArrayList<>(unsortedVCs);
1264         else {
1265             ArrayList<VariantContext> sorted = new ArrayList<>(unsortedVCs);
1266             Collections.sort(sorted, new CompareByPriority(priorityListOfVCs));
1267             return sorted;
1268         }
1269     }
1270 
mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniquifySamples)1271     private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniquifySamples) {
1272         //TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE
1273         for ( final Genotype g : oneVC.getGenotypes() ) {
1274             final String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniquifySamples);
1275             if ( ! mergedGenotypes.containsSample(name) ) {
1276                 // only add if the name is new
1277                 Genotype newG = g;
1278 
1279                 if ( uniquifySamples || alleleMapping.needsRemapping() ) {
1280                     final List<Allele> alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles();
1281                     newG = new GenotypeBuilder(g).name(name).alleles(alleles).make();
1282                 }
1283 
1284                 mergedGenotypes.add(newG);
1285             }
1286         }
1287     }
1288 
1289     /**
1290      * Cached NO_CALL immutable lists where the position ith contains the list with i elements.
1291      */
1292     @SuppressWarnings({"rawtypes", "unchecked"})
1293     private static List<Allele>[] NOCALL_LISTS = new List[] {
1294             Collections.emptyList(),
1295             Collections.singletonList(Allele.NO_CALL),
1296             Collections.nCopies(2,Allele.NO_CALL)
1297     };
1298 
1299     /**
1300      * Code to ensure that {@link #NOCALL_LISTS} has enough entries beyond the requested ploidy
1301      * @param capacity the requested ploidy.
1302      */
ensureNoCallListsCapacity(final int capacity)1303     private static void ensureNoCallListsCapacity(final int capacity) {
1304         final int currentCapacity = NOCALL_LISTS.length - 1;
1305         if (currentCapacity >= capacity)
1306             return;
1307         NOCALL_LISTS = Arrays.copyOf(NOCALL_LISTS,Math.max(capacity,currentCapacity << 1) + 1);
1308         for (int i = currentCapacity + 1; i < NOCALL_LISTS.length; i++)
1309             NOCALL_LISTS[i] = Collections.nCopies(i,Allele.NO_CALL);
1310     }
1311 
1312     /**
1313      * Returns a {@link Allele#NO_CALL NO_CALL} allele list provided the ploidy.
1314      *
1315      * @param ploidy the required ploidy.
1316      *
1317      * @return never {@code null}, but an empty list if {@code ploidy} is equal or less than 0. The returned list
1318      *   might or might not be mutable.
1319      */
noCallAlleles(final int ploidy)1320     public static List<Allele> noCallAlleles(final int ploidy) {
1321         if (NOCALL_LISTS.length <= ploidy)
1322             ensureNoCallListsCapacity(ploidy);
1323         return NOCALL_LISTS[ploidy];
1324     }
1325 
1326 
mergedSampleName(String trackName, String sampleName, boolean uniquify )1327     public static String mergedSampleName(String trackName, String sampleName, boolean uniquify ) {
1328         return uniquify ? sampleName + "." + trackName : sampleName;
1329     }
1330 
1331     /**
1332      * Trim the alleles in inputVC from the reverse direction
1333      *
1334      * @param inputVC a non-null input VC whose alleles might need a haircut
1335      * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up
1336      */
reverseTrimAlleles( final VariantContext inputVC )1337     public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) {
1338         return trimAlleles(inputVC, false, true);
1339     }
1340 
1341     /**
1342      * Trim the alleles in inputVC forward and reverse, as requested
1343      *
1344      * @param inputVC a non-null input VC whose alleles might need a haircut
1345      * @param trimForward should we trim up the alleles from the forward direction?
1346      * @param trimReverse should we trim up the alleles from the reverse direction?
1347      * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles
1348      */
trimAlleles(final VariantContext inputVC, final boolean trimForward, final boolean trimReverse)1349     public static VariantContext trimAlleles(final VariantContext inputVC, final boolean trimForward, final boolean trimReverse) {
1350         Utils.nonNull(inputVC);
1351 
1352         if ( inputVC.getNAlleles() <= 1 || inputVC.getAlleles().stream().anyMatch(a -> a.length() == 1) ) {
1353             return inputVC;
1354         }
1355 
1356         final List<byte[]> sequences = inputVC.getAlleles().stream().filter(a -> !a.isSymbolic()).map(Allele::getBases).collect(Collectors.toList());
1357         final List<IndexRange> ranges = inputVC.getAlleles().stream().filter(a -> !a.isSymbolic()).map(a -> new IndexRange(0, a.length())).collect(Collectors.toList());
1358 
1359         final Pair<Integer, Integer> shifts = AlignmentUtils.normalizeAlleles(sequences, ranges, 0, true);
1360         final int endTrim = shifts.getRight();
1361         final int startTrim = -shifts.getLeft();
1362 
1363         final boolean emptyAllele = ranges.stream().anyMatch(r -> r.size() == 0);
1364         final boolean restoreOneBaseAtEnd = emptyAllele && startTrim == 0;
1365         final boolean restoreOneBaseAtStart = emptyAllele && startTrim > 0;
1366 
1367         // if the end trimming consumed all the bases, leave one base
1368         final int endBasesToClip = restoreOneBaseAtEnd ? endTrim - 1 : endTrim;
1369         final int startBasesToClip = restoreOneBaseAtStart ? startTrim - 1 : startTrim;
1370 
1371         return trimAlleles(inputVC, (trimForward ? startBasesToClip : 0) - 1, trimReverse ? endBasesToClip : 0);
1372     }
1373 
1374     /**
1375      * Trim up alleles in inputVC, cutting out all bases up to fwdTrimEnd inclusive and
1376      * the last revTrim bases from the end
1377      *
1378      * @param inputVC a non-null input VC
1379      * @param fwdTrimEnd bases up to this index (can be -1) will be removed from the start of all alleles
1380      * @param revTrim the last revTrim bases of each allele will be clipped off as well
1381      * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles
1382      */
trimAlleles(final VariantContext inputVC, final int fwdTrimEnd, final int revTrim)1383     protected static VariantContext trimAlleles(final VariantContext inputVC,
1384                                                 final int fwdTrimEnd,
1385                                                 final int revTrim) {
1386         if( fwdTrimEnd == -1 && revTrim == 0 ) // nothing to do, so just return inputVC unmodified
1387             return inputVC;
1388 
1389         final List<Allele> alleles = new LinkedList<>();
1390         final Map<Allele, Allele> originalToTrimmedAlleleMap = new LinkedHashMap<>();
1391 
1392         for (final Allele a : inputVC.getAlleles()) {
1393             if (a.isSymbolic()) {
1394                 alleles.add(a);
1395                 originalToTrimmedAlleleMap.put(a, a);
1396             } else {
1397                 // get bases for current allele and create a new one with trimmed bases
1398                 final byte[] newBases = Arrays.copyOfRange(a.getBases(), fwdTrimEnd+1, a.length()-revTrim);
1399                 final Allele trimmedAllele = Allele.create(newBases, a.isReference());
1400                 alleles.add(trimmedAllele);
1401                 originalToTrimmedAlleleMap.put(a, trimmedAllele);
1402             }
1403         }
1404 
1405         // now we can recreate new genotypes with trimmed alleles
1406         final AlleleMapper alleleMapper = new AlleleMapper(originalToTrimmedAlleleMap);
1407         final GenotypesContext genotypes = updateGenotypesWithMappedAlleles(inputVC.getGenotypes(), alleleMapper);
1408 
1409         final int start = inputVC.getStart() + (fwdTrimEnd + 1);
1410         final VariantContextBuilder builder = new VariantContextBuilder(inputVC);
1411         builder.start(start);
1412         builder.stop(start + alleles.get(0).length() - 1);
1413         builder.alleles(alleles);
1414         builder.genotypes(genotypes);
1415         return builder.make();
1416     }
1417 
updateGenotypesWithMappedAlleles(final GenotypesContext originalGenotypes, final AlleleMapper alleleMapper)1418     protected static GenotypesContext updateGenotypesWithMappedAlleles(final GenotypesContext originalGenotypes, final AlleleMapper alleleMapper) {
1419         final GenotypesContext updatedGenotypes = GenotypesContext.create(originalGenotypes.size());
1420 
1421         for ( final Genotype genotype : originalGenotypes ) {
1422             final List<Allele> updatedAlleles = alleleMapper.remap(genotype.getAlleles());
1423             updatedGenotypes.add(new GenotypeBuilder(genotype).alleles(updatedAlleles).make());
1424         }
1425 
1426         return updatedGenotypes;
1427     }
1428 
1429     protected static class AlleleMapper {
1430         private VariantContext vc = null;
1431         private Map<Allele, Allele> map = null;
AlleleMapper(VariantContext vc)1432         public AlleleMapper(VariantContext vc)          { this.vc = vc; }
AlleleMapper(Map<Allele, Allele> map)1433         public AlleleMapper(Map<Allele, Allele> map)    { this.map = map; }
needsRemapping()1434         public boolean needsRemapping()                 { return this.map != null; }
values()1435         public Collection<Allele> values()              { return map != null ? map.values() : vc.getAlleles(); }
remap(Allele a)1436         public Allele remap(Allele a)                   { return map != null && map.containsKey(a) ? map.get(a) : a; }
1437 
remap(List<Allele> as)1438         public List<Allele> remap(List<Allele> as) {
1439             List<Allele> newAs = as.stream()
1440                     .map(this::remap)
1441                     .collect(Collectors.toList());
1442             return newAs;
1443         }
1444 
1445     }
1446 
1447     private static class CompareByPriority implements Comparator<VariantContext>, Serializable {
1448         private static final long serialVersionUID = 0L;
1449 
1450         List<String> priorityListOfVCs;
CompareByPriority(List<String> priorityListOfVCs)1451         public CompareByPriority(List<String> priorityListOfVCs) {
1452             this.priorityListOfVCs = priorityListOfVCs;
1453         }
1454 
getIndex(VariantContext vc)1455         private int getIndex(VariantContext vc) {
1456             return Utils.validIndex(priorityListOfVCs.indexOf(vc.getSource()), priorityListOfVCs.size());
1457         }
1458 
1459         @Override
compare(VariantContext vc1, VariantContext vc2)1460         public int compare(VariantContext vc1, VariantContext vc2) {
1461             return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2));
1462         }
1463     }
1464 
1465     /**
1466      * For testing purposes only.  Create a site-only VariantContext at contig:start containing alleles
1467      *
1468      * @param name the name of the VC
1469      * @param contig the contig for the VC
1470      * @param start the start of the VC
1471      * @param alleleStrings a non-null, non-empty list of strings for the alleles.  The first will be the ref allele, and others the
1472      *                      alt.  Will compute the stop of the VC from the length of the reference allele
1473      * @return a non-null VariantContext
1474      */
makeFromAlleles(final String name, final String contig, final int start, final List<String> alleleStrings)1475     public static VariantContext makeFromAlleles(final String name, final String contig, final int start, final List<String> alleleStrings) {
1476         Utils.nonNull(alleleStrings, "alleleStrings is null");
1477         Utils.nonEmpty(alleleStrings, "alleleStrings is empty");
1478 
1479         final List<Allele> alleles = new LinkedList<>();
1480         final int length = alleleStrings.get(0).length();
1481 
1482         boolean first = true;
1483         for ( final String alleleString : alleleStrings ) {
1484             alleles.add(Allele.create(alleleString, first));
1485             first = false;
1486         }
1487         return new VariantContextBuilder(name, contig, start, start+length-1, alleles).make();
1488     }
1489 
splitSomaticVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final VCFHeader outputHeader)1490     public static List<VariantContext> splitSomaticVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final VCFHeader outputHeader) {
1491         Utils.nonNull(vc);
1492 
1493         if (!vc.isVariant() || vc.isBiallelic()) {
1494             // non variant or biallelics already satisfy the contract
1495             return Collections.singletonList(vc);
1496         } else {
1497             final List<VariantContext> biallelics = new LinkedList<>();
1498 
1499             List<String> attrsSpecialFormats = new ArrayList<String>(Arrays.asList(GATKVCFConstants.AS_FILTER_STATUS_KEY, GATKVCFConstants.AS_SB_TABLE_KEY));
1500             List<Map<String, Object>> attributesByAllele = splitAttributesIntoPerAlleleLists(vc, attrsSpecialFormats, outputHeader);
1501             splitASSBTable(vc, attributesByAllele);
1502             splitASFilters(vc, attributesByAllele);
1503 
1504             ListIterator<Map<String, Object>> attributesByAlleleIterator = attributesByAllele.listIterator();
1505 
1506             for (final Allele alt : vc.getAlternateAlleles()) {
1507                 final VariantContextBuilder builder = new VariantContextBuilder(vc);
1508 
1509                 // make biallelic alleles
1510                 final List<Allele> alleles = Arrays.asList(vc.getReference(), alt);
1511                 builder.alleles(alleles);
1512                 Map<String, Object> attributes = attributesByAlleleIterator.next();
1513                 builder.attributes(attributes);
1514 
1515                 // now add the allele specific filters to the variant context
1516                 String filters = (String) attributes.get(GATKVCFConstants.AS_FILTER_STATUS_KEY);
1517                 // checking for . and PASS should be able to be removed. these were temporarily used to indicate no allele specific filter
1518                 if (filters != null && !filters.isEmpty() && !filters.equals(VCFConstants.EMPTY_INFO_FIELD) && !filters.equals(GATKVCFConstants.SITE_LEVEL_FILTERS) && !filters.equals((VCFConstants.PASSES_FILTERS_v4))) {
1519                     AnnotationUtils.decodeAnyASList(filters).stream().forEach(filter -> builder.filter(filter));
1520                 }
1521 
1522                 int alleleIndex = vc.getAlleleIndex(alt);
1523                 builder.genotypes(AlleleSubsettingUtils.subsetSomaticAlleles(outputHeader, vc.getGenotypes(), alleles, new int[]{0, alleleIndex}));
1524                 final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true);
1525                 biallelics.add(trimmed);
1526             }
1527             return biallelics;
1528         }
1529     }
1530 
splitASSBTable(VariantContext vc, List<Map<String, Object>> attrsByAllele)1531     public static void splitASSBTable(VariantContext vc, List<Map<String, Object>> attrsByAllele) {
1532         List<String> sbs = StrandBiasUtils.getSBsForAlleles(vc).stream().map(ints -> StrandBiasUtils.encode(ints)).collect(Collectors.toList());
1533         new IndexRange(1, sbs.size()).forEach(i -> {
1534             String newattrs = String.join(AnnotationUtils.ALLELE_SPECIFIC_RAW_DELIM, new ArrayList<String>(Arrays.asList(sbs.get(0), sbs.get(i))));
1535             attrsByAllele.get(i - 1).put(GATKVCFConstants.AS_SB_TABLE_KEY, newattrs);
1536         });
1537     }
1538 
splitASFilters(VariantContext vc, List<Map<String, Object>> attrsByAllele)1539     public static void splitASFilters(VariantContext vc, List<Map<String, Object>> attrsByAllele) {
1540         // the reason we are getting as list and then joining on , is because the default getAttributeAsString for a list will add spaces between items which we don't
1541         // want to have to trim out later in the code
1542         String asfiltersStr = String.join(",", vc.getCommonInfo().getAttributeAsStringList(GATKVCFConstants.AS_FILTER_STATUS_KEY, GATKVCFConstants.SITE_LEVEL_FILTERS));
1543         List<String> filtersList = AnnotationUtils.decodeAnyASListWithRawDelim(asfiltersStr);
1544         new IndexRange(0, filtersList.size()).forEach(i -> attrsByAllele.get(i).put(GATKVCFConstants.AS_FILTER_STATUS_KEY, filtersList.get(i)));
1545     }
1546 
splitAttributesIntoPerAlleleLists(VariantContext vc, List<String> skipAttributes, VCFHeader outputHeader)1547     public static List<Map<String, Object>> splitAttributesIntoPerAlleleLists(VariantContext vc, List<String> skipAttributes, VCFHeader outputHeader) {
1548         List<Map<String, Object>> results = new ArrayList<>(vc.getNAlleles()-1);
1549         vc.getAlternateAlleles().forEach(alt -> results.add(new HashMap<>()));
1550 
1551         Map<String, Object> attributes = vc.getAttributes();
1552         attributes.entrySet().stream().filter(entry -> !skipAttributes.contains(entry.getKey())).forEachOrdered(entry -> {
1553             String key = entry.getKey();
1554             // default to unbounded in case header is not found
1555             VCFHeaderLineCount countType = VCFHeaderLineCount.UNBOUNDED;
1556             try {
1557                 VCFInfoHeaderLine header = outputHeader.getInfoHeaderLine(key);
1558                 countType = header.getCountType();
1559             } catch (IllegalStateException ex) {
1560                 // this happens for DP if we use GATKVCFHeaderLines.getInfoLine(key)
1561                 // shouldn't happen now that we use the generated output header
1562                 logger.warn("Could not find header info for key " + key);
1563             }
1564             // override count type for this attribute
1565             if (key.equals(GATKVCFConstants.REPEATS_PER_ALLELE_KEY)) {
1566                 countType = VCFHeaderLineCount.R;
1567             }
1568             List<Object> attr;
1569             switch (countType) {
1570                 case A:
1571                     attr = vc.getCommonInfo().getAttributeAsList(key);
1572                     ValidationUtils.validateArg(attr.size() == results.size(), "Incorrect attribute size for " + key);
1573                     new IndexRange(0, attr.size()).forEach(i -> results.get(i).put(key, attr.get(i)));
1574                     break;
1575                 case R:
1576                     attr = vc.getCommonInfo().getAttributeAsList(key);
1577                     ValidationUtils.validateArg(attr.size() == vc.getNAlleles(), "Incorrect attribute size for " + key);
1578                     new IndexRange(1, attr.size()).forEach(i -> {
1579                         List<Object> newattrs = new ArrayList<Object>(Arrays.asList(attr.get(0), attr.get(i)));
1580                         results.get(i-1).put(key, newattrs);
1581                     });
1582                     break;
1583                 default:
1584                     results.forEach(altMap -> altMap.put(key, entry.getValue()));
1585 
1586             }
1587 
1588         });
1589         return results;
1590     }
1591 
1592     /**
1593      * Split variant context into its biallelic components if there are more than 2 alleles
1594      * <p>
1595      * For VC has A/B/C alleles, returns A/B and A/C contexts.
1596      * Alleles are right trimmed to satisfy VCF conventions
1597      * <p>
1598      * If vc is biallelic or non-variant it is just returned
1599      * <p>
1600      * Chromosome counts are updated (but they are by definition 0)
1601      *
1602      * @param vc                       a potentially multi-allelic variant context
1603      * @param trimLeft                 if true, we will also left trim alleles, potentially moving the resulting vcs forward on the genome
1604      * @param genotypeAssignmentMethod assignment strategy for the (subsetted) PLs
1605      * @param keepOriginalChrCounts    keep the orignal chromosome counts before subsetting
1606      * @return a list of bi-allelic (or monomorphic) variant context
1607      */
splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod, final boolean keepOriginalChrCounts)1608     public static List<VariantContext> splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod,
1609                                                                        final boolean keepOriginalChrCounts) {
1610         Utils.nonNull(vc);
1611 
1612         if (!vc.isVariant() || vc.isBiallelic())
1613             // non variant or biallelics already satisfy the contract
1614             return Collections.singletonList(vc);
1615         else {
1616             final List<VariantContext> biallelics = new LinkedList<>();
1617 
1618             // if any of the genotypes are het-not-ref (i.e. 1/2), set all of them to no-call
1619             final GenotypeAssignmentMethod genotypeAssignmentMethodUsed = hasHetNonRef(vc.getGenotypes()) ? GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS : genotypeAssignmentMethod;
1620 
1621             for (final Allele alt : vc.getAlternateAlleles()) {
1622                 final VariantContextBuilder builder = new VariantContextBuilder(vc);
1623 
1624                 // make biallelic alleles
1625                 final List<Allele> alleles = Arrays.asList(vc.getReference(), alt);
1626                 builder.alleles(alleles);
1627 
1628                 // since the VC has been subset, remove the invalid attributes
1629                 for (final String key : vc.getAttributes().keySet()) {
1630                     if (!(key.equals(VCFConstants.ALLELE_COUNT_KEY) || key.equals(VCFConstants.ALLELE_FREQUENCY_KEY) || key.equals(VCFConstants.ALLELE_NUMBER_KEY)) ||
1631                             genotypeAssignmentMethodUsed == GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS) {
1632                         builder.rmAttribute(key);
1633                     }
1634                 }
1635 
1636                 // subset INFO field annotations if available if genotype is called
1637                 if (genotypeAssignmentMethodUsed != GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS &&
1638                         genotypeAssignmentMethodUsed != GenotypeAssignmentMethod.SET_TO_NO_CALL)
1639                     AlleleSubsettingUtils.addInfoFieldAnnotations(vc, builder, keepOriginalChrCounts);
1640 
1641                 builder.genotypes(AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(),2,vc.getAlleles(), alleles, null, genotypeAssignmentMethodUsed,vc.getAttributeAsInt("DP",0)));
1642                 final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true);
1643                 biallelics.add(trimmed);
1644             }
1645 
1646             return biallelics;
1647         }
1648     }
1649 
1650     /**
1651      * Check if any of the genotypes is heterozygous, non-reference (i.e. 1/2)
1652      *
1653      * @param genotypesContext genotype information
1654      * @return true if any of the genotypes are heterozygous, non-reference, false otherwise
1655      */
hasHetNonRef(final GenotypesContext genotypesContext)1656     private static boolean hasHetNonRef(final GenotypesContext genotypesContext) {
1657         for (final Genotype gt : genotypesContext) {
1658             if (gt.isHetNonRef())
1659                 return true;
1660         }
1661         return false;
1662     }
1663 
1664     /**
1665      * Splits the alleles for the provided variant context into its primitive parts.
1666      * Requires that the input VC be bi-allelic, so calling methods should first call splitVariantContextToBiallelics() if needed.
1667      * Currently works only for MNPs.
1668      *
1669      * @param vc  the non-null VC to split
1670      * @return a non-empty list of VCs split into primitive parts or the original VC otherwise
1671      */
splitIntoPrimitiveAlleles(final VariantContext vc)1672     public static List<VariantContext> splitIntoPrimitiveAlleles(final VariantContext vc) {
1673         Utils.nonNull(vc);
1674         if ( !vc.isBiallelic() ) {
1675             throw new IllegalArgumentException("Trying to break a multi-allelic Variant Context into primitive parts");
1676         }
1677 
1678         // currently only works for MNPs
1679         if ( !vc.isMNP() )
1680             return Arrays.asList(vc);
1681 
1682         final byte[] ref = vc.getReference().getBases();
1683         final byte[] alt = vc.getAlternateAllele(0).getBases();
1684 
1685         Utils.validate(ref.length == alt.length, "ref and alt alleles for MNP have different lengths");
1686 
1687         final List<VariantContext> result = new ArrayList<>(ref.length);
1688 
1689         for ( int i = 0; i < ref.length; i++ ) {
1690 
1691             // if the ref and alt bases are different at a given position, create a new SNP record (otherwise do nothing)
1692             if ( ref[i] != alt[i] ) {
1693 
1694                 // create the ref and alt SNP alleles
1695                 final Allele newRefAllele = Allele.create(ref[i], true);
1696                 final Allele newAltAllele = Allele.create(alt[i], false);
1697 
1698                 // create a new VariantContext with the new SNP alleles
1699                 final VariantContextBuilder newVC = new VariantContextBuilder(vc).start(vc.getStart() + i).stop(vc.getStart() + i).alleles(Arrays.asList(newRefAllele, newAltAllele));
1700 
1701                 // create new genotypes with updated alleles
1702                 final Map<Allele, Allele> alleleMap = new LinkedHashMap<>();
1703                 alleleMap.put(vc.getReference(), newRefAllele);
1704                 alleleMap.put(vc.getAlternateAllele(0), newAltAllele);
1705                 final GenotypesContext newGenotypes = updateGenotypesWithMappedAlleles(vc.getGenotypes(), new AlleleMapper(alleleMap));
1706 
1707                 result.add(newVC.genotypes(newGenotypes).make());
1708             }
1709         }
1710 
1711         if ( result.isEmpty() )
1712             result.add(vc);
1713 
1714         return result;
1715     }
1716 
1717     /**
1718      * Add chromosome counts (AC, AN and AF) to the VCF header lines
1719      *
1720      * @param headerLines the VCF header lines
1721      */
addChromosomeCountsToHeader(final Set<VCFHeaderLine> headerLines)1722     public static void addChromosomeCountsToHeader(final Set<VCFHeaderLine> headerLines) {
1723         headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY));
1724         headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY));
1725         headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY));
1726     }
1727 
1728     /**
1729      * Set the builder's filtered genotypes to no-call and update AC, AN and AF
1730      *
1731      * @param builder builder for variant context
1732      * @param vc the VariantContext record to set filtered genotypes to no-call
1733      * @param setFilteredGenotypesToNocall flag to set filtered genotype to NO CALL
1734      * @param filters the filters for each genotype
1735      */
setFilteredGenotypeToNocall(final VariantContextBuilder builder, final VariantContext vc, final boolean setFilteredGenotypesToNocall, BiFunction<VariantContext, Genotype, List<String>> filters)1736     public static void setFilteredGenotypeToNocall(final VariantContextBuilder builder, final VariantContext vc,
1737                                                    final boolean setFilteredGenotypesToNocall, BiFunction<VariantContext, Genotype, List<String>> filters) {
1738         Utils.nonNull(vc);
1739         Utils.nonNull(builder);
1740         Utils.nonNull(filters);
1741 
1742         final GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
1743 
1744         // update genotypes if filtered genotypes are set to no-call
1745         boolean haveFilteredNoCallAlleles = false;
1746         for (final Genotype g : vc.getGenotypes()) {
1747             if (g.isCalled()) {
1748                 List<String> filterNames = filters.apply(vc,g);
1749                 if (!filterNames.isEmpty() && setFilteredGenotypesToNocall) {
1750                     haveFilteredNoCallAlleles = true;
1751                     genotypes.add(new GenotypeBuilder(g).filters(filterNames).alleles(GATKVariantContextUtils.noCallAlleles((g.getPloidy()))).make());
1752                 } else {
1753                     genotypes.add(new GenotypeBuilder(g).filters(filterNames).make());
1754                 }
1755             } else {
1756                 genotypes.add(g);
1757             }
1758         }
1759 
1760         // if filtered genotypes are set to no-call, output recomputed AC, AN, AF
1761         if ( haveFilteredNoCallAlleles ) {
1762             final Map<String, Object> attributes = new LinkedHashMap<>(vc.getAttributes()); // need to make mutable
1763             VariantContextUtils.calculateChromosomeCounts(builder.genotypes(genotypes).make(), attributes, true, vc.getSampleNames());
1764             builder.attributes(attributes);
1765         }
1766 
1767         // update genotypes
1768         builder.genotypes(genotypes);
1769     }
1770 
1771     /**
1772      * Calculate the Genotype Quality (GQ) by subtracting the smallest Phred-scaled likelihoods (PL) from the second smallest.
1773      *
1774      * @param plValues  array of PL values
1775      * @return          the genotype quality corresponding to the PL values
1776      */
calculateGQFromPLs(final int[] plValues)1777     public static int calculateGQFromPLs(final int[] plValues) {
1778         Utils.nonNull(plValues);
1779         Utils.validateArg(plValues.length >= 2, () -> "Array of PL values must contain at least two elements.");
1780 
1781         int first = plValues[0];
1782         int second = plValues[1];
1783         if (first > second) {
1784             second = first;
1785             first = plValues[1];
1786         }
1787         for (int i = 2; i < plValues.length; i++) {
1788             final int candidate = plValues[i];
1789             if (candidate >= second) {
1790                 continue;
1791             }
1792             if (candidate <= first) {
1793                 second = first;
1794                 first = candidate;
1795             } else
1796                 second = candidate;
1797         }
1798         return second - first;
1799     }
1800 
1801     /**
1802      *  Create a string with existing filters plus the one to append
1803      * @param vc VariantContext to base initial filter values.  Not {@code null}
1804      * @param filterToAppend the filter value to append to the strings Not {@code null}
1805      * @return String of filter values.  Sorted alphabetically.
1806      */
createFilterListWithAppend(VariantContext vc, String filterToAppend)1807     public static List<String> createFilterListWithAppend(VariantContext vc, String filterToAppend) {
1808         Utils.nonNull(vc);
1809         Utils.nonNull(filterToAppend);
1810 
1811         final List<String> filtersAsList = new ArrayList<>(vc.getFilters());
1812         filtersAsList.add(filterToAppend);
1813         filtersAsList.sort(String::compareToIgnoreCase);
1814         return filtersAsList;
1815     }
1816 
1817     /**
1818      * Determines whether the given reference and alternate alleles constitute a frameshift mutation.
1819      * @param reference The reference {@link Allele}.  Must not be {@code null}.
1820      * @param alternate The alternate / variant {@link Allele}.  Must not be {@code null}.
1821      * @return {@code true} if replacing the reference with the alternate results in a frameshift.  {@code false} otherwise.
1822      */
isFrameshift(final Allele reference, final Allele alternate)1823     public static boolean isFrameshift(final Allele reference, final Allele alternate) {
1824 
1825         Utils.nonNull(reference);
1826         Utils.nonNull(alternate);
1827 
1828         // We know it's a frameshift if we have a replacement that is not of a
1829         // length evenly divisible by 3 because that's how many bases are read at once:
1830         return ((Math.abs( reference.length() - alternate.length() ) % 3) != 0);
1831     }
1832 
1833     /**
1834      * Determines whether the given reference and alternate alleles constitute a frameshift mutation.
1835      * @param reference The {@link String} containing the bases of the reference allele.  Must not be {@code null}.
1836      * @param alternate The {@link String} containing the bases of the alternate allele.  Must not be {@code null}.
1837      * @return {@code true} if replacing the reference with the alternate results in a frameshift.  {@code false} otherwise.
1838      */
isFrameshift(final String reference, final String alternate)1839     public static boolean isFrameshift(final String reference, final String alternate) {
1840 
1841         Utils.nonNull(reference);
1842         Utils.nonNull(alternate);
1843 
1844         final String refComparable = reference.replaceAll("\\*", "");
1845         final String altComperable = alternate.replaceAll("\\*", "");
1846 
1847         // We know it's a frameshift if we have a replacement that is not of a
1848         // length evenly divisible by 3 because that's how many bases are read at once:
1849         return ((Math.abs( refComparable.length() - altComperable.length() ) % 3) != 0);
1850     }
1851 
1852     /**
1853      * Determines whether the given reference and alternate alleles constitute a frameshift mutation.
1854      * This does not take into account the cases where either or both of the alleles overlap splice sites.
1855      * @param startPos Genomic start position (1-based, inclusive) of the variant.  Must be > 0.
1856      * @param refEnd Genomic end position (1-based, inclusive) of the reference allele.  Must be > 0.
1857      * @param altEnd Genomic end position (1-based, inclusive) of the alternate allele.  Must be > 0.
1858      * @return {@code true} if replacing the reference with the alternate results in a frameshift.  {@code false} otherwise.
1859      */
isFrameshift(final int startPos, final int refEnd, final int altEnd)1860     public static boolean isFrameshift(final int startPos, final int refEnd, final int altEnd) {
1861 
1862         ParamUtils.isPositive( startPos, "Genomic positions must be > 0." );
1863         ParamUtils.isPositive( refEnd, "Genomic positions must be > 0." );
1864         ParamUtils.isPositive( altEnd, "Genomic positions must be > 0." );
1865 
1866         final int refLength = refEnd - startPos + 1;
1867         final int altLength = altEnd - startPos + 1;
1868 
1869         // We know it's a frameshift if we have a replacement that is not of a
1870         // length evenly divisible by 3 because that's how many bases are read at once:
1871         return ((Math.abs( refLength - altLength ) % 3) != 0);
1872     }
1873 
1874     /**
1875      * Determines whether the given reference and alternate alleles constitute an insertion mutation.
1876      * @param reference The reference {@link Allele}.   Must not be {@code null}.
1877      * @param alternate The alternate / variant {@link Allele}.   Must not be {@code null}.
1878      * @return {@code true} if replacing the reference with the alternate results in an insertion.  {@code false} otherwise.
1879      */
isInsertion(final Allele reference, final Allele alternate)1880     public static boolean isInsertion(final Allele reference, final Allele alternate) {
1881 
1882         Utils.nonNull(reference);
1883         Utils.nonNull(alternate);
1884 
1885         // If we have more bases in the alternate, we have an insertion:
1886         return reference.length() < alternate.length();
1887     }
1888 
1889     /**
1890      * Determines whether the given reference and alternate alleles constitute an insertion mutation.
1891      * @param reference A {@link String} containing the bases of the reference allele.  Must not be {@code null}.
1892      * @param alternate A {@link String} containing the bases of the alternate / variant allele.  Must not be {@code null}.
1893      * @return {@code true} if replacing the reference with the alternate results in an insertion.  {@code false} otherwise.
1894      */
isInsertion(final String reference, final String alternate)1895     public static boolean isInsertion(final String reference, final String alternate) {
1896 
1897         Utils.nonNull(reference);
1898         Utils.nonNull(alternate);
1899 
1900         final String refComparable = reference.replaceAll("\\*", "");
1901         final String altComperable = alternate.replaceAll("\\*", "");
1902 
1903         // If we have more bases in the alternate, we have an insertion:
1904         return refComparable.length() < altComperable.length();
1905     }
1906 
1907     /**
1908      * Determines whether the given reference and alternate alleles constitute a deletion mutation.
1909      * @param reference A {@link String} containing the bases of the reference allele.  Must not be {@code null}.
1910      * @param alternate A {@link String} containing the bases of the alternate / variant allele.  Must not be {@code null}.
1911      * @return {@code true} if replacing the reference with the alternate results in a deletion.  {@code false} otherwise.
1912      */
isDeletion(final String reference, final String alternate)1913     public static boolean isDeletion(final String reference, final String alternate) {
1914 
1915         Utils.nonNull(reference);
1916         Utils.nonNull(alternate);
1917 
1918         final String refComparable = reference.replaceAll("\\*", "");
1919         final String altComperable = alternate.replaceAll("\\*", "");
1920 
1921         // If we have fewer bases in the alternate, we have a deletion:
1922         return refComparable.length() > altComperable.length();
1923     }
1924 
1925     /**
1926      * Determines whether the given reference and alternate alleles constitute a deletion mutation.
1927      * @param reference The reference {@link Allele}.  Must not be {@code null}.
1928      * @param alternate The alternate / variant {@link Allele}.  Must not be {@code null}.
1929      * @return {@code true} if replacing the reference with the alternate results in a deletion.  {@code false} otherwise.
1930      */
isDeletion(final Allele reference, final Allele alternate)1931     public static boolean isDeletion(final Allele reference, final Allele alternate) {
1932 
1933         Utils.nonNull(reference);
1934         Utils.nonNull(alternate);
1935 
1936         // If we have fewer bases in the alternate, we have a deletion:
1937         return reference.length() > alternate.length();
1938     }
1939 
1940     /**
1941      * Determines whether the given reference and alternate alleles constitute an insertion or deletion mutation.
1942      * @param reference A {@link String} containing the bases of the reference allele.  Must not be {@code null}.
1943      * @param alternate A {@link String} containing the bases of the alternate / variant allele.  Must not be {@code null}.
1944      * @return {@code true} if replacing the reference with the alternate results in an insertion or deletion.  {@code false} otherwise.
1945      */
isIndel(final String reference, final String alternate)1946     public static boolean isIndel(final String reference, final String alternate) {
1947 
1948         Utils.nonNull(reference);
1949         Utils.nonNull(alternate);
1950 
1951         final String refComparable = reference.replaceAll("\\*", "");
1952         final String altComperable = alternate.replaceAll("\\*", "");
1953 
1954         // If we do not have the same number of bases in the reference and alternate alleles,
1955         // then we have an indel:
1956         return refComparable.length() != altComperable.length();
1957     }
1958 
1959     /**
1960      * Determines whether the given reference and alternate alleles constitute an insertion or deletion mutation.
1961      * @param reference The reference {@link Allele}.  Must not be {@code null}.
1962      * @param alternate The alternate / variant {@link Allele}.  Must not be {@code null}.
1963      * @return {@code true} if replacing the reference with the alternate results in an insertion or deletion.  {@code false} otherwise.
1964      */
isIndel(final Allele reference, final Allele alternate)1965     public static boolean isIndel(final Allele reference, final Allele alternate) {
1966 
1967         Utils.nonNull(reference);
1968         Utils.nonNull(alternate);
1969 
1970         // If we do not have the same number of bases in the reference and alternate alleles,
1971         // then we have an indel:
1972         return reference.length() != alternate.length();
1973     }
1974 
1975     /**
1976      * Determines whether the given reference and alternate alleles constitute a polymorphism in one or more nucleotides (XNP).
1977      * @param reference The reference {@link Allele}.  Must not be {@code null}.
1978      * @param alternate The alternate / variant {@link Allele}.  Must not be {@code null}.
1979      * @return {@code true} if replacing the reference with the alternate results in an XNP.  {@code false} otherwise.
1980      */
isXnp(final Allele reference, final Allele alternate)1981     public static boolean isXnp(final Allele reference, final Allele alternate) {
1982 
1983         Utils.nonNull(reference);
1984         Utils.nonNull(alternate);
1985 
1986         // If we have an equal number of bases in the reference and the alternate, we have an ONP:
1987         return (reference.length() == alternate.length()) && (!reference.equals(alternate));
1988     }
1989 
1990     /**
1991      * Determines whether the given reference and alternate alleles constitute a polymorphism in one or more nucleotides (XNP).
1992      * @param reference A {@link String} containing the bases of the reference allele.  Must not be {@code null}.
1993      * @param alternate A {@link String} containing the bases of the alternate / variant allele.  Must not be {@code null}.
1994      * @return {@code true} if replacing the reference with the alternate results in an XNP.  {@code false} otherwise.
1995      */
isXnp(final String reference, final String alternate)1996     public static boolean isXnp(final String reference, final String alternate) {
1997 
1998         Utils.nonNull(reference);
1999         Utils.nonNull(alternate);
2000 
2001         final String refComparable = reference.replaceAll("\\*", "");
2002         final String altComperable = alternate.replaceAll("\\*", "");
2003 
2004         // If we have an equal number of bases in the reference and the alternate, we have an ONP:
2005         return ((refComparable.length() == altComperable.length()) && (!refComparable.equals(altComperable)));
2006     }
2007 
2008     /**
2009      * @param vc {@link VariantContext to test}
2010      * @return true if the only alternate allele for this VariantContext is a spanning deletion, otherwise false.
2011      */
isSpanningDeletionOnly(final VariantContext vc)2012     public static boolean isSpanningDeletionOnly(final VariantContext vc){
2013         return vc.getAlternateAlleles().size() == 1 && GATKVCFConstants.isSpanningDeletion(vc.getAlternateAllele(0));
2014     }
2015 
2016     /**
2017      *
2018      * Attempt to match allele ref/alt pairs, even if the allele pairs in the given variant contexts are equivalent,
2019      *  but not exact.
2020      *
2021      * For example, if variant 1 and variant2 have the same position, but
2022      * Variant 1: "A", T", "C"
2023      * Variant 2: "ACCAGGCCCAGCTCATGCTTCTTTGCAGCCTCT", "TCCAGGCCCAGCTCATGCTTCTTTGCAGCCTCT", "A", "AC"
2024      *
2025      * Then the returned array would be:  {0, -1}
2026      * Since A>T matches in variant1 matches the the first alt allele in variant 2.  And A>C does not match any allele
2027      *  in variant 2.
2028      *
2029      * Do not use this method for doing a full split of a variant context into biallelic components.  This method
2030      *  ignores (and drops) genotypes and INFO attributes.  It is really meant just for alleles, but works on a
2031      *  VariantContext to better leverage existing code in
2032      *  {@link GATKVariantContextUtils#trimAlleles(VariantContext, boolean, boolean)}
2033      *
2034      * @param variant1 Never {@code null}
2035      * @param variant2 Never {@code null}
2036      * @return Returns indexes into the alternate alleles of variant2.  Note that this method assumes that (when biallelic) the variant
2037      *  contexts are already trimmed.  See {@link GATKVariantContextUtils#trimAlleles(VariantContext, boolean, boolean)}
2038      *  Never {@code null}.  The length will be equal to the number of alternate alleles in variant1.  A value of -1
2039      *  indicates no match in variant2.  If the reference alleles do not match, the output array will be populated
2040      *  exclusively with -1.
2041      */
matchAllelesOnly(final VariantContext variant1, final VariantContext variant2)2042     public static int[] matchAllelesOnly(final VariantContext variant1, final VariantContext variant2) {
2043         Utils.nonNull(variant1);
2044         Utils.nonNull(variant2);
2045 
2046         // Grab the trivial case:
2047         if (variant1.isBiallelic() && variant2.isBiallelic()) {
2048             if (variant1.getAlternateAllele(0).equals(variant2.getAlternateAllele(0)) &&
2049                     (variant1.getReference().equals(variant2.getReference()))) {
2050                 return new int[]{0};
2051             } else {
2052                 return new int[]{-1};
2053             }
2054         }
2055 
2056         // Handle the case where one or both of the input VCs are not biallelic.
2057         final int[] result = new int[variant1.getAlternateAlleles().size()];
2058 
2059         // First split (and trim) all variant contexts into biallelics.  We are only going ot be interested in the alleles.
2060         final List<VariantContext> splitVariants1 = simpleSplitIntoBiallelics(variant1);
2061         final List<VariantContext> splitVariants2 = simpleSplitIntoBiallelics(variant2);
2062 
2063         // Second, match on ref and alt.  If match occurs add it to the output list.
2064         for (int i = 0; i < splitVariants1.size(); i++) {
2065             result[i] = -1;
2066             for (int j = 0; j < splitVariants2.size(); j++) {
2067                 final VariantContext splitVariant1 = splitVariants1.get(i);
2068                 final VariantContext splitVariant2 = splitVariants2.get(j);
2069                 if (splitVariant1.getAlternateAllele(0).equals(splitVariant2.getAlternateAllele(0))
2070                         && splitVariant1.getReference().equals(splitVariant2.getReference())) {
2071                     result[i] = j;
2072                 }
2073             }
2074         }
2075 
2076         return result;
2077     }
2078 
2079     /**
2080      * Do not use this method for doing a full split of a variant context into biallelic components.  This method
2081      *  ignores (and drops) genotypes and INFO attributes.  It is really meant just for alleles, but works on a
2082      *  VariantContext to better leverage existing code in
2083      *  {@link GATKVariantContextUtils#trimAlleles(VariantContext, boolean, boolean)}
2084      *
2085      * This method is trying to be fast, otherwise.
2086      *
2087      * @param vc variant context to split into simple biallelics.  Never {@code null}
2088      * @return a list of variant contexts.  Each will be biallelic.  Length will be the number of alt alleles in the input vc.
2089      * Note that the variant contexts are usually stripped of attributes and genotypes.  Never {@code null}.  Empty list
2090      * if variant context has no alt alleles.
2091      */
simpleSplitIntoBiallelics(final VariantContext vc)2092     private static List<VariantContext> simpleSplitIntoBiallelics(final VariantContext vc) {
2093         Utils.nonNull(vc);
2094         final List<VariantContext> result = new ArrayList<>();
2095 
2096         if (vc.isBiallelic()) {
2097             return Collections.singletonList(vc);
2098         } else {
2099             // Since variant context builders are slow to keep re-creating.  Just create one and spew variant contexts from it, since
2100             //  the only difference will be the alternate allele.  Initialize the VCB with a dummy alternate allele,
2101             //  since it will be overwritten in all cases.
2102             final VariantContextBuilder vcb = new VariantContextBuilder("SimpleSplit", vc.getContig(), vc.getStart(), vc.getEnd(),
2103                     Arrays.asList(vc.getReference(), Allele.NO_CALL));
2104             vc.getAlternateAlleles().forEach(allele -> result.add(GATKVariantContextUtils.trimAlleles(
2105                     vcb.alleles(Arrays.asList(vc.getReference(), allele)).make(true), true, true)
2106                     )
2107             );
2108         }
2109 
2110         return result;
2111     }
2112 
2113     /**
2114      * Returns true if the context represents a MNP. If the context supplied contains the GVCF NON_REF symbolic
2115      * allele, then the determination is performed after discarding the NON_REF symbolic allele.
2116      *
2117      * @param vc a Variant context
2118      * @return true if the context represents an unmixed MNP (i.e. all alleles are non-symbolic and of the same
2119      *         length > 1), false otherwise
2120      */
isUnmixedMnpIgnoringNonRef(final VariantContext vc)2121     public static boolean isUnmixedMnpIgnoringNonRef(final VariantContext vc) {
2122         final List<Allele> alleles = vc.getAlleles();
2123         int length = vc.getReference().length(); // ref allele cannot be symbolic
2124         if (length < 2) { return false; }
2125 
2126         for (final Allele a : alleles) {
2127             if (a.isSymbolic() && !Allele.NON_REF_ALLELE.equals(a)) { return false; }
2128             else if (!a.isSymbolic() && a.length() != length) { return false; }
2129         }
2130 
2131         return true;
2132     }
2133 
removeDataForSymbolicAltAlleles(VariantContext vc, List<T> data)2134     public static <T> List<T> removeDataForSymbolicAltAlleles(VariantContext vc, List<T> data) {
2135         return removeDataForSymbolicAlleles(vc, data, false);
2136     }
2137 
removeDataForSymbolicAlleles(VariantContext vc, List<T> data)2138     public static <T> List<T> removeDataForSymbolicAlleles(VariantContext vc, List<T> data) {
2139         return removeDataForSymbolicAlleles(vc, data, true);
2140     }
2141 
removeDataForSymbolicAlleles(VariantContext vc, List<T> data, boolean dataContainsReference)2142     protected static <T> List<T> removeDataForSymbolicAlleles(VariantContext vc, List<T> data, boolean dataContainsReference) {
2143         if (vc.hasSymbolicAlleles()) {
2144             List<Allele> symbolicAlleles = vc.getAlternateAlleles().stream().filter(allele -> allele.isSymbolic()).collect(Collectors.toList());
2145             // convert allele index to index for data
2146             int offset = dataContainsReference ? 0 : 1;
2147             List<Integer> symAltIndexes = vc.getAlleleIndices(symbolicAlleles).stream().map(i -> i-offset).collect(Collectors.toList());
2148             return removeItemsByIndex(data, symAltIndexes);
2149         } else {
2150             return data;
2151         }
2152     }
2153 
removeItemsByIndex(List<T> data, List<Integer> indexesToRemove)2154     public static <T> List<T> removeItemsByIndex(List<T> data, List<Integer> indexesToRemove) {
2155         List<T> updated = new ArrayList<>();
2156         new IndexRange(0, data.size()).forEach(i -> {
2157             if (!indexesToRemove.contains(i)) {
2158                 updated.add(data.get(i));
2159             }
2160         });
2161         return updated;
2162     }
2163 
2164 
2165 }
2166 
2167