1 package org.broadinstitute.hellbender.utils.variant; 2 3 import htsjdk.samtools.SAMSequenceDictionary; 4 import htsjdk.samtools.util.Locatable; 5 import htsjdk.tribble.TribbleException; 6 import htsjdk.utils.ValidationUtils; 7 import htsjdk.variant.variantcontext.*; 8 import htsjdk.variant.variantcontext.writer.Options; 9 import htsjdk.variant.variantcontext.writer.VariantContextWriter; 10 import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; 11 import htsjdk.variant.vcf.*; 12 import org.apache.commons.io.FilenameUtils; 13 import org.apache.commons.lang3.ArrayUtils; 14 import org.apache.commons.lang3.tuple.MutablePair; 15 import org.apache.commons.lang3.tuple.Pair; 16 import org.apache.logging.log4j.LogManager; 17 import org.apache.logging.log4j.Logger; 18 import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils; 19 import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.StrandBiasUtils; 20 import org.broadinstitute.hellbender.exceptions.GATKException; 21 import org.broadinstitute.hellbender.utils.genotyper.GenotypePriorCalculator; 22 import org.broadinstitute.hellbender.tools.walkers.genotyper.*; 23 import org.broadinstitute.hellbender.utils.*; 24 import org.broadinstitute.hellbender.utils.param.ParamUtils; 25 import org.broadinstitute.hellbender.utils.pileup.PileupElement; 26 import org.broadinstitute.hellbender.utils.read.AlignmentUtils; 27 28 import java.io.Serializable; 29 import java.nio.file.Path; 30 import java.util.*; 31 import java.util.function.BiFunction; 32 import java.util.stream.Collectors; 33 import java.util.stream.IntStream; 34 35 public final class GATKVariantContextUtils { 36 37 private static final Logger logger = LogManager.getLogger(GATKVariantContextUtils.class); 38 39 public static final String MERGE_FILTER_PREFIX = "filterIn"; 40 public static final String MERGE_REF_IN_ALL = "ReferenceInAlln"; 41 public static final String MERGE_FILTER_IN_ALL = "FilteredInAll"; 42 public static final String MERGE_INTERSECTION = "Intersection"; 43 44 public static final int DEFAULT_PLOIDY = HomoSapiensConstants.DEFAULT_PLOIDY; 45 46 private static final GenotypeLikelihoodCalculators GL_CALCS = new GenotypeLikelihoodCalculators(); 47 48 public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. 49 isInformative(final double[] gls)50 public static boolean isInformative(final double[] gls) { 51 return MathUtils.sum(gls) < GATKVariantContextUtils.SUM_GL_THRESH_NOCALL; 52 } 53 54 /** 55 * A method that constructs a mutable set of VCF header lines containing the tool name, version, date and command line, 56 * and return to the requesting tool. 57 * @param toolkitShortName short name for the tool kit, e.g. "gatk" 58 * @param toolName name of the tool 59 * @param versionString version of the tool 60 * @param dateTime date and time at invocation of the tool 61 * @param cmdLine command line (e.g. options, flags) when the tool is invoked. 62 * @return A mutable set of VCF header lines containing the tool name, version, date and command line. 63 */ getDefaultVCFHeaderLines(final String toolkitShortName, final String toolName, final String versionString, final String dateTime, final String cmdLine)64 public static Set<VCFHeaderLine> getDefaultVCFHeaderLines(final String toolkitShortName, final String toolName, 65 final String versionString, final String dateTime, 66 final String cmdLine) { 67 final Set<VCFHeaderLine> defaultVCFHeaderLines = new HashSet<>(); 68 final Map<String, String> simpleHeaderLineMap = new HashMap<>(4); 69 simpleHeaderLineMap.put("ID", toolName); 70 simpleHeaderLineMap.put("Version", versionString); 71 simpleHeaderLineMap.put("Date", dateTime); 72 simpleHeaderLineMap.put("CommandLine", cmdLine); 73 defaultVCFHeaderLines.add(new VCFHeaderLine("source", toolName)); 74 defaultVCFHeaderLines.add(new VCFSimpleHeaderLine(String.format("%sCommandLine", toolkitShortName), simpleHeaderLineMap)); 75 return defaultVCFHeaderLines; 76 } 77 78 /** 79 * Creates a VariantContextWriter whose outputFile type is based on the extension of the output file name. 80 * The default options set by VariantContextWriter are cleared before applying ALLOW_MISSING_FIELDS_IN_HEADER (if 81 * <code>lenientProcessing</code> is set), followed by the set of options specified by any <code>options</code> args. 82 * 83 * @param outPath output Path for this writer. May not be null. 84 * @param referenceDictionary required if on the fly indexing is set, otherwise can be null 85 * @param createMD5 true if an md5 file should be created 86 * @param options variable length list of additional Options to be set for this writer 87 * @returns VariantContextWriter must be closed by the caller 88 */ createVCFWriter( final Path outPath, final SAMSequenceDictionary referenceDictionary, final boolean createMD5, final Options... options)89 public static VariantContextWriter createVCFWriter( 90 final Path outPath, 91 final SAMSequenceDictionary referenceDictionary, 92 final boolean createMD5, 93 final Options... options) 94 { 95 Utils.nonNull(outPath); 96 97 VariantContextWriterBuilder vcWriterBuilder = 98 new VariantContextWriterBuilder().clearOptions().setOutputPath(outPath); 99 100 if (VariantContextWriterBuilder.OutputType.UNSPECIFIED == VariantContextWriterBuilder.determineOutputTypeFromFile(outPath)) { 101 // the only way the user has to specify an output type is by file extension, and htsjdk 102 // throws if it can't map the file extension to a known vcf type, so fallback to a default 103 // of VCF 104 logger.warn(String.format( 105 "Can't determine output variant file format from output file extension \"%s\". Defaulting to VCF.", 106 FilenameUtils.getExtension(outPath.getFileName().toString()))); 107 vcWriterBuilder = vcWriterBuilder.setOutputFileType(VariantContextWriterBuilder.OutputType.VCF); 108 } 109 110 if (createMD5) { 111 vcWriterBuilder.setCreateMD5(); 112 } 113 114 if (null != referenceDictionary) { 115 vcWriterBuilder = vcWriterBuilder.setReferenceDictionary(referenceDictionary); 116 } 117 118 for (Options opt : options) { 119 vcWriterBuilder = vcWriterBuilder.setOption(opt); 120 } 121 122 return vcWriterBuilder.build(); 123 } 124 125 /** 126 * Diploid NO_CALL allele list... 127 * 128 * @deprecated you should use {@link #noCallAlleles(int)} instead. It indicates the presence of a hardcoded diploid assumption which is bad. 129 */ 130 @Deprecated 131 public final static List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); 132 hasPLIncompatibleAlleles(final Collection<Allele> alleleSet1, final Collection<Allele> alleleSet2)133 private static boolean hasPLIncompatibleAlleles(final Collection<Allele> alleleSet1, final Collection<Allele> alleleSet2) { 134 final Iterator<Allele> it1 = alleleSet1.iterator(); 135 final Iterator<Allele> it2 = alleleSet2.iterator(); 136 137 while ( it1.hasNext() && it2.hasNext() ) { 138 final Allele a1 = it1.next(); 139 final Allele a2 = it2.next(); 140 if ( ! a1.equals(a2) ) 141 return true; 142 } 143 144 // by this point, at least one of the iterators is empty. All of the elements 145 // we've compared are equal up until this point. But it's possible that the 146 // sets aren't the same size, which is indicated by the test below. If they 147 // are of the same size, though, the sets are compatible 148 return it1.hasNext() || it2.hasNext(); 149 } 150 151 /** 152 * Does an allele match any in a list of alleles? 153 * We don't assume that ref1/alt1 are in their minimal representation 154 * @param ref1 155 * @param alt1 156 * @param ref2 157 * @param altList 158 * @return 159 */ isAlleleInList(final Allele ref1, final Allele alt1, final Allele ref2, final List<Allele> altList)160 public static boolean isAlleleInList(final Allele ref1, final Allele alt1, final Allele ref2, final List<Allele> altList) { 161 final Allele commonRef; 162 if (ref1.equals(ref2)) { 163 return altList.contains(alt1); 164 } else { 165 commonRef = determineReferenceAllele(ref1, ref2); 166 } 167 final Map<Allele, Allele> alleleMap; 168 if (ref1.equals(commonRef)) { 169 alleleMap = GATKVariantContextUtils.createAlleleMapping(commonRef, ref2, altList); 170 return alleleMap.values().contains(alt1); 171 } else if (ref2.equals(commonRef)) { 172 alleleMap = GATKVariantContextUtils.createAlleleMapping(commonRef, ref1, Arrays.asList(alt1)); 173 return altList.contains(alleleMap.get(alt1)); 174 } else { 175 throw new IllegalStateException("Reference alleles " + ref1 + " and " + ref2 + " have common reference allele " + commonRef + " which is equal to neither."); 176 } 177 } 178 179 /** 180 * Determines the common reference allele 181 * 182 * @param VCs the list of VariantContexts 183 * @param loc if not null, ignore records that do not begin at this start location 184 * @return possibly null Allele 185 */ determineReferenceAllele(final List<VariantContext> VCs, final Locatable loc)186 public static Allele determineReferenceAllele(final List<VariantContext> VCs, final Locatable loc) { 187 Allele ref = null; 188 189 for ( final VariantContext vc : VCs ) { 190 if ( contextMatchesLoc(vc, loc) ) { 191 final Allele myRef = vc.getReference(); 192 try { 193 ref = determineReferenceAllele(ref, myRef); 194 } catch (IllegalStateException e) { 195 throw new IllegalStateException(String.format("The provided variant file(s) have inconsistent references " + 196 "for the same position(s) at %s:%d, %s vs. %s", vc.getContig(), vc.getStart(), ref, myRef)); 197 } 198 } 199 } 200 return ref; 201 } 202 determineReferenceAllele(final Allele ref1, final Allele ref2)203 public static Allele determineReferenceAllele(final Allele ref1, final Allele ref2) { 204 if ( ref1 == null || ref1.length() < ref2.length() ) { 205 return ref2; 206 } else if ( ref2 == null || ref2.length() < ref1.length()) { 207 return ref1; 208 } 209 else if ( ref1.length() == ref2.length() && ! ref1.equals(ref2) ) { 210 throw new IllegalStateException(String.format("The provided reference alleles do not appear to represent the same position, %s vs. %s", ref1, ref2)); 211 } else { //the lengths are the same and they're equal, so we could return ref1 or ref2 212 return ref1; 213 } 214 } 215 216 /** 217 * Calculates the total ploidy of a variant context as the sum of all plodies across genotypes. 218 * @param vc the target variant context. 219 * @param defaultPloidy the default ploidy to be assume when there is no ploidy information for a genotype. 220 * @return never {@code null}. 221 */ totalPloidy(final VariantContext vc, final int defaultPloidy)222 public static int totalPloidy(final VariantContext vc, final int defaultPloidy) { 223 Utils.nonNull(vc, "the vc provided cannot be null"); 224 Utils.validateArg(defaultPloidy >= 0, "the default ploidy must 0 or greater"); 225 return vc.getGenotypes().stream().mapToInt(Genotype::getPloidy) 226 .map(p -> p <= 0 ? defaultPloidy : p).sum(); 227 } 228 229 /** 230 * Returns the type of the substitution (transition vs transversion) represented by this variant. Only applicable to bi allelic SNPs. 231 */ getSNPSubstitutionType(final VariantContext context)232 public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(final VariantContext context) { 233 Utils.nonNull(context); 234 if (!context.isSNP() || !context.isBiallelic()) { 235 throw new IllegalArgumentException("Requested SNP substitution type for bialleic non-SNP " + context); 236 } 237 return BaseUtils.SNPSubstitutionType(context.getReference().getBases()[0], context.getAlternateAllele(0).getBases()[0]); 238 } 239 240 /** 241 * If this is a BiAllelic SNP, is it a transition? 242 */ isTransition(final VariantContext context)243 public static boolean isTransition(final VariantContext context) { 244 Utils.nonNull(context); 245 return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; 246 } 247 248 249 /** 250 * Returns a homozygous call allele list given the only allele and the ploidy. 251 * 252 * @param allele the only allele in the allele list. 253 * @param ploidy the ploidy of the resulting allele list. 254 * 255 * @throws IllegalArgumentException if {@code allele} is {@code null} or ploidy is negative. 256 * 257 * @return never {@code null}. 258 */ homozygousAlleleList(final Allele allele, final int ploidy)259 public static List<Allele> homozygousAlleleList(final Allele allele, final int ploidy) { 260 Utils.nonNull(allele); 261 Utils.validateArg(ploidy >= 0, "ploidy cannot be negative"); 262 263 // Use a tailored inner class to implement the list: 264 return Collections.nCopies(ploidy,allele); 265 } 266 267 /** 268 * Add the genotype call (GT) field to GenotypeBuilder using the requested {@link GenotypeAssignmentMethod} 269 * 270 * @param gb the builder where we should put our newly called alleles, cannot be null 271 * @param assignmentMethod the method to use to do the assignment, cannot be null 272 * @param genotypeLikelihoods a vector of likelihoods to use if the method requires PLs, should be log10 likelihoods, cannot be null 273 * @param allelesToUse the alleles with respect to which the likelihoods are defined 274 */ makeGenotypeCall(final int ploidy, final GenotypeBuilder gb, final GenotypeAssignmentMethod assignmentMethod, final double[] genotypeLikelihoods, final List<Allele> allelesToUse, final List<Allele> originalGT, final GenotypePriorCalculator gpc)275 public static void makeGenotypeCall(final int ploidy, 276 final GenotypeBuilder gb, 277 final GenotypeAssignmentMethod assignmentMethod, 278 final double[] genotypeLikelihoods, 279 final List<Allele> allelesToUse, 280 final List<Allele> originalGT, 281 final GenotypePriorCalculator gpc) { 282 if(originalGT == null && assignmentMethod == GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL) { 283 throw new IllegalArgumentException("original GT cannot be null if assignmentMethod is BEST_MATCH_TO_ORIGINAL"); 284 } 285 if (assignmentMethod == GenotypeAssignmentMethod.SET_TO_NO_CALL) { 286 gb.alleles(noCallAlleles(ploidy)).noGQ(); 287 } else if (assignmentMethod == GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN) { 288 if ( genotypeLikelihoods == null || !isInformative(genotypeLikelihoods) ) { 289 gb.alleles(noCallAlleles(ploidy)).noGQ(); 290 } else { 291 final int maxLikelihoodIndex = MathUtils.maxElementIndex(genotypeLikelihoods); 292 final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, allelesToUse.size()); 293 final GenotypeAlleleCounts alleleCounts = glCalc.genotypeAlleleCountsAt(maxLikelihoodIndex); 294 295 final List<Allele> finalAlleles = alleleCounts.asAlleleList(allelesToUse); 296 if (finalAlleles.contains(Allele.NON_REF_ALLELE)) { 297 gb.alleles(GATKVariantContextUtils.noCallAlleles(ploidy)); 298 gb.PL(new int[genotypeLikelihoods.length]); 299 } else { 300 gb.alleles(finalAlleles); 301 } 302 final int numAltAlleles = allelesToUse.size() - 1; 303 if ( numAltAlleles > 0 ) { 304 gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(maxLikelihoodIndex, genotypeLikelihoods)); 305 } 306 } 307 } else if (assignmentMethod == GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS) { 308 gb.alleles(noCallAlleles(ploidy)).noGQ().noAD().noPL().noAttributes(); 309 } else if (assignmentMethod == GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL) { 310 final List<Allele> best = new LinkedList<>(); 311 final Allele ref = allelesToUse.get(0); 312 for (final Allele originalAllele : originalGT) { 313 best.add((allelesToUse.contains(originalAllele) || originalAllele.isNoCall()) ? originalAllele : ref); 314 } 315 gb.alleles(best); 316 } else if (assignmentMethod == GenotypeAssignmentMethod.USE_POSTERIOR_PROBABILITIES) { 317 if (gpc == null) { 318 throw new GATKException("cannot uses posteriors without an genotype prior calculator present"); 319 } else { 320 // Calculate posteriors. 321 final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, allelesToUse.size()); 322 final double[] log10Priors = gpc.getLog10Priors(glCalc, allelesToUse); 323 final double[] log10Posteriors = MathUtils.ebeAdd(log10Priors, genotypeLikelihoods); 324 final double[] normalizedLog10Posteriors = MathUtils.scaleLogSpaceArrayForNumericalStability(log10Posteriors); 325 // Update GP and PG annotations: 326 gb.attribute(VCFConstants.GENOTYPE_POSTERIORS_KEY, Arrays.stream(normalizedLog10Posteriors) 327 .map(v -> v == 0.0 ? 0.0 : v * -10) // the reason for the == 0.0 is to avoid a signed 0 output "-0.0" 328 .mapToObj(GATKVariantContextUtils::formatGP).toArray()); 329 gb.attribute(GATKVCFConstants.GENOTYPE_PRIOR_KEY, Arrays.stream(log10Priors) 330 .map(v -> v == 0.0 ? 0.0 : v * -10) 331 .mapToObj(GATKVariantContextUtils::formatGP).toArray()); 332 // Set the GQ accordingly 333 final int maxPosteriorIndex = MathUtils.maxElementIndex(log10Posteriors); 334 if ( allelesToUse.size() > 0 ) { 335 gb.log10PError(getGQLog10FromPosteriors(maxPosteriorIndex, normalizedLog10Posteriors)); 336 } 337 // Finally we update the genotype alleles. 338 gb.alleles(glCalc.genotypeAlleleCountsAt(maxPosteriorIndex).asAlleleList(allelesToUse)); 339 } 340 } 341 } 342 getGQLog10FromPosteriors(final int bestGenotypeIndex, final double[] log10Posteriors)343 private static double getGQLog10FromPosteriors(final int bestGenotypeIndex, final double[] /**/log10Posteriors) { 344 if (bestGenotypeIndex < 0) { 345 return CommonInfo.NO_LOG10_PERROR; 346 } else { 347 switch (log10Posteriors.length) { 348 case 0: 349 case 1: return CommonInfo.NO_LOG10_PERROR; 350 case 2: return bestGenotypeIndex == 0 ? log10Posteriors[1] : log10Posteriors[0]; 351 case 3: return Math.min(0, MathUtils.log10SumLog10( 352 log10Posteriors[ bestGenotypeIndex == 0 ? 2 : bestGenotypeIndex - 1], 353 log10Posteriors[ bestGenotypeIndex == 2 ? 0 : bestGenotypeIndex + 1])); 354 default: 355 if (bestGenotypeIndex == 0) { 356 return MathUtils.log10SumLog10(log10Posteriors, 1, log10Posteriors.length); 357 } else if (bestGenotypeIndex == log10Posteriors.length - 1) { 358 return MathUtils.log10SumLog10(log10Posteriors, 0, bestGenotypeIndex); 359 } else { 360 return Math.min(0.0, MathUtils.log10SumLog10( 361 MathUtils.log10sumLog10(log10Posteriors, 0, bestGenotypeIndex), 362 MathUtils.log10sumLog10(log10Posteriors, bestGenotypeIndex + 1, log10Posteriors.length) 363 )); 364 } 365 } 366 } 367 } 368 formatGP(final double gp)369 private static String formatGP(final double gp) { 370 final String formatted = String.format("%.2f", gp); 371 int last = formatted.length() - 1; 372 if (formatted.charAt(last) == '0') { 373 if (formatted.charAt(--last) == '0') { 374 return formatted.substring(0, --last); // exclude the '.' as-well. 375 } else { 376 return formatted.substring(0, ++last); 377 } 378 } else { 379 return formatted; 380 } 381 } 382 383 /** 384 * 385 * @param ploidy 386 * @param allele 387 * @return (return a single no-call allele for ploidy 0, as on Y for females) 388 */ makePloidyLengthAlleleList(final int ploidy, final Allele allele)389 public static List<Allele> makePloidyLengthAlleleList(final int ploidy, final Allele allele) { 390 if (ploidy == 0) { 391 return Arrays.asList(Allele.NO_CALL); 392 } 393 final List<Allele> repeatedList = new ArrayList<>(); 394 for (int i = 0; i < ploidy; i++) { 395 repeatedList.add(allele); 396 } 397 return repeatedList; 398 } 399 makeGenotypeCall(final int ploidy, final GenotypeBuilder gb, final GenotypeAssignmentMethod assignmentMethod, final double[] genotypeLikelihoods, final List<Allele> allelesToUse, final GenotypePriorCalculator gpc)400 public static void makeGenotypeCall(final int ploidy, 401 final GenotypeBuilder gb, 402 final GenotypeAssignmentMethod assignmentMethod, 403 final double[] genotypeLikelihoods, 404 final List<Allele> allelesToUse, 405 final GenotypePriorCalculator gpc){ 406 makeGenotypeCall(ploidy,gb,assignmentMethod,genotypeLikelihoods,allelesToUse,null, gpc); 407 } 408 409 /** 410 * Return the rightmost variant context in maybeOverlapping that overlaps curPos 411 * 412 * @param curPos non-null genome loc 413 * @param maybeOverlapping a collection of variant contexts that might overlap curPos 414 * @return the rightmost VariantContext, or null if none overlaps 415 */ getOverlappingVariantContext(final Locatable curPos, final Collection<VariantContext> maybeOverlapping)416 public static VariantContext getOverlappingVariantContext(final Locatable curPos, final Collection<VariantContext> maybeOverlapping) { 417 VariantContext overlaps = null; 418 for ( final VariantContext vc : maybeOverlapping ) { 419 if ( curPos.overlaps(vc) ) { 420 if ( overlaps == null || vc.getStart() > overlaps.getStart() ) { 421 overlaps = vc; 422 } 423 } 424 } 425 return overlaps; 426 } 427 428 /** 429 * Determines whether the provided VariantContext has real alternate alleles. 430 * 431 * @param vc the VariantContext to evaluate 432 * @return true if it has proper alternate alleles, false otherwise 433 */ isProperlyPolymorphic(final VariantContext vc)434 public static boolean isProperlyPolymorphic(final VariantContext vc) { 435 //obvious cases 436 if (vc == null || vc.getAlternateAlleles().isEmpty()) { 437 return false; 438 } else if (vc.isBiallelic()) { 439 return !(GATKVCFConstants.isSpanningDeletion(vc.getAlternateAllele(0)) || vc.isSymbolic()); 440 } else if (GATKVCFConstants.isSpanningDeletion(vc.getAlternateAllele(0)) && vc.getAlternateAllele(1).equals(Allele.NON_REF_ALLELE)){ 441 return false; 442 } else { 443 return true; 444 } 445 } 446 447 /** This is lifted directly from htsjdk with some minor modifications! However, it is a private method there. 448 * 449 * This method cannot return {@link VariantContext.Type} MIXED 450 * 451 * Please see https://github.com/samtools/htsjdk/issues/999 452 * 453 * <p>Here are some cases that will not work properly, though this may not be an issue in practice: </p> 454 * <ul> 455 * <li>"CGT" --> "GGA" this will be a MNP, but really it is two SNPs.</li> 456 * <li>Spanning deletions for alternate will show as {@link VariantContext.Type} NO_VARIATION</li> 457 * <li>Spanning deletions for reference will throw exception. </li> 458 * <li>Reference that is symbolic will throw an exception.</li> 459 * </ul> 460 * 461 * @param ref reference allele. Never {@code null} 462 * @param allele alternate allele to compare. Never {@code null} 463 * @return 464 */ typeOfVariant(final Allele ref, final Allele allele)465 public static VariantContext.Type typeOfVariant(final Allele ref, final Allele allele) { 466 Utils.nonNull(ref); 467 Utils.nonNull(allele); 468 469 if ( ref.isSymbolic() ) 470 throw new IllegalStateException("Unexpected error: encountered a record with a symbolic reference allele"); 471 472 if ( allele.isSymbolic() ) 473 return VariantContext.Type.SYMBOLIC; 474 475 if (allele.equals(Allele.SPAN_DEL)) { 476 return VariantContext.Type.NO_VARIATION; 477 } 478 479 if ( ref.equals(Allele.SPAN_DEL) ) 480 throw new IllegalStateException("Unexpected error: encountered a record with a spanning deletion reference allele"); 481 482 if ( ref.length() == allele.length() ) { 483 if (ref.basesMatch(allele)) { 484 return VariantContext.Type.NO_VARIATION; 485 } else if ( allele.length() == 1 ) 486 return VariantContext.Type.SNP; 487 488 // If the two alleles are the same length and only differ by one base, then still a SNP. 489 else if (IntStream.range(0, ref.length()).filter(i -> ref.getBases()[i] != allele.getBases()[i]).count() == 1) { 490 return VariantContext.Type.SNP; 491 } else 492 return VariantContext.Type.MNP; 493 } 494 495 // Important note: previously we were checking that one allele is the prefix of the other. However, that's not an 496 // appropriate check as can be seen from the following example: 497 // REF = CTTA and ALT = C,CT,CA 498 // This should be assigned the INDEL type but was being marked as a MIXED type because of the prefix check. 499 // In truth, it should be absolutely impossible to return a MIXED type from this method because it simply 500 // performs a pairwise comparison of a single alternate allele against the reference allele (whereas the MIXED type 501 // is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point 502 // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL. 503 504 return VariantContext.Type.INDEL; 505 506 // old incorrect logic: 507 // if (oneIsPrefixOfOther(ref, allele)) 508 // return Type.INDEL; 509 // else 510 // return Type.MIXED; 511 } 512 513 /** 514 * This method should only be run on variants that are known to be indels. See {@code typeOfVariant} 515 * 516 *<p>Here are some cases that will not work properly, though this may not be an issue in practice: </p> 517 * <ul> 518 * <li>"CT" --> "CATT" this is really just a simple AT insertion, but this will show up as complex.</li> 519 * </ul> 520 * @param ref reference allele. Never {@code null} 521 * @param allele alternate allele to compare. Never {@code null} 522 * @return true if the indel is complex (for example, also includes a SNP), false if simple indel. If the input alleles define a variant that is not 523 * an indel, then the behavior of this method is undefined (though will probably just return false). 524 * 525 */ isComplexIndel(final Allele ref, final Allele allele)526 public static boolean isComplexIndel(final Allele ref, final Allele allele) { 527 528 Utils.nonNull(ref); 529 Utils.nonNull(allele); 530 531 // Symbolic --> false 532 if (ref.isSymbolic() || (ref.length() == 0)) { 533 return false; 534 } 535 if (allele.isSymbolic() || (allele.length() == 0)) { 536 return false; 537 } 538 539 // SNP, MNP, or no variation --> false 540 if ( ref.length() == allele.length() ) { 541 return false; 542 } 543 544 // obvious simple del or simple indel 545 if ((allele.length() == 1) || (ref.length() == 1)) { 546 return false; 547 } 548 549 // If the ref starts with the alt or vice versa, this is still simple. 550 if (allele.length() > ref.length()) { 551 final boolean isAltStartsWithRef = IntStream.range(0, ref.length()).allMatch(i -> ref.getBases()[i] == allele.getBases()[i]); 552 return !isAltStartsWithRef; 553 } else { 554 final boolean isRefStartsWithAlt = IntStream.range(0, allele.length()).allMatch(i -> ref.getBases()[i] == allele.getBases()[i]); 555 return !isRefStartsWithAlt; 556 } 557 } 558 559 /** 560 * Given a set of alleles (reference and alternate), choose the allele that is the best match for the given read (and offset) 561 * TODO: This method cannot recognize equivalent alleles (See https://github.com/broadinstitute/gatk/issues/5061) 562 * @param pileupElement read and offset. Never {@code null} 563 * @param referenceAllele Reference allele. Never {@code null} 564 * @param altAlleles List of candidate alternate alleles. Never {@code null} 565 * @param minBaseQualityCutoff minimum base quality for the bases that match the allele in order to be counted. 566 * Must be positive or zero. If you do not want any filtering, specify 0. 567 * @return The allele (reference or from the altAlleles) that matches. {@code null} if none are a match or the base qualities 568 * corresponding to the allele don't all exceed the minimum. 569 */ chooseAlleleForRead(final PileupElement pileupElement, final Allele referenceAllele, final List<Allele> altAlleles, int minBaseQualityCutoff)570 public static Allele chooseAlleleForRead(final PileupElement pileupElement, final Allele referenceAllele, final List<Allele> altAlleles, int minBaseQualityCutoff) { 571 Utils.nonNull(pileupElement); 572 Utils.nonNull(referenceAllele); 573 Utils.nonNull(altAlleles); 574 ParamUtils.isPositiveOrZero(minBaseQualityCutoff, "Minimum base quality must be positive or zero."); 575 576 final boolean isRef = referenceAllele.basesMatch(getBasesForAlleleInRead(pileupElement, referenceAllele)) 577 && !pileupElement.isBeforeDeletionStart() && !pileupElement.isBeforeInsertion(); 578 579 Allele pileupAllele = null; 580 if (!isRef) { 581 582 for (Allele altAllele : altAlleles) { 583 final VariantContext.Type variantType = typeOfVariant(referenceAllele, altAllele); 584 585 if (variantType == VariantContext.Type.INDEL) { 586 if (isIndelInThePileupElement(pileupElement, referenceAllele, altAllele)) { 587 pileupAllele = altAllele; 588 } 589 590 } else if (variantType == VariantContext.Type.MNP || variantType == VariantContext.Type.SNP) { 591 if (doesReadContainAllele(pileupElement, altAllele) == Trilean.TRUE) { 592 pileupAllele = altAllele; 593 } 594 } 595 } 596 } else { 597 pileupAllele = referenceAllele; 598 } 599 600 if ((pileupAllele != null) && (getMinBaseQualityForAlleleInRead(pileupElement, pileupAllele) < minBaseQualityCutoff)) { 601 pileupAllele = null; 602 } 603 604 return pileupAllele; 605 } 606 isIndelInThePileupElement(final PileupElement pileupElement, final Allele referenceAllele, final Allele altAllele)607 private static boolean isIndelInThePileupElement(final PileupElement pileupElement, final Allele referenceAllele, final Allele altAllele) { 608 boolean isAltAlleleInThePileup = false; 609 610 // Check insertion 611 if (pileupElement.isBeforeInsertion()) { 612 final String insertionBases = pileupElement.getBasesOfImmediatelyFollowingInsertion(); 613 // edge case: ignore a deletion immediately preceding an insertion as p.getBasesOfImmediatelyFollowingInsertion() returns null [EB] 614 if ((insertionBases != null) && (Allele.extend(referenceAllele, insertionBases.getBytes()).basesMatch(altAllele))) { 615 isAltAlleleInThePileup = true; 616 } 617 } else if (pileupElement.isBeforeDeletionStart()) { 618 final int deletionLength = pileupElement.getLengthOfImmediatelyFollowingIndel(); 619 if ((referenceAllele.getBases().length - altAllele.getBases().length) == deletionLength) { 620 isAltAlleleInThePileup = true; 621 } 622 } 623 return isAltAlleleInThePileup; 624 } 625 626 /** 627 * @param pileupElement pileup element representing the read. Never {@code null} 628 * @param allele allele to get overlapping bases in read. Never {@code null} 629 * @return array of the bytes that correspond to the allele in the pileup element. Note that, if the read ends, this 630 * list can be smaller than the length of the allele. 631 */ getBasesForAlleleInRead(final PileupElement pileupElement, final Allele allele)632 private static byte[] getBasesForAlleleInRead(final PileupElement pileupElement, final Allele allele) { 633 Utils.nonNull(pileupElement); 634 Utils.nonNull(allele); 635 return ArrayUtils.subarray(pileupElement.getRead().getBases(), pileupElement.getOffset(), pileupElement.getOffset() + allele.getBases().length); 636 } 637 638 /** 639 * TODO: Test. And make sure to test with reference alleles to make sure "*" is not included. 640 * @param pileupElement pileup element representing the read. Never {@code null} 641 * @param allele query allele. Never {@code null} 642 * @return Whether the read contains the allele. Note that unknown can occur as well. 643 */ doesReadContainAllele(final PileupElement pileupElement, final Allele allele)644 public static Trilean doesReadContainAllele(final PileupElement pileupElement, final Allele allele) { 645 Utils.nonNull(pileupElement); 646 Utils.nonNull(allele); 647 648 final byte[] readBases = ArrayUtils.subarray(pileupElement.getRead().getBases(), pileupElement.getOffset(), pileupElement.getOffset() + allele.getBases().length); 649 650 if (readBases.length < allele.getBases().length) { 651 return Trilean.UNKNOWN; 652 } 653 654 if (allele.basesMatch(readBases)) { 655 return Trilean.TRUE; 656 } else { 657 return Trilean.FALSE; 658 } 659 } 660 661 /** 662 * Find the minimum base quality for all bases in a read that correspond to a given allele. 663 * 664 * @param pileupElement pileup element representing the read. Never {@code null} 665 * @param allele query allele. Never {@code null} 666 * @return lowest base quality seen in the corresponding bases of the read 667 */ getMinBaseQualityForAlleleInRead(final PileupElement pileupElement, final Allele allele)668 private static int getMinBaseQualityForAlleleInRead(final PileupElement pileupElement, final Allele allele) { 669 Utils.nonNull(pileupElement); 670 Utils.nonNull(allele); 671 final byte[] alleleBases = allele.getBases(); 672 final byte[] pileupBaseQualities = ArrayUtils.subarray(pileupElement.getRead().getBaseQualities(), pileupElement.getOffset(), pileupElement.getOffset() + alleleBases.length); 673 final OptionalInt minQuality = IntStream.range(0, pileupBaseQualities.length).map(i -> Byte.toUnsignedInt(pileupBaseQualities[i])).min(); 674 return minQuality.orElse(-1); 675 } 676 containsInlineIndel(final VariantContext vc)677 public static boolean containsInlineIndel(final VariantContext vc) { 678 final List<Allele> alleles = vc.getAlleles(); 679 final int refLength = alleles.get(0).length(); 680 for (int i = 1; i < alleles.size(); i++) { 681 final Allele alt = alleles.get(i); 682 if (!alt.isSymbolic() && alt != Allele.SPAN_DEL && alt.length() != refLength) { 683 return true; 684 } 685 } 686 return false; 687 } 688 689 public enum GenotypeMergeType { 690 /** 691 * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. 692 */ 693 UNIQUIFY, 694 /** 695 * Take genotypes in priority order (see the priority argument). 696 */ 697 PRIORITIZE, 698 /** 699 * Take the genotypes in any order. 700 */ 701 UNSORTED, 702 /** 703 * Require that all samples/genotypes be unique between all inputs. 704 */ 705 REQUIRE_UNIQUE 706 } 707 708 public enum FilteredRecordMergeType { 709 /** 710 * Union - leaves the record if any record is unfiltered. 711 */ 712 KEEP_IF_ANY_UNFILTERED, 713 /** 714 * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. 715 */ 716 KEEP_IF_ALL_UNFILTERED, 717 /** 718 * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset. 719 */ 720 KEEP_UNCONDITIONAL 721 } 722 723 /** 724 * Returns true iff VC is an non-complex indel where every allele represents an expansion or 725 * contraction of a series of identical bases in the reference. 726 * 727 * For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT 728 * 729 * If VC = -/CT, then this function returns true because the CT insertion matches exactly the 730 * upcoming reference. 731 * If VC = -/CTA then this function returns false because the CTA isn't a perfect match 732 * 733 * Now consider deletions: 734 * 735 * If VC = CT/- then again the same logic applies and this returns true 736 * The case of CTA/- makes no sense because it doesn't actually match the reference bases. 737 * 738 * The logic of this function is pretty simple. Take all of the non-null alleles in VC. For 739 * each insertion allele of n bases, check if that allele matches the next n reference bases. 740 * For each deletion allele of n bases, check if this matches the reference bases at n - 2 n, 741 * as it must necessarily match the first n bases. If this test returns true for all 742 * alleles you are a tandem repeat, otherwise you are not. 743 * 744 * @param vc 745 * @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference 746 * @return 747 */ isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad)748 public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { 749 final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); 750 if ( ! vc.isIndel() ) // only indels are tandem repeats 751 return false; 752 753 final Allele ref = vc.getReference(); 754 755 for ( final Allele allele : vc.getAlternateAlleles() ) { 756 if ( ! isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad) ) 757 return false; 758 } 759 760 // we've passed all of the tests, so we are a repeat 761 return true; 762 } 763 764 /** 765 * 766 * @param vc 767 * @param refBasesStartingAtVCWithoutPad Ref bases excluding the initial base of the variant context where the alt matches the ref. 768 * For example, if the reference sequence is GATCCACCACCAGTCGA and we have a deletion 769 * of one STR unit CCA, it is represented as a variant context TCCA -> T, where the 'T' is 770 * the padding base. In this case, {@code refBasesStartingAtVCWithoutPad} is CCACCACCAGTCGA. 771 * @return 772 */ getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithoutPad)773 public static Pair<List<Integer>, byte[]> getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithoutPad) { 774 Utils.nonNull(vc); 775 Utils.nonNull(refBasesStartingAtVCWithoutPad); 776 777 if ( ! vc.isIndel() ){ // only indels are tandem repeats 778 return null; 779 } 780 781 final Allele refAllele = vc.getReference(); 782 final byte[] refAlleleBases = Arrays.copyOfRange(refAllele.getBases(), 1, refAllele.length()); 783 784 byte[] repeatUnit = null; 785 final List<Integer> lengths = new ArrayList<>(); 786 787 for ( final Allele allele : vc.getAlternateAlleles() ) { 788 Pair<int[],byte[]> result = getNumTandemRepeatUnits(refAlleleBases, Arrays.copyOfRange(allele.getBases(), 1, allele.length()), refBasesStartingAtVCWithoutPad); 789 790 final int[] repetitionCount = result.getLeft(); 791 // repetition count = 0 means allele is not a tandem expansion of context 792 if (repetitionCount[0] == 0 || repetitionCount[1] == 0) 793 return null; 794 795 if (lengths.isEmpty()) { 796 lengths.add(repetitionCount[0]); // add ref allele length only once 797 } 798 lengths.add(repetitionCount[1]); // add this alt allele's length 799 800 repeatUnit = result.getRight(); 801 } 802 803 return new MutablePair<>(lengths,repeatUnit); 804 } 805 getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext)806 public static Pair<int[],byte[]> getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext) { 807 /* we can't exactly apply same logic as in basesAreRepeated() to compute tandem unit and number of repeated units. 808 Consider case where ref =ATATAT and we have an insertion of ATAT. Natural description is (AT)3 -> (AT)2. 809 */ 810 811 byte[] longB; 812 // find first repeat unit based on either ref or alt, whichever is longer 813 if (altBases.length > refBases.length) 814 longB = altBases; 815 else 816 longB = refBases; 817 818 // see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units 819 // for example, -*,CACA needs to first be decomposed into (CA)2 820 final int repeatUnitLength = findRepeatedSubstring(longB); 821 final byte[] repeatUnit = Arrays.copyOf(longB, repeatUnitLength); 822 823 final int[] repetitionCount = new int[2]; 824 // look for repetitions forward on the ref bases (i.e. starting at beginning of ref bases) 825 int repetitionsInRef = findNumberOfRepetitions(repeatUnit, refBases, true); 826 repetitionCount[0] = findNumberOfRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef; 827 repetitionCount[1] = findNumberOfRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef; 828 829 return new MutablePair<>(repetitionCount, repeatUnit); 830 831 } 832 833 /** 834 * Find out if a string can be represented as a tandem number of substrings. 835 * For example ACTACT is a 2-tandem of ACT, 836 * but ACTACA is not. 837 * 838 * @param bases String to be tested 839 * @return Length of repeat unit, if string can be represented as tandem of substring (if it can't 840 * be represented as one, it will be just the length of the input string) 841 */ findRepeatedSubstring(byte[] bases)842 public static int findRepeatedSubstring(byte[] bases) { 843 844 int repLength; 845 for (repLength=1; repLength <=bases.length; repLength++) { 846 final byte[] candidateRepeatUnit = Arrays.copyOf(bases,repLength); 847 boolean allBasesMatch = true; 848 for (int start = repLength; start < bases.length; start += repLength ) { 849 // check that remaining of string is exactly equal to repeat unit 850 final byte[] basePiece = Arrays.copyOfRange(bases,start,start+candidateRepeatUnit.length); 851 if (!Arrays.equals(candidateRepeatUnit, basePiece)) { 852 allBasesMatch = false; 853 break; 854 } 855 } 856 if (allBasesMatch) 857 return repLength; 858 } 859 860 return repLength; 861 } 862 863 /** 864 * Finds number of repetitions a string consists of. 865 * For example, for string ATAT and repeat unit AT, number of repetitions = 2 866 * @param repeatUnit Non-empty substring represented by byte array 867 * @param testString String to test (represented by byte array), may be empty 868 * @param leadingRepeats Look for leading (at the beginning of string) or trailing (at end of string) repetitions 869 * For example: 870 * GATAT has 0 leading repeats of AT but 2 trailing repeats of AT 871 * ATATG has 1 leading repeat of A but 2 leading repeats of AT 872 * CCCCCCCC has 2 leading and 2 trailing repeats of CCC 873 * 874 * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's, including the case of empty testString) 875 */ findNumberOfRepetitions(byte[] repeatUnit, byte[] testString, boolean leadingRepeats)876 public static int findNumberOfRepetitions(byte[] repeatUnit, byte[] testString, boolean leadingRepeats) { 877 Utils.nonNull(repeatUnit, "repeatUnit"); 878 Utils.nonNull(testString, "testString"); 879 Utils.validateArg(repeatUnit.length != 0, "empty repeatUnit"); 880 if (testString.length == 0){ 881 return 0; 882 } 883 return findNumberOfRepetitions(repeatUnit, 0, repeatUnit.length, testString, 0, testString.length, leadingRepeats); 884 } 885 886 /** 887 * Finds number of repetitions a string consists of. 888 * Same as {@link #findNumberOfRepetitions} but operates on subarrays of a bigger array to save on copying. 889 * For example, for string ATAT and repeat unit AT, number of repetitions = 2 890 * @param repeatUnitFull Non-empty substring represented by byte array 891 * @param offsetInRepeatUnitFull the offset in repeatUnitFull from which to read the repeat unit 892 * @param repeatUnitLength length of the repeat unit 893 * @param testStringFull string to test (represented by byte array), may be empty 894 * @param offsetInTestStringFull the offset in offsetInRepeatUnitFull from which to read the test string 895 * @param testStringLength length of the test string 896 * @param leadingRepeats Look for leading (at the beginning of string) or trailing (at end of string) repetitions 897 * For example: 898 * GATAT has 0 leading repeats of AT but 2 trailing repeats of AT 899 * ATATG has 1 leading repeat of A but 2 leading repeats of AT 900 * CCCCCCCC has 2 leading and 2 trailing repeats of CCC 901 * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's, including the case of empty testString) 902 */ findNumberOfRepetitions(final byte[] repeatUnitFull, final int offsetInRepeatUnitFull, final int repeatUnitLength, final byte[] testStringFull, final int offsetInTestStringFull, final int testStringLength, final boolean leadingRepeats)903 public static int findNumberOfRepetitions(final byte[] repeatUnitFull, final int offsetInRepeatUnitFull, final int repeatUnitLength, final byte[] testStringFull, final int offsetInTestStringFull, final int testStringLength, final boolean leadingRepeats) { 904 Utils.nonNull(repeatUnitFull, "repeatUnit"); 905 Utils.nonNull(testStringFull, "testString"); 906 Utils.validIndex(offsetInRepeatUnitFull, repeatUnitFull.length); 907 Utils.validateArg(repeatUnitLength >= 0 && repeatUnitLength <= repeatUnitFull.length, "repeatUnitLength"); 908 if (testStringLength == 0){ 909 return 0; 910 } 911 Utils.validIndex(offsetInTestStringFull, testStringFull.length); 912 Utils.validateArg(testStringLength >= 0 && testStringLength <= testStringFull.length, "testStringLength"); 913 final int lengthDifference = testStringLength - repeatUnitLength; 914 915 if (leadingRepeats) { 916 int numRepeats = 0; 917 // look forward on the test string 918 for (int start = 0; start <= lengthDifference; start += repeatUnitLength) { 919 if(Utils.equalRange(testStringFull, start + offsetInTestStringFull, repeatUnitFull, offsetInRepeatUnitFull, repeatUnitLength)) { 920 numRepeats++; 921 } else { 922 return numRepeats; 923 } 924 } 925 return numRepeats; 926 } else { 927 // look backward. For example, if repeatUnit = AT and testString = GATAT, number of repeat units is still 2 928 int numRepeats = 0; 929 // look backward on the test string 930 for (int start = lengthDifference; start >= 0; start -= repeatUnitLength) { 931 if (Utils.equalRange(testStringFull, start + offsetInTestStringFull, repeatUnitFull, offsetInRepeatUnitFull, repeatUnitLength)) { 932 numRepeats++; 933 } else { 934 return numRepeats; 935 } 936 } 937 return numRepeats; 938 } 939 } 940 941 /** 942 * Helper function for isTandemRepeat that checks that allele matches somewhere on the reference 943 */ isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad)944 protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) { 945 if ( ! Allele.oneIsPrefixOfOther(ref, alt) ) 946 return false; // we require one allele be a prefix of another 947 948 if ( ref.length() > alt.length() ) { // we are a deletion 949 return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2); 950 } else { // we are an insertion 951 return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1); 952 } 953 } 954 basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches)955 protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) { 956 final String potentialRepeat = l.substring(s.length()); // skip s bases 957 958 for ( int i = 0; i < minNumberOfMatches; i++) { 959 final int start = i * potentialRepeat.length(); 960 final int end = (i+1) * potentialRepeat.length(); 961 if ( ref.length() < end ) 962 return false; // we ran out of bases to test 963 final String refSub = ref.substring(start, end); 964 if ( ! refSub.equals(potentialRepeat) ) 965 return false; // repeat didn't match, fail 966 } 967 968 return true; // we passed all tests, we matched 969 } 970 971 /** 972 * Subset the samples in VC to reference only information with ref call alleles 973 * 974 * Preserves DP if present 975 * 976 * @param vc the variant context to subset down to 977 * @param defaultPloidy defaultPloidy to use if a genotype doesn't have any alleles 978 * @return a GenotypesContext 979 */ subsetToRefOnly(final VariantContext vc, final int defaultPloidy)980 public static GenotypesContext subsetToRefOnly(final VariantContext vc, final int defaultPloidy) { 981 Utils.nonNull(vc == null, "vc cannot be null"); 982 Utils.validateArg(defaultPloidy >= 1, () -> "defaultPloidy must be >= 1 but got " + defaultPloidy); 983 984 final GenotypesContext oldGTs = vc.getGenotypes(); 985 if (oldGTs.isEmpty()) return oldGTs; 986 final GenotypesContext newGTs = GenotypesContext.create(oldGTs.size()); 987 988 final Allele ref = vc.getReference(); 989 final List<Allele> diploidRefAlleles = Arrays.asList(ref, ref); 990 991 // create the new genotypes 992 for ( final Genotype g : vc.getGenotypes() ) { 993 final int gPloidy = g.getPloidy() == 0 ? defaultPloidy : g.getPloidy(); 994 final List<Allele> refAlleles = gPloidy == 2 ? diploidRefAlleles : Collections.nCopies(gPloidy, ref); 995 final GenotypeBuilder gb = new GenotypeBuilder(g.getSampleName(), refAlleles); 996 if ( g.hasDP() ) gb.DP(g.getDP()); 997 if ( g.hasGQ() ) gb.GQ(g.getGQ()); 998 newGTs.add(gb.make()); 999 } 1000 1001 return newGTs; 1002 } 1003 removePLsAndAD(final Genotype g)1004 public static Genotype removePLsAndAD(final Genotype g) { 1005 return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g; 1006 } 1007 1008 //TODO consider refactor variant-context merging code so that we share as much as possible between 1009 //TODO simpleMerge and referenceConfidenceMerge 1010 //TODO likely using a separate helper class or hierarchy. 1011 /** 1012 * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. 1013 * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with 1014 * the sample name 1015 * 1016 * @param unsortedVCs collection of unsorted VCs 1017 * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs 1018 * @param filteredRecordMergeType merge type for filtered records 1019 * @param genotypeMergeOptions merge option for genotypes 1020 * @param filteredAreUncalled are filtered records uncalled? 1021 * @return new VariantContext representing the merge of unsortedVCs 1022 */ simpleMerge(final Collection<VariantContext> unsortedVCs, final List<String> priorityListOfVCs, final FilteredRecordMergeType filteredRecordMergeType, final GenotypeMergeType genotypeMergeOptions, final boolean filteredAreUncalled)1023 public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs, 1024 final List<String> priorityListOfVCs, 1025 final FilteredRecordMergeType filteredRecordMergeType, 1026 final GenotypeMergeType genotypeMergeOptions, 1027 final boolean filteredAreUncalled) { 1028 int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); 1029 return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled); 1030 } 1031 1032 /** 1033 * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. 1034 * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with 1035 * the sample name. 1036 * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use 1037 * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge. 1038 * 1039 * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ 1040 * 1041 * @param unsortedVCs collection of unsorted VCs 1042 * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs 1043 * @param filteredRecordMergeType merge type for filtered records 1044 * @param genotypeMergeOptions merge option for genotypes 1045 * @param filteredAreUncalled are filtered records uncalled? 1046 * @return new VariantContext representing the merge of unsortedVCs 1047 */ simpleMerge(final Collection<VariantContext> unsortedVCs, final List<String> priorityListOfVCs, final int originalNumOfVCs, final FilteredRecordMergeType filteredRecordMergeType, final GenotypeMergeType genotypeMergeOptions, final boolean filteredAreUncalled)1048 public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs, 1049 final List<String> priorityListOfVCs, 1050 final int originalNumOfVCs, 1051 final FilteredRecordMergeType filteredRecordMergeType, 1052 final GenotypeMergeType genotypeMergeOptions, 1053 final boolean filteredAreUncalled) { 1054 if ( unsortedVCs == null || unsortedVCs.isEmpty() ) 1055 return null; 1056 1057 if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size()) 1058 throw new IllegalArgumentException("the number of the original VariantContexts must be the same as the number of VariantContexts in the priority list"); 1059 1060 final List<VariantContext> preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); 1061 // Make sure all variant contexts are padded with reference base in case of indels if necessary 1062 List<VariantContext> VCs = preFilteredVCs.stream() 1063 .filter(vc -> !filteredAreUncalled || vc.isNotFiltered()) 1064 .collect(Collectors.toList()); 1065 1066 if ( VCs.isEmpty() ) // everything is filtered out and we're filteredAreUncalled 1067 return null; 1068 1069 // establish the baseline info from the first VC 1070 final VariantContext first = VCs.get(0); 1071 final String name = first.getSource(); 1072 final Allele refAllele = determineReferenceAllele(VCs); 1073 1074 final LinkedHashSet<Allele> alleles = new LinkedHashSet<>(); 1075 final Set<String> filters = new LinkedHashSet<>(); 1076 final Map<String, Object> attributes = new LinkedHashMap<>(); 1077 final Set<String> inconsistentAttributes = new LinkedHashSet<>(); 1078 final Set<String> variantSources = new LinkedHashSet<>(); // contains the set of sources we found in our set of VCs that are variant 1079 final Set<String> rsIDs = new LinkedHashSet<>(1); // most of the time there's one id 1080 1081 VariantContext longestVC = first; 1082 int depth = 0; 1083 double log10PError = CommonInfo.NO_LOG10_PERROR; 1084 boolean anyVCHadFiltersApplied = false; 1085 GenotypesContext genotypes = GenotypesContext.create(); 1086 1087 // counting the number of filtered and variant VCs 1088 int nFiltered = 0; 1089 1090 // cycle through and add info from the other VCs, making sure the loc/reference matches 1091 for ( final VariantContext vc : VCs ) { 1092 Utils.validate(longestVC.getStart() == vc.getStart(), () -> "BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); 1093 1094 if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) ) 1095 longestVC = vc; // get the longest location 1096 1097 nFiltered += vc.isFiltered() ? 1 : 0; 1098 if ( vc.isVariant() ) variantSources.add(vc.getSource()); 1099 1100 AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc); 1101 1102 alleles.addAll(alleleMapping.values()); 1103 1104 mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); 1105 1106 // We always take the QUAL of the first VC with a non-MISSING qual for the combined value 1107 if ( log10PError == CommonInfo.NO_LOG10_PERROR ) 1108 log10PError = vc.getLog10PError(); 1109 1110 filters.addAll(vc.getFilters()); 1111 anyVCHadFiltersApplied |= vc.filtersWereApplied(); 1112 1113 // 1114 // add attributes 1115 // 1116 // special case DP (add it up) and ID (just preserve it) 1117 // 1118 if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) 1119 depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); 1120 if ( vc.hasID() ) rsIDs.add(vc.getID()); 1121 1122 for (final Map.Entry<String, Object> p : vc.getAttributes().entrySet()) { 1123 final String key = p.getKey(); 1124 final Object value = p.getValue(); 1125 // only output annotations that have the same value in every input VC 1126 // if we don't like the key already, don't go anywhere 1127 if ( ! inconsistentAttributes.contains(key) ) { 1128 final boolean alreadyFound = attributes.containsKey(key); 1129 final Object boundValue = attributes.get(key); 1130 final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); 1131 1132 if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { 1133 // we found the value but we're inconsistent, put it in the exclude list 1134 inconsistentAttributes.add(key); 1135 attributes.remove(key); 1136 } else if ( ! alreadyFound || boundIsMissingValue ) { // no value 1137 attributes.put(key, value); 1138 } 1139 } 1140 } 1141 } 1142 1143 // if we have more alternate alleles in the merged VC than in one or more of the 1144 // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD 1145 for ( final VariantContext vc : VCs ) { 1146 if (vc.getAlleles().size() == 1) 1147 continue; 1148 if ( hasPLIncompatibleAlleles(alleles, vc.getAlleles())) { 1149 if ( ! genotypes.isEmpty() ) { 1150 logger.debug(String.format("Stripping PLs at %s:%d-%d due to incompatible alleles merged=%s vs. single=%s", 1151 vc.getContig(), vc.getStart(), vc.getEnd(), alleles, vc.getAlleles())); 1152 } 1153 genotypes = stripPLsAndAD(genotypes); 1154 // this will remove stale AC,AF attributed from vc 1155 VariantContextUtils.calculateChromosomeCounts(vc, attributes, true); 1156 break; 1157 } 1158 } 1159 1160 // if at least one record was unfiltered and we want a union, clear all of the filters 1161 if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL ) 1162 filters.clear(); 1163 1164 if ( depth > 0 ) 1165 attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); 1166 1167 final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); 1168 1169 final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); 1170 builder.loc(longestVC.getContig(), longestVC.getStart(), longestVC.getEnd()); 1171 builder.alleles(alleles); 1172 builder.genotypes(genotypes); 1173 builder.log10PError(log10PError); 1174 if ( anyVCHadFiltersApplied ) { 1175 builder.filters(filters.isEmpty() ? filters : new TreeSet<>(filters)); 1176 } 1177 builder.attributes(new TreeMap<>(attributes)); 1178 1179 // Trim the padded bases of all alleles if necessary 1180 final VariantContext merged = builder.make(); 1181 return merged; 1182 } 1183 1184 1185 //TODO as part of a larger refactoring effort remapAlleles can be merged with createAlleleMapping. 1186 stripPLsAndAD(final GenotypesContext genotypes)1187 public static GenotypesContext stripPLsAndAD(final GenotypesContext genotypes) { 1188 final GenotypesContext newGs = GenotypesContext.create(genotypes.size()); 1189 for ( final Genotype g : genotypes ) { 1190 newGs.add(removePLsAndAD(g)); 1191 } 1192 return newGs; 1193 } 1194 determineReferenceAllele(final List<VariantContext> VCs)1195 private static Allele determineReferenceAllele(final List<VariantContext> VCs) { 1196 return determineReferenceAllele(VCs, null); 1197 } 1198 contextMatchesLoc(final VariantContext vc, final Locatable loc)1199 public static boolean contextMatchesLoc(final VariantContext vc, final Locatable loc) { 1200 return loc == null || loc.getStart() == vc.getStart(); 1201 } 1202 resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc)1203 public static AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc) { 1204 if ( refAllele.equals(vc.getReference()) ) 1205 return new AlleleMapper(vc); 1206 else { 1207 final Map<Allele, Allele> map = createAlleleMapping(refAllele, vc); 1208 map.put(vc.getReference(), refAllele); 1209 return new AlleleMapper(map); 1210 } 1211 } 1212 createAlleleMapping(final Allele refAllele, final VariantContext oneVC)1213 public static Map<Allele, Allele> createAlleleMapping(final Allele refAllele, 1214 final VariantContext oneVC) { 1215 return createAlleleMapping(refAllele, oneVC.getReference(), oneVC.getAlternateAlleles()); 1216 } 1217 1218 //TODO as part of a larger refactoring effort {@link #createAlleleMapping} can be merged with {@link ReferenceConfidenceVariantContextMerger#remapAlleles}. 1219 /** 1220 * Create an allele mapping for the given context where its reference allele must (potentially) be extended to the given allele 1221 * 1222 * The refAllele is the longest reference allele seen at this start site. 1223 * So imagine it is: 1224 * refAllele: ACGTGA 1225 * myRef: ACGT 1226 * myAlt: A 1227 * 1228 * We need to remap all of the alleles in vc to include the extra GA so that 1229 * myRef => refAllele and myAlt => AGA 1230 * 1231 * @param refAllele the new (extended) reference allele 1232 * @param inputRef the reference allele that may need to be extended 1233 * @param inputAlts the alternate alleles that may need to be extended 1234 * @return a non-null mapping of original alleles to new (extended) ones 1235 */ createAlleleMapping(final Allele refAllele, final Allele inputRef, final List<Allele> inputAlts)1236 public static Map<Allele, Allele> createAlleleMapping(final Allele refAllele, 1237 final Allele inputRef, final List<Allele> inputAlts) { 1238 Utils.validate( refAllele.length() > inputRef.length(), () -> "BUG: inputRef="+inputRef+" is longer than refAllele="+refAllele); 1239 final byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), inputRef.length(), refAllele.length()); 1240 1241 final Map<Allele, Allele> map = new LinkedHashMap<>(); 1242 for ( final Allele a : inputAlts ) { 1243 if ( isNonSymbolicExtendableAllele(a) ) { 1244 Allele extended = Allele.extend(a, extraBases); 1245 map.put(a, extended); 1246 } else if (a.equals(Allele.SPAN_DEL)) { 1247 map.put(a, a); 1248 } 1249 } 1250 1251 return map; 1252 } 1253 isNonSymbolicExtendableAllele(final Allele allele)1254 private static boolean isNonSymbolicExtendableAllele(final Allele allele) { 1255 return ! (allele.isReference() || allele.isSymbolic() || allele.equals(Allele.SPAN_DEL)); 1256 } 1257 sortVariantContextsByPriority(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, GenotypeMergeType mergeOption )1258 public static List<VariantContext> sortVariantContextsByPriority(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, GenotypeMergeType mergeOption ) { 1259 if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) 1260 throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); 1261 1262 if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) 1263 return new ArrayList<>(unsortedVCs); 1264 else { 1265 ArrayList<VariantContext> sorted = new ArrayList<>(unsortedVCs); 1266 Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); 1267 return sorted; 1268 } 1269 } 1270 mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniquifySamples)1271 private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniquifySamples) { 1272 //TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE 1273 for ( final Genotype g : oneVC.getGenotypes() ) { 1274 final String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniquifySamples); 1275 if ( ! mergedGenotypes.containsSample(name) ) { 1276 // only add if the name is new 1277 Genotype newG = g; 1278 1279 if ( uniquifySamples || alleleMapping.needsRemapping() ) { 1280 final List<Allele> alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); 1281 newG = new GenotypeBuilder(g).name(name).alleles(alleles).make(); 1282 } 1283 1284 mergedGenotypes.add(newG); 1285 } 1286 } 1287 } 1288 1289 /** 1290 * Cached NO_CALL immutable lists where the position ith contains the list with i elements. 1291 */ 1292 @SuppressWarnings({"rawtypes", "unchecked"}) 1293 private static List<Allele>[] NOCALL_LISTS = new List[] { 1294 Collections.emptyList(), 1295 Collections.singletonList(Allele.NO_CALL), 1296 Collections.nCopies(2,Allele.NO_CALL) 1297 }; 1298 1299 /** 1300 * Code to ensure that {@link #NOCALL_LISTS} has enough entries beyond the requested ploidy 1301 * @param capacity the requested ploidy. 1302 */ ensureNoCallListsCapacity(final int capacity)1303 private static void ensureNoCallListsCapacity(final int capacity) { 1304 final int currentCapacity = NOCALL_LISTS.length - 1; 1305 if (currentCapacity >= capacity) 1306 return; 1307 NOCALL_LISTS = Arrays.copyOf(NOCALL_LISTS,Math.max(capacity,currentCapacity << 1) + 1); 1308 for (int i = currentCapacity + 1; i < NOCALL_LISTS.length; i++) 1309 NOCALL_LISTS[i] = Collections.nCopies(i,Allele.NO_CALL); 1310 } 1311 1312 /** 1313 * Returns a {@link Allele#NO_CALL NO_CALL} allele list provided the ploidy. 1314 * 1315 * @param ploidy the required ploidy. 1316 * 1317 * @return never {@code null}, but an empty list if {@code ploidy} is equal or less than 0. The returned list 1318 * might or might not be mutable. 1319 */ noCallAlleles(final int ploidy)1320 public static List<Allele> noCallAlleles(final int ploidy) { 1321 if (NOCALL_LISTS.length <= ploidy) 1322 ensureNoCallListsCapacity(ploidy); 1323 return NOCALL_LISTS[ploidy]; 1324 } 1325 1326 mergedSampleName(String trackName, String sampleName, boolean uniquify )1327 public static String mergedSampleName(String trackName, String sampleName, boolean uniquify ) { 1328 return uniquify ? sampleName + "." + trackName : sampleName; 1329 } 1330 1331 /** 1332 * Trim the alleles in inputVC from the reverse direction 1333 * 1334 * @param inputVC a non-null input VC whose alleles might need a haircut 1335 * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up 1336 */ reverseTrimAlleles( final VariantContext inputVC )1337 public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { 1338 return trimAlleles(inputVC, false, true); 1339 } 1340 1341 /** 1342 * Trim the alleles in inputVC forward and reverse, as requested 1343 * 1344 * @param inputVC a non-null input VC whose alleles might need a haircut 1345 * @param trimForward should we trim up the alleles from the forward direction? 1346 * @param trimReverse should we trim up the alleles from the reverse direction? 1347 * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles 1348 */ trimAlleles(final VariantContext inputVC, final boolean trimForward, final boolean trimReverse)1349 public static VariantContext trimAlleles(final VariantContext inputVC, final boolean trimForward, final boolean trimReverse) { 1350 Utils.nonNull(inputVC); 1351 1352 if ( inputVC.getNAlleles() <= 1 || inputVC.getAlleles().stream().anyMatch(a -> a.length() == 1) ) { 1353 return inputVC; 1354 } 1355 1356 final List<byte[]> sequences = inputVC.getAlleles().stream().filter(a -> !a.isSymbolic()).map(Allele::getBases).collect(Collectors.toList()); 1357 final List<IndexRange> ranges = inputVC.getAlleles().stream().filter(a -> !a.isSymbolic()).map(a -> new IndexRange(0, a.length())).collect(Collectors.toList()); 1358 1359 final Pair<Integer, Integer> shifts = AlignmentUtils.normalizeAlleles(sequences, ranges, 0, true); 1360 final int endTrim = shifts.getRight(); 1361 final int startTrim = -shifts.getLeft(); 1362 1363 final boolean emptyAllele = ranges.stream().anyMatch(r -> r.size() == 0); 1364 final boolean restoreOneBaseAtEnd = emptyAllele && startTrim == 0; 1365 final boolean restoreOneBaseAtStart = emptyAllele && startTrim > 0; 1366 1367 // if the end trimming consumed all the bases, leave one base 1368 final int endBasesToClip = restoreOneBaseAtEnd ? endTrim - 1 : endTrim; 1369 final int startBasesToClip = restoreOneBaseAtStart ? startTrim - 1 : startTrim; 1370 1371 return trimAlleles(inputVC, (trimForward ? startBasesToClip : 0) - 1, trimReverse ? endBasesToClip : 0); 1372 } 1373 1374 /** 1375 * Trim up alleles in inputVC, cutting out all bases up to fwdTrimEnd inclusive and 1376 * the last revTrim bases from the end 1377 * 1378 * @param inputVC a non-null input VC 1379 * @param fwdTrimEnd bases up to this index (can be -1) will be removed from the start of all alleles 1380 * @param revTrim the last revTrim bases of each allele will be clipped off as well 1381 * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles 1382 */ trimAlleles(final VariantContext inputVC, final int fwdTrimEnd, final int revTrim)1383 protected static VariantContext trimAlleles(final VariantContext inputVC, 1384 final int fwdTrimEnd, 1385 final int revTrim) { 1386 if( fwdTrimEnd == -1 && revTrim == 0 ) // nothing to do, so just return inputVC unmodified 1387 return inputVC; 1388 1389 final List<Allele> alleles = new LinkedList<>(); 1390 final Map<Allele, Allele> originalToTrimmedAlleleMap = new LinkedHashMap<>(); 1391 1392 for (final Allele a : inputVC.getAlleles()) { 1393 if (a.isSymbolic()) { 1394 alleles.add(a); 1395 originalToTrimmedAlleleMap.put(a, a); 1396 } else { 1397 // get bases for current allele and create a new one with trimmed bases 1398 final byte[] newBases = Arrays.copyOfRange(a.getBases(), fwdTrimEnd+1, a.length()-revTrim); 1399 final Allele trimmedAllele = Allele.create(newBases, a.isReference()); 1400 alleles.add(trimmedAllele); 1401 originalToTrimmedAlleleMap.put(a, trimmedAllele); 1402 } 1403 } 1404 1405 // now we can recreate new genotypes with trimmed alleles 1406 final AlleleMapper alleleMapper = new AlleleMapper(originalToTrimmedAlleleMap); 1407 final GenotypesContext genotypes = updateGenotypesWithMappedAlleles(inputVC.getGenotypes(), alleleMapper); 1408 1409 final int start = inputVC.getStart() + (fwdTrimEnd + 1); 1410 final VariantContextBuilder builder = new VariantContextBuilder(inputVC); 1411 builder.start(start); 1412 builder.stop(start + alleles.get(0).length() - 1); 1413 builder.alleles(alleles); 1414 builder.genotypes(genotypes); 1415 return builder.make(); 1416 } 1417 updateGenotypesWithMappedAlleles(final GenotypesContext originalGenotypes, final AlleleMapper alleleMapper)1418 protected static GenotypesContext updateGenotypesWithMappedAlleles(final GenotypesContext originalGenotypes, final AlleleMapper alleleMapper) { 1419 final GenotypesContext updatedGenotypes = GenotypesContext.create(originalGenotypes.size()); 1420 1421 for ( final Genotype genotype : originalGenotypes ) { 1422 final List<Allele> updatedAlleles = alleleMapper.remap(genotype.getAlleles()); 1423 updatedGenotypes.add(new GenotypeBuilder(genotype).alleles(updatedAlleles).make()); 1424 } 1425 1426 return updatedGenotypes; 1427 } 1428 1429 protected static class AlleleMapper { 1430 private VariantContext vc = null; 1431 private Map<Allele, Allele> map = null; AlleleMapper(VariantContext vc)1432 public AlleleMapper(VariantContext vc) { this.vc = vc; } AlleleMapper(Map<Allele, Allele> map)1433 public AlleleMapper(Map<Allele, Allele> map) { this.map = map; } needsRemapping()1434 public boolean needsRemapping() { return this.map != null; } values()1435 public Collection<Allele> values() { return map != null ? map.values() : vc.getAlleles(); } remap(Allele a)1436 public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } 1437 remap(List<Allele> as)1438 public List<Allele> remap(List<Allele> as) { 1439 List<Allele> newAs = as.stream() 1440 .map(this::remap) 1441 .collect(Collectors.toList()); 1442 return newAs; 1443 } 1444 1445 } 1446 1447 private static class CompareByPriority implements Comparator<VariantContext>, Serializable { 1448 private static final long serialVersionUID = 0L; 1449 1450 List<String> priorityListOfVCs; CompareByPriority(List<String> priorityListOfVCs)1451 public CompareByPriority(List<String> priorityListOfVCs) { 1452 this.priorityListOfVCs = priorityListOfVCs; 1453 } 1454 getIndex(VariantContext vc)1455 private int getIndex(VariantContext vc) { 1456 return Utils.validIndex(priorityListOfVCs.indexOf(vc.getSource()), priorityListOfVCs.size()); 1457 } 1458 1459 @Override compare(VariantContext vc1, VariantContext vc2)1460 public int compare(VariantContext vc1, VariantContext vc2) { 1461 return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2)); 1462 } 1463 } 1464 1465 /** 1466 * For testing purposes only. Create a site-only VariantContext at contig:start containing alleles 1467 * 1468 * @param name the name of the VC 1469 * @param contig the contig for the VC 1470 * @param start the start of the VC 1471 * @param alleleStrings a non-null, non-empty list of strings for the alleles. The first will be the ref allele, and others the 1472 * alt. Will compute the stop of the VC from the length of the reference allele 1473 * @return a non-null VariantContext 1474 */ makeFromAlleles(final String name, final String contig, final int start, final List<String> alleleStrings)1475 public static VariantContext makeFromAlleles(final String name, final String contig, final int start, final List<String> alleleStrings) { 1476 Utils.nonNull(alleleStrings, "alleleStrings is null"); 1477 Utils.nonEmpty(alleleStrings, "alleleStrings is empty"); 1478 1479 final List<Allele> alleles = new LinkedList<>(); 1480 final int length = alleleStrings.get(0).length(); 1481 1482 boolean first = true; 1483 for ( final String alleleString : alleleStrings ) { 1484 alleles.add(Allele.create(alleleString, first)); 1485 first = false; 1486 } 1487 return new VariantContextBuilder(name, contig, start, start+length-1, alleles).make(); 1488 } 1489 splitSomaticVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final VCFHeader outputHeader)1490 public static List<VariantContext> splitSomaticVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final VCFHeader outputHeader) { 1491 Utils.nonNull(vc); 1492 1493 if (!vc.isVariant() || vc.isBiallelic()) { 1494 // non variant or biallelics already satisfy the contract 1495 return Collections.singletonList(vc); 1496 } else { 1497 final List<VariantContext> biallelics = new LinkedList<>(); 1498 1499 List<String> attrsSpecialFormats = new ArrayList<String>(Arrays.asList(GATKVCFConstants.AS_FILTER_STATUS_KEY, GATKVCFConstants.AS_SB_TABLE_KEY)); 1500 List<Map<String, Object>> attributesByAllele = splitAttributesIntoPerAlleleLists(vc, attrsSpecialFormats, outputHeader); 1501 splitASSBTable(vc, attributesByAllele); 1502 splitASFilters(vc, attributesByAllele); 1503 1504 ListIterator<Map<String, Object>> attributesByAlleleIterator = attributesByAllele.listIterator(); 1505 1506 for (final Allele alt : vc.getAlternateAlleles()) { 1507 final VariantContextBuilder builder = new VariantContextBuilder(vc); 1508 1509 // make biallelic alleles 1510 final List<Allele> alleles = Arrays.asList(vc.getReference(), alt); 1511 builder.alleles(alleles); 1512 Map<String, Object> attributes = attributesByAlleleIterator.next(); 1513 builder.attributes(attributes); 1514 1515 // now add the allele specific filters to the variant context 1516 String filters = (String) attributes.get(GATKVCFConstants.AS_FILTER_STATUS_KEY); 1517 // checking for . and PASS should be able to be removed. these were temporarily used to indicate no allele specific filter 1518 if (filters != null && !filters.isEmpty() && !filters.equals(VCFConstants.EMPTY_INFO_FIELD) && !filters.equals(GATKVCFConstants.SITE_LEVEL_FILTERS) && !filters.equals((VCFConstants.PASSES_FILTERS_v4))) { 1519 AnnotationUtils.decodeAnyASList(filters).stream().forEach(filter -> builder.filter(filter)); 1520 } 1521 1522 int alleleIndex = vc.getAlleleIndex(alt); 1523 builder.genotypes(AlleleSubsettingUtils.subsetSomaticAlleles(outputHeader, vc.getGenotypes(), alleles, new int[]{0, alleleIndex})); 1524 final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true); 1525 biallelics.add(trimmed); 1526 } 1527 return biallelics; 1528 } 1529 } 1530 splitASSBTable(VariantContext vc, List<Map<String, Object>> attrsByAllele)1531 public static void splitASSBTable(VariantContext vc, List<Map<String, Object>> attrsByAllele) { 1532 List<String> sbs = StrandBiasUtils.getSBsForAlleles(vc).stream().map(ints -> StrandBiasUtils.encode(ints)).collect(Collectors.toList()); 1533 new IndexRange(1, sbs.size()).forEach(i -> { 1534 String newattrs = String.join(AnnotationUtils.ALLELE_SPECIFIC_RAW_DELIM, new ArrayList<String>(Arrays.asList(sbs.get(0), sbs.get(i)))); 1535 attrsByAllele.get(i - 1).put(GATKVCFConstants.AS_SB_TABLE_KEY, newattrs); 1536 }); 1537 } 1538 splitASFilters(VariantContext vc, List<Map<String, Object>> attrsByAllele)1539 public static void splitASFilters(VariantContext vc, List<Map<String, Object>> attrsByAllele) { 1540 // the reason we are getting as list and then joining on , is because the default getAttributeAsString for a list will add spaces between items which we don't 1541 // want to have to trim out later in the code 1542 String asfiltersStr = String.join(",", vc.getCommonInfo().getAttributeAsStringList(GATKVCFConstants.AS_FILTER_STATUS_KEY, GATKVCFConstants.SITE_LEVEL_FILTERS)); 1543 List<String> filtersList = AnnotationUtils.decodeAnyASListWithRawDelim(asfiltersStr); 1544 new IndexRange(0, filtersList.size()).forEach(i -> attrsByAllele.get(i).put(GATKVCFConstants.AS_FILTER_STATUS_KEY, filtersList.get(i))); 1545 } 1546 splitAttributesIntoPerAlleleLists(VariantContext vc, List<String> skipAttributes, VCFHeader outputHeader)1547 public static List<Map<String, Object>> splitAttributesIntoPerAlleleLists(VariantContext vc, List<String> skipAttributes, VCFHeader outputHeader) { 1548 List<Map<String, Object>> results = new ArrayList<>(vc.getNAlleles()-1); 1549 vc.getAlternateAlleles().forEach(alt -> results.add(new HashMap<>())); 1550 1551 Map<String, Object> attributes = vc.getAttributes(); 1552 attributes.entrySet().stream().filter(entry -> !skipAttributes.contains(entry.getKey())).forEachOrdered(entry -> { 1553 String key = entry.getKey(); 1554 // default to unbounded in case header is not found 1555 VCFHeaderLineCount countType = VCFHeaderLineCount.UNBOUNDED; 1556 try { 1557 VCFInfoHeaderLine header = outputHeader.getInfoHeaderLine(key); 1558 countType = header.getCountType(); 1559 } catch (IllegalStateException ex) { 1560 // this happens for DP if we use GATKVCFHeaderLines.getInfoLine(key) 1561 // shouldn't happen now that we use the generated output header 1562 logger.warn("Could not find header info for key " + key); 1563 } 1564 // override count type for this attribute 1565 if (key.equals(GATKVCFConstants.REPEATS_PER_ALLELE_KEY)) { 1566 countType = VCFHeaderLineCount.R; 1567 } 1568 List<Object> attr; 1569 switch (countType) { 1570 case A: 1571 attr = vc.getCommonInfo().getAttributeAsList(key); 1572 ValidationUtils.validateArg(attr.size() == results.size(), "Incorrect attribute size for " + key); 1573 new IndexRange(0, attr.size()).forEach(i -> results.get(i).put(key, attr.get(i))); 1574 break; 1575 case R: 1576 attr = vc.getCommonInfo().getAttributeAsList(key); 1577 ValidationUtils.validateArg(attr.size() == vc.getNAlleles(), "Incorrect attribute size for " + key); 1578 new IndexRange(1, attr.size()).forEach(i -> { 1579 List<Object> newattrs = new ArrayList<Object>(Arrays.asList(attr.get(0), attr.get(i))); 1580 results.get(i-1).put(key, newattrs); 1581 }); 1582 break; 1583 default: 1584 results.forEach(altMap -> altMap.put(key, entry.getValue())); 1585 1586 } 1587 1588 }); 1589 return results; 1590 } 1591 1592 /** 1593 * Split variant context into its biallelic components if there are more than 2 alleles 1594 * <p> 1595 * For VC has A/B/C alleles, returns A/B and A/C contexts. 1596 * Alleles are right trimmed to satisfy VCF conventions 1597 * <p> 1598 * If vc is biallelic or non-variant it is just returned 1599 * <p> 1600 * Chromosome counts are updated (but they are by definition 0) 1601 * 1602 * @param vc a potentially multi-allelic variant context 1603 * @param trimLeft if true, we will also left trim alleles, potentially moving the resulting vcs forward on the genome 1604 * @param genotypeAssignmentMethod assignment strategy for the (subsetted) PLs 1605 * @param keepOriginalChrCounts keep the orignal chromosome counts before subsetting 1606 * @return a list of bi-allelic (or monomorphic) variant context 1607 */ splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod, final boolean keepOriginalChrCounts)1608 public static List<VariantContext> splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod, 1609 final boolean keepOriginalChrCounts) { 1610 Utils.nonNull(vc); 1611 1612 if (!vc.isVariant() || vc.isBiallelic()) 1613 // non variant or biallelics already satisfy the contract 1614 return Collections.singletonList(vc); 1615 else { 1616 final List<VariantContext> biallelics = new LinkedList<>(); 1617 1618 // if any of the genotypes are het-not-ref (i.e. 1/2), set all of them to no-call 1619 final GenotypeAssignmentMethod genotypeAssignmentMethodUsed = hasHetNonRef(vc.getGenotypes()) ? GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS : genotypeAssignmentMethod; 1620 1621 for (final Allele alt : vc.getAlternateAlleles()) { 1622 final VariantContextBuilder builder = new VariantContextBuilder(vc); 1623 1624 // make biallelic alleles 1625 final List<Allele> alleles = Arrays.asList(vc.getReference(), alt); 1626 builder.alleles(alleles); 1627 1628 // since the VC has been subset, remove the invalid attributes 1629 for (final String key : vc.getAttributes().keySet()) { 1630 if (!(key.equals(VCFConstants.ALLELE_COUNT_KEY) || key.equals(VCFConstants.ALLELE_FREQUENCY_KEY) || key.equals(VCFConstants.ALLELE_NUMBER_KEY)) || 1631 genotypeAssignmentMethodUsed == GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS) { 1632 builder.rmAttribute(key); 1633 } 1634 } 1635 1636 // subset INFO field annotations if available if genotype is called 1637 if (genotypeAssignmentMethodUsed != GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS && 1638 genotypeAssignmentMethodUsed != GenotypeAssignmentMethod.SET_TO_NO_CALL) 1639 AlleleSubsettingUtils.addInfoFieldAnnotations(vc, builder, keepOriginalChrCounts); 1640 1641 builder.genotypes(AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(),2,vc.getAlleles(), alleles, null, genotypeAssignmentMethodUsed,vc.getAttributeAsInt("DP",0))); 1642 final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true); 1643 biallelics.add(trimmed); 1644 } 1645 1646 return biallelics; 1647 } 1648 } 1649 1650 /** 1651 * Check if any of the genotypes is heterozygous, non-reference (i.e. 1/2) 1652 * 1653 * @param genotypesContext genotype information 1654 * @return true if any of the genotypes are heterozygous, non-reference, false otherwise 1655 */ hasHetNonRef(final GenotypesContext genotypesContext)1656 private static boolean hasHetNonRef(final GenotypesContext genotypesContext) { 1657 for (final Genotype gt : genotypesContext) { 1658 if (gt.isHetNonRef()) 1659 return true; 1660 } 1661 return false; 1662 } 1663 1664 /** 1665 * Splits the alleles for the provided variant context into its primitive parts. 1666 * Requires that the input VC be bi-allelic, so calling methods should first call splitVariantContextToBiallelics() if needed. 1667 * Currently works only for MNPs. 1668 * 1669 * @param vc the non-null VC to split 1670 * @return a non-empty list of VCs split into primitive parts or the original VC otherwise 1671 */ splitIntoPrimitiveAlleles(final VariantContext vc)1672 public static List<VariantContext> splitIntoPrimitiveAlleles(final VariantContext vc) { 1673 Utils.nonNull(vc); 1674 if ( !vc.isBiallelic() ) { 1675 throw new IllegalArgumentException("Trying to break a multi-allelic Variant Context into primitive parts"); 1676 } 1677 1678 // currently only works for MNPs 1679 if ( !vc.isMNP() ) 1680 return Arrays.asList(vc); 1681 1682 final byte[] ref = vc.getReference().getBases(); 1683 final byte[] alt = vc.getAlternateAllele(0).getBases(); 1684 1685 Utils.validate(ref.length == alt.length, "ref and alt alleles for MNP have different lengths"); 1686 1687 final List<VariantContext> result = new ArrayList<>(ref.length); 1688 1689 for ( int i = 0; i < ref.length; i++ ) { 1690 1691 // if the ref and alt bases are different at a given position, create a new SNP record (otherwise do nothing) 1692 if ( ref[i] != alt[i] ) { 1693 1694 // create the ref and alt SNP alleles 1695 final Allele newRefAllele = Allele.create(ref[i], true); 1696 final Allele newAltAllele = Allele.create(alt[i], false); 1697 1698 // create a new VariantContext with the new SNP alleles 1699 final VariantContextBuilder newVC = new VariantContextBuilder(vc).start(vc.getStart() + i).stop(vc.getStart() + i).alleles(Arrays.asList(newRefAllele, newAltAllele)); 1700 1701 // create new genotypes with updated alleles 1702 final Map<Allele, Allele> alleleMap = new LinkedHashMap<>(); 1703 alleleMap.put(vc.getReference(), newRefAllele); 1704 alleleMap.put(vc.getAlternateAllele(0), newAltAllele); 1705 final GenotypesContext newGenotypes = updateGenotypesWithMappedAlleles(vc.getGenotypes(), new AlleleMapper(alleleMap)); 1706 1707 result.add(newVC.genotypes(newGenotypes).make()); 1708 } 1709 } 1710 1711 if ( result.isEmpty() ) 1712 result.add(vc); 1713 1714 return result; 1715 } 1716 1717 /** 1718 * Add chromosome counts (AC, AN and AF) to the VCF header lines 1719 * 1720 * @param headerLines the VCF header lines 1721 */ addChromosomeCountsToHeader(final Set<VCFHeaderLine> headerLines)1722 public static void addChromosomeCountsToHeader(final Set<VCFHeaderLine> headerLines) { 1723 headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY)); 1724 headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY)); 1725 headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY)); 1726 } 1727 1728 /** 1729 * Set the builder's filtered genotypes to no-call and update AC, AN and AF 1730 * 1731 * @param builder builder for variant context 1732 * @param vc the VariantContext record to set filtered genotypes to no-call 1733 * @param setFilteredGenotypesToNocall flag to set filtered genotype to NO CALL 1734 * @param filters the filters for each genotype 1735 */ setFilteredGenotypeToNocall(final VariantContextBuilder builder, final VariantContext vc, final boolean setFilteredGenotypesToNocall, BiFunction<VariantContext, Genotype, List<String>> filters)1736 public static void setFilteredGenotypeToNocall(final VariantContextBuilder builder, final VariantContext vc, 1737 final boolean setFilteredGenotypesToNocall, BiFunction<VariantContext, Genotype, List<String>> filters) { 1738 Utils.nonNull(vc); 1739 Utils.nonNull(builder); 1740 Utils.nonNull(filters); 1741 1742 final GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size()); 1743 1744 // update genotypes if filtered genotypes are set to no-call 1745 boolean haveFilteredNoCallAlleles = false; 1746 for (final Genotype g : vc.getGenotypes()) { 1747 if (g.isCalled()) { 1748 List<String> filterNames = filters.apply(vc,g); 1749 if (!filterNames.isEmpty() && setFilteredGenotypesToNocall) { 1750 haveFilteredNoCallAlleles = true; 1751 genotypes.add(new GenotypeBuilder(g).filters(filterNames).alleles(GATKVariantContextUtils.noCallAlleles((g.getPloidy()))).make()); 1752 } else { 1753 genotypes.add(new GenotypeBuilder(g).filters(filterNames).make()); 1754 } 1755 } else { 1756 genotypes.add(g); 1757 } 1758 } 1759 1760 // if filtered genotypes are set to no-call, output recomputed AC, AN, AF 1761 if ( haveFilteredNoCallAlleles ) { 1762 final Map<String, Object> attributes = new LinkedHashMap<>(vc.getAttributes()); // need to make mutable 1763 VariantContextUtils.calculateChromosomeCounts(builder.genotypes(genotypes).make(), attributes, true, vc.getSampleNames()); 1764 builder.attributes(attributes); 1765 } 1766 1767 // update genotypes 1768 builder.genotypes(genotypes); 1769 } 1770 1771 /** 1772 * Calculate the Genotype Quality (GQ) by subtracting the smallest Phred-scaled likelihoods (PL) from the second smallest. 1773 * 1774 * @param plValues array of PL values 1775 * @return the genotype quality corresponding to the PL values 1776 */ calculateGQFromPLs(final int[] plValues)1777 public static int calculateGQFromPLs(final int[] plValues) { 1778 Utils.nonNull(plValues); 1779 Utils.validateArg(plValues.length >= 2, () -> "Array of PL values must contain at least two elements."); 1780 1781 int first = plValues[0]; 1782 int second = plValues[1]; 1783 if (first > second) { 1784 second = first; 1785 first = plValues[1]; 1786 } 1787 for (int i = 2; i < plValues.length; i++) { 1788 final int candidate = plValues[i]; 1789 if (candidate >= second) { 1790 continue; 1791 } 1792 if (candidate <= first) { 1793 second = first; 1794 first = candidate; 1795 } else 1796 second = candidate; 1797 } 1798 return second - first; 1799 } 1800 1801 /** 1802 * Create a string with existing filters plus the one to append 1803 * @param vc VariantContext to base initial filter values. Not {@code null} 1804 * @param filterToAppend the filter value to append to the strings Not {@code null} 1805 * @return String of filter values. Sorted alphabetically. 1806 */ createFilterListWithAppend(VariantContext vc, String filterToAppend)1807 public static List<String> createFilterListWithAppend(VariantContext vc, String filterToAppend) { 1808 Utils.nonNull(vc); 1809 Utils.nonNull(filterToAppend); 1810 1811 final List<String> filtersAsList = new ArrayList<>(vc.getFilters()); 1812 filtersAsList.add(filterToAppend); 1813 filtersAsList.sort(String::compareToIgnoreCase); 1814 return filtersAsList; 1815 } 1816 1817 /** 1818 * Determines whether the given reference and alternate alleles constitute a frameshift mutation. 1819 * @param reference The reference {@link Allele}. Must not be {@code null}. 1820 * @param alternate The alternate / variant {@link Allele}. Must not be {@code null}. 1821 * @return {@code true} if replacing the reference with the alternate results in a frameshift. {@code false} otherwise. 1822 */ isFrameshift(final Allele reference, final Allele alternate)1823 public static boolean isFrameshift(final Allele reference, final Allele alternate) { 1824 1825 Utils.nonNull(reference); 1826 Utils.nonNull(alternate); 1827 1828 // We know it's a frameshift if we have a replacement that is not of a 1829 // length evenly divisible by 3 because that's how many bases are read at once: 1830 return ((Math.abs( reference.length() - alternate.length() ) % 3) != 0); 1831 } 1832 1833 /** 1834 * Determines whether the given reference and alternate alleles constitute a frameshift mutation. 1835 * @param reference The {@link String} containing the bases of the reference allele. Must not be {@code null}. 1836 * @param alternate The {@link String} containing the bases of the alternate allele. Must not be {@code null}. 1837 * @return {@code true} if replacing the reference with the alternate results in a frameshift. {@code false} otherwise. 1838 */ isFrameshift(final String reference, final String alternate)1839 public static boolean isFrameshift(final String reference, final String alternate) { 1840 1841 Utils.nonNull(reference); 1842 Utils.nonNull(alternate); 1843 1844 final String refComparable = reference.replaceAll("\\*", ""); 1845 final String altComperable = alternate.replaceAll("\\*", ""); 1846 1847 // We know it's a frameshift if we have a replacement that is not of a 1848 // length evenly divisible by 3 because that's how many bases are read at once: 1849 return ((Math.abs( refComparable.length() - altComperable.length() ) % 3) != 0); 1850 } 1851 1852 /** 1853 * Determines whether the given reference and alternate alleles constitute a frameshift mutation. 1854 * This does not take into account the cases where either or both of the alleles overlap splice sites. 1855 * @param startPos Genomic start position (1-based, inclusive) of the variant. Must be > 0. 1856 * @param refEnd Genomic end position (1-based, inclusive) of the reference allele. Must be > 0. 1857 * @param altEnd Genomic end position (1-based, inclusive) of the alternate allele. Must be > 0. 1858 * @return {@code true} if replacing the reference with the alternate results in a frameshift. {@code false} otherwise. 1859 */ isFrameshift(final int startPos, final int refEnd, final int altEnd)1860 public static boolean isFrameshift(final int startPos, final int refEnd, final int altEnd) { 1861 1862 ParamUtils.isPositive( startPos, "Genomic positions must be > 0." ); 1863 ParamUtils.isPositive( refEnd, "Genomic positions must be > 0." ); 1864 ParamUtils.isPositive( altEnd, "Genomic positions must be > 0." ); 1865 1866 final int refLength = refEnd - startPos + 1; 1867 final int altLength = altEnd - startPos + 1; 1868 1869 // We know it's a frameshift if we have a replacement that is not of a 1870 // length evenly divisible by 3 because that's how many bases are read at once: 1871 return ((Math.abs( refLength - altLength ) % 3) != 0); 1872 } 1873 1874 /** 1875 * Determines whether the given reference and alternate alleles constitute an insertion mutation. 1876 * @param reference The reference {@link Allele}. Must not be {@code null}. 1877 * @param alternate The alternate / variant {@link Allele}. Must not be {@code null}. 1878 * @return {@code true} if replacing the reference with the alternate results in an insertion. {@code false} otherwise. 1879 */ isInsertion(final Allele reference, final Allele alternate)1880 public static boolean isInsertion(final Allele reference, final Allele alternate) { 1881 1882 Utils.nonNull(reference); 1883 Utils.nonNull(alternate); 1884 1885 // If we have more bases in the alternate, we have an insertion: 1886 return reference.length() < alternate.length(); 1887 } 1888 1889 /** 1890 * Determines whether the given reference and alternate alleles constitute an insertion mutation. 1891 * @param reference A {@link String} containing the bases of the reference allele. Must not be {@code null}. 1892 * @param alternate A {@link String} containing the bases of the alternate / variant allele. Must not be {@code null}. 1893 * @return {@code true} if replacing the reference with the alternate results in an insertion. {@code false} otherwise. 1894 */ isInsertion(final String reference, final String alternate)1895 public static boolean isInsertion(final String reference, final String alternate) { 1896 1897 Utils.nonNull(reference); 1898 Utils.nonNull(alternate); 1899 1900 final String refComparable = reference.replaceAll("\\*", ""); 1901 final String altComperable = alternate.replaceAll("\\*", ""); 1902 1903 // If we have more bases in the alternate, we have an insertion: 1904 return refComparable.length() < altComperable.length(); 1905 } 1906 1907 /** 1908 * Determines whether the given reference and alternate alleles constitute a deletion mutation. 1909 * @param reference A {@link String} containing the bases of the reference allele. Must not be {@code null}. 1910 * @param alternate A {@link String} containing the bases of the alternate / variant allele. Must not be {@code null}. 1911 * @return {@code true} if replacing the reference with the alternate results in a deletion. {@code false} otherwise. 1912 */ isDeletion(final String reference, final String alternate)1913 public static boolean isDeletion(final String reference, final String alternate) { 1914 1915 Utils.nonNull(reference); 1916 Utils.nonNull(alternate); 1917 1918 final String refComparable = reference.replaceAll("\\*", ""); 1919 final String altComperable = alternate.replaceAll("\\*", ""); 1920 1921 // If we have fewer bases in the alternate, we have a deletion: 1922 return refComparable.length() > altComperable.length(); 1923 } 1924 1925 /** 1926 * Determines whether the given reference and alternate alleles constitute a deletion mutation. 1927 * @param reference The reference {@link Allele}. Must not be {@code null}. 1928 * @param alternate The alternate / variant {@link Allele}. Must not be {@code null}. 1929 * @return {@code true} if replacing the reference with the alternate results in a deletion. {@code false} otherwise. 1930 */ isDeletion(final Allele reference, final Allele alternate)1931 public static boolean isDeletion(final Allele reference, final Allele alternate) { 1932 1933 Utils.nonNull(reference); 1934 Utils.nonNull(alternate); 1935 1936 // If we have fewer bases in the alternate, we have a deletion: 1937 return reference.length() > alternate.length(); 1938 } 1939 1940 /** 1941 * Determines whether the given reference and alternate alleles constitute an insertion or deletion mutation. 1942 * @param reference A {@link String} containing the bases of the reference allele. Must not be {@code null}. 1943 * @param alternate A {@link String} containing the bases of the alternate / variant allele. Must not be {@code null}. 1944 * @return {@code true} if replacing the reference with the alternate results in an insertion or deletion. {@code false} otherwise. 1945 */ isIndel(final String reference, final String alternate)1946 public static boolean isIndel(final String reference, final String alternate) { 1947 1948 Utils.nonNull(reference); 1949 Utils.nonNull(alternate); 1950 1951 final String refComparable = reference.replaceAll("\\*", ""); 1952 final String altComperable = alternate.replaceAll("\\*", ""); 1953 1954 // If we do not have the same number of bases in the reference and alternate alleles, 1955 // then we have an indel: 1956 return refComparable.length() != altComperable.length(); 1957 } 1958 1959 /** 1960 * Determines whether the given reference and alternate alleles constitute an insertion or deletion mutation. 1961 * @param reference The reference {@link Allele}. Must not be {@code null}. 1962 * @param alternate The alternate / variant {@link Allele}. Must not be {@code null}. 1963 * @return {@code true} if replacing the reference with the alternate results in an insertion or deletion. {@code false} otherwise. 1964 */ isIndel(final Allele reference, final Allele alternate)1965 public static boolean isIndel(final Allele reference, final Allele alternate) { 1966 1967 Utils.nonNull(reference); 1968 Utils.nonNull(alternate); 1969 1970 // If we do not have the same number of bases in the reference and alternate alleles, 1971 // then we have an indel: 1972 return reference.length() != alternate.length(); 1973 } 1974 1975 /** 1976 * Determines whether the given reference and alternate alleles constitute a polymorphism in one or more nucleotides (XNP). 1977 * @param reference The reference {@link Allele}. Must not be {@code null}. 1978 * @param alternate The alternate / variant {@link Allele}. Must not be {@code null}. 1979 * @return {@code true} if replacing the reference with the alternate results in an XNP. {@code false} otherwise. 1980 */ isXnp(final Allele reference, final Allele alternate)1981 public static boolean isXnp(final Allele reference, final Allele alternate) { 1982 1983 Utils.nonNull(reference); 1984 Utils.nonNull(alternate); 1985 1986 // If we have an equal number of bases in the reference and the alternate, we have an ONP: 1987 return (reference.length() == alternate.length()) && (!reference.equals(alternate)); 1988 } 1989 1990 /** 1991 * Determines whether the given reference and alternate alleles constitute a polymorphism in one or more nucleotides (XNP). 1992 * @param reference A {@link String} containing the bases of the reference allele. Must not be {@code null}. 1993 * @param alternate A {@link String} containing the bases of the alternate / variant allele. Must not be {@code null}. 1994 * @return {@code true} if replacing the reference with the alternate results in an XNP. {@code false} otherwise. 1995 */ isXnp(final String reference, final String alternate)1996 public static boolean isXnp(final String reference, final String alternate) { 1997 1998 Utils.nonNull(reference); 1999 Utils.nonNull(alternate); 2000 2001 final String refComparable = reference.replaceAll("\\*", ""); 2002 final String altComperable = alternate.replaceAll("\\*", ""); 2003 2004 // If we have an equal number of bases in the reference and the alternate, we have an ONP: 2005 return ((refComparable.length() == altComperable.length()) && (!refComparable.equals(altComperable))); 2006 } 2007 2008 /** 2009 * @param vc {@link VariantContext to test} 2010 * @return true if the only alternate allele for this VariantContext is a spanning deletion, otherwise false. 2011 */ isSpanningDeletionOnly(final VariantContext vc)2012 public static boolean isSpanningDeletionOnly(final VariantContext vc){ 2013 return vc.getAlternateAlleles().size() == 1 && GATKVCFConstants.isSpanningDeletion(vc.getAlternateAllele(0)); 2014 } 2015 2016 /** 2017 * 2018 * Attempt to match allele ref/alt pairs, even if the allele pairs in the given variant contexts are equivalent, 2019 * but not exact. 2020 * 2021 * For example, if variant 1 and variant2 have the same position, but 2022 * Variant 1: "A", T", "C" 2023 * Variant 2: "ACCAGGCCCAGCTCATGCTTCTTTGCAGCCTCT", "TCCAGGCCCAGCTCATGCTTCTTTGCAGCCTCT", "A", "AC" 2024 * 2025 * Then the returned array would be: {0, -1} 2026 * Since A>T matches in variant1 matches the the first alt allele in variant 2. And A>C does not match any allele 2027 * in variant 2. 2028 * 2029 * Do not use this method for doing a full split of a variant context into biallelic components. This method 2030 * ignores (and drops) genotypes and INFO attributes. It is really meant just for alleles, but works on a 2031 * VariantContext to better leverage existing code in 2032 * {@link GATKVariantContextUtils#trimAlleles(VariantContext, boolean, boolean)} 2033 * 2034 * @param variant1 Never {@code null} 2035 * @param variant2 Never {@code null} 2036 * @return Returns indexes into the alternate alleles of variant2. Note that this method assumes that (when biallelic) the variant 2037 * contexts are already trimmed. See {@link GATKVariantContextUtils#trimAlleles(VariantContext, boolean, boolean)} 2038 * Never {@code null}. The length will be equal to the number of alternate alleles in variant1. A value of -1 2039 * indicates no match in variant2. If the reference alleles do not match, the output array will be populated 2040 * exclusively with -1. 2041 */ matchAllelesOnly(final VariantContext variant1, final VariantContext variant2)2042 public static int[] matchAllelesOnly(final VariantContext variant1, final VariantContext variant2) { 2043 Utils.nonNull(variant1); 2044 Utils.nonNull(variant2); 2045 2046 // Grab the trivial case: 2047 if (variant1.isBiallelic() && variant2.isBiallelic()) { 2048 if (variant1.getAlternateAllele(0).equals(variant2.getAlternateAllele(0)) && 2049 (variant1.getReference().equals(variant2.getReference()))) { 2050 return new int[]{0}; 2051 } else { 2052 return new int[]{-1}; 2053 } 2054 } 2055 2056 // Handle the case where one or both of the input VCs are not biallelic. 2057 final int[] result = new int[variant1.getAlternateAlleles().size()]; 2058 2059 // First split (and trim) all variant contexts into biallelics. We are only going ot be interested in the alleles. 2060 final List<VariantContext> splitVariants1 = simpleSplitIntoBiallelics(variant1); 2061 final List<VariantContext> splitVariants2 = simpleSplitIntoBiallelics(variant2); 2062 2063 // Second, match on ref and alt. If match occurs add it to the output list. 2064 for (int i = 0; i < splitVariants1.size(); i++) { 2065 result[i] = -1; 2066 for (int j = 0; j < splitVariants2.size(); j++) { 2067 final VariantContext splitVariant1 = splitVariants1.get(i); 2068 final VariantContext splitVariant2 = splitVariants2.get(j); 2069 if (splitVariant1.getAlternateAllele(0).equals(splitVariant2.getAlternateAllele(0)) 2070 && splitVariant1.getReference().equals(splitVariant2.getReference())) { 2071 result[i] = j; 2072 } 2073 } 2074 } 2075 2076 return result; 2077 } 2078 2079 /** 2080 * Do not use this method for doing a full split of a variant context into biallelic components. This method 2081 * ignores (and drops) genotypes and INFO attributes. It is really meant just for alleles, but works on a 2082 * VariantContext to better leverage existing code in 2083 * {@link GATKVariantContextUtils#trimAlleles(VariantContext, boolean, boolean)} 2084 * 2085 * This method is trying to be fast, otherwise. 2086 * 2087 * @param vc variant context to split into simple biallelics. Never {@code null} 2088 * @return a list of variant contexts. Each will be biallelic. Length will be the number of alt alleles in the input vc. 2089 * Note that the variant contexts are usually stripped of attributes and genotypes. Never {@code null}. Empty list 2090 * if variant context has no alt alleles. 2091 */ simpleSplitIntoBiallelics(final VariantContext vc)2092 private static List<VariantContext> simpleSplitIntoBiallelics(final VariantContext vc) { 2093 Utils.nonNull(vc); 2094 final List<VariantContext> result = new ArrayList<>(); 2095 2096 if (vc.isBiallelic()) { 2097 return Collections.singletonList(vc); 2098 } else { 2099 // Since variant context builders are slow to keep re-creating. Just create one and spew variant contexts from it, since 2100 // the only difference will be the alternate allele. Initialize the VCB with a dummy alternate allele, 2101 // since it will be overwritten in all cases. 2102 final VariantContextBuilder vcb = new VariantContextBuilder("SimpleSplit", vc.getContig(), vc.getStart(), vc.getEnd(), 2103 Arrays.asList(vc.getReference(), Allele.NO_CALL)); 2104 vc.getAlternateAlleles().forEach(allele -> result.add(GATKVariantContextUtils.trimAlleles( 2105 vcb.alleles(Arrays.asList(vc.getReference(), allele)).make(true), true, true) 2106 ) 2107 ); 2108 } 2109 2110 return result; 2111 } 2112 2113 /** 2114 * Returns true if the context represents a MNP. If the context supplied contains the GVCF NON_REF symbolic 2115 * allele, then the determination is performed after discarding the NON_REF symbolic allele. 2116 * 2117 * @param vc a Variant context 2118 * @return true if the context represents an unmixed MNP (i.e. all alleles are non-symbolic and of the same 2119 * length > 1), false otherwise 2120 */ isUnmixedMnpIgnoringNonRef(final VariantContext vc)2121 public static boolean isUnmixedMnpIgnoringNonRef(final VariantContext vc) { 2122 final List<Allele> alleles = vc.getAlleles(); 2123 int length = vc.getReference().length(); // ref allele cannot be symbolic 2124 if (length < 2) { return false; } 2125 2126 for (final Allele a : alleles) { 2127 if (a.isSymbolic() && !Allele.NON_REF_ALLELE.equals(a)) { return false; } 2128 else if (!a.isSymbolic() && a.length() != length) { return false; } 2129 } 2130 2131 return true; 2132 } 2133 removeDataForSymbolicAltAlleles(VariantContext vc, List<T> data)2134 public static <T> List<T> removeDataForSymbolicAltAlleles(VariantContext vc, List<T> data) { 2135 return removeDataForSymbolicAlleles(vc, data, false); 2136 } 2137 removeDataForSymbolicAlleles(VariantContext vc, List<T> data)2138 public static <T> List<T> removeDataForSymbolicAlleles(VariantContext vc, List<T> data) { 2139 return removeDataForSymbolicAlleles(vc, data, true); 2140 } 2141 removeDataForSymbolicAlleles(VariantContext vc, List<T> data, boolean dataContainsReference)2142 protected static <T> List<T> removeDataForSymbolicAlleles(VariantContext vc, List<T> data, boolean dataContainsReference) { 2143 if (vc.hasSymbolicAlleles()) { 2144 List<Allele> symbolicAlleles = vc.getAlternateAlleles().stream().filter(allele -> allele.isSymbolic()).collect(Collectors.toList()); 2145 // convert allele index to index for data 2146 int offset = dataContainsReference ? 0 : 1; 2147 List<Integer> symAltIndexes = vc.getAlleleIndices(symbolicAlleles).stream().map(i -> i-offset).collect(Collectors.toList()); 2148 return removeItemsByIndex(data, symAltIndexes); 2149 } else { 2150 return data; 2151 } 2152 } 2153 removeItemsByIndex(List<T> data, List<Integer> indexesToRemove)2154 public static <T> List<T> removeItemsByIndex(List<T> data, List<Integer> indexesToRemove) { 2155 List<T> updated = new ArrayList<>(); 2156 new IndexRange(0, data.size()).forEach(i -> { 2157 if (!indexesToRemove.contains(i)) { 2158 updated.add(data.get(i)); 2159 } 2160 }); 2161 return updated; 2162 } 2163 2164 2165 } 2166 2167