1 package org.broadinstitute.hellbender.tools.walkers.annotator;
2 
3 import htsjdk.variant.variantcontext.Allele;
4 import htsjdk.variant.variantcontext.VariantContext;
5 import org.apache.commons.lang3.tuple.Pair;
6 import org.broadinstitute.barclay.help.DocumentedFeature;
7 import org.broadinstitute.hellbender.engine.ReferenceContext;
8 import org.broadinstitute.hellbender.utils.Utils;
9 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
10 import org.broadinstitute.hellbender.utils.help.HelpConstants;
11 import org.broadinstitute.hellbender.utils.read.GATKRead;
12 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
13 import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
14 
15 import java.util.*;
16 
17 /**
18  * Tandem repeat unit composition and counts per allele
19  *
20  * <p>This annotation tags variants that fall within tandem repeat sets. It also provides the composition of the tandem repeat units and the number of times they are repeated for each allele (including the REF allele).</p>
21  *
22  * <p>A tandem repeat unit is composed of one or more nucleotides that are repeated multiple times in series. Repetitive sequences are difficult to map to the reference because they are associated with multiple alignment possibilities. Knowing the number of repeat units in a set of tandem repeats tells you the number of different positions the tandem repeat can be placed in. The observation of many tandem repeat units multiplies the number of possible representations that can be made of the region.
23  */
24 @DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Tandem repeat unit composition and counts per allele (STR, RU, RPA)")
25 public final class TandemRepeat extends InfoFieldAnnotation implements StandardMutectAnnotation {
26 
27     @Override
annotate(final ReferenceContext ref, final VariantContext vc, final AlleleLikelihoods<GATKRead, Allele> likelihoods)28     public Map<String, Object> annotate(final ReferenceContext ref,
29                                         final VariantContext vc,
30                                         final AlleleLikelihoods<GATKRead, Allele> likelihoods) {
31         Utils.nonNull(vc);
32         if ( !vc.isIndel()) {
33             return Collections.emptyMap();
34         }
35 
36         final Pair<List<Integer>,byte[]> result = getNumTandemRepeatUnits(ref, vc);
37         if (result == null) {
38             return Collections.emptyMap();
39         }
40 
41         final byte[] repeatUnit = result.getRight();
42         final List<Integer> numUnits = result.getLeft();
43 
44         final Map<String, Object> map = new LinkedHashMap<>();
45         map.put(GATKVCFConstants.STR_PRESENT_KEY, true);
46         map.put(GATKVCFConstants.REPEAT_UNIT_KEY, new String(repeatUnit));
47         map.put(GATKVCFConstants.REPEATS_PER_ALLELE_KEY, numUnits);
48         return Collections.unmodifiableMap(map);
49     }
50 
getNumTandemRepeatUnits(final ReferenceContext ref, final VariantContext vc)51     public static Pair<List<Integer>, byte[]> getNumTandemRepeatUnits(final ReferenceContext ref, final VariantContext vc) {
52         final byte[] refBases = ref.getBases();
53         final int startIndex = vc.getStart() + 1 - ref.getWindow().getStart();  // +1 to exclude leading match base common to VC's ref and alt alleles
54         return GATKVariantContextUtils.getNumTandemRepeatUnits(vc, Arrays.copyOfRange(refBases, startIndex, refBases.length));
55     }
56 
57     @Override
getKeyNames()58     public List<String> getKeyNames() {
59         return Arrays.asList(
60                 GATKVCFConstants.STR_PRESENT_KEY,
61                 GATKVCFConstants.REPEAT_UNIT_KEY,
62                 GATKVCFConstants.REPEATS_PER_ALLELE_KEY);
63     }
64 
65 }
66