1 package org.broadinstitute.hellbender.transformers;
2 
3 import com.google.common.annotations.VisibleForTesting;
4 import htsjdk.samtools.Cigar;
5 import htsjdk.samtools.CigarElement;
6 import htsjdk.samtools.CigarOperator;
7 import org.broadinstitute.hellbender.utils.Utils;
8 import org.broadinstitute.hellbender.utils.read.GATKRead;
9 
10 /**
11  * A read transformer that refactors NDN cigar elements to one N element.
12  *
13  *  <p>
14  *     This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length of the three refactored elements).
15  *     This is intended primarily for users of RNA-Seq data handling programs such as TopHat2.
16  *     Currently we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of
17  *     those specific reads, users of TopHat and other tools can circumvent this problem without affecting the rest of their dataset.
18  *  </p>
19  *
20  */
21 
22 public final class NDNCigarReadTransformer implements ReadTransformer {
23     private static final long serialVersionUID = 1L;
24 
25     @Override
apply(final GATKRead read)26     public GATKRead apply(final GATKRead read) {
27         final Cigar originalCigar = read.getCigar();
28         Utils.validateArg(originalCigar.isValid(read.getName(), -1) == null, () -> "try to transform a read with non-valid cigar string: readName: "+read.getName()+" Cigar String: "+originalCigar);
29         read.setCigar(refactorNDNtoN(originalCigar));
30         return read;
31     }
32 
33     /**
34      * Refactor cigar strings that contain N-D-N elements to one N element (with total length of the three refactored elements).
35      */
36     @VisibleForTesting
refactorNDNtoN(final Cigar originalCigar)37     protected static Cigar refactorNDNtoN(final Cigar originalCigar) {
38         final Cigar refactoredCigar = new Cigar();
39         final int cigarLength = originalCigar.numCigarElements();
40         for(int i = 0; i < cigarLength; i++){
41             final CigarElement element = originalCigar.getCigarElement(i);
42             if(element.getOperator() == CigarOperator.N && thereAreAtLeast2MoreElements(i,cigarLength)){
43                 final CigarElement nextElement = originalCigar.getCigarElement(i+1);
44                 final CigarElement nextNextElement = originalCigar.getCigarElement(i+2);
45 
46                 // if it is N-D-N replace with N (with the total length) otherwise just add the first N.
47                 if(nextElement.getOperator() == CigarOperator.D && nextNextElement.getOperator() == CigarOperator.N){
48                     final int threeElementsLength = element.getLength() + nextElement.getLength() + nextNextElement.getLength();
49                     final CigarElement refactoredElement = new CigarElement(threeElementsLength,CigarOperator.N);
50                     refactoredCigar.add(refactoredElement);
51                     i += 2; //skip the elements that were refactored
52                 } else {
53                     refactoredCigar.add(element);  // add only the first N
54                 }
55             } else {
56                 refactoredCigar.add(element);  // add any non-N element
57             }
58         }
59         return refactoredCigar;
60     }
61 
thereAreAtLeast2MoreElements(final int index, final int cigarLength)62     private static boolean thereAreAtLeast2MoreElements(final int index, final int cigarLength){
63         return index < cigarLength - 2;
64     }
65 
66 }
67 
68