1 package org.broadinstitute.hellbender.transformers; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import htsjdk.samtools.Cigar; 5 import htsjdk.samtools.CigarElement; 6 import htsjdk.samtools.CigarOperator; 7 import org.broadinstitute.hellbender.utils.Utils; 8 import org.broadinstitute.hellbender.utils.read.GATKRead; 9 10 /** 11 * A read transformer that refactors NDN cigar elements to one N element. 12 * 13 * <p> 14 * This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length of the three refactored elements). 15 * This is intended primarily for users of RNA-Seq data handling programs such as TopHat2. 16 * Currently we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of 17 * those specific reads, users of TopHat and other tools can circumvent this problem without affecting the rest of their dataset. 18 * </p> 19 * 20 */ 21 22 public final class NDNCigarReadTransformer implements ReadTransformer { 23 private static final long serialVersionUID = 1L; 24 25 @Override apply(final GATKRead read)26 public GATKRead apply(final GATKRead read) { 27 final Cigar originalCigar = read.getCigar(); 28 Utils.validateArg(originalCigar.isValid(read.getName(), -1) == null, () -> "try to transform a read with non-valid cigar string: readName: "+read.getName()+" Cigar String: "+originalCigar); 29 read.setCigar(refactorNDNtoN(originalCigar)); 30 return read; 31 } 32 33 /** 34 * Refactor cigar strings that contain N-D-N elements to one N element (with total length of the three refactored elements). 35 */ 36 @VisibleForTesting refactorNDNtoN(final Cigar originalCigar)37 protected static Cigar refactorNDNtoN(final Cigar originalCigar) { 38 final Cigar refactoredCigar = new Cigar(); 39 final int cigarLength = originalCigar.numCigarElements(); 40 for(int i = 0; i < cigarLength; i++){ 41 final CigarElement element = originalCigar.getCigarElement(i); 42 if(element.getOperator() == CigarOperator.N && thereAreAtLeast2MoreElements(i,cigarLength)){ 43 final CigarElement nextElement = originalCigar.getCigarElement(i+1); 44 final CigarElement nextNextElement = originalCigar.getCigarElement(i+2); 45 46 // if it is N-D-N replace with N (with the total length) otherwise just add the first N. 47 if(nextElement.getOperator() == CigarOperator.D && nextNextElement.getOperator() == CigarOperator.N){ 48 final int threeElementsLength = element.getLength() + nextElement.getLength() + nextNextElement.getLength(); 49 final CigarElement refactoredElement = new CigarElement(threeElementsLength,CigarOperator.N); 50 refactoredCigar.add(refactoredElement); 51 i += 2; //skip the elements that were refactored 52 } else { 53 refactoredCigar.add(element); // add only the first N 54 } 55 } else { 56 refactoredCigar.add(element); // add any non-N element 57 } 58 } 59 return refactoredCigar; 60 } 61 thereAreAtLeast2MoreElements(final int index, final int cigarLength)62 private static boolean thereAreAtLeast2MoreElements(final int index, final int cigarLength){ 63 return index < cigarLength - 2; 64 } 65 66 } 67 68