1 /*
2  *  Copyright (C) 2010  Regents of the University of Michigan
3  *
4  *   This program is free software: you can redistribute it and/or modify
5  *   it under the terms of the GNU General Public License as published by
6  *   the Free Software Foundation, either version 3 of the License, or
7  *   (at your option) any later version.
8  *
9  *   This program is distributed in the hope that it will be useful,
10  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *   GNU General Public License for more details.
13  *
14  *   You should have received a copy of the GNU General Public License
15  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 //////////////////////////////////////////////////////////////////////////
19 
20 
21 #include "SamFilter.h"
22 
23 #include "SamQuerySeqWithRefHelper.h"
24 #include "BaseUtilities.h"
25 #include "SamFlag.h"
26 
clipOnMismatchThreshold(SamRecord & record,GenomeSequence & refSequence,double mismatchThreshold)27 SamFilter::FilterStatus SamFilter::clipOnMismatchThreshold(SamRecord& record,
28                                                            GenomeSequence& refSequence,
29                                                            double mismatchThreshold)
30 {
31     // Read & clip from the left & right.
32     SamQuerySeqWithRefIter iterFromFront(record, refSequence, true);
33     SamQuerySeqWithRefIter iterFromBack(record, refSequence, false);
34 
35     SamSingleBaseMatchInfo baseMatchInfo;
36 
37     int32_t readLength = record.getReadLength();
38     // Init last front clip to be prior to the lastFront index (0).
39     const int32_t initialLastFrontClipPos = -1;
40     int32_t lastFrontClipPos = initialLastFrontClipPos;
41     // Init first back clip to be past the last index (readLength).
42     int32_t firstBackClipPos = readLength;
43 
44     bool fromFrontComplete = false;
45     bool fromBackComplete = false;
46     int32_t numBasesFromFront = 0;
47     int32_t numBasesFromBack = 0;
48     int32_t numMismatchFromFront = 0;
49     int32_t numMismatchFromBack = 0;
50 
51     //////////////////////////////////////////////////////////
52     // Determining the clip positions.
53     while(!fromFrontComplete || !fromBackComplete)
54     {
55         // Read from the front (left to right) of the read until
56         // more have been read from that direction than the opposite direction.
57         while(!fromFrontComplete &&
58               ((numBasesFromFront <= numBasesFromBack) ||
59                (fromBackComplete)))
60         {
61             if(iterFromFront.getNextMatchMismatch(baseMatchInfo) == false)
62             {
63                 // Nothing more to read in this direction.
64                 fromFrontComplete = true;
65                 break;
66             }
67             // Got a read.  Check to see if it is to or past the last clip.
68             if(baseMatchInfo.getQueryIndex() >= firstBackClipPos)
69             {
70                 // This base is past where we are clipping, so we
71                 // are done reading in this direction.
72                 fromFrontComplete = true;
73                 break;
74             }
75             // This is an actual base read from the left to the
76             // right, so up the counter and determine if it was a mismatch.
77             ++numBasesFromFront;
78 
79             if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
80             {
81                 // Mismatch
82                 ++numMismatchFromFront;
83                 // Check to see if it is over the threshold.
84                 double mismatchPercent =
85                     (double)numMismatchFromFront / numBasesFromFront;
86                 if(mismatchPercent > mismatchThreshold)
87                 {
88                     // Need to clip.
89                     lastFrontClipPos = baseMatchInfo.getQueryIndex();
90                     // Reset the counters.
91                     numBasesFromFront = 0;
92                     numMismatchFromFront = 0;
93                 }
94             }
95         }
96 
97         // Now, read from right to left until more have been read
98         // from the back than from the front.
99         while(!fromBackComplete &&
100               ((numBasesFromBack <= numBasesFromFront) ||
101                (fromFrontComplete)))
102         {
103             if(iterFromBack.getNextMatchMismatch(baseMatchInfo) == false)
104             {
105                 // Nothing more to read in this direction.
106                 fromBackComplete = true;
107                 break;
108             }
109             // Got a read.  Check to see if it is to or past the first clip.
110             if(baseMatchInfo.getQueryIndex() <= lastFrontClipPos)
111             {
112                 // This base is past where we are clipping, so we
113                 // are done reading in this direction.
114                 fromBackComplete = true;
115                 break;
116             }
117             // This is an actual base read from the right to the
118             // left, so up the counter and determine if it was a mismatch.
119             ++numBasesFromBack;
120 
121             if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
122             {
123                 // Mismatch
124                 ++numMismatchFromBack;
125                 // Check to see if it is over the threshold.
126                 double mismatchPercent =
127                     (double)numMismatchFromBack / numBasesFromBack;
128                 if(mismatchPercent > mismatchThreshold)
129                 {
130                     // Need to clip.
131                     firstBackClipPos = baseMatchInfo.getQueryIndex();
132                     // Reset the counters.
133                     numBasesFromBack = 0;
134                     numMismatchFromBack = 0;
135                 }
136             }
137         }
138     }
139 
140     //////////////////////////////////////////////////////////
141     // Done determining the clip positions, so clip.
142     // To determine the number of clips from the front, add 1 to the
143     // lastFrontClipPos since the index starts at 0.
144     // To determine the number of clips from the back, subtract the
145     // firstBackClipPos from the readLength.
146     // Example:
147     // Pos:  012345
148     // Read: AAAAAA
149     // Read Length = 6.  If lastFrontClipPos = 2 and firstBackClipPos = 4, numFrontClips = 3 & numBack = 2.
150     return(softClip(record, lastFrontClipPos + 1, readLength - firstBackClipPos));
151 }
152 
153 
154 // Soft clip the record from the front and/or the back.
softClip(SamRecord & record,int32_t numFrontClips,int32_t numBackClips)155 SamFilter::FilterStatus SamFilter::softClip(SamRecord& record,
156                                             int32_t numFrontClips,
157                                             int32_t numBackClips)
158 {
159     //////////////////////////////////////////////////////////
160     Cigar* cigar = record.getCigarInfo();
161     FilterStatus status = NONE;
162     int32_t startPos = record.get0BasedPosition();
163     CigarRoller updatedCigar;
164 
165     status = softClip(*cigar, numFrontClips, numBackClips,
166                       startPos, updatedCigar);
167 
168     if(status == FILTERED)
169     {
170         /////////////////////////////
171         // The entire read is clipped, so rather than clipping it,
172         // filter it out.
173         filterRead(record);
174         return(FILTERED);
175     }
176     else if(status == CLIPPED)
177     {
178         // Part of the read was clipped, and now that we have
179         // an updated cigar, update the read.
180         record.setCigar(updatedCigar);
181 
182         // Update the starting position.
183         record.set0BasedPosition(startPos);
184     }
185     return(status);
186 }
187 
188 
189 // Soft clip the cigar from the front and/or the back, writing the value
190 // into the new cigar.
softClip(Cigar & oldCigar,int32_t numFrontClips,int32_t numBackClips,int32_t & startPos,CigarRoller & updatedCigar)191 SamFilter::FilterStatus SamFilter::softClip(Cigar& oldCigar,
192                                             int32_t numFrontClips,
193                                             int32_t numBackClips,
194                                             int32_t& startPos,
195                                             CigarRoller& updatedCigar)
196 {
197     int32_t readLength = oldCigar.getExpectedQueryBaseCount();
198     int32_t endClipPos = readLength - numBackClips;
199     FilterStatus status = NONE;
200 
201     if((numFrontClips != 0) || (numBackClips != 0))
202     {
203         // Clipping from front and/or from the back.
204 
205         // Check to see if the entire read was clipped.
206         int32_t totalClips = numFrontClips + numBackClips;
207         if(totalClips >= readLength)
208         {
209             /////////////////////////////
210             // The entire read is clipped, so rather than clipping it,
211             // filter it out.
212             return(FILTERED);
213         }
214 
215         // Part of the read was clipped.
216         status = CLIPPED;
217 
218         // Loop through, creating an updated cigar.
219         int origCigarOpIndex = 0;
220 
221         // Track how many read positions are covered up to this
222         // point by the cigar to determine up to up to what
223         // point in the cigar is affected by this clipping.
224         int32_t numPositions = 0;
225 
226         // Track if any non-clips are in the new cigar.
227         bool onlyClips = true;
228 
229         const Cigar::CigarOperator* op = NULL;
230 
231         //////////////////
232         // Clip from front
233         while((origCigarOpIndex < oldCigar.size()) &&
234               (numPositions < numFrontClips))
235         {
236             op = &(oldCigar.getOperator(origCigarOpIndex));
237             switch(op->operation)
238             {
239                 case Cigar::hardClip:
240                     // Keep this operation as the new clips do not
241                     // affect other clips.
242                     updatedCigar += *op;
243                     break;
244                 case Cigar::del:
245                 case Cigar::skip:
246                     // Skip and delete are going to be dropped, and
247                     // are not in the read, so the read index doesn't
248                     // need to be updated
249                     break;
250                 case Cigar::insert:
251                 case Cigar::match:
252                 case Cigar::mismatch:
253                 case Cigar::softClip:
254                     // Update the read index as these types
255                     // are found in the read.
256                     numPositions += op->count;
257                     break;
258                 case Cigar::none:
259                 default:
260                     // Nothing to do for none.
261                     break;
262             };
263             ++origCigarOpIndex;
264         }
265 
266         // If bases were clipped from the front, add the clip and
267         // any partial cigar operation as necessary.
268         if(numFrontClips != 0)
269         {
270             // Add the softclip to the front of the read.
271             updatedCigar.Add(Cigar::softClip, numFrontClips);
272 
273             // Add the rest of the last Cigar operation if
274             // it is not entirely clipped.
275             int32_t newCount = numPositions - numFrontClips;
276             if(newCount > 0)
277             {
278                 // Before adding it, check to see if the same
279                 // operation is clipped from the end.
280                 // numPositions greater than the endClipPos
281                 // means that it is equal or past that position,
282                 // so shorten the number of positions.
283                 if(numPositions > endClipPos)
284                 {
285                     newCount -= (numPositions - endClipPos);
286                 }
287                 if(newCount > 0)
288                 {
289                     updatedCigar.Add(op->operation, newCount);
290                     if(!Cigar::isClip(op->operation))
291                     {
292                         onlyClips = false;
293                     }
294                 }
295             }
296         }
297 
298         // Add operations until the point of the end clip is reached.
299         // For example...
300         //   2M1D3M = MMDMMM  readLength = 5
301         // readIndex: 01 234
302         //   at cigarOpIndex 0 (2M), numPositions = 2.
303         //   at cigarOpIndex 1 (1D), numPositions = 2.
304         //   at cigarOpIndex 2 (3M), numPositions = 5.
305         // if endClipPos = 2, we still want to consume the 1D, so
306         // need to keep looping until numPositions > endClipPos
307         while((origCigarOpIndex < oldCigar.size()) &&
308               (numPositions <= endClipPos))
309         {
310             op = &(oldCigar.getOperator(origCigarOpIndex));
311 
312             // Update the numPositions count if the operations indicates
313             // bases within the read.
314             if(!Cigar::foundInQuery(op->operation))
315             {
316                 // This operation is not in the query read sequence,
317                 // so it is not yet to the endClipPos, just add the
318                 // operation do not increment the number of positions.
319                 updatedCigar += *op;
320                 if(!Cigar::isClip(op->operation))
321                 {
322                     onlyClips = false;
323                 }
324             }
325             else
326             {
327                 // This operation appears in the query sequence, so
328                 // check to see if the clip occurs in this operation.
329 
330                 // endClipPos is 0 based & numPositions is a count.
331                 // If endClipPos is 4, then it is the 5th position.
332                 // If 4 positions are covered so far (numPositions = 4),
333                 // then we are right at endCLipPos: 4-4 = 0, none of
334                 // this operation should be kept.
335                 // If only 3 positions were covered, then we are at offset
336                 // 3, so offset 3 should be added: 4-3 = 1.
337                 uint32_t numPosTilClip = endClipPos - numPositions;
338 
339                 if(numPosTilClip < op->count)
340                 {
341                     // this operation is partially clipped, write the part
342                     // that was not clipped if it is not all clipped.
343                     if(numPosTilClip != 0)
344                     {
345                         updatedCigar.Add(op->operation,
346                                      numPosTilClip);
347                         if(!Cigar::isClip(op->operation))
348                         {
349                             onlyClips = false;
350                         }
351                     }
352                 }
353                 else
354                 {
355                     // This operation is not clipped, so add it
356                     updatedCigar += *op;
357                     if(!Cigar::isClip(op->operation))
358                     {
359                         onlyClips = false;
360                     }
361                 }
362                 // This operation occurs in the query sequence, so
363                 // increment the number of positions covered.
364                 numPositions += op->count;
365             }
366 
367             // Move to the next cigar position.
368             ++origCigarOpIndex;
369         }
370 
371         //////////////////
372         // Add the softclip to the back.
373         if(numBackClips != 0)
374         {
375             // Add the softclip to the end
376             updatedCigar.Add(Cigar::softClip, numBackClips);
377         }
378 
379         //////////////////
380         // Add any hardclips remaining in the original cigar to the back.
381         while(origCigarOpIndex < oldCigar.size())
382         {
383             op = &(oldCigar.getOperator(origCigarOpIndex));
384             if(op->operation == Cigar::hardClip)
385             {
386                 // Keep this operation as the new clips do not
387                 // affect other clips.
388                 updatedCigar += *op;
389             }
390             ++origCigarOpIndex;
391         }
392 
393         // Check to see if the new cigar is only clips.
394         if(onlyClips)
395         {
396             // Only clips in the new cigar, so mark the read as filtered
397             // instead of updating the cigar.
398             /////////////////////////////
399             // The entire read was clipped.
400             status = FILTERED;
401         }
402         else
403         {
404             // Part of the read was clipped.
405             // Update the starting position if a clip was added to
406             // the front.
407             if(numFrontClips > 0)
408             {
409                 // Convert from query index to reference position (from the
410                 // old cigar)
411                 // Get the position for the last front clipped position by
412                 // getting the position associated with the clipped base on
413                 // the reference.  Then add one to get to the first
414                 // non-clipped position.
415                 int32_t lastFrontClipPos = numFrontClips - 1;
416                 int32_t newStartPos = oldCigar.getRefPosition(lastFrontClipPos,
417                                                               startPos);
418                 if(newStartPos != Cigar::INDEX_NA)
419                 {
420                     // Add one to get first non-clipped position.
421                     startPos = newStartPos + 1;
422                 }
423             }
424         }
425     }
426     return(status);
427 }
428 
429 
filterOnMismatchQuality(SamRecord & record,GenomeSequence & refSequence,uint32_t qualityThreshold,uint8_t defaultQualityInt)430 SamFilter::FilterStatus SamFilter::filterOnMismatchQuality(SamRecord& record,
431                                                            GenomeSequence& refSequence,
432                                                            uint32_t qualityThreshold,
433                                                            uint8_t defaultQualityInt)
434 {
435     uint32_t totalMismatchQuality =
436         sumMismatchQuality(record, refSequence, defaultQualityInt);
437 
438     // If the total mismatch quality is over the threshold,
439     // filter the read.
440     if(totalMismatchQuality > qualityThreshold)
441     {
442         filterRead(record);
443         return(FILTERED);
444     }
445     return(NONE);
446 }
447 
448 
449 // NOTE: Only positions where the reference and read both have bases that
450 //       are different and not 'N' are considered mismatches.
sumMismatchQuality(SamRecord & record,GenomeSequence & refSequence,uint8_t defaultQualityInt)451 uint32_t SamFilter::sumMismatchQuality(SamRecord& record,
452                                        GenomeSequence& refSequence,
453                                        uint8_t defaultQualityInt)
454 {
455     // Track the mismatch info.
456     int mismatchQual = 0;
457     int numMismatch = 0;
458 
459     SamQuerySeqWithRefIter sequenceIter(record, refSequence);
460 
461     SamSingleBaseMatchInfo baseMatchInfo;
462     while(sequenceIter.getNextMatchMismatch(baseMatchInfo))
463     {
464         if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
465         {
466             // Got a mismatch, get the associated quality.
467             char readQualityChar =
468                 record.getQuality(baseMatchInfo.getQueryIndex());
469             uint8_t readQualityInt =
470                 BaseUtilities::getPhredBaseQuality(readQualityChar);
471 
472             if(readQualityInt == BaseUtilities::UNKNOWN_QUALITY_INT)
473             {
474                 // Quality was not specified, so use the configured setting.
475                 readQualityInt = defaultQualityInt;
476             }
477             mismatchQual += readQualityInt;
478             ++numMismatch;
479         }
480     }
481 
482     return(mismatchQual);
483 }
484 
485 
filterRead(SamRecord & record)486 void SamFilter::filterRead(SamRecord& record)
487 {
488     // Filter the read by marking it as unmapped.
489     uint16_t flag = record.getFlag();
490     SamFlag::setUnmapped(flag);
491     // Clear N/A flags.
492     flag &= ~SamFlag::PROPER_PAIR;
493     flag &= ~SamFlag::SECONDARY_ALIGNMENT;
494     flag &= ~SamFlag::SUPPLEMENTARY_ALIGNMENT;
495     record.setFlag(flag);
496     // Clear Cigar
497     record.setCigar("*");
498     // Clear mapping quality
499     record.setMapQuality(0);
500 }
501