1 /*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 //////////////////////////////////////////////////////////////////////////
19
20
21 #include "SamFilter.h"
22
23 #include "SamQuerySeqWithRefHelper.h"
24 #include "BaseUtilities.h"
25 #include "SamFlag.h"
26
clipOnMismatchThreshold(SamRecord & record,GenomeSequence & refSequence,double mismatchThreshold)27 SamFilter::FilterStatus SamFilter::clipOnMismatchThreshold(SamRecord& record,
28 GenomeSequence& refSequence,
29 double mismatchThreshold)
30 {
31 // Read & clip from the left & right.
32 SamQuerySeqWithRefIter iterFromFront(record, refSequence, true);
33 SamQuerySeqWithRefIter iterFromBack(record, refSequence, false);
34
35 SamSingleBaseMatchInfo baseMatchInfo;
36
37 int32_t readLength = record.getReadLength();
38 // Init last front clip to be prior to the lastFront index (0).
39 const int32_t initialLastFrontClipPos = -1;
40 int32_t lastFrontClipPos = initialLastFrontClipPos;
41 // Init first back clip to be past the last index (readLength).
42 int32_t firstBackClipPos = readLength;
43
44 bool fromFrontComplete = false;
45 bool fromBackComplete = false;
46 int32_t numBasesFromFront = 0;
47 int32_t numBasesFromBack = 0;
48 int32_t numMismatchFromFront = 0;
49 int32_t numMismatchFromBack = 0;
50
51 //////////////////////////////////////////////////////////
52 // Determining the clip positions.
53 while(!fromFrontComplete || !fromBackComplete)
54 {
55 // Read from the front (left to right) of the read until
56 // more have been read from that direction than the opposite direction.
57 while(!fromFrontComplete &&
58 ((numBasesFromFront <= numBasesFromBack) ||
59 (fromBackComplete)))
60 {
61 if(iterFromFront.getNextMatchMismatch(baseMatchInfo) == false)
62 {
63 // Nothing more to read in this direction.
64 fromFrontComplete = true;
65 break;
66 }
67 // Got a read. Check to see if it is to or past the last clip.
68 if(baseMatchInfo.getQueryIndex() >= firstBackClipPos)
69 {
70 // This base is past where we are clipping, so we
71 // are done reading in this direction.
72 fromFrontComplete = true;
73 break;
74 }
75 // This is an actual base read from the left to the
76 // right, so up the counter and determine if it was a mismatch.
77 ++numBasesFromFront;
78
79 if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
80 {
81 // Mismatch
82 ++numMismatchFromFront;
83 // Check to see if it is over the threshold.
84 double mismatchPercent =
85 (double)numMismatchFromFront / numBasesFromFront;
86 if(mismatchPercent > mismatchThreshold)
87 {
88 // Need to clip.
89 lastFrontClipPos = baseMatchInfo.getQueryIndex();
90 // Reset the counters.
91 numBasesFromFront = 0;
92 numMismatchFromFront = 0;
93 }
94 }
95 }
96
97 // Now, read from right to left until more have been read
98 // from the back than from the front.
99 while(!fromBackComplete &&
100 ((numBasesFromBack <= numBasesFromFront) ||
101 (fromFrontComplete)))
102 {
103 if(iterFromBack.getNextMatchMismatch(baseMatchInfo) == false)
104 {
105 // Nothing more to read in this direction.
106 fromBackComplete = true;
107 break;
108 }
109 // Got a read. Check to see if it is to or past the first clip.
110 if(baseMatchInfo.getQueryIndex() <= lastFrontClipPos)
111 {
112 // This base is past where we are clipping, so we
113 // are done reading in this direction.
114 fromBackComplete = true;
115 break;
116 }
117 // This is an actual base read from the right to the
118 // left, so up the counter and determine if it was a mismatch.
119 ++numBasesFromBack;
120
121 if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
122 {
123 // Mismatch
124 ++numMismatchFromBack;
125 // Check to see if it is over the threshold.
126 double mismatchPercent =
127 (double)numMismatchFromBack / numBasesFromBack;
128 if(mismatchPercent > mismatchThreshold)
129 {
130 // Need to clip.
131 firstBackClipPos = baseMatchInfo.getQueryIndex();
132 // Reset the counters.
133 numBasesFromBack = 0;
134 numMismatchFromBack = 0;
135 }
136 }
137 }
138 }
139
140 //////////////////////////////////////////////////////////
141 // Done determining the clip positions, so clip.
142 // To determine the number of clips from the front, add 1 to the
143 // lastFrontClipPos since the index starts at 0.
144 // To determine the number of clips from the back, subtract the
145 // firstBackClipPos from the readLength.
146 // Example:
147 // Pos: 012345
148 // Read: AAAAAA
149 // Read Length = 6. If lastFrontClipPos = 2 and firstBackClipPos = 4, numFrontClips = 3 & numBack = 2.
150 return(softClip(record, lastFrontClipPos + 1, readLength - firstBackClipPos));
151 }
152
153
154 // Soft clip the record from the front and/or the back.
softClip(SamRecord & record,int32_t numFrontClips,int32_t numBackClips)155 SamFilter::FilterStatus SamFilter::softClip(SamRecord& record,
156 int32_t numFrontClips,
157 int32_t numBackClips)
158 {
159 //////////////////////////////////////////////////////////
160 Cigar* cigar = record.getCigarInfo();
161 FilterStatus status = NONE;
162 int32_t startPos = record.get0BasedPosition();
163 CigarRoller updatedCigar;
164
165 status = softClip(*cigar, numFrontClips, numBackClips,
166 startPos, updatedCigar);
167
168 if(status == FILTERED)
169 {
170 /////////////////////////////
171 // The entire read is clipped, so rather than clipping it,
172 // filter it out.
173 filterRead(record);
174 return(FILTERED);
175 }
176 else if(status == CLIPPED)
177 {
178 // Part of the read was clipped, and now that we have
179 // an updated cigar, update the read.
180 record.setCigar(updatedCigar);
181
182 // Update the starting position.
183 record.set0BasedPosition(startPos);
184 }
185 return(status);
186 }
187
188
189 // Soft clip the cigar from the front and/or the back, writing the value
190 // into the new cigar.
softClip(Cigar & oldCigar,int32_t numFrontClips,int32_t numBackClips,int32_t & startPos,CigarRoller & updatedCigar)191 SamFilter::FilterStatus SamFilter::softClip(Cigar& oldCigar,
192 int32_t numFrontClips,
193 int32_t numBackClips,
194 int32_t& startPos,
195 CigarRoller& updatedCigar)
196 {
197 int32_t readLength = oldCigar.getExpectedQueryBaseCount();
198 int32_t endClipPos = readLength - numBackClips;
199 FilterStatus status = NONE;
200
201 if((numFrontClips != 0) || (numBackClips != 0))
202 {
203 // Clipping from front and/or from the back.
204
205 // Check to see if the entire read was clipped.
206 int32_t totalClips = numFrontClips + numBackClips;
207 if(totalClips >= readLength)
208 {
209 /////////////////////////////
210 // The entire read is clipped, so rather than clipping it,
211 // filter it out.
212 return(FILTERED);
213 }
214
215 // Part of the read was clipped.
216 status = CLIPPED;
217
218 // Loop through, creating an updated cigar.
219 int origCigarOpIndex = 0;
220
221 // Track how many read positions are covered up to this
222 // point by the cigar to determine up to up to what
223 // point in the cigar is affected by this clipping.
224 int32_t numPositions = 0;
225
226 // Track if any non-clips are in the new cigar.
227 bool onlyClips = true;
228
229 const Cigar::CigarOperator* op = NULL;
230
231 //////////////////
232 // Clip from front
233 while((origCigarOpIndex < oldCigar.size()) &&
234 (numPositions < numFrontClips))
235 {
236 op = &(oldCigar.getOperator(origCigarOpIndex));
237 switch(op->operation)
238 {
239 case Cigar::hardClip:
240 // Keep this operation as the new clips do not
241 // affect other clips.
242 updatedCigar += *op;
243 break;
244 case Cigar::del:
245 case Cigar::skip:
246 // Skip and delete are going to be dropped, and
247 // are not in the read, so the read index doesn't
248 // need to be updated
249 break;
250 case Cigar::insert:
251 case Cigar::match:
252 case Cigar::mismatch:
253 case Cigar::softClip:
254 // Update the read index as these types
255 // are found in the read.
256 numPositions += op->count;
257 break;
258 case Cigar::none:
259 default:
260 // Nothing to do for none.
261 break;
262 };
263 ++origCigarOpIndex;
264 }
265
266 // If bases were clipped from the front, add the clip and
267 // any partial cigar operation as necessary.
268 if(numFrontClips != 0)
269 {
270 // Add the softclip to the front of the read.
271 updatedCigar.Add(Cigar::softClip, numFrontClips);
272
273 // Add the rest of the last Cigar operation if
274 // it is not entirely clipped.
275 int32_t newCount = numPositions - numFrontClips;
276 if(newCount > 0)
277 {
278 // Before adding it, check to see if the same
279 // operation is clipped from the end.
280 // numPositions greater than the endClipPos
281 // means that it is equal or past that position,
282 // so shorten the number of positions.
283 if(numPositions > endClipPos)
284 {
285 newCount -= (numPositions - endClipPos);
286 }
287 if(newCount > 0)
288 {
289 updatedCigar.Add(op->operation, newCount);
290 if(!Cigar::isClip(op->operation))
291 {
292 onlyClips = false;
293 }
294 }
295 }
296 }
297
298 // Add operations until the point of the end clip is reached.
299 // For example...
300 // 2M1D3M = MMDMMM readLength = 5
301 // readIndex: 01 234
302 // at cigarOpIndex 0 (2M), numPositions = 2.
303 // at cigarOpIndex 1 (1D), numPositions = 2.
304 // at cigarOpIndex 2 (3M), numPositions = 5.
305 // if endClipPos = 2, we still want to consume the 1D, so
306 // need to keep looping until numPositions > endClipPos
307 while((origCigarOpIndex < oldCigar.size()) &&
308 (numPositions <= endClipPos))
309 {
310 op = &(oldCigar.getOperator(origCigarOpIndex));
311
312 // Update the numPositions count if the operations indicates
313 // bases within the read.
314 if(!Cigar::foundInQuery(op->operation))
315 {
316 // This operation is not in the query read sequence,
317 // so it is not yet to the endClipPos, just add the
318 // operation do not increment the number of positions.
319 updatedCigar += *op;
320 if(!Cigar::isClip(op->operation))
321 {
322 onlyClips = false;
323 }
324 }
325 else
326 {
327 // This operation appears in the query sequence, so
328 // check to see if the clip occurs in this operation.
329
330 // endClipPos is 0 based & numPositions is a count.
331 // If endClipPos is 4, then it is the 5th position.
332 // If 4 positions are covered so far (numPositions = 4),
333 // then we are right at endCLipPos: 4-4 = 0, none of
334 // this operation should be kept.
335 // If only 3 positions were covered, then we are at offset
336 // 3, so offset 3 should be added: 4-3 = 1.
337 uint32_t numPosTilClip = endClipPos - numPositions;
338
339 if(numPosTilClip < op->count)
340 {
341 // this operation is partially clipped, write the part
342 // that was not clipped if it is not all clipped.
343 if(numPosTilClip != 0)
344 {
345 updatedCigar.Add(op->operation,
346 numPosTilClip);
347 if(!Cigar::isClip(op->operation))
348 {
349 onlyClips = false;
350 }
351 }
352 }
353 else
354 {
355 // This operation is not clipped, so add it
356 updatedCigar += *op;
357 if(!Cigar::isClip(op->operation))
358 {
359 onlyClips = false;
360 }
361 }
362 // This operation occurs in the query sequence, so
363 // increment the number of positions covered.
364 numPositions += op->count;
365 }
366
367 // Move to the next cigar position.
368 ++origCigarOpIndex;
369 }
370
371 //////////////////
372 // Add the softclip to the back.
373 if(numBackClips != 0)
374 {
375 // Add the softclip to the end
376 updatedCigar.Add(Cigar::softClip, numBackClips);
377 }
378
379 //////////////////
380 // Add any hardclips remaining in the original cigar to the back.
381 while(origCigarOpIndex < oldCigar.size())
382 {
383 op = &(oldCigar.getOperator(origCigarOpIndex));
384 if(op->operation == Cigar::hardClip)
385 {
386 // Keep this operation as the new clips do not
387 // affect other clips.
388 updatedCigar += *op;
389 }
390 ++origCigarOpIndex;
391 }
392
393 // Check to see if the new cigar is only clips.
394 if(onlyClips)
395 {
396 // Only clips in the new cigar, so mark the read as filtered
397 // instead of updating the cigar.
398 /////////////////////////////
399 // The entire read was clipped.
400 status = FILTERED;
401 }
402 else
403 {
404 // Part of the read was clipped.
405 // Update the starting position if a clip was added to
406 // the front.
407 if(numFrontClips > 0)
408 {
409 // Convert from query index to reference position (from the
410 // old cigar)
411 // Get the position for the last front clipped position by
412 // getting the position associated with the clipped base on
413 // the reference. Then add one to get to the first
414 // non-clipped position.
415 int32_t lastFrontClipPos = numFrontClips - 1;
416 int32_t newStartPos = oldCigar.getRefPosition(lastFrontClipPos,
417 startPos);
418 if(newStartPos != Cigar::INDEX_NA)
419 {
420 // Add one to get first non-clipped position.
421 startPos = newStartPos + 1;
422 }
423 }
424 }
425 }
426 return(status);
427 }
428
429
filterOnMismatchQuality(SamRecord & record,GenomeSequence & refSequence,uint32_t qualityThreshold,uint8_t defaultQualityInt)430 SamFilter::FilterStatus SamFilter::filterOnMismatchQuality(SamRecord& record,
431 GenomeSequence& refSequence,
432 uint32_t qualityThreshold,
433 uint8_t defaultQualityInt)
434 {
435 uint32_t totalMismatchQuality =
436 sumMismatchQuality(record, refSequence, defaultQualityInt);
437
438 // If the total mismatch quality is over the threshold,
439 // filter the read.
440 if(totalMismatchQuality > qualityThreshold)
441 {
442 filterRead(record);
443 return(FILTERED);
444 }
445 return(NONE);
446 }
447
448
449 // NOTE: Only positions where the reference and read both have bases that
450 // are different and not 'N' are considered mismatches.
sumMismatchQuality(SamRecord & record,GenomeSequence & refSequence,uint8_t defaultQualityInt)451 uint32_t SamFilter::sumMismatchQuality(SamRecord& record,
452 GenomeSequence& refSequence,
453 uint8_t defaultQualityInt)
454 {
455 // Track the mismatch info.
456 int mismatchQual = 0;
457 int numMismatch = 0;
458
459 SamQuerySeqWithRefIter sequenceIter(record, refSequence);
460
461 SamSingleBaseMatchInfo baseMatchInfo;
462 while(sequenceIter.getNextMatchMismatch(baseMatchInfo))
463 {
464 if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
465 {
466 // Got a mismatch, get the associated quality.
467 char readQualityChar =
468 record.getQuality(baseMatchInfo.getQueryIndex());
469 uint8_t readQualityInt =
470 BaseUtilities::getPhredBaseQuality(readQualityChar);
471
472 if(readQualityInt == BaseUtilities::UNKNOWN_QUALITY_INT)
473 {
474 // Quality was not specified, so use the configured setting.
475 readQualityInt = defaultQualityInt;
476 }
477 mismatchQual += readQualityInt;
478 ++numMismatch;
479 }
480 }
481
482 return(mismatchQual);
483 }
484
485
filterRead(SamRecord & record)486 void SamFilter::filterRead(SamRecord& record)
487 {
488 // Filter the read by marking it as unmapped.
489 uint16_t flag = record.getFlag();
490 SamFlag::setUnmapped(flag);
491 // Clear N/A flags.
492 flag &= ~SamFlag::PROPER_PAIR;
493 flag &= ~SamFlag::SECONDARY_ALIGNMENT;
494 flag &= ~SamFlag::SUPPLEMENTARY_ALIGNMENT;
495 record.setFlag(flag);
496 // Clear Cigar
497 record.setCigar("*");
498 // Clear mapping quality
499 record.setMapQuality(0);
500 }
501