1 /*
2 * Copyright (C) 2010-2016, 2019 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include <iostream>
19 #include <algorithm>
20 #include <cstdlib>
21 #include <string>
22 #include <unistd.h>
23 #include <getopt.h>
24 #include "SamFile.h"
25 #include "Dedup.h"
26 #include "Logger.h"
27 #include "SamHelper.h"
28 #include "SamStatus.h"
29 #include "BgzfFileType.h"
30
31 const int Dedup::DEFAULT_MIN_QUAL = 15;
32 const uint32_t Dedup::CLIP_OFFSET = 1000;
33
~Dedup()34 Dedup::~Dedup()
35 {
36 // clean up the maps.
37 // First free any fragment records.
38 for(FragmentMap::iterator iter = myFragmentMap.begin();
39 iter != myFragmentMap.end(); iter++)
40 {
41 mySamPool.releaseRecord(iter->second.recordPtr);
42 }
43 myFragmentMap.clear();
44
45 // Free any paired records.
46 for(PairedMap::iterator iterator = myPairedMap.begin();
47 iterator != myPairedMap.end(); iterator++)
48 {
49 // These are not duplicates, but we are done with them, so release them.
50 mySamPool.releaseRecord(iterator->second.record1Ptr);
51 mySamPool.releaseRecord(iterator->second.record2Ptr);
52 }
53 myPairedMap.clear();
54
55 // Free any entries in the mate map.
56 for(MateMap::iterator iter = myMateMap.begin();
57 iter != myMateMap.end(); iter++)
58 {
59 mySamPool.releaseRecord(iter->second.recordPtr);
60 }
61 myMateMap.clear();
62 }
63
printDedupDescription(std::ostream & os)64 void Dedup::printDedupDescription(std::ostream& os)
65 {
66 os << " dedup - Mark Duplicates\n";
67 }
68
69
printDescription(std::ostream & os)70 void Dedup::printDescription(std::ostream& os)
71 {
72 printDedupDescription(os);
73 }
74
75
printUsage(std::ostream & os)76 void Dedup::printUsage(std::ostream& os)
77 {
78 os << "Usage: ./bam dedup --in <InputBamFile> --out <OutputBamFile> [--minQual <minPhred>] [--log <logFile>] [--oneChrom] [--rmDups] [--force] [--excludeFlags <flag>] [--verbose] [--noeof] [--params] [--recab] ";
79 myRecab.printRecabSpecificUsageLine(os);
80 os << std::endl << std::endl;
81 os << "Required parameters :" << std::endl;
82 os << "\t--in <infile> : Input BAM file name (must be sorted)" << std::endl;
83 os << "\t--out <outfile> : Output BAM file name (same order with original file)" << std::endl;
84 os << "Optional parameters : " << std::endl;
85 os << "\t--minQual <int> : Only add scores over this phred quality when determining a read's quality (default: "
86 << DEFAULT_MIN_QUAL << ")" << std::endl;
87 os << "\t--log <logfile> : Log and summary statistics (default: [outfile].log, or stderr if --out starts with '-')" << std::endl;
88 os << "\t--oneChrom : Treat reads with mates on different chromosomes as single-ended." << std::endl;
89 os << "\t--rmDups : Remove duplicates (default is to mark duplicates)" << std::endl;
90 os << "\t--force : Allow an already mark-duplicated BAM file, unmarking any previously marked " << std::endl;
91 os << "\t duplicates and apply this duplicate marking logic. Default is to throw errors" << std::endl;
92 os << "\t and exit when trying to run on an already mark-duplicated BAM" << std::endl;
93 os << "\t--excludeFlags <flag> : exclude reads with any of these flags set when determining or marking duplicates" << std::endl;
94 os << "\t by default (0xB04): exclude unmapped, secondary reads, QC failures, and supplementary reads" << std::endl;
95 os << "\t--verbose : Turn on verbose mode" << std::endl;
96 os << "\t--noeof : Do not expect an EOF block on a bam file." << std::endl;
97 os << "\t--params : Print the parameter settings" << std::endl;
98 os << "\t--recab : Recalibrate in addition to deduping" << std::endl;
99 myRecab.printRecabSpecificUsage(os);
100 os<< "\n" << std::endl;
101 }
102
execute(int argc,char ** argv)103 int Dedup::execute(int argc, char** argv)
104 {
105 /* --------------------------------
106 * process the arguments
107 * -------------------------------*/
108 String inFile, outFile, logFile;
109 myDoRecab = false;
110 bool removeFlag = false;
111 bool verboseFlag = false;
112 myForceFlag = false;
113 myNumMissingMate = 0;
114 myMinQual = DEFAULT_MIN_QUAL;
115 String excludeFlags = "0xB04";
116 uint16_t intExcludeFlags = 0;
117 bool noeof = false;
118 bool params = false;
119
120 LongParamContainer parameters;
121 parameters.addGroup("Required Parameters");
122 parameters.addString("in", &inFile);
123 parameters.addString("out", &outFile);
124 parameters.addGroup("Optional Parameters");
125 parameters.addInt("minQual", & myMinQual);
126 parameters.addString("log", &logFile);
127 parameters.addBool("oneChrom", &myOneChrom);
128 parameters.addBool("recab", &myDoRecab);
129 parameters.addBool("rmDups", &removeFlag);
130 parameters.addBool("force", &myForceFlag);
131 parameters.addString("excludeFlags", &excludeFlags);
132 parameters.addBool("verbose", &verboseFlag);
133 parameters.addBool("noeof", &noeof);
134 parameters.addBool("params", ¶ms);
135 parameters.addPhoneHome(VERSION);
136 myRecab.addRecabSpecificParameters(parameters);
137
138 ParameterList inputParameters;
139 inputParameters.Add(new LongParameters ("Input Parameters",
140 parameters.getLongParameterList()));
141
142 // parameters start at index 2 rather than 1.
143 inputParameters.Read(argc, argv, 2);
144
145 // If no eof block is required for a bgzf file, set the bgzf file type to
146 // not look for it.
147 if(noeof)
148 {
149 // Set that the eof block is not required.
150 BgzfFileType::setRequireEofBlock(false);
151 }
152
153 if(inFile.IsEmpty())
154 {
155 printUsage(std::cerr);
156 inputParameters.Status();
157 std::cerr << "Specify an input file" << std::endl;
158 return EXIT_FAILURE;
159 }
160 // inFile is not empty, so there is at least one character. Check if
161 // it is specifing stdin since that is not supported for Dedup.
162 if(inFile[0] == '-')
163 {
164 // ERROR: stdin specified, but since Dedup requires 2 passes through
165 // the input file, stdin is not supported.
166 printUsage(std::cerr);
167 inputParameters.Status();
168 std::cerr << "ERROR: stdin ('" << inFile << "') is not a supported input file because Dedup requires two passes through the input file." << std::endl;
169 return EXIT_FAILURE;
170 }
171
172 if(outFile.IsEmpty())
173 {
174 printUsage(std::cerr);
175 inputParameters.Status();
176 std::cerr << "Specify an output file" << std::endl;
177 return EXIT_FAILURE;
178 }
179
180 intExcludeFlags = excludeFlags.AsInteger();
181
182 if(myForceFlag && SamFlag::isDuplicate(intExcludeFlags))
183 {
184 printUsage(std::cerr);
185 inputParameters.Status();
186 std::cerr << "Cannot specify --force and Duplicate in the excludeFlags. Since --force indicates to override"
187 << " previous duplicate setting and the excludeFlags says to skip those, you can't do both.\n";
188 return EXIT_FAILURE;
189 }
190
191 if(!SamFlag::isSecondary(intExcludeFlags))
192 {
193 printUsage(std::cerr);
194 inputParameters.Status();
195 std::cerr << "ERROR: Secondary reads must be excluded, edit --excludeFlags to include 0x0100\n";
196 return EXIT_FAILURE;
197 }
198
199 if(!(intExcludeFlags & SamFlag::SUPPLEMENTARY_ALIGNMENT))
200 {
201 printUsage(std::cerr);
202 inputParameters.Status();
203 std::cerr << "ERROR: Supplementary reads must be excluded, edit --excludeFlags to include 0x0800\n";
204 return EXIT_FAILURE;
205 }
206
207 if(logFile.IsEmpty())
208 {
209 logFile = outFile + ".log";
210 }
211
212 if(myDoRecab)
213 {
214 int status = myRecab.processRecabParam();
215 if(status != 0)
216 {
217 inputParameters.Status();
218 return(status);
219 }
220 }
221
222 if(params)
223 {
224 inputParameters.Status();
225 }
226
227 Logger::gLogger = new Logger(logFile.c_str(), verboseFlag);
228
229 /* -------------------------------------------------------------------
230 * The arguments are processed. Prepare the input BAM file,
231 * instantiate dedup, and construct the read group library map
232 * ------------------------------------------------------------------*/
233
234 SamFile samIn;
235
236 samIn.OpenForRead(inFile.c_str());
237 // If the file isn't sorted it will throw an exception.
238 samIn.setSortedValidation(SamFile::COORDINATE);
239
240 SamFileHeader header;
241 samIn.ReadHeader(header);
242
243 buildReadGroupLibraryMap(header);
244
245 lastReference = -1;
246 lastCoordinate = -1;
247
248 // for keeping some basic statistics
249 uint32_t recordCount = 0;
250 uint32_t pairedCount = 0;
251 uint32_t properPairCount = 0;
252 uint32_t unmappedCount = 0;
253 uint32_t reverseCount = 0;
254 uint32_t qualCheckFailCount = 0;
255 uint32_t secondaryCount = 0;
256 uint32_t supplementaryCount = 0;
257 uint32_t excludedCount = 0;
258
259 // Now we start reading records
260 SamRecord* recordPtr;
261 SamStatus::Status returnStatus = SamStatus::SUCCESS;
262 while(returnStatus == SamStatus::SUCCESS)
263 {
264 recordPtr = mySamPool.getRecord();
265 if(recordPtr == NULL)
266 {
267 std::cerr << "Failed to allocate enough records\n";
268 return(-1);
269 }
270 if(!samIn.ReadRecord(header, *recordPtr))
271 {
272 returnStatus = samIn.GetStatus();
273 continue;
274 }
275 // Take note of properties of this record
276 int flag = recordPtr->getFlag();
277 if(SamFlag::isPaired(flag)) ++pairedCount;
278 if(SamFlag::isProperPair(flag)) ++properPairCount;
279 if(SamFlag::isReverse(flag)) ++reverseCount;
280 if(SamFlag::isQCFailure(flag)) ++qualCheckFailCount;
281 if(SamFlag::isSecondary(flag)) ++secondaryCount;
282 if(flag & SamFlag::SUPPLEMENTARY_ALIGNMENT) ++supplementaryCount;
283 if(!SamFlag::isMapped(flag)) ++unmappedCount;
284
285 // put the record in the appropriate maps:
286 // single reads go in myFragmentMap
287 // paired reads go in myPairedMap
288 recordCount = samIn.GetCurrentRecordCount();
289
290 // if we have moved to a new position, look back at previous reads for duplicates
291 if (hasPositionChanged(*recordPtr))
292 {
293 cleanupPriorReads(recordPtr);
294 }
295
296 // Determine if this read should be checked for duplicates.
297 if((!SamFlag::isMapped(flag)) || ((flag & intExcludeFlags) != 0))
298 {
299 ++excludedCount;
300
301 // No deduping done on this record, but still build the recab table.
302 if(myDoRecab)
303 {
304 myRecab.processReadBuildTable(*recordPtr);
305 }
306 // Nothing more to do with this record, so
307 // release the pointer.
308 mySamPool.releaseRecord(recordPtr);
309 }
310 else
311 {
312 if(SamFlag::isDuplicate(flag) && !myForceFlag)
313 {
314 // Error: Marked duplicates, and duplicates aren't excluded.
315 Logger::gLogger->error("There are records already duplicate marked.");
316 Logger::gLogger->error("Use -f to clear the duplicate flag and start the deduping procedure over");
317 }
318
319 checkDups(*recordPtr, recordCount);
320 }
321 // let the user know we're not napping
322 if (verboseFlag && (recordCount % 100000 == 0))
323 {
324 Logger::gLogger->writeLog("recordCount=%u singleKeyMap=%u pairedKeyMap=%u, dictSize=%u",
325 recordCount, myFragmentMap.size(),
326 myPairedMap.size(),
327 myMateMap.size());
328 }
329 }
330
331 // we're finished reading record so clean up the duplicate search and
332 // close the input file
333 cleanupPriorReads(NULL);
334 samIn.Close();
335
336 // print some statistics
337 Logger::gLogger->writeLog("--------------------------------------------------------------------------");
338 Logger::gLogger->writeLog("SUMMARY STATISTICS OF THE READS");
339 Logger::gLogger->writeLog("Total number of reads: %u",recordCount);
340 Logger::gLogger->writeLog("Total number of paired-end reads: %u",
341 pairedCount);
342 Logger::gLogger->writeLog("Total number of properly paired reads: %u",
343 properPairCount);
344 Logger::gLogger->writeLog("Total number of unmapped reads: %u",
345 unmappedCount);
346 Logger::gLogger->writeLog("Total number of reverse strand mapped reads: %u",
347 reverseCount);
348 Logger::gLogger->writeLog("Total number of QC-failed reads: %u",
349 qualCheckFailCount);
350 Logger::gLogger->writeLog("Total number of secondary reads: %u",
351 secondaryCount);
352 Logger::gLogger->writeLog("Total number of supplementary reads: %u",
353 supplementaryCount);
354 Logger::gLogger->writeLog("Size of singleKeyMap (must be zero): %u",
355 myFragmentMap.size());
356 Logger::gLogger->writeLog("Size of pairedKeyMap (must be zero): %u",
357 myPairedMap.size());
358 Logger::gLogger->writeLog("Total number of missing mates: %u",
359 myNumMissingMate);
360 Logger::gLogger->writeLog("Total number of reads excluded from duplicate checking: %u",
361 excludedCount);
362 Logger::gLogger->writeLog("--------------------------------------------------------------------------");
363 Logger::gLogger->writeLog("Sorting the indices of %d duplicated records",
364 myDupList.size());
365
366 // sort the indices of duplicate records
367 std::sort(myDupList.begin(), myDupList.end(),
368 std::less<uint32_t> ());
369
370 // get ready to write the output file by making a second pass
371 // through the input file
372 samIn.OpenForRead(inFile.c_str());
373 samIn.ReadHeader(header);
374
375 SamFile samOut;
376 samOut.OpenForWrite(outFile.c_str());
377 samOut.WriteHeader(header);
378
379 // If we are recalibrating, output the model information.
380 if(myDoRecab)
381 {
382 myRecab.modelFitPrediction(outFile);
383 }
384
385 // an iterator to run through the duplicate indices
386 int currentDupIndex = 0;
387 bool moreDups = !myDupList.empty();
388
389 // let the user know what we're doing
390 Logger::gLogger->writeLog("\nWriting %s", outFile.c_str());
391
392 // count the duplicate records as a check
393 uint32_t singleDuplicates(0), pairedDuplicates(0);
394
395 // start reading records and writing them out
396 SamRecord record;
397 while(samIn.ReadRecord(header, record))
398 {
399 uint32_t currentIndex = samIn.GetCurrentRecordCount();
400
401 bool foundDup = moreDups &&
402 (currentIndex == myDupList[currentDupIndex]);
403
404 // modify the duplicate flag and write out the record,
405 // if it's appropriate
406 int flag = record.getFlag();
407 if (foundDup)
408 {
409 // this record is a duplicate, so mark it.
410 record.setFlag( flag | 0x400 );
411 currentDupIndex++;
412 // increment duplicate counters to verify we found them all
413 if ( ( ( flag & 0x0001 ) == 0 ) || ( flag & 0x0008 ) )
414 { // unpaired or mate unmapped
415 singleDuplicates++;
416 }
417 else
418 {
419 pairedDuplicates++;
420 }
421 // recalibrate if necessary.
422 if(myDoRecab)
423 {
424 myRecab.processReadApplyTable(record);
425 }
426
427 // write the record if we are not removing duplicates
428 if (!removeFlag ) samOut.WriteRecord(header, record);
429 }
430 else
431 {
432 if(myForceFlag)
433 {
434 // this is not a duplicate we've identified but we want to
435 // remove any duplicate marking
436 record.setFlag( flag & 0xfffffbff ); // unmark duplicate
437 }
438 // Not a duplicate, so recalibrate if necessary.
439 if(myDoRecab)
440 {
441 myRecab.processReadApplyTable(record);
442 }
443 samOut.WriteRecord(header, record);
444 }
445
446 // Let the user know we're still here
447 if (verboseFlag && (currentIndex % 100000 == 0)) {
448 Logger::gLogger->writeLog("recordCount=%u", currentIndex);
449 }
450 }
451
452 // We're done. Close the files and print triumphant messages.
453 samIn.Close();
454 samOut.Close();
455
456 Logger::gLogger->writeLog("Successfully %s %u unpaired and %u paired duplicate reads",
457 removeFlag ? "removed" : "marked" ,
458 singleDuplicates,
459 pairedDuplicates/2);
460 Logger::gLogger->writeLog("\nDedup complete!");
461 return 0;
462 }
463
464 // Now that we've reached coordinate on chromosome reference, look back and
465 // clean up any previous positions from being tracked.
cleanupPriorReads(SamRecord * record)466 void Dedup::cleanupPriorReads(SamRecord* record)
467 {
468 static DupKey emptyKey;
469 static DupKey tempKey2;
470
471 // Set where to stop cleaning out the structures.
472 // Initialize to the end of the structures.
473 FragmentMap::iterator fragmentFinish = myFragmentMap.end();
474 PairedMap::iterator pairedFinish = myPairedMap.end();
475 uint64_t mateStopPos = 0;
476
477 // If a record was specified, stop before this record.
478 if(record != NULL)
479 {
480 int32_t reference = record->getReferenceID();
481 int32_t coordinate = record->get0BasedPosition();
482 tempKey2.cleanupKey(reference, coordinate);
483 fragmentFinish = myFragmentMap.lower_bound(tempKey2);
484 // Now do the same thing with the paired reads
485 PairedKey pairedKey(emptyKey, tempKey2);
486 pairedFinish = myPairedMap.lower_bound(pairedKey);
487 mateStopPos =
488 SamHelper::combineChromPos(reference,
489 coordinate);
490 }
491
492 // For each key k < fragmentFinish, release the record since we are
493 // done with that position and it is not a duplicate.
494 for(FragmentMap::iterator iter = myFragmentMap.begin();
495 iter != fragmentFinish; iter++)
496 {
497 // If it is not paired, we are done with this record.
498 if(!iter->second.paired)
499 {
500 // Unpaired, non-duplicate, so perform any additional handling.
501 handleNonDuplicate(iter->second.recordPtr);
502 }
503 }
504 // Erase the entries from the map.
505 if(fragmentFinish != myFragmentMap.begin())
506 {
507 myFragmentMap.erase(myFragmentMap.begin(), fragmentFinish);
508 }
509
510 // Now do the same thing with the paired reads
511 for(PairedMap::iterator iter = myPairedMap.begin();
512 iter != pairedFinish; iter++)
513 {
514 PairedData* pairedData = &(iter->second);
515 // These are not duplicates, but we are done with them,
516 // so perform any additional handling.
517 handleNonDuplicate(pairedData->record1Ptr);
518 handleNonDuplicate(pairedData->record2Ptr);
519 }
520 // Erase the entries.
521 if (pairedFinish != myPairedMap.begin())
522 {
523 myPairedMap.erase(myPairedMap.begin(), pairedFinish);
524 }
525
526 // Clean up the Mate map from any reads whose mates were not found.
527 // Loop through the mate map and release records prior to this position.
528 MateMap::iterator mateIter;
529 for(mateIter = myMateMap.begin(); mateIter != myMateMap.end(); mateIter++)
530 {
531 // stop if a record was specified and we have gone past the mate
532 // stop position. If no record was specified, we want to clean
533 // it all out.
534 if((record != NULL) && (mateIter->first >= mateStopPos))
535 {
536 break;
537 }
538 // Passed the mate, but it was not found.
539 handleMissingMate(mateIter->second.recordPtr);
540 }
541 // Erase the entries.
542 if(mateIter != myMateMap.begin())
543 {
544 myMateMap.erase(myMateMap.begin(), mateIter);
545 }
546 return;
547 }
548
549
550 // determine whether the record's position is different from the previous record
hasPositionChanged(SamRecord & record)551 bool Dedup::hasPositionChanged(SamRecord& record)
552 {
553 if (lastReference != record.getReferenceID() ||
554 lastCoordinate < record.get0BasedPosition())
555 {
556 if (lastReference != record.getReferenceID())
557 {
558 lastReference = record.getReferenceID();
559 Logger::gLogger->writeLog("Reading ReferenceID %d\n", lastReference);
560 }
561 lastCoordinate = record.get0BasedPosition();
562 return true;
563 }
564 return false;
565 }
566
567 // When a record is read, check if it is a duplicate or
568 // store for future checking.
checkDups(SamRecord & record,uint32_t recordCount)569 void Dedup::checkDups(SamRecord& record, uint32_t recordCount)
570 {
571 // Only inside this method if the record is mapped.
572
573 // Get the key for this record.
574 static DupKey key;
575 static DupKey mateKey;
576 key.updateKey(record, getLibraryID(record));
577
578 int flag = record.getFlag();
579 bool recordPaired = SamFlag::isPaired(flag) && SamFlag::isMateMapped(flag);
580 int sumBaseQual = getBaseQuality(record);
581
582 int32_t chromID = record.getReferenceID();
583 int32_t mateChromID = record.getMateReferenceID();
584
585 // If we are one-chrom and the mate is not on the same chromosome,
586 // mark it as not paired.
587 if(myOneChrom && (chromID != mateChromID))
588 {
589 recordPaired = false;
590 }
591
592 // Look in the map to see if an entry for this key exists.
593 FragmentMapInsertReturn ireturn =
594 myFragmentMap.insert(std::make_pair(key, ReadData()));
595
596 ReadData* readData = &(ireturn.first->second);
597
598 // Mark this record's data in the fragment record if this is the first
599 // entry or if it is a duplicate and the old record is not paired and
600 // the new record is paired or the has a higher quality.
601 if((ireturn.second == true) ||
602 ((readData->paired == false) &&
603 (recordPaired || (sumBaseQual > readData->sumBaseQual))))
604 {
605 // If there was a previous record, mark it duplicate and release
606 // the old record
607 if(ireturn.second == false)
608 {
609 // Mark the old record as a DUPLICATE!
610 handleDuplicate(readData->recordIndex, readData->recordPtr);
611 }
612 // Store this record for later duplicate checking.
613 readData->sumBaseQual = sumBaseQual;
614 readData->recordIndex = recordCount;
615 readData->paired = recordPaired;
616 if(recordPaired)
617 {
618 readData->recordPtr = NULL;
619 }
620 else
621 {
622 readData->recordPtr = &record;
623 }
624 }
625 else
626 {
627 // The old record is not a duplicate so the new record is
628 // a duplicate if it is not paired.
629 if(recordPaired == false)
630 {
631 // This record is a duplicate, so mark it and release it.
632 handleDuplicate(recordCount, &record);
633 }
634 }
635
636 // Only paired processing is left, so return if not paired.
637 if(recordPaired == false)
638 {
639 // Not paired, no more operations required, so return.
640 return;
641 }
642
643 // This is a paired record, so check for its mate.
644 uint64_t readPos =
645 SamHelper::combineChromPos(chromID,
646 record.get0BasedPosition());
647 uint64_t matePos =
648 SamHelper::combineChromPos(mateChromID,
649 record.get0BasedMatePosition());
650 SamRecord* mateRecord = NULL;
651 int mateIndex = 0;
652
653 // Check to see if the mate is prior to this record.
654 if(matePos <= readPos)
655 {
656 // The mate map is stored by the mate position, so look for this
657 // record's position.
658 // The mate should be in the mate map, so find it.
659 std::pair<MateMap::iterator,MateMap::iterator> matches =
660 myMateMap.equal_range(readPos);
661 // Loop through the elements that matched the pos looking for the mate.
662 for(MateMap::iterator iter = matches.first;
663 iter != matches.second; iter++)
664 {
665 if(strcmp((*iter).second.recordPtr->getReadName(),
666 record.getReadName()) == 0)
667 {
668 // Found the match.
669 ReadData* mateData = &((*iter).second);
670 // Update the quality and track the mate record and index.
671 sumBaseQual += mateData->sumBaseQual;
672 mateIndex = mateData->recordIndex;
673 mateRecord = mateData->recordPtr;
674 // Remove the entry from the map.
675 myMateMap.erase(iter);
676 break;
677 }
678 }
679 }
680 if((mateRecord == NULL) && (matePos >= readPos))
681 {
682 // Haven't gotten to the mate yet, so store this record.
683 MateMap::iterator mateIter =
684 myMateMap.insert(std::make_pair(matePos, ReadData()));
685 mateIter->second.sumBaseQual = sumBaseQual;
686 mateIter->second.recordPtr = &record;
687 mateIter->second.recordIndex = recordCount;
688 // No more processing for this record is necessary.
689 return;
690 }
691
692 if(mateRecord == NULL)
693 {
694 // Passed the mate, but it was not found.
695 handleMissingMate(&record);
696 return;
697 }
698
699 // Make the paired key.
700 mateKey.updateKey(*mateRecord, getLibraryID(*mateRecord));
701 PairedKey pkey(key, mateKey);
702
703 // Check to see if this pair is a duplicate.
704 PairedMapInsertReturn pairedReturn =
705 myPairedMap.insert(std::make_pair(pkey,PairedData()));
706 PairedData* storedPair = &(pairedReturn.first->second);
707
708 // Get the index for "record 1" - the one with the earlier coordinate.
709 int record1Index = getFirstIndex(key, recordCount,
710 mateKey, mateIndex);
711
712 // Check if we have already found a duplicate pair.
713 // If there is no duplicate found, there is nothing more to do.
714 if(pairedReturn.second == false)
715 {
716 // Duplicate found.
717 bool keepStored = true;
718 if(pairedReturn.first->second.sumBaseQual < sumBaseQual)
719 {
720 // The new pair has higher quality, so keep that.
721 keepStored = false;
722 }
723 else if(pairedReturn.first->second.sumBaseQual == sumBaseQual)
724 {
725 // Same quality, so keep the one with the earlier record1Index.
726 if(record1Index < storedPair->record1Index)
727 {
728 // The new pair has an earlier lower coordinate read,
729 // so keep that.
730 keepStored = false;
731 }
732 }
733 // Check to see which one should be kept by checking qualities.
734 if(keepStored)
735 {
736 // The old pair had higher quality so mark the new pair as a
737 // duplicate and release them.
738 handleDuplicate(mateIndex, mateRecord);
739 handleDuplicate(recordCount, &record);
740 }
741 else
742 {
743 // The new pair has higher quality, so keep that.
744 // First mark the previous one as duplicates and release them.
745 handleDuplicate(storedPair->record1Index, storedPair->record1Ptr);
746 handleDuplicate(storedPair->record2Index, storedPair->record2Ptr);
747 // Store this pair's information.
748 if(record1Index == mateIndex)
749 {
750 // Mate has a lower coordinate, so make mate
751 // record1.
752 storedPair->sumBaseQual = sumBaseQual;
753 storedPair->record1Ptr = mateRecord;
754 storedPair->record2Ptr = &record;
755 storedPair->record1Index = mateIndex;
756 storedPair->record2Index = recordCount;
757 }
758 else
759 {
760 // This record has a lower coordinate, so make it
761 // record1.
762 storedPair->sumBaseQual = sumBaseQual;
763 storedPair->record1Ptr = &record;
764 storedPair->record2Ptr = mateRecord;
765 storedPair->record1Index = recordCount;
766 storedPair->record2Index = mateIndex;
767 }
768 }
769 }
770 else
771 {
772 // Store this pair's information.
773 storedPair->sumBaseQual = sumBaseQual;
774
775 if(record1Index == mateIndex)
776 {
777 // Mate has a lower coordinate, so make mate
778 // record1.
779 storedPair->record1Ptr = mateRecord;
780 storedPair->record2Ptr = &record;
781 storedPair->record1Index = mateIndex;
782 storedPair->record2Index = recordCount;
783 }
784 else
785 {
786 // This record has a lower coordinate, so make it
787 // record1.
788 storedPair->record1Ptr = &record;
789 storedPair->record2Ptr = mateRecord;
790 storedPair->record1Index = recordCount;
791 storedPair->record2Index = mateIndex;
792 }
793 }
794 }
795
796
797 // Finds the total base quality of a read
getBaseQuality(SamRecord & record)798 int Dedup::getBaseQuality(SamRecord & record) {
799 const char* baseQualities = record.getQuality();
800 int readLength = record.getReadLength();
801 int quality = 0.;
802 if(strcmp(baseQualities, "*") == 0)
803 {
804 return(0);
805 }
806 for(int i=0; i < readLength; ++i) {
807 int q = static_cast<int>(baseQualities[i])-33;
808 if ( q >= myMinQual ) quality += q;
809 }
810 return quality;
811 }
812
813
814 // build the read group library map
buildReadGroupLibraryMap(SamFileHeader & header)815 void Dedup::buildReadGroupLibraryMap(SamFileHeader& header) {
816 rgidLibMap.clear();
817 numLibraries = 0;
818 std::map<std::string,uint32_t> libNameMap;
819
820 SamHeaderRecord * headerRecord = header.getNextRGRecord();
821 while(headerRecord != NULL) {
822 std::string ID = headerRecord->getTagValue("ID");
823 std::string LB = headerRecord->getTagValue("LB");
824
825 if ( ID.empty() ) {
826 std::string headerRecordString;
827 headerRecord->appendString(headerRecordString);
828 Logger::gLogger->error("Cannot find readGroup ID information in the header line %s",
829 headerRecordString.c_str());
830 }
831 if ( rgidLibMap.find(ID) != rgidLibMap.end() ) {
832 Logger::gLogger->error("The readGroup ID %s is not a unique identifier",ID.c_str());
833 }
834
835 if ( LB.empty() ) {
836 std::string headerRecordString;
837 headerRecord->appendString(headerRecordString);
838 Logger::gLogger->warning("Cannot find library information in the header line %s. Using empty string for library name",
839 headerRecordString.c_str());
840 }
841
842 if ( libNameMap.find( LB ) != libNameMap.end() ) {
843 rgidLibMap[ID] = libNameMap[LB];
844 }
845 else {
846 numLibraries = libNameMap.size()+1;
847 libNameMap[LB] = numLibraries;
848 rgidLibMap[ID] = numLibraries;
849 }
850 headerRecord = header.getNextRGRecord();
851 }
852
853 if (numLibraries > 0xff) {
854 Logger::gLogger->error("More than 255 library names are identified. Dedup currently only allows up to 255 library names");
855 }
856 }
857
858 // get the libraryID of a record
getLibraryID(SamRecord & record,bool checkTags)859 uint32_t Dedup::getLibraryID(SamRecord& record, bool checkTags) {
860 if ( ( checkTags == false ) && ( numLibraries <= 1 ) ) {
861 return 0;
862 } else {
863 char tag[3];
864 char vtype;
865 void* value;
866 std::string rgID;
867 record.resetTagIter();
868 while( record.getNextSamTag(tag,vtype,&value) != false ) {
869 if ( ( tag[0] == 'R' ) && ( tag[1] == 'G' ) && ( vtype == 'Z' ) ) {
870 if ( !rgID.empty() ) {
871 Logger::gLogger->error("Multiple RG tag found in one record. ReadName is %s",record.getReadName());
872 }
873 else if ( record.isStringType(vtype) ) {
874 String s = (String)*(String*)value;
875 rgID = s.c_str();
876 }
877 else {
878 Logger::gLogger->error("vtype is not string (Z) for RG tag");
879 }
880 }
881 }
882 if ( rgID.empty() ) {
883 Logger::gLogger->error("No RG tag is found in read %s",record.getReadName());
884 return 0;
885 }
886 else {
887 std::map<std::string,uint32_t>::iterator it = rgidLibMap.find(rgID);
888 if ( it != rgidLibMap.end() ) {
889 return it->second;
890 }
891 else {
892 Logger::gLogger->warning("RG tag %s does not exist in the header",rgID.c_str());
893 return 0; // cannot be reached
894 }
895 }
896 }
897 }
898
899
handleNonDuplicate(SamRecord * recordPtr)900 void Dedup::handleNonDuplicate(SamRecord* recordPtr)
901 {
902 if(recordPtr == NULL)
903 {
904 return;
905 }
906 if(myDoRecab)
907 {
908 if(myForceFlag)
909 {
910 // this is not a duplicate we've identified but we want to
911 // remove any duplicate marking
912 uint16_t flag = recordPtr->getFlag();
913 if(SamFlag::isDuplicate(flag))
914 {
915 SamFlag::setNotDuplicate(flag);
916 recordPtr->setFlag(flag); // unmark duplicate
917 }
918 }
919 // Add to recalibration matrix.
920 myRecab.processReadBuildTable(*recordPtr);
921 }
922
923 // Release the record.
924 mySamPool.releaseRecord(recordPtr);
925 }
926
927
handleMissingMate(SamRecord * recordPtr)928 void Dedup::handleMissingMate(SamRecord* recordPtr)
929 {
930 static bool firstDifferChrom = true;
931 static bool firstSameChrom = true;
932
933 if(recordPtr == NULL)
934 {
935 return;
936 }
937
938 // Passed the mate, but it was not found.
939 if(recordPtr->getMateReferenceID() != recordPtr->getReferenceID())
940 {
941 if(firstDifferChrom)
942 {
943 std::cerr << "Mate on different chromosome was not found.\n"
944 << "If you are running single chromosome, consider "
945 << "using --oneChrom to treat reads with mates on "
946 << "different chromosomes as single-ended.\n";
947 firstDifferChrom = false;
948 }
949 }
950 else if(firstSameChrom)
951 {
952 std::cerr << "WARNING: Records with missing mate can't be checked for "
953 << "duplicates.\n";
954 firstSameChrom = false;
955 }
956
957 // Don't consider this record to be a duplicate.
958 // Release this record since there is nothing more to do with it.
959 ++myNumMissingMate;
960 handleNonDuplicate(recordPtr);
961 }
962
963
handleDuplicate(uint32_t index,SamRecord * recordPtr)964 void Dedup::handleDuplicate(uint32_t index, SamRecord* recordPtr)
965 {
966 if(recordPtr == NULL)
967 {
968 return;
969 }
970 if(myDoRecab)
971 {
972 // Mark the record as a duplicate
973 uint16_t flag = recordPtr->getFlag();
974 SamFlag::setDuplicate(flag);
975 recordPtr->setFlag(flag);
976
977 // Add to recalibration matrix.
978 myRecab.processReadBuildTable(*recordPtr);
979 }
980
981 // Add the index to the duplicate list.
982 myDupList.push_back(index);
983 // Release the record.
984 mySamPool.releaseRecord(recordPtr);
985 }
986