1 /*
2  *  Copyright (C) 2010-2016, 2019  Regents of the University of Michigan
3  *
4  *   This program is free software: you can redistribute it and/or modify
5  *   it under the terms of the GNU General Public License as published by
6  *   the Free Software Foundation, either version 3 of the License, or
7  *   (at your option) any later version.
8  *
9  *   This program is distributed in the hope that it will be useful,
10  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *   GNU General Public License for more details.
13  *
14  *   You should have received a copy of the GNU General Public License
15  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <iostream>
19 #include <algorithm>
20 #include <cstdlib>
21 #include <string>
22 #include <unistd.h>
23 #include <getopt.h>
24 #include "SamFile.h"
25 #include "Dedup.h"
26 #include "Logger.h"
27 #include "SamHelper.h"
28 #include "SamStatus.h"
29 #include "BgzfFileType.h"
30 
31 const int Dedup::DEFAULT_MIN_QUAL = 15;
32 const uint32_t Dedup::CLIP_OFFSET = 1000;
33 
~Dedup()34 Dedup::~Dedup()
35 {
36     // clean up the maps.
37     // First free any fragment records.
38     for(FragmentMap::iterator iter = myFragmentMap.begin();
39         iter != myFragmentMap.end(); iter++)
40     {
41         mySamPool.releaseRecord(iter->second.recordPtr);
42     }
43     myFragmentMap.clear();
44 
45     // Free any paired records.
46     for(PairedMap::iterator iterator = myPairedMap.begin();
47         iterator != myPairedMap.end(); iterator++)
48     {
49         // These are not duplicates, but we are done with them, so release them.
50         mySamPool.releaseRecord(iterator->second.record1Ptr);
51         mySamPool.releaseRecord(iterator->second.record2Ptr);
52     }
53     myPairedMap.clear();
54 
55     // Free any entries in the mate map.
56     for(MateMap::iterator iter = myMateMap.begin();
57         iter != myMateMap.end(); iter++)
58     {
59         mySamPool.releaseRecord(iter->second.recordPtr);
60     }
61     myMateMap.clear();
62 }
63 
printDedupDescription(std::ostream & os)64 void Dedup::printDedupDescription(std::ostream& os)
65 {
66     os << " dedup - Mark Duplicates\n";
67 }
68 
69 
printDescription(std::ostream & os)70 void Dedup::printDescription(std::ostream& os)
71 {
72     printDedupDescription(os);
73 }
74 
75 
printUsage(std::ostream & os)76 void Dedup::printUsage(std::ostream& os)
77 {
78     os << "Usage: ./bam dedup --in <InputBamFile> --out <OutputBamFile> [--minQual <minPhred>] [--log <logFile>] [--oneChrom] [--rmDups] [--force] [--excludeFlags <flag>] [--verbose] [--noeof] [--params] [--recab] ";
79     myRecab.printRecabSpecificUsageLine(os);
80     os << std::endl << std::endl;
81     os << "Required parameters :" << std::endl;
82     os << "\t--in <infile>   : Input BAM file name (must be sorted)" << std::endl;
83     os << "\t--out <outfile> : Output BAM file name (same order with original file)" << std::endl;
84     os << "Optional parameters : " << std::endl;
85     os << "\t--minQual <int> : Only add scores over this phred quality when determining a read's quality (default: "
86               << DEFAULT_MIN_QUAL << ")" << std::endl;
87     os << "\t--log <logfile> : Log and summary statistics (default: [outfile].log, or stderr if --out starts with '-')" << std::endl;
88     os << "\t--oneChrom      : Treat reads with mates on different chromosomes as single-ended." << std::endl;
89     os << "\t--rmDups        : Remove duplicates (default is to mark duplicates)" << std::endl;
90     os << "\t--force         : Allow an already mark-duplicated BAM file, unmarking any previously marked " << std::endl;
91     os << "\t                  duplicates and apply this duplicate marking logic.  Default is to throw errors" << std::endl;
92     os << "\t                  and exit when trying to run on an already mark-duplicated BAM" << std::endl;
93     os << "\t--excludeFlags <flag>    : exclude reads with any of these flags set when determining or marking duplicates" << std::endl;
94     os << "\t                           by default (0xB04): exclude unmapped, secondary reads, QC failures, and supplementary reads" << std::endl;
95     os << "\t--verbose       : Turn on verbose mode" << std::endl;
96     os << "\t--noeof         : Do not expect an EOF block on a bam file." << std::endl;
97     os << "\t--params        : Print the parameter settings" << std::endl;
98     os << "\t--recab         : Recalibrate in addition to deduping" << std::endl;
99     myRecab.printRecabSpecificUsage(os);
100     os<< "\n" << std::endl;
101 }
102 
execute(int argc,char ** argv)103 int Dedup::execute(int argc, char** argv)
104 {
105     /* --------------------------------
106      * process the arguments
107      * -------------------------------*/
108     String inFile, outFile, logFile;
109     myDoRecab = false;
110     bool removeFlag = false;
111     bool verboseFlag = false;
112     myForceFlag = false;
113     myNumMissingMate = 0;
114     myMinQual = DEFAULT_MIN_QUAL;
115     String excludeFlags = "0xB04";
116     uint16_t intExcludeFlags = 0;
117     bool noeof = false;
118     bool params = false;
119 
120     LongParamContainer parameters;
121     parameters.addGroup("Required Parameters");
122     parameters.addString("in", &inFile);
123     parameters.addString("out", &outFile);
124     parameters.addGroup("Optional Parameters");
125     parameters.addInt("minQual", & myMinQual);
126     parameters.addString("log", &logFile);
127     parameters.addBool("oneChrom", &myOneChrom);
128     parameters.addBool("recab", &myDoRecab);
129     parameters.addBool("rmDups", &removeFlag);
130     parameters.addBool("force", &myForceFlag);
131     parameters.addString("excludeFlags", &excludeFlags);
132     parameters.addBool("verbose", &verboseFlag);
133     parameters.addBool("noeof", &noeof);
134     parameters.addBool("params", &params);
135     parameters.addPhoneHome(VERSION);
136     myRecab.addRecabSpecificParameters(parameters);
137 
138     ParameterList inputParameters;
139     inputParameters.Add(new LongParameters ("Input Parameters",
140                                             parameters.getLongParameterList()));
141 
142     // parameters start at index 2 rather than 1.
143     inputParameters.Read(argc, argv, 2);
144 
145     // If no eof block is required for a bgzf file, set the bgzf file type to
146     // not look for it.
147     if(noeof)
148     {
149         // Set that the eof block is not required.
150         BgzfFileType::setRequireEofBlock(false);
151     }
152 
153     if(inFile.IsEmpty())
154     {
155         printUsage(std::cerr);
156         inputParameters.Status();
157         std::cerr << "Specify an input file" << std::endl;
158         return EXIT_FAILURE;
159     }
160     // inFile is not empty, so there is at least one character.  Check if
161     // it is specifing stdin since that is not supported for Dedup.
162     if(inFile[0] == '-')
163     {
164         // ERROR: stdin specified, but since Dedup requires 2 passes through
165         // the input file, stdin is not supported.
166         printUsage(std::cerr);
167         inputParameters.Status();
168         std::cerr << "ERROR: stdin ('" << inFile << "') is not a supported input file because Dedup requires two passes through the input file." << std::endl;
169         return EXIT_FAILURE;
170     }
171 
172     if(outFile.IsEmpty())
173     {
174         printUsage(std::cerr);
175         inputParameters.Status();
176         std::cerr << "Specify an output file" << std::endl;
177         return EXIT_FAILURE;
178     }
179 
180     intExcludeFlags = excludeFlags.AsInteger();
181 
182     if(myForceFlag && SamFlag::isDuplicate(intExcludeFlags))
183     {
184         printUsage(std::cerr);
185         inputParameters.Status();
186         std::cerr << "Cannot specify --force and Duplicate in the excludeFlags.  Since --force indicates to override"
187                   << " previous duplicate setting and the excludeFlags says to skip those, you can't do both.\n";
188         return EXIT_FAILURE;
189     }
190 
191     if(!SamFlag::isSecondary(intExcludeFlags))
192     {
193         printUsage(std::cerr);
194         inputParameters.Status();
195         std::cerr << "ERROR: Secondary reads must be excluded, edit --excludeFlags to include 0x0100\n";
196         return EXIT_FAILURE;
197     }
198 
199     if(!(intExcludeFlags & SamFlag::SUPPLEMENTARY_ALIGNMENT))
200     {
201         printUsage(std::cerr);
202         inputParameters.Status();
203         std::cerr << "ERROR: Supplementary reads must be excluded, edit --excludeFlags to include 0x0800\n";
204         return EXIT_FAILURE;
205     }
206 
207     if(logFile.IsEmpty())
208     {
209         logFile = outFile + ".log";
210     }
211 
212     if(myDoRecab)
213     {
214         int status = myRecab.processRecabParam();
215         if(status != 0)
216         {
217             inputParameters.Status();
218             return(status);
219         }
220     }
221 
222     if(params)
223     {
224         inputParameters.Status();
225     }
226 
227     Logger::gLogger = new Logger(logFile.c_str(), verboseFlag);
228 
229     /* -------------------------------------------------------------------
230      * The arguments are processed.  Prepare the input BAM file,
231      * instantiate dedup, and construct the read group library map
232      * ------------------------------------------------------------------*/
233 
234     SamFile samIn;
235 
236     samIn.OpenForRead(inFile.c_str());
237     // If the file isn't sorted it will throw an exception.
238     samIn.setSortedValidation(SamFile::COORDINATE);
239 
240     SamFileHeader header;
241     samIn.ReadHeader(header);
242 
243     buildReadGroupLibraryMap(header);
244 
245     lastReference = -1;
246     lastCoordinate = -1;
247 
248     // for keeping some basic statistics
249     uint32_t recordCount = 0;
250     uint32_t pairedCount = 0;
251     uint32_t properPairCount = 0;
252     uint32_t unmappedCount = 0;
253     uint32_t reverseCount = 0;
254     uint32_t qualCheckFailCount = 0;
255     uint32_t secondaryCount = 0;
256     uint32_t supplementaryCount = 0;
257     uint32_t excludedCount = 0;
258 
259     // Now we start reading records
260     SamRecord* recordPtr;
261     SamStatus::Status returnStatus = SamStatus::SUCCESS;
262     while(returnStatus == SamStatus::SUCCESS)
263     {
264         recordPtr = mySamPool.getRecord();
265         if(recordPtr == NULL)
266         {
267             std::cerr << "Failed to allocate enough records\n";
268             return(-1);
269         }
270         if(!samIn.ReadRecord(header, *recordPtr))
271         {
272             returnStatus = samIn.GetStatus();
273             continue;
274         }
275         // Take note of properties of this record
276         int flag = recordPtr->getFlag();
277         if(SamFlag::isPaired(flag))     ++pairedCount;
278         if(SamFlag::isProperPair(flag)) ++properPairCount;
279         if(SamFlag::isReverse(flag))    ++reverseCount;
280         if(SamFlag::isQCFailure(flag))  ++qualCheckFailCount;
281         if(SamFlag::isSecondary(flag))  ++secondaryCount;
282         if(flag & SamFlag::SUPPLEMENTARY_ALIGNMENT)  ++supplementaryCount;
283         if(!SamFlag::isMapped(flag))    ++unmappedCount;
284 
285         // put the record in the appropriate maps:
286         //   single reads go in myFragmentMap
287         //   paired reads go in myPairedMap
288         recordCount = samIn.GetCurrentRecordCount();
289 
290         // if we have moved to a new position, look back at previous reads for duplicates
291         if (hasPositionChanged(*recordPtr))
292         {
293             cleanupPriorReads(recordPtr);
294         }
295 
296         // Determine if this read should be checked for duplicates.
297         if((!SamFlag::isMapped(flag)) || ((flag & intExcludeFlags) != 0))
298         {
299             ++excludedCount;
300 
301             // No deduping done on this record, but still build the recab table.
302             if(myDoRecab)
303             {
304                 myRecab.processReadBuildTable(*recordPtr);
305             }
306             // Nothing more to do with this record, so
307             // release the pointer.
308             mySamPool.releaseRecord(recordPtr);
309         }
310         else
311         {
312             if(SamFlag::isDuplicate(flag) && !myForceFlag)
313             {
314                 // Error: Marked duplicates, and duplicates aren't excluded.
315                 Logger::gLogger->error("There are records already duplicate marked.");
316                 Logger::gLogger->error("Use -f to clear the duplicate flag and start the deduping procedure over");
317             }
318 
319             checkDups(*recordPtr, recordCount);
320         }
321         // let the user know we're not napping
322         if (verboseFlag && (recordCount % 100000 == 0))
323         {
324             Logger::gLogger->writeLog("recordCount=%u singleKeyMap=%u pairedKeyMap=%u, dictSize=%u",
325                                       recordCount, myFragmentMap.size(),
326                                       myPairedMap.size(),
327                                       myMateMap.size());
328         }
329     }
330 
331     // we're finished reading record so clean up the duplicate search and
332     //  close the input file
333     cleanupPriorReads(NULL);
334     samIn.Close();
335 
336     // print some statistics
337     Logger::gLogger->writeLog("--------------------------------------------------------------------------");
338     Logger::gLogger->writeLog("SUMMARY STATISTICS OF THE READS");
339     Logger::gLogger->writeLog("Total number of reads: %u",recordCount);
340     Logger::gLogger->writeLog("Total number of paired-end reads: %u",
341                               pairedCount);
342     Logger::gLogger->writeLog("Total number of properly paired reads: %u",
343                               properPairCount);
344     Logger::gLogger->writeLog("Total number of unmapped reads: %u",
345                               unmappedCount);
346     Logger::gLogger->writeLog("Total number of reverse strand mapped reads: %u",
347                               reverseCount);
348     Logger::gLogger->writeLog("Total number of QC-failed reads: %u",
349                               qualCheckFailCount);
350     Logger::gLogger->writeLog("Total number of secondary reads: %u",
351                               secondaryCount);
352     Logger::gLogger->writeLog("Total number of supplementary reads: %u",
353                               supplementaryCount);
354     Logger::gLogger->writeLog("Size of singleKeyMap (must be zero): %u",
355                               myFragmentMap.size());
356     Logger::gLogger->writeLog("Size of pairedKeyMap (must be zero): %u",
357                               myPairedMap.size());
358     Logger::gLogger->writeLog("Total number of missing mates: %u",
359                               myNumMissingMate);
360     Logger::gLogger->writeLog("Total number of reads excluded from duplicate checking: %u",
361                               excludedCount);
362     Logger::gLogger->writeLog("--------------------------------------------------------------------------");
363     Logger::gLogger->writeLog("Sorting the indices of %d duplicated records",
364                               myDupList.size());
365 
366     // sort the indices of duplicate records
367     std::sort(myDupList.begin(), myDupList.end(),
368               std::less<uint32_t> ());
369 
370     // get ready to write the output file by making a second pass
371     // through the input file
372     samIn.OpenForRead(inFile.c_str());
373     samIn.ReadHeader(header);
374 
375     SamFile samOut;
376     samOut.OpenForWrite(outFile.c_str());
377     samOut.WriteHeader(header);
378 
379     // If we are recalibrating, output the model information.
380     if(myDoRecab)
381     {
382         myRecab.modelFitPrediction(outFile);
383     }
384 
385     // an iterator to run through the duplicate indices
386     int currentDupIndex = 0;
387     bool moreDups = !myDupList.empty();
388 
389     // let the user know what we're doing
390     Logger::gLogger->writeLog("\nWriting %s", outFile.c_str());
391 
392     // count the duplicate records as a check
393     uint32_t singleDuplicates(0), pairedDuplicates(0);
394 
395     // start reading records and writing them out
396     SamRecord record;
397     while(samIn.ReadRecord(header, record))
398     {
399         uint32_t currentIndex = samIn.GetCurrentRecordCount();
400 
401         bool foundDup = moreDups &&
402             (currentIndex == myDupList[currentDupIndex]);
403 
404         // modify the duplicate flag and write out the record,
405         // if it's appropriate
406         int flag = record.getFlag();
407         if (foundDup)
408         {
409             // this record is a duplicate, so mark it.
410             record.setFlag( flag | 0x400 );
411             currentDupIndex++;
412             // increment duplicate counters to verify we found them all
413             if ( ( ( flag & 0x0001 ) == 0 ) || ( flag & 0x0008 ) )
414             { // unpaired or mate unmapped
415                 singleDuplicates++;
416             }
417             else
418             {
419                 pairedDuplicates++;
420             }
421             // recalibrate if necessary.
422             if(myDoRecab)
423             {
424                 myRecab.processReadApplyTable(record);
425             }
426 
427             // write the record if we are not removing duplicates
428             if (!removeFlag ) samOut.WriteRecord(header, record);
429         }
430         else
431         {
432             if(myForceFlag)
433             {
434                 // this is not a duplicate we've identified but we want to
435                 // remove any duplicate marking
436                 record.setFlag( flag & 0xfffffbff ); // unmark duplicate
437             }
438             // Not a duplicate, so recalibrate if necessary.
439             if(myDoRecab)
440             {
441                 myRecab.processReadApplyTable(record);
442             }
443             samOut.WriteRecord(header, record);
444         }
445 
446         // Let the user know we're still here
447         if (verboseFlag && (currentIndex % 100000 == 0)) {
448             Logger::gLogger->writeLog("recordCount=%u", currentIndex);
449         }
450     }
451 
452     // We're done.  Close the files and print triumphant messages.
453     samIn.Close();
454     samOut.Close();
455 
456     Logger::gLogger->writeLog("Successfully %s %u unpaired and %u paired duplicate reads",
457                               removeFlag ? "removed" : "marked" ,
458                               singleDuplicates,
459                               pairedDuplicates/2);
460     Logger::gLogger->writeLog("\nDedup complete!");
461     return 0;
462 }
463 
464 // Now that we've reached coordinate on chromosome reference, look back and
465 // clean up any previous positions from being tracked.
cleanupPriorReads(SamRecord * record)466 void Dedup::cleanupPriorReads(SamRecord* record)
467 {
468     static DupKey emptyKey;
469     static DupKey tempKey2;
470 
471     // Set where to stop cleaning out the structures.
472     // Initialize to the end of the structures.
473     FragmentMap::iterator fragmentFinish = myFragmentMap.end();
474     PairedMap::iterator pairedFinish = myPairedMap.end();
475     uint64_t mateStopPos = 0;
476 
477     // If a record was specified, stop before this record.
478     if(record != NULL)
479     {
480         int32_t reference = record->getReferenceID();
481         int32_t coordinate = record->get0BasedPosition();
482         tempKey2.cleanupKey(reference, coordinate);
483         fragmentFinish = myFragmentMap.lower_bound(tempKey2);
484         // Now do the same thing with the paired reads
485         PairedKey pairedKey(emptyKey, tempKey2);
486         pairedFinish = myPairedMap.lower_bound(pairedKey);
487         mateStopPos =
488             SamHelper::combineChromPos(reference,
489                                        coordinate);
490     }
491 
492     // For each key k < fragmentFinish, release the record since we are
493     // done with that position and it is not a duplicate.
494     for(FragmentMap::iterator iter = myFragmentMap.begin();
495         iter != fragmentFinish; iter++)
496     {
497         // If it is not paired, we are done with this record.
498         if(!iter->second.paired)
499         {
500             // Unpaired, non-duplicate, so perform any additional handling.
501             handleNonDuplicate(iter->second.recordPtr);
502         }
503     }
504     // Erase the entries from the map.
505     if(fragmentFinish != myFragmentMap.begin())
506     {
507         myFragmentMap.erase(myFragmentMap.begin(), fragmentFinish);
508     }
509 
510     // Now do the same thing with the paired reads
511     for(PairedMap::iterator iter = myPairedMap.begin();
512         iter != pairedFinish; iter++)
513     {
514         PairedData* pairedData = &(iter->second);
515         // These are not duplicates, but we are done with them,
516         // so perform any additional handling.
517         handleNonDuplicate(pairedData->record1Ptr);
518         handleNonDuplicate(pairedData->record2Ptr);
519     }
520     // Erase the entries.
521     if (pairedFinish != myPairedMap.begin())
522     {
523         myPairedMap.erase(myPairedMap.begin(), pairedFinish);
524     }
525 
526     // Clean up the Mate map from any reads whose mates were not found.
527     // Loop through the mate map and release records prior to this position.
528     MateMap::iterator mateIter;
529     for(mateIter = myMateMap.begin(); mateIter != myMateMap.end(); mateIter++)
530     {
531         // stop if a record was specified and we have gone past the mate
532         // stop position.  If no record was specified, we want to clean
533         // it all out.
534         if((record != NULL) && (mateIter->first >= mateStopPos))
535         {
536             break;
537         }
538         // Passed the mate, but it was not found.
539         handleMissingMate(mateIter->second.recordPtr);
540     }
541     // Erase the entries.
542     if(mateIter != myMateMap.begin())
543     {
544         myMateMap.erase(myMateMap.begin(), mateIter);
545     }
546     return;
547 }
548 
549 
550 // determine whether the record's position is different from the previous record
hasPositionChanged(SamRecord & record)551 bool Dedup::hasPositionChanged(SamRecord& record)
552 {
553     if (lastReference != record.getReferenceID() ||
554         lastCoordinate < record.get0BasedPosition())
555     {
556         if (lastReference != record.getReferenceID())
557         {
558             lastReference = record.getReferenceID();
559             Logger::gLogger->writeLog("Reading ReferenceID %d\n", lastReference);
560         }
561         lastCoordinate = record.get0BasedPosition();
562         return true;
563     }
564     return false;
565 }
566 
567 // When a record is read, check if it is a duplicate or
568 // store for future checking.
checkDups(SamRecord & record,uint32_t recordCount)569 void Dedup::checkDups(SamRecord& record, uint32_t recordCount)
570 {
571     // Only inside this method if the record is mapped.
572 
573     // Get the key for this record.
574     static DupKey key;
575     static DupKey mateKey;
576     key.updateKey(record, getLibraryID(record));
577 
578     int flag = record.getFlag();
579     bool recordPaired = SamFlag::isPaired(flag) && SamFlag::isMateMapped(flag);
580     int sumBaseQual = getBaseQuality(record);
581 
582     int32_t chromID = record.getReferenceID();
583     int32_t mateChromID = record.getMateReferenceID();
584 
585     // If we are one-chrom and the mate is not on the same chromosome,
586     // mark it as not paired.
587     if(myOneChrom && (chromID != mateChromID))
588     {
589         recordPaired = false;
590     }
591 
592     // Look in the map to see if an entry for this key exists.
593     FragmentMapInsertReturn ireturn =
594         myFragmentMap.insert(std::make_pair(key, ReadData()));
595 
596     ReadData* readData = &(ireturn.first->second);
597 
598     // Mark this record's data in the fragment record if this is the first
599     // entry or if it is a duplicate and the old record is not paired and
600     // the new record is paired or the has a higher quality.
601     if((ireturn.second == true) ||
602        ((readData->paired == false) &&
603         (recordPaired || (sumBaseQual > readData->sumBaseQual))))
604     {
605         // If there was a previous record, mark it duplicate and release
606         // the old record
607         if(ireturn.second == false)
608         {
609             // Mark the old record as a DUPLICATE!
610             handleDuplicate(readData->recordIndex, readData->recordPtr);
611         }
612         // Store this record for later duplicate checking.
613         readData->sumBaseQual = sumBaseQual;
614         readData->recordIndex = recordCount;
615         readData->paired = recordPaired;
616         if(recordPaired)
617         {
618             readData->recordPtr = NULL;
619         }
620         else
621         {
622             readData->recordPtr = &record;
623         }
624     }
625     else
626     {
627         // The old record is not a duplicate so the new record is
628         // a duplicate if it is not paired.
629         if(recordPaired == false)
630         {
631             // This record is a duplicate, so mark it and release it.
632             handleDuplicate(recordCount, &record);
633         }
634     }
635 
636     // Only paired processing is left, so return if not paired.
637     if(recordPaired == false)
638     {
639         // Not paired, no more operations required, so return.
640         return;
641     }
642 
643     // This is a paired record, so check for its mate.
644     uint64_t readPos =
645         SamHelper::combineChromPos(chromID,
646                                    record.get0BasedPosition());
647     uint64_t matePos =
648         SamHelper::combineChromPos(mateChromID,
649                                    record.get0BasedMatePosition());
650     SamRecord* mateRecord = NULL;
651     int mateIndex = 0;
652 
653     // Check to see if the mate is prior to this record.
654     if(matePos <= readPos)
655     {
656         // The mate map is stored by the mate position, so look for this
657         // record's position.
658         // The mate should be in the mate map, so find it.
659         std::pair<MateMap::iterator,MateMap::iterator> matches =
660             myMateMap.equal_range(readPos);
661         // Loop through the elements that matched the pos looking for the mate.
662         for(MateMap::iterator iter = matches.first;
663             iter != matches.second; iter++)
664         {
665             if(strcmp((*iter).second.recordPtr->getReadName(),
666                       record.getReadName()) == 0)
667             {
668                 // Found the match.
669                 ReadData* mateData = &((*iter).second);
670                 // Update the quality and track the mate record and index.
671                 sumBaseQual += mateData->sumBaseQual;
672                 mateIndex = mateData->recordIndex;
673                 mateRecord = mateData->recordPtr;
674                 // Remove the entry from the map.
675                 myMateMap.erase(iter);
676                 break;
677             }
678         }
679     }
680     if((mateRecord == NULL) && (matePos >= readPos))
681     {
682         // Haven't gotten to the mate yet, so store this record.
683         MateMap::iterator mateIter =
684             myMateMap.insert(std::make_pair(matePos, ReadData()));
685         mateIter->second.sumBaseQual = sumBaseQual;
686         mateIter->second.recordPtr = &record;
687         mateIter->second.recordIndex = recordCount;
688         // No more processing for this record is necessary.
689         return;
690     }
691 
692     if(mateRecord == NULL)
693     {
694         // Passed the mate, but it was not found.
695         handleMissingMate(&record);
696         return;
697     }
698 
699     // Make the paired key.
700     mateKey.updateKey(*mateRecord, getLibraryID(*mateRecord));
701     PairedKey pkey(key, mateKey);
702 
703     // Check to see if this pair is a duplicate.
704     PairedMapInsertReturn pairedReturn =
705         myPairedMap.insert(std::make_pair(pkey,PairedData()));
706     PairedData* storedPair = &(pairedReturn.first->second);
707 
708     // Get the index for "record 1" - the one with the earlier coordinate.
709     int record1Index = getFirstIndex(key, recordCount,
710                                           mateKey, mateIndex);
711 
712     // Check if we have already found a duplicate pair.
713     // If there is no duplicate found, there is nothing more to do.
714     if(pairedReturn.second == false)
715     {
716         // Duplicate found.
717         bool keepStored = true;
718         if(pairedReturn.first->second.sumBaseQual < sumBaseQual)
719         {
720             // The new pair has higher quality, so keep that.
721             keepStored = false;
722         }
723         else if(pairedReturn.first->second.sumBaseQual == sumBaseQual)
724         {
725             // Same quality, so keep the one with the earlier record1Index.
726             if(record1Index < storedPair->record1Index)
727             {
728                 // The new pair has an earlier lower coordinate read,
729                 // so keep that.
730                 keepStored = false;
731             }
732         }
733         // Check to see which one should be kept by checking qualities.
734         if(keepStored)
735         {
736             // The old pair had higher quality so mark the new pair as a
737             // duplicate and release them.
738             handleDuplicate(mateIndex, mateRecord);
739             handleDuplicate(recordCount, &record);
740         }
741         else
742         {
743             // The new pair has higher quality, so keep that.
744             // First mark the previous one as duplicates and release them.
745             handleDuplicate(storedPair->record1Index, storedPair->record1Ptr);
746             handleDuplicate(storedPair->record2Index, storedPair->record2Ptr);
747             // Store this pair's information.
748             if(record1Index == mateIndex)
749             {
750                 // Mate has a lower coordinate, so make mate
751                 // record1.
752                 storedPair->sumBaseQual = sumBaseQual;
753                 storedPair->record1Ptr = mateRecord;
754                 storedPair->record2Ptr = &record;
755                 storedPair->record1Index = mateIndex;
756                 storedPair->record2Index = recordCount;
757             }
758             else
759             {
760                 // This record has a lower coordinate, so make it
761                 // record1.
762                 storedPair->sumBaseQual = sumBaseQual;
763                 storedPair->record1Ptr = &record;
764                 storedPair->record2Ptr = mateRecord;
765                 storedPair->record1Index = recordCount;
766                 storedPair->record2Index = mateIndex;
767             }
768         }
769     }
770     else
771     {
772         // Store this pair's information.
773         storedPair->sumBaseQual = sumBaseQual;
774 
775         if(record1Index == mateIndex)
776         {
777             // Mate has a lower coordinate, so make mate
778             // record1.
779             storedPair->record1Ptr = mateRecord;
780             storedPair->record2Ptr = &record;
781             storedPair->record1Index = mateIndex;
782             storedPair->record2Index = recordCount;
783         }
784         else
785         {
786             // This record has a lower coordinate, so make it
787             // record1.
788             storedPair->record1Ptr = &record;
789             storedPair->record2Ptr = mateRecord;
790             storedPair->record1Index = recordCount;
791             storedPair->record2Index = mateIndex;
792         }
793     }
794 }
795 
796 
797 // Finds the total base quality of a read
getBaseQuality(SamRecord & record)798 int Dedup::getBaseQuality(SamRecord & record) {
799     const char* baseQualities = record.getQuality();
800     int readLength = record.getReadLength();
801     int quality = 0.;
802     if(strcmp(baseQualities, "*") == 0)
803     {
804         return(0);
805     }
806     for(int i=0; i < readLength; ++i) {
807         int q = static_cast<int>(baseQualities[i])-33;
808         if ( q >= myMinQual ) quality += q;
809     }
810     return quality;
811 }
812 
813 
814 // build the read group library map
buildReadGroupLibraryMap(SamFileHeader & header)815 void Dedup::buildReadGroupLibraryMap(SamFileHeader& header) {
816     rgidLibMap.clear();
817     numLibraries = 0;
818     std::map<std::string,uint32_t> libNameMap;
819 
820     SamHeaderRecord * headerRecord = header.getNextRGRecord();
821     while(headerRecord != NULL) {
822         std::string ID = headerRecord->getTagValue("ID");
823         std::string LB = headerRecord->getTagValue("LB");
824 
825         if ( ID.empty() ) {
826             std::string headerRecordString;
827             headerRecord->appendString(headerRecordString);
828             Logger::gLogger->error("Cannot find readGroup ID information in the header line %s",
829                                    headerRecordString.c_str());
830         }
831         if ( rgidLibMap.find(ID) != rgidLibMap.end() ) {
832             Logger::gLogger->error("The readGroup ID %s is not a unique identifier",ID.c_str());
833         }
834 
835         if ( LB.empty() ) {
836             std::string headerRecordString;
837             headerRecord->appendString(headerRecordString);
838             Logger::gLogger->warning("Cannot find library information in the header line %s. Using empty string for library name",
839                                      headerRecordString.c_str());
840         }
841 
842         if ( libNameMap.find( LB ) != libNameMap.end() ) {
843             rgidLibMap[ID] = libNameMap[LB];
844         }
845         else {
846             numLibraries = libNameMap.size()+1;
847             libNameMap[LB] = numLibraries;
848             rgidLibMap[ID] = numLibraries;
849         }
850         headerRecord = header.getNextRGRecord();
851     }
852 
853     if (numLibraries > 0xff) {
854         Logger::gLogger->error("More than 255 library names are identified. Dedup currently only allows up to 255 library names");
855     }
856 }
857 
858 // get the libraryID of a record
getLibraryID(SamRecord & record,bool checkTags)859 uint32_t Dedup::getLibraryID(SamRecord& record, bool checkTags) {
860     if ( ( checkTags == false ) && ( numLibraries <= 1 ) ) {
861         return 0;
862     } else {
863         char tag[3];
864         char vtype;
865         void* value;
866         std::string rgID;
867         record.resetTagIter();
868         while( record.getNextSamTag(tag,vtype,&value) != false ) {
869             if ( ( tag[0] == 'R' ) && ( tag[1] == 'G' ) && ( vtype == 'Z' ) ) {
870                 if ( !rgID.empty() ) {
871                     Logger::gLogger->error("Multiple RG tag found in one record. ReadName is %s",record.getReadName());
872                 }
873                 else if ( record.isStringType(vtype) ) {
874                     String s = (String)*(String*)value;
875                     rgID = s.c_str();
876                 }
877                 else {
878                     Logger::gLogger->error("vtype is not string (Z) for RG tag");
879                 }
880             }
881         }
882         if ( rgID.empty() ) {
883             Logger::gLogger->error("No RG tag is found in read %s",record.getReadName());
884             return 0;
885         }
886         else {
887             std::map<std::string,uint32_t>::iterator it = rgidLibMap.find(rgID);
888             if ( it != rgidLibMap.end() ) {
889                 return it->second;
890             }
891             else {
892                 Logger::gLogger->warning("RG tag %s does not exist in the header",rgID.c_str());
893                 return 0; // cannot be reached
894             }
895         }
896     }
897 }
898 
899 
handleNonDuplicate(SamRecord * recordPtr)900 void Dedup::handleNonDuplicate(SamRecord* recordPtr)
901 {
902     if(recordPtr == NULL)
903     {
904         return;
905     }
906     if(myDoRecab)
907     {
908         if(myForceFlag)
909         {
910             // this is not a duplicate we've identified but we want to
911             // remove any duplicate marking
912             uint16_t flag = recordPtr->getFlag();
913             if(SamFlag::isDuplicate(flag))
914             {
915                 SamFlag::setNotDuplicate(flag);
916                 recordPtr->setFlag(flag); // unmark duplicate
917             }
918         }
919         // Add to recalibration matrix.
920         myRecab.processReadBuildTable(*recordPtr);
921     }
922 
923     // Release the record.
924     mySamPool.releaseRecord(recordPtr);
925 }
926 
927 
handleMissingMate(SamRecord * recordPtr)928 void Dedup::handleMissingMate(SamRecord* recordPtr)
929 {
930     static bool firstDifferChrom = true;
931     static bool firstSameChrom = true;
932 
933     if(recordPtr == NULL)
934     {
935         return;
936     }
937 
938     // Passed the mate, but it was not found.
939     if(recordPtr->getMateReferenceID() != recordPtr->getReferenceID())
940     {
941         if(firstDifferChrom)
942         {
943             std::cerr << "Mate on different chromosome was not found.\n"
944                       << "If you are running single chromosome, consider "
945                       << "using --oneChrom to treat reads with mates on "
946                       << "different chromosomes as single-ended.\n";
947             firstDifferChrom = false;
948         }
949     }
950     else if(firstSameChrom)
951     {
952         std::cerr << "WARNING: Records with missing mate can't be checked for "
953                   << "duplicates.\n";
954         firstSameChrom = false;
955     }
956 
957     // Don't consider this record to be a duplicate.
958     // Release this record since there is nothing more to do with it.
959     ++myNumMissingMate;
960     handleNonDuplicate(recordPtr);
961 }
962 
963 
handleDuplicate(uint32_t index,SamRecord * recordPtr)964 void Dedup::handleDuplicate(uint32_t index, SamRecord* recordPtr)
965 {
966     if(recordPtr == NULL)
967     {
968         return;
969     }
970     if(myDoRecab)
971     {
972         // Mark the record as a duplicate
973         uint16_t flag = recordPtr->getFlag();
974         SamFlag::setDuplicate(flag);
975         recordPtr->setFlag(flag);
976 
977         // Add to recalibration matrix.
978         myRecab.processReadBuildTable(*recordPtr);
979     }
980 
981     // Add the index to the duplicate list.
982     myDupList.push_back(index);
983     // Release the record.
984     mySamPool.releaseRecord(recordPtr);
985 }
986