1 /*
2  *  Copyright (C) 2010-2016  Regents of the University of Michigan
3  *
4  *   This program is free software: you can redistribute it and/or modify
5  *   it under the terms of the GNU General Public License as published by
6  *   the Free Software Foundation, either version 3 of the License, or
7  *   (at your option) any later version.
8  *
9  *   This program is distributed in the hope that it will be useful,
10  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *   GNU General Public License for more details.
13  *
14  *   You should have received a copy of the GNU General Public License
15  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <iostream>
19 #include <algorithm>
20 #include <cstdlib>
21 #include <string>
22 #include <unistd.h>
23 #include <getopt.h>
24 #include "SamFile.h"
25 #include "Dedup_LowMem.h"
26 #include "Logger.h"
27 #include "SamHelper.h"
28 #include "SamStatus.h"
29 #include "BgzfFileType.h"
30 
31 const int Dedup_LowMem::DEFAULT_MIN_QUAL = 15;
32 const uint32_t Dedup_LowMem::CLIP_OFFSET = 1000;
33 
~Dedup_LowMem()34 Dedup_LowMem::~Dedup_LowMem()
35 {
36     // clean up the maps.
37     // First free any fragment records.
38     myFragmentMap.clear();
39 
40     // Free any paired records.
41     myPairedMap.clear();
42 
43     // Free any entries in the mate map.
44     myMateMap.clear();
45 }
46 
printDedup_LowMemDescription(std::ostream & os)47 void Dedup_LowMem::printDedup_LowMemDescription(std::ostream& os)
48 {
49     os << " dedup_LowMem - Mark Duplicates using only a little memory\n";
50 }
51 
52 
printDescription(std::ostream & os)53 void Dedup_LowMem::printDescription(std::ostream& os)
54 {
55     printDedup_LowMemDescription(os);
56 }
57 
58 
printUsage(std::ostream & os)59 void Dedup_LowMem::printUsage(std::ostream& os)
60 {
61     os << "Usage: ./bam dedup_LowMem --in <InputBamFile> --out <OutputBamFile> [--minQual <minPhred>] [--log <logFile>] [--oneChrom] [--rmDups] [--force] [--excludeFlags <flag>] [--verbose] [--noeof] [--params] [--recab] ";
62     myRecab.printRecabSpecificUsageLine(os);
63     os << std::endl << std::endl;
64     os << "Required parameters :" << std::endl;
65     os << "\t--in <infile>   : Input BAM file name (must be sorted)" << std::endl;
66     os << "\t--out <outfile> : Output BAM file name (same order with original file)" << std::endl;
67     os << "Optional parameters : " << std::endl;
68     os << "\t--minQual <int> : Only add scores over this phred quality when determining a read's quality (default: "
69               << DEFAULT_MIN_QUAL << ")" << std::endl;
70     os << "\t--log <logfile> : Log and summary statistics (default: [outfile].log, or stderr if --out starts with '-')" << std::endl;
71     os << "\t--oneChrom      : Treat reads with mates on different chromosomes as single-ended." << std::endl;
72     os << "\t--rmDups        : Remove duplicates (default is to mark duplicates)" << std::endl;
73     os << "\t--force         : Allow an already mark-duplicated BAM file, unmarking any previously marked " << std::endl;
74     os << "\t                  duplicates and apply this duplicate marking logic.  Default is to throw errors" << std::endl;
75     os << "\t                  and exit when trying to run on an already mark-duplicated BAM" << std::endl;
76     os << "\t--excludeFlags <flag>    : exclude reads with any of these flags set when determining or marking duplicates" << std::endl;
77     os << "\t                           by default (0xB04): exclude unmapped, secondary reads, QC failures, and supplementary reads" << std::endl;
78     os << "\t--verbose       : Turn on verbose mode" << std::endl;
79     os << "\t--noeof         : Do not expect an EOF block on a bam file." << std::endl;
80     os << "\t--params        : Print the parameter settings" << std::endl;
81     os << "\t--recab         : Recalibrate in addition to dedup_LowMem" << std::endl;
82     myRecab.printRecabSpecificUsage(os);
83     os << "\n" << std::endl;
84 }
85 
execute(int argc,char ** argv)86 int Dedup_LowMem::execute(int argc, char** argv)
87 {
88     /* --------------------------------
89      * process the arguments
90      * -------------------------------*/
91     String inFile, outFile, logFile;
92     myDoRecab = false;
93     bool removeFlag = false;
94     bool verboseFlag = false;
95     myForceFlag = false;
96     myNumMissingMate = 0;
97     myMinQual = DEFAULT_MIN_QUAL;
98     String excludeFlags = "0xB04";
99     uint16_t intExcludeFlags = 0;
100     bool noeof = false;
101     bool params = false;
102 
103     LongParamContainer parameters;
104     parameters.addGroup("Required Parameters");
105     parameters.addString("in", &inFile);
106     parameters.addString("out", &outFile);
107     parameters.addGroup("Optional Parameters");
108     parameters.addInt("minQual", & myMinQual);
109     parameters.addString("log", &logFile);
110     parameters.addBool("oneChrom", &myOneChrom);
111     parameters.addBool("recab", &myDoRecab);
112     parameters.addBool("rmDups", &removeFlag);
113     parameters.addBool("force", &myForceFlag);
114     parameters.addString("excludeFlags", &excludeFlags);
115     parameters.addBool("verbose", &verboseFlag);
116     parameters.addBool("noeof", &noeof);
117     parameters.addBool("params", &params);
118     parameters.addPhoneHome(VERSION);
119     myRecab.addRecabSpecificParameters(parameters);
120 
121     ParameterList inputParameters;
122     inputParameters.Add(new LongParameters ("Input Parameters",
123                                             parameters.getLongParameterList()));
124 
125     // parameters start at index 2 rather than 1.
126     inputParameters.Read(argc, argv, 2);
127 
128     // If no eof block is required for a bgzf file, set the bgzf file type to
129     // not look for it.
130     if(noeof)
131     {
132         // Set that the eof block is not required.
133         BgzfFileType::setRequireEofBlock(false);
134     }
135 
136     if(inFile.IsEmpty())
137     {
138         printUsage(std::cerr);
139         inputParameters.Status();
140         std::cerr << "Specify an input file" << std::endl;
141         return EXIT_FAILURE;
142     }
143 
144     if(outFile.IsEmpty())
145     {
146         printUsage(std::cerr);
147         inputParameters.Status();
148         std::cerr << "Specify an output file" << std::endl;
149         return EXIT_FAILURE;
150     }
151 
152     intExcludeFlags = excludeFlags.AsInteger();
153 
154     if(myForceFlag && SamFlag::isDuplicate(intExcludeFlags))
155     {
156         printUsage(std::cerr);
157         inputParameters.Status();
158         std::cerr << "Cannot specify --force and Duplicate in the excludeFlags.  Since --force indicates to override"
159                   << " previous duplicate setting and the excludeFlags says to skip those, you can't do both.\n";
160         return EXIT_FAILURE;
161     }
162 
163     if(!SamFlag::isSecondary(intExcludeFlags))
164     {
165         printUsage(std::cerr);
166         inputParameters.Status();
167         std::cerr << "ERROR: Secondary reads must be excluded, edit --excludeFlags to include 0x0100\n";
168         return EXIT_FAILURE;
169     }
170 
171     if(!(intExcludeFlags & SamFlag::SUPPLEMENTARY_ALIGNMENT))
172     {
173         printUsage(std::cerr);
174         inputParameters.Status();
175         std::cerr << "ERROR: Supplementary reads must be excluded, edit --excludeFlags to include 0x0800\n";
176         return EXIT_FAILURE;
177     }
178 
179     if(logFile.IsEmpty())
180     {
181         logFile = outFile + ".log";
182     }
183 
184     if(myDoRecab)
185     {
186         int status = myRecab.processRecabParam();
187         if(status != 0)
188         {
189             inputParameters.Status();
190             return(status);
191         }
192     }
193 
194     if(params)
195     {
196         inputParameters.Status();
197     }
198 
199     Logger::gLogger = new Logger(logFile.c_str(), verboseFlag);
200 
201     /* -------------------------------------------------------------------
202      * The arguments are processed.  Prepare the input BAM file,
203      * instantiate dedup_LowMem, and construct the read group library map
204      * ------------------------------------------------------------------*/
205 
206     SamFile samIn;
207 
208     samIn.OpenForRead(inFile.c_str());
209     // If the file isn't sorted it will throw an exception.
210     samIn.setSortedValidation(SamFile::COORDINATE);
211 
212     SamFileHeader header;
213     samIn.ReadHeader(header);
214 
215     buildReadGroupLibraryMap(header);
216 
217     lastReference = -1;
218     lastCoordinate = -1;
219 
220     // for keeping some basic statistics
221     uint32_t recordCount = 0;
222     uint32_t pairedCount = 0;
223     uint32_t properPairCount = 0;
224     uint32_t unmappedCount = 0;
225     uint32_t reverseCount = 0;
226     uint32_t qualCheckFailCount = 0;
227     uint32_t secondaryCount = 0;
228     uint32_t supplementaryCount = 0;
229     uint32_t excludedCount = 0;
230 
231     // Now we start reading records
232     SamRecord* recordPtr;
233     SamStatus::Status returnStatus = SamStatus::SUCCESS;
234     while(returnStatus == SamStatus::SUCCESS)
235     {
236         recordPtr = mySamPool.getRecord();
237         if(recordPtr == NULL)
238         {
239             std::cerr << "Failed to allocate enough records\n";
240             return(-1);
241         }
242         if(!samIn.ReadRecord(header, *recordPtr))
243         {
244             returnStatus = samIn.GetStatus();
245             continue;
246         }
247         // Take note of properties of this record
248         int flag = recordPtr->getFlag();
249         if(SamFlag::isPaired(flag))     ++pairedCount;
250         if(SamFlag::isProperPair(flag)) ++properPairCount;
251         if(SamFlag::isReverse(flag))    ++reverseCount;
252         if(SamFlag::isQCFailure(flag))  ++qualCheckFailCount;
253         if(SamFlag::isSecondary(flag))  ++secondaryCount;
254         if(flag & SamFlag::SUPPLEMENTARY_ALIGNMENT)  ++supplementaryCount;
255         if(!SamFlag::isMapped(flag))    ++unmappedCount;
256 
257         // put the record in the appropriate maps:
258         //   single reads go in myFragmentMap
259         //   paired reads go in myPairedMap
260         recordCount = samIn.GetCurrentRecordCount();
261 
262         // if we have moved to a new position, look back at previous reads for duplicates
263         if (hasPositionChanged(*recordPtr))
264         {
265             cleanupPriorReads(recordPtr);
266         }
267 
268         // Determine if this read should be checked for duplicates.
269         if((!SamFlag::isMapped(flag)) || ((flag & intExcludeFlags) != 0))
270         {
271             ++excludedCount;
272 
273             // No deduping done on this record, but still build the recab table.
274             if(myDoRecab)
275             {
276                 myRecab.processReadBuildTable(*recordPtr);
277             }
278             // Nothing more to do with this record, so
279             // release the pointer.
280             mySamPool.releaseRecord(recordPtr);
281         }
282         else
283         {
284             if(SamFlag::isDuplicate(flag) && !myForceFlag)
285             {
286                 // Error: Marked duplicates, and duplicates aren't excluded.
287                 Logger::gLogger->error("There are records already duplicate marked.");
288                 Logger::gLogger->error("Use -f to clear the duplicate flag and start the dedup_LowMem procedure over");
289             }
290 
291             checkDups(*recordPtr, recordCount);
292             mySamPool.releaseRecord(recordPtr);
293         }
294         // let the user know we're not napping
295         if (verboseFlag && (recordCount % 100000 == 0))
296         {
297             Logger::gLogger->writeLog("recordCount=%u singleKeyMap=%u pairedKeyMap=%u, dictSize=%u",
298                                       recordCount, myFragmentMap.size(),
299                                       myPairedMap.size(),
300                                       myMateMap.size());
301         }
302     }
303 
304     // we're finished reading record so clean up the duplicate search and
305     //  close the input file
306     cleanupPriorReads(NULL);
307     samIn.Close();
308 
309     // print some statistics
310     Logger::gLogger->writeLog("--------------------------------------------------------------------------");
311     Logger::gLogger->writeLog("SUMMARY STATISTICS OF THE READS");
312     Logger::gLogger->writeLog("Total number of reads: %u",recordCount);
313     Logger::gLogger->writeLog("Total number of paired-end reads: %u",
314                               pairedCount);
315     Logger::gLogger->writeLog("Total number of properly paired reads: %u",
316                               properPairCount);
317     Logger::gLogger->writeLog("Total number of unmapped reads: %u",
318                               unmappedCount);
319     Logger::gLogger->writeLog("Total number of reverse strand mapped reads: %u",
320                               reverseCount);
321     Logger::gLogger->writeLog("Total number of QC-failed reads: %u",
322                               qualCheckFailCount);
323     Logger::gLogger->writeLog("Total number of secondary reads: %u",
324                               secondaryCount);
325     Logger::gLogger->writeLog("Total number of supplementary reads: %u",
326                               supplementaryCount);
327     Logger::gLogger->writeLog("Size of singleKeyMap (must be zero): %u",
328                               myFragmentMap.size());
329     Logger::gLogger->writeLog("Size of pairedKeyMap (must be zero): %u",
330                               myPairedMap.size());
331     Logger::gLogger->writeLog("Total number of missing mates: %u",
332                               myNumMissingMate);
333     Logger::gLogger->writeLog("Total number of reads excluded from duplicate checking: %u",
334                               excludedCount);
335     Logger::gLogger->writeLog("--------------------------------------------------------------------------");
336     Logger::gLogger->writeLog("Sorting the indices of %d duplicated records",
337                               myDupList.size());
338 
339     // sort the indices of duplicate records
340     std::sort(myDupList.begin(), myDupList.end(),
341               std::less<uint32_t> ());
342 
343     // get ready to write the output file by making a second pass
344     // through the input file
345     samIn.OpenForRead(inFile.c_str());
346     samIn.ReadHeader(header);
347 
348     SamFile samOut;
349     samOut.OpenForWrite(outFile.c_str());
350     samOut.WriteHeader(header);
351 
352     // If we are recalibrating, output the model information.
353     if(myDoRecab)
354     {
355         myRecab.modelFitPrediction(outFile);
356     }
357 
358     // an iterator to run through the duplicate indices
359     int currentDupIndex = 0;
360     bool moreDups = !myDupList.empty();
361 
362     // let the user know what we're doing
363     Logger::gLogger->writeLog("\nWriting %s", outFile.c_str());
364 
365     // count the duplicate records as a check
366     uint32_t singleDuplicates(0), pairedDuplicates(0);
367 
368     // start reading records and writing them out
369     SamRecord record;
370     while(samIn.ReadRecord(header, record))
371     {
372         uint32_t currentIndex = samIn.GetCurrentRecordCount();
373 
374         bool foundDup = moreDups &&
375             (currentIndex == myDupList[currentDupIndex]);
376 
377         // modify the duplicate flag and write out the record,
378         // if it's appropriate
379         int flag = record.getFlag();
380         if (foundDup)
381         {
382             // this record is a duplicate, so mark it.
383             record.setFlag( flag | 0x400 );
384             currentDupIndex++;
385             // increment duplicate counters to verify we found them all
386             if ( ( ( flag & 0x0001 ) == 0 ) || ( flag & 0x0008 ) )
387             { // unpaired or mate unmapped
388                 singleDuplicates++;
389             }
390             else
391             {
392                 pairedDuplicates++;
393             }
394             // recalibrate if necessary.
395             if(myDoRecab)
396             {
397                 myRecab.processReadApplyTable(record);
398             }
399 
400             // write the record if we are not removing duplicates
401             if (!removeFlag ) samOut.WriteRecord(header, record);
402         }
403         else
404         {
405             if(myForceFlag)
406             {
407                 // this is not a duplicate we've identified but we want to
408                 // remove any duplicate marking
409                 record.setFlag( flag & 0xfffffbff ); // unmark duplicate
410             }
411             // Not a duplicate, so recalibrate if necessary.
412             if(myDoRecab)
413             {
414                 myRecab.processReadApplyTable(record);
415             }
416             samOut.WriteRecord(header, record);
417         }
418 
419         // Let the user know we're still here
420         if (verboseFlag && (currentIndex % 100000 == 0)) {
421             Logger::gLogger->writeLog("recordCount=%u", currentIndex);
422         }
423     }
424 
425     // We're done.  Close the files and print triumphant messages.
426     samIn.Close();
427     samOut.Close();
428 
429     Logger::gLogger->writeLog("Successfully %s %u unpaired and %u paired duplicate reads",
430                               removeFlag ? "removed" : "marked" ,
431                               singleDuplicates,
432                               pairedDuplicates/2);
433     Logger::gLogger->writeLog("\nDedup_LowMem complete!");
434     return 0;
435 }
436 
437 // Now that we've reached coordinate on chromosome reference, look back and
438 // clean up any previous positions from being tracked.
cleanupPriorReads(SamRecord * record)439 void Dedup_LowMem::cleanupPriorReads(SamRecord* record)
440 {
441     static DupKey emptyKey;
442     static DupKey tempKey2;
443 
444     // Set where to stop cleaning out the structures.
445     // Initialize to the end of the structures.
446     FragmentMap::iterator fragmentFinish = myFragmentMap.end();
447     PairedMap::iterator pairedFinish = myPairedMap.end();
448     uint64_t mateStopPos = 0;
449 
450     // If a record was specified, stop before this record.
451     if(record != NULL)
452     {
453         int32_t reference = record->getReferenceID();
454         int32_t coordinate = record->get0BasedPosition();
455         tempKey2.cleanupKey(reference, coordinate);
456         fragmentFinish = myFragmentMap.lower_bound(tempKey2);
457         // Now do the same thing with the paired reads
458         PairedKey pairedKey(emptyKey, tempKey2);
459         pairedFinish = myPairedMap.lower_bound(pairedKey);
460         mateStopPos =
461             SamHelper::combineChromPos(reference,
462                                        coordinate);
463     }
464 
465     // For each key k < fragmentFinish, release the record since we are
466     // done with that position and it is not a duplicate.
467     for(FragmentMap::iterator iter = myFragmentMap.begin();
468         iter != fragmentFinish; iter++)
469     {
470         // If it is not paired, we are done with this record.
471         // If it is paired, it will be handled separately.
472         if(!iter->second.paired)
473         {
474             // Unpaired, non-duplicate, so perform any additional handling.
475             handleNonDuplicate();
476         }
477     }
478     // Erase the entries from the map.
479     if(fragmentFinish != myFragmentMap.begin())
480     {
481         myFragmentMap.erase(myFragmentMap.begin(), fragmentFinish);
482     }
483 
484     // Now do the same thing with the paired reads
485     for(PairedMap::iterator iter = myPairedMap.begin();
486         iter != pairedFinish; iter++)
487     {
488         // These are not duplicates, but we are done with them,
489         // so perform any additional handling.
490         handleNonDuplicatePair();
491     }
492     // Erase the entries.
493     if (pairedFinish != myPairedMap.begin())
494     {
495         myPairedMap.erase(myPairedMap.begin(), pairedFinish);
496     }
497 
498     // Clean up the Mate map from any reads whose mates were not found.
499     // Loop through the mate map and release records prior to this position.
500     MateMap::iterator mateIter;
501     for(mateIter = myMateMap.begin(); mateIter != myMateMap.end(); mateIter++)
502     {
503         // stop if a record was specified and we have gone past the mate
504         // stop position.  If no record was specified, we want to clean
505         // it all out.
506         if((record != NULL) && (mateIter->first >= mateStopPos))
507         {
508             break;
509         }
510         // Passed the mate, but it was not found.
511         handleMissingMate(mateIter->second.key.reference,
512                           mateIter->first >> 32);
513     }
514     // Erase the entries.
515     if(mateIter != myMateMap.begin())
516     {
517         myMateMap.erase(myMateMap.begin(), mateIter);
518     }
519     return;
520 }
521 
522 
523 // determine whether the record's position is different from the previous record
hasPositionChanged(SamRecord & record)524 bool Dedup_LowMem::hasPositionChanged(SamRecord& record)
525 {
526     if (lastReference != record.getReferenceID() ||
527         lastCoordinate < record.get0BasedPosition())
528     {
529         if (lastReference != record.getReferenceID())
530         {
531             lastReference = record.getReferenceID();
532             Logger::gLogger->writeLog("Reading ReferenceID %d\n", lastReference);
533         }
534         lastCoordinate = record.get0BasedPosition();
535         return true;
536     }
537     return false;
538 }
539 
540 // When a record is read, check if it is a duplicate or
541 // store for future checking.
checkDups(SamRecord & record,uint32_t recordCount)542 void Dedup_LowMem::checkDups(SamRecord& record, uint32_t recordCount)
543 {
544     // Only inside this method if the record is mapped.
545 
546     // Get the key for this record.
547     static DupKey key;
548     key.initKey(record, getLibraryID(record));
549 
550     int flag = record.getFlag();
551     bool recordPaired = SamFlag::isPaired(flag) && SamFlag::isMateMapped(flag);
552     int sumBaseQual = getBaseQuality(record);
553 
554     int32_t chromID = record.getReferenceID();
555     int32_t mateChromID = record.getMateReferenceID();
556 
557     // If we are one-chrom and the mate is not on the same chromosome,
558     // mark it as not paired.
559     if(myOneChrom && (chromID != mateChromID))
560     {
561         recordPaired = false;
562     }
563 
564     // Look in the fragment map to see if an entry for this key exists.
565     FragmentMapInsertReturn ireturn =
566         myFragmentMap.insert(std::make_pair(key, FragData()));
567 
568     FragData* fragData = &(ireturn.first->second);
569 
570     // Enter the new record in the fragData if (any of the below):
571     // 1) there is no previous entry for this key (ireturn.second == true)
572     // or
573     // 2) the previous entry is not paired
574     //    AND
575     //     a) the new record is paired
576     //     or
577     //     b) the new record has higher quality
578     if((ireturn.second == true) ||
579        ((fragData->paired == false) &&
580         (recordPaired || (sumBaseQual > fragData->sumBaseQual))))
581     {
582         // Check if this is a new key.
583         if(ireturn.second == true)
584         {
585             // New entry, so build the recalibration table now.
586             if(myDoRecab)
587             {
588                 myRecab.processReadBuildTable(record);
589             }
590         }
591         else if(fragData->paired == false)
592         {
593             // There was a previous record and it is not paired,
594             // so mark it as a duplicate.
595             // Duplicate checking/marking for pairs is handled below.
596             handleDuplicate(fragData->recordIndex);
597         }
598 
599         // Store this record for later duplicate checking.
600         fragData->sumBaseQual = sumBaseQual;
601         fragData->recordIndex = recordCount;
602         fragData->paired = recordPaired;
603     }
604     else
605     {
606         // Leave the old record in fragData.
607         // If the new record is not paired, handle it as a duplicate.
608         if(recordPaired == false)
609         {
610             // This record is a duplicate, so mark it and release it.
611             handleDuplicate(recordCount);
612         }
613     }
614 
615     // Only paired processing is left, so return if not paired.
616     if(recordPaired == false)
617     {
618         // Not paired, no more operations required, so return.
619         return;
620     }
621 
622     // This is a paired record, so check for its mate.
623     uint64_t readPos =
624         SamHelper::combineChromPos(chromID,
625                                    record.get0BasedPosition());
626     uint64_t matePos =
627         SamHelper::combineChromPos(mateChromID,
628                                    record.get0BasedMatePosition());
629     int mateIndex = -1;
630     MateData* mateData = NULL;
631 
632     // Check to see if the mate is prior to this record.
633     if(matePos <= readPos)
634     {
635         // The mate map is stored by the mate position, so look for this
636         // record's position.
637         // The mate should be in the mate map, so find it.
638         std::pair<MateMap::iterator,MateMap::iterator> matches =
639             myMateMap.equal_range(readPos);
640         // Loop through the elements that matched the pos looking for the mate.
641         for(MateMap::iterator iter = matches.first;
642             iter != matches.second; iter++)
643         {
644             if(strcmp((*iter).second.readName.c_str(),
645                       record.getReadName()) == 0)
646             {
647                 // Found the match.
648                 mateData = &((*iter).second);
649                 // Update the quality and track the mate record and index.
650                 sumBaseQual += mateData->sumBaseQual;
651                 mateIndex = mateData->recordIndex;
652                 // Remove the entry from the map.
653                 myMateMap.erase(iter);
654                 break;
655             }
656         }
657     }
658     if(mateData == NULL)
659     {
660         if(matePos >= readPos)
661         {
662             // Haven't gotten to the mate yet, so store this record.
663             MateMap::iterator mateIter =
664                 myMateMap.insert(std::make_pair(matePos, MateData()));
665             mateIter->second.sumBaseQual = sumBaseQual;
666             mateIter->second.recordIndex = recordCount;
667             mateIter->second.key.copy(key);
668             mateIter->second.readName = record.getReadName();
669         }
670         else
671         {
672             // Passed the mate, but it was not found.
673             handleMissingMate(record.getReferenceID(), record.getMateReferenceID());
674         }
675         return;
676     }
677 
678     // Make the paired key.
679     PairedKey pkey(key, mateData->key);
680 
681     // Check to see if this pair is a duplicate.
682     PairedMapInsertReturn pairedReturn =
683         myPairedMap.insert(std::make_pair(pkey,PairedData()));
684     PairedData* storedPair = &(pairedReturn.first->second);
685 
686     // Get the index for "record 1" - the one with the earlier coordinate.
687     int record1Index = getFirstIndex(key, recordCount,
688                                      mateData->key, mateIndex);
689 
690     // Check if we have already found a duplicate pair.
691     // If there is no duplicate found, there is nothing more to do.
692     if(pairedReturn.second == false)
693     {
694         // Duplicate found.
695         bool keepStored = true;
696         if(pairedReturn.first->second.sumBaseQual < sumBaseQual)
697         {
698             // The new pair has higher quality, so keep that.
699             keepStored = false;
700         }
701         else if(pairedReturn.first->second.sumBaseQual == sumBaseQual)
702         {
703             // Same quality, so keep the one with the earlier record1Index.
704             if(record1Index < storedPair->record1Index)
705             {
706                 // The new pair has an earlier lower coordinate read,
707                 // so keep that.
708                 keepStored = false;
709             }
710         }
711         // Check to see which one should be kept by checking qualities.
712         if(keepStored)
713         {
714             // The old pair had higher quality so mark the new pair as a
715             // duplicate and release them.
716             handleDuplicate(mateIndex);
717             handleDuplicate(recordCount);
718         }
719         else
720         {
721             // The new pair has higher quality, so keep that.
722             // First mark the previous one as duplicates and release them.
723             handleDuplicate(storedPair->record1Index);
724             handleDuplicate(storedPair->record2Index);
725             // Store this pair's information.
726             if(record1Index == mateIndex)
727             {
728                 // Mate has a lower coordinate, so make mate
729                 // record1.
730                 storedPair->sumBaseQual = sumBaseQual;
731                 storedPair->record1Index = mateIndex;
732                 storedPair->record2Index = recordCount;
733             }
734             else
735             {
736                 // This record has a lower coordinate, so make it
737                 // record1.
738                 storedPair->sumBaseQual = sumBaseQual;
739                 storedPair->record1Index = recordCount;
740                 storedPair->record2Index = mateIndex;
741             }
742         }
743     }
744     else
745     {
746         // Store this pair's information.
747         storedPair->sumBaseQual = sumBaseQual;
748 
749         if(record1Index == mateIndex)
750         {
751             // Mate has a lower coordinate, so make mate
752             // record1.
753             storedPair->record1Index = mateIndex;
754             storedPair->record2Index = recordCount;
755         }
756         else
757         {
758             // This record has a lower coordinate, so make it
759             // record1.
760             storedPair->record1Index = recordCount;
761             storedPair->record2Index = mateIndex;
762         }
763     }
764 }
765 
766 
767 // Finds the total base quality of a read
getBaseQuality(SamRecord & record)768 int Dedup_LowMem::getBaseQuality(SamRecord & record) {
769     const char* baseQualities = record.getQuality();
770     int readLength = record.getReadLength();
771     int quality = 0.;
772     if(strcmp(baseQualities, "*") == 0)
773     {
774         return(0);
775     }
776     for(int i=0; i < readLength; ++i) {
777         int q = static_cast<int>(baseQualities[i])-33;
778         if ( q >= myMinQual ) quality += q;
779     }
780     return quality;
781 }
782 
783 
784 // build the read group library map
buildReadGroupLibraryMap(SamFileHeader & header)785 void Dedup_LowMem::buildReadGroupLibraryMap(SamFileHeader& header) {
786     rgidLibMap.clear();
787     numLibraries = 0;
788     std::map<std::string,uint32_t> libNameMap;
789 
790     SamHeaderRecord * headerRecord = header.getNextRGRecord();
791     while(headerRecord != NULL) {
792         std::string ID = headerRecord->getTagValue("ID");
793         std::string LB = headerRecord->getTagValue("LB");
794 
795         if ( ID.empty() ) {
796             std::string headerRecordString;
797             headerRecord->appendString(headerRecordString);
798             Logger::gLogger->error("Cannot find readGroup ID information in the header line %s",
799                                    headerRecordString.c_str());
800         }
801         if ( rgidLibMap.find(ID) != rgidLibMap.end() ) {
802             Logger::gLogger->error("The readGroup ID %s is not a unique identifier",ID.c_str());
803         }
804 
805         if ( LB.empty() ) {
806             std::string headerRecordString;
807             headerRecord->appendString(headerRecordString);
808             Logger::gLogger->warning("Cannot find library information in the header line %s. Using empty string for library name",
809                                      headerRecordString.c_str());
810         }
811 
812         if ( libNameMap.find( LB ) != libNameMap.end() ) {
813             rgidLibMap[ID] = libNameMap[LB];
814         }
815         else {
816             numLibraries = libNameMap.size()+1;
817             libNameMap[LB] = numLibraries;
818             rgidLibMap[ID] = numLibraries;
819         }
820         headerRecord = header.getNextRGRecord();
821     }
822 
823     if (numLibraries > 0xff) {
824         Logger::gLogger->error("More than 255 library names are identified. Dedup_LowMem currently only allows up to 255 library names");
825     }
826 }
827 
828 // get the libraryID of a record
getLibraryID(SamRecord & record,bool checkTags)829 uint32_t Dedup_LowMem::getLibraryID(SamRecord& record, bool checkTags) {
830     if ( ( checkTags == false ) && ( numLibraries <= 1 ) ) {
831         return 0;
832     } else {
833         char tag[3];
834         char vtype;
835         void* value;
836         std::string rgID;
837         record.resetTagIter();
838         while( record.getNextSamTag(tag,vtype,&value) != false ) {
839             if ( ( tag[0] == 'R' ) && ( tag[1] == 'G' ) && ( vtype == 'Z' ) ) {
840                 if ( !rgID.empty() ) {
841                     Logger::gLogger->error("Multiple RG tag found in one record. ReadName is %s",record.getReadName());
842                 }
843                 else if ( record.isStringType(vtype) ) {
844                     String s = (String)*(String*)value;
845                     rgID = s.c_str();
846                 }
847                 else {
848                     Logger::gLogger->error("vtype is not string (Z) for RG tag");
849                 }
850             }
851         }
852         if ( rgID.empty() ) {
853             Logger::gLogger->error("No RG tag is found in read %s",record.getReadName());
854             return 0;
855         }
856         else {
857             std::map<std::string,uint32_t>::iterator it = rgidLibMap.find(rgID);
858             if ( it != rgidLibMap.end() ) {
859                 return it->second;
860             }
861             else {
862                 Logger::gLogger->warning("RG tag %s does not exist in the header",rgID.c_str());
863                 return 0; // cannot be reached
864             }
865         }
866     }
867 }
868 
869 
handleMissingMate(int32_t refID,int32_t mateRefID)870 void Dedup_LowMem::handleMissingMate(int32_t refID, int32_t mateRefID)
871 {
872     static bool firstDifferChrom = true;
873     static bool firstSameChrom = true;
874 
875     ++myNumMissingMate;
876 
877     // Passed the mate, but it was not found.
878     if(refID != mateRefID)
879     {
880         if(firstDifferChrom)
881         {
882             std::cerr << "Mate on different chromosome was not found.\n"
883                       << "If you are running single chromosome, consider "
884                       << "using --oneChrom to treat reads with mates on "
885                       << "different chromosomes as single-ended.\n";
886             firstDifferChrom = false;
887         }
888     }
889     else if(firstSameChrom)
890     {
891         std::cerr << "ERROR: Records with missing mate can't be checked for "
892                   << "duplicates.\n";
893         firstSameChrom = false;
894     }
895 
896     // Don't consider this record to be a duplicate.
897     handleNonDuplicate();
898 }
899 
900 
handleDuplicate(uint32_t index)901 void Dedup_LowMem::handleDuplicate(uint32_t index)
902 {
903     // Add the index to the duplicate list.
904     myDupList.push_back(index);
905 }
906