1 /*
2 * Copyright (C) 2010-2016 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include <iostream>
19 #include <algorithm>
20 #include <cstdlib>
21 #include <string>
22 #include <unistd.h>
23 #include <getopt.h>
24 #include "SamFile.h"
25 #include "Dedup_LowMem.h"
26 #include "Logger.h"
27 #include "SamHelper.h"
28 #include "SamStatus.h"
29 #include "BgzfFileType.h"
30
31 const int Dedup_LowMem::DEFAULT_MIN_QUAL = 15;
32 const uint32_t Dedup_LowMem::CLIP_OFFSET = 1000;
33
~Dedup_LowMem()34 Dedup_LowMem::~Dedup_LowMem()
35 {
36 // clean up the maps.
37 // First free any fragment records.
38 myFragmentMap.clear();
39
40 // Free any paired records.
41 myPairedMap.clear();
42
43 // Free any entries in the mate map.
44 myMateMap.clear();
45 }
46
printDedup_LowMemDescription(std::ostream & os)47 void Dedup_LowMem::printDedup_LowMemDescription(std::ostream& os)
48 {
49 os << " dedup_LowMem - Mark Duplicates using only a little memory\n";
50 }
51
52
printDescription(std::ostream & os)53 void Dedup_LowMem::printDescription(std::ostream& os)
54 {
55 printDedup_LowMemDescription(os);
56 }
57
58
printUsage(std::ostream & os)59 void Dedup_LowMem::printUsage(std::ostream& os)
60 {
61 os << "Usage: ./bam dedup_LowMem --in <InputBamFile> --out <OutputBamFile> [--minQual <minPhred>] [--log <logFile>] [--oneChrom] [--rmDups] [--force] [--excludeFlags <flag>] [--verbose] [--noeof] [--params] [--recab] ";
62 myRecab.printRecabSpecificUsageLine(os);
63 os << std::endl << std::endl;
64 os << "Required parameters :" << std::endl;
65 os << "\t--in <infile> : Input BAM file name (must be sorted)" << std::endl;
66 os << "\t--out <outfile> : Output BAM file name (same order with original file)" << std::endl;
67 os << "Optional parameters : " << std::endl;
68 os << "\t--minQual <int> : Only add scores over this phred quality when determining a read's quality (default: "
69 << DEFAULT_MIN_QUAL << ")" << std::endl;
70 os << "\t--log <logfile> : Log and summary statistics (default: [outfile].log, or stderr if --out starts with '-')" << std::endl;
71 os << "\t--oneChrom : Treat reads with mates on different chromosomes as single-ended." << std::endl;
72 os << "\t--rmDups : Remove duplicates (default is to mark duplicates)" << std::endl;
73 os << "\t--force : Allow an already mark-duplicated BAM file, unmarking any previously marked " << std::endl;
74 os << "\t duplicates and apply this duplicate marking logic. Default is to throw errors" << std::endl;
75 os << "\t and exit when trying to run on an already mark-duplicated BAM" << std::endl;
76 os << "\t--excludeFlags <flag> : exclude reads with any of these flags set when determining or marking duplicates" << std::endl;
77 os << "\t by default (0xB04): exclude unmapped, secondary reads, QC failures, and supplementary reads" << std::endl;
78 os << "\t--verbose : Turn on verbose mode" << std::endl;
79 os << "\t--noeof : Do not expect an EOF block on a bam file." << std::endl;
80 os << "\t--params : Print the parameter settings" << std::endl;
81 os << "\t--recab : Recalibrate in addition to dedup_LowMem" << std::endl;
82 myRecab.printRecabSpecificUsage(os);
83 os << "\n" << std::endl;
84 }
85
execute(int argc,char ** argv)86 int Dedup_LowMem::execute(int argc, char** argv)
87 {
88 /* --------------------------------
89 * process the arguments
90 * -------------------------------*/
91 String inFile, outFile, logFile;
92 myDoRecab = false;
93 bool removeFlag = false;
94 bool verboseFlag = false;
95 myForceFlag = false;
96 myNumMissingMate = 0;
97 myMinQual = DEFAULT_MIN_QUAL;
98 String excludeFlags = "0xB04";
99 uint16_t intExcludeFlags = 0;
100 bool noeof = false;
101 bool params = false;
102
103 LongParamContainer parameters;
104 parameters.addGroup("Required Parameters");
105 parameters.addString("in", &inFile);
106 parameters.addString("out", &outFile);
107 parameters.addGroup("Optional Parameters");
108 parameters.addInt("minQual", & myMinQual);
109 parameters.addString("log", &logFile);
110 parameters.addBool("oneChrom", &myOneChrom);
111 parameters.addBool("recab", &myDoRecab);
112 parameters.addBool("rmDups", &removeFlag);
113 parameters.addBool("force", &myForceFlag);
114 parameters.addString("excludeFlags", &excludeFlags);
115 parameters.addBool("verbose", &verboseFlag);
116 parameters.addBool("noeof", &noeof);
117 parameters.addBool("params", ¶ms);
118 parameters.addPhoneHome(VERSION);
119 myRecab.addRecabSpecificParameters(parameters);
120
121 ParameterList inputParameters;
122 inputParameters.Add(new LongParameters ("Input Parameters",
123 parameters.getLongParameterList()));
124
125 // parameters start at index 2 rather than 1.
126 inputParameters.Read(argc, argv, 2);
127
128 // If no eof block is required for a bgzf file, set the bgzf file type to
129 // not look for it.
130 if(noeof)
131 {
132 // Set that the eof block is not required.
133 BgzfFileType::setRequireEofBlock(false);
134 }
135
136 if(inFile.IsEmpty())
137 {
138 printUsage(std::cerr);
139 inputParameters.Status();
140 std::cerr << "Specify an input file" << std::endl;
141 return EXIT_FAILURE;
142 }
143
144 if(outFile.IsEmpty())
145 {
146 printUsage(std::cerr);
147 inputParameters.Status();
148 std::cerr << "Specify an output file" << std::endl;
149 return EXIT_FAILURE;
150 }
151
152 intExcludeFlags = excludeFlags.AsInteger();
153
154 if(myForceFlag && SamFlag::isDuplicate(intExcludeFlags))
155 {
156 printUsage(std::cerr);
157 inputParameters.Status();
158 std::cerr << "Cannot specify --force and Duplicate in the excludeFlags. Since --force indicates to override"
159 << " previous duplicate setting and the excludeFlags says to skip those, you can't do both.\n";
160 return EXIT_FAILURE;
161 }
162
163 if(!SamFlag::isSecondary(intExcludeFlags))
164 {
165 printUsage(std::cerr);
166 inputParameters.Status();
167 std::cerr << "ERROR: Secondary reads must be excluded, edit --excludeFlags to include 0x0100\n";
168 return EXIT_FAILURE;
169 }
170
171 if(!(intExcludeFlags & SamFlag::SUPPLEMENTARY_ALIGNMENT))
172 {
173 printUsage(std::cerr);
174 inputParameters.Status();
175 std::cerr << "ERROR: Supplementary reads must be excluded, edit --excludeFlags to include 0x0800\n";
176 return EXIT_FAILURE;
177 }
178
179 if(logFile.IsEmpty())
180 {
181 logFile = outFile + ".log";
182 }
183
184 if(myDoRecab)
185 {
186 int status = myRecab.processRecabParam();
187 if(status != 0)
188 {
189 inputParameters.Status();
190 return(status);
191 }
192 }
193
194 if(params)
195 {
196 inputParameters.Status();
197 }
198
199 Logger::gLogger = new Logger(logFile.c_str(), verboseFlag);
200
201 /* -------------------------------------------------------------------
202 * The arguments are processed. Prepare the input BAM file,
203 * instantiate dedup_LowMem, and construct the read group library map
204 * ------------------------------------------------------------------*/
205
206 SamFile samIn;
207
208 samIn.OpenForRead(inFile.c_str());
209 // If the file isn't sorted it will throw an exception.
210 samIn.setSortedValidation(SamFile::COORDINATE);
211
212 SamFileHeader header;
213 samIn.ReadHeader(header);
214
215 buildReadGroupLibraryMap(header);
216
217 lastReference = -1;
218 lastCoordinate = -1;
219
220 // for keeping some basic statistics
221 uint32_t recordCount = 0;
222 uint32_t pairedCount = 0;
223 uint32_t properPairCount = 0;
224 uint32_t unmappedCount = 0;
225 uint32_t reverseCount = 0;
226 uint32_t qualCheckFailCount = 0;
227 uint32_t secondaryCount = 0;
228 uint32_t supplementaryCount = 0;
229 uint32_t excludedCount = 0;
230
231 // Now we start reading records
232 SamRecord* recordPtr;
233 SamStatus::Status returnStatus = SamStatus::SUCCESS;
234 while(returnStatus == SamStatus::SUCCESS)
235 {
236 recordPtr = mySamPool.getRecord();
237 if(recordPtr == NULL)
238 {
239 std::cerr << "Failed to allocate enough records\n";
240 return(-1);
241 }
242 if(!samIn.ReadRecord(header, *recordPtr))
243 {
244 returnStatus = samIn.GetStatus();
245 continue;
246 }
247 // Take note of properties of this record
248 int flag = recordPtr->getFlag();
249 if(SamFlag::isPaired(flag)) ++pairedCount;
250 if(SamFlag::isProperPair(flag)) ++properPairCount;
251 if(SamFlag::isReverse(flag)) ++reverseCount;
252 if(SamFlag::isQCFailure(flag)) ++qualCheckFailCount;
253 if(SamFlag::isSecondary(flag)) ++secondaryCount;
254 if(flag & SamFlag::SUPPLEMENTARY_ALIGNMENT) ++supplementaryCount;
255 if(!SamFlag::isMapped(flag)) ++unmappedCount;
256
257 // put the record in the appropriate maps:
258 // single reads go in myFragmentMap
259 // paired reads go in myPairedMap
260 recordCount = samIn.GetCurrentRecordCount();
261
262 // if we have moved to a new position, look back at previous reads for duplicates
263 if (hasPositionChanged(*recordPtr))
264 {
265 cleanupPriorReads(recordPtr);
266 }
267
268 // Determine if this read should be checked for duplicates.
269 if((!SamFlag::isMapped(flag)) || ((flag & intExcludeFlags) != 0))
270 {
271 ++excludedCount;
272
273 // No deduping done on this record, but still build the recab table.
274 if(myDoRecab)
275 {
276 myRecab.processReadBuildTable(*recordPtr);
277 }
278 // Nothing more to do with this record, so
279 // release the pointer.
280 mySamPool.releaseRecord(recordPtr);
281 }
282 else
283 {
284 if(SamFlag::isDuplicate(flag) && !myForceFlag)
285 {
286 // Error: Marked duplicates, and duplicates aren't excluded.
287 Logger::gLogger->error("There are records already duplicate marked.");
288 Logger::gLogger->error("Use -f to clear the duplicate flag and start the dedup_LowMem procedure over");
289 }
290
291 checkDups(*recordPtr, recordCount);
292 mySamPool.releaseRecord(recordPtr);
293 }
294 // let the user know we're not napping
295 if (verboseFlag && (recordCount % 100000 == 0))
296 {
297 Logger::gLogger->writeLog("recordCount=%u singleKeyMap=%u pairedKeyMap=%u, dictSize=%u",
298 recordCount, myFragmentMap.size(),
299 myPairedMap.size(),
300 myMateMap.size());
301 }
302 }
303
304 // we're finished reading record so clean up the duplicate search and
305 // close the input file
306 cleanupPriorReads(NULL);
307 samIn.Close();
308
309 // print some statistics
310 Logger::gLogger->writeLog("--------------------------------------------------------------------------");
311 Logger::gLogger->writeLog("SUMMARY STATISTICS OF THE READS");
312 Logger::gLogger->writeLog("Total number of reads: %u",recordCount);
313 Logger::gLogger->writeLog("Total number of paired-end reads: %u",
314 pairedCount);
315 Logger::gLogger->writeLog("Total number of properly paired reads: %u",
316 properPairCount);
317 Logger::gLogger->writeLog("Total number of unmapped reads: %u",
318 unmappedCount);
319 Logger::gLogger->writeLog("Total number of reverse strand mapped reads: %u",
320 reverseCount);
321 Logger::gLogger->writeLog("Total number of QC-failed reads: %u",
322 qualCheckFailCount);
323 Logger::gLogger->writeLog("Total number of secondary reads: %u",
324 secondaryCount);
325 Logger::gLogger->writeLog("Total number of supplementary reads: %u",
326 supplementaryCount);
327 Logger::gLogger->writeLog("Size of singleKeyMap (must be zero): %u",
328 myFragmentMap.size());
329 Logger::gLogger->writeLog("Size of pairedKeyMap (must be zero): %u",
330 myPairedMap.size());
331 Logger::gLogger->writeLog("Total number of missing mates: %u",
332 myNumMissingMate);
333 Logger::gLogger->writeLog("Total number of reads excluded from duplicate checking: %u",
334 excludedCount);
335 Logger::gLogger->writeLog("--------------------------------------------------------------------------");
336 Logger::gLogger->writeLog("Sorting the indices of %d duplicated records",
337 myDupList.size());
338
339 // sort the indices of duplicate records
340 std::sort(myDupList.begin(), myDupList.end(),
341 std::less<uint32_t> ());
342
343 // get ready to write the output file by making a second pass
344 // through the input file
345 samIn.OpenForRead(inFile.c_str());
346 samIn.ReadHeader(header);
347
348 SamFile samOut;
349 samOut.OpenForWrite(outFile.c_str());
350 samOut.WriteHeader(header);
351
352 // If we are recalibrating, output the model information.
353 if(myDoRecab)
354 {
355 myRecab.modelFitPrediction(outFile);
356 }
357
358 // an iterator to run through the duplicate indices
359 int currentDupIndex = 0;
360 bool moreDups = !myDupList.empty();
361
362 // let the user know what we're doing
363 Logger::gLogger->writeLog("\nWriting %s", outFile.c_str());
364
365 // count the duplicate records as a check
366 uint32_t singleDuplicates(0), pairedDuplicates(0);
367
368 // start reading records and writing them out
369 SamRecord record;
370 while(samIn.ReadRecord(header, record))
371 {
372 uint32_t currentIndex = samIn.GetCurrentRecordCount();
373
374 bool foundDup = moreDups &&
375 (currentIndex == myDupList[currentDupIndex]);
376
377 // modify the duplicate flag and write out the record,
378 // if it's appropriate
379 int flag = record.getFlag();
380 if (foundDup)
381 {
382 // this record is a duplicate, so mark it.
383 record.setFlag( flag | 0x400 );
384 currentDupIndex++;
385 // increment duplicate counters to verify we found them all
386 if ( ( ( flag & 0x0001 ) == 0 ) || ( flag & 0x0008 ) )
387 { // unpaired or mate unmapped
388 singleDuplicates++;
389 }
390 else
391 {
392 pairedDuplicates++;
393 }
394 // recalibrate if necessary.
395 if(myDoRecab)
396 {
397 myRecab.processReadApplyTable(record);
398 }
399
400 // write the record if we are not removing duplicates
401 if (!removeFlag ) samOut.WriteRecord(header, record);
402 }
403 else
404 {
405 if(myForceFlag)
406 {
407 // this is not a duplicate we've identified but we want to
408 // remove any duplicate marking
409 record.setFlag( flag & 0xfffffbff ); // unmark duplicate
410 }
411 // Not a duplicate, so recalibrate if necessary.
412 if(myDoRecab)
413 {
414 myRecab.processReadApplyTable(record);
415 }
416 samOut.WriteRecord(header, record);
417 }
418
419 // Let the user know we're still here
420 if (verboseFlag && (currentIndex % 100000 == 0)) {
421 Logger::gLogger->writeLog("recordCount=%u", currentIndex);
422 }
423 }
424
425 // We're done. Close the files and print triumphant messages.
426 samIn.Close();
427 samOut.Close();
428
429 Logger::gLogger->writeLog("Successfully %s %u unpaired and %u paired duplicate reads",
430 removeFlag ? "removed" : "marked" ,
431 singleDuplicates,
432 pairedDuplicates/2);
433 Logger::gLogger->writeLog("\nDedup_LowMem complete!");
434 return 0;
435 }
436
437 // Now that we've reached coordinate on chromosome reference, look back and
438 // clean up any previous positions from being tracked.
cleanupPriorReads(SamRecord * record)439 void Dedup_LowMem::cleanupPriorReads(SamRecord* record)
440 {
441 static DupKey emptyKey;
442 static DupKey tempKey2;
443
444 // Set where to stop cleaning out the structures.
445 // Initialize to the end of the structures.
446 FragmentMap::iterator fragmentFinish = myFragmentMap.end();
447 PairedMap::iterator pairedFinish = myPairedMap.end();
448 uint64_t mateStopPos = 0;
449
450 // If a record was specified, stop before this record.
451 if(record != NULL)
452 {
453 int32_t reference = record->getReferenceID();
454 int32_t coordinate = record->get0BasedPosition();
455 tempKey2.cleanupKey(reference, coordinate);
456 fragmentFinish = myFragmentMap.lower_bound(tempKey2);
457 // Now do the same thing with the paired reads
458 PairedKey pairedKey(emptyKey, tempKey2);
459 pairedFinish = myPairedMap.lower_bound(pairedKey);
460 mateStopPos =
461 SamHelper::combineChromPos(reference,
462 coordinate);
463 }
464
465 // For each key k < fragmentFinish, release the record since we are
466 // done with that position and it is not a duplicate.
467 for(FragmentMap::iterator iter = myFragmentMap.begin();
468 iter != fragmentFinish; iter++)
469 {
470 // If it is not paired, we are done with this record.
471 // If it is paired, it will be handled separately.
472 if(!iter->second.paired)
473 {
474 // Unpaired, non-duplicate, so perform any additional handling.
475 handleNonDuplicate();
476 }
477 }
478 // Erase the entries from the map.
479 if(fragmentFinish != myFragmentMap.begin())
480 {
481 myFragmentMap.erase(myFragmentMap.begin(), fragmentFinish);
482 }
483
484 // Now do the same thing with the paired reads
485 for(PairedMap::iterator iter = myPairedMap.begin();
486 iter != pairedFinish; iter++)
487 {
488 // These are not duplicates, but we are done with them,
489 // so perform any additional handling.
490 handleNonDuplicatePair();
491 }
492 // Erase the entries.
493 if (pairedFinish != myPairedMap.begin())
494 {
495 myPairedMap.erase(myPairedMap.begin(), pairedFinish);
496 }
497
498 // Clean up the Mate map from any reads whose mates were not found.
499 // Loop through the mate map and release records prior to this position.
500 MateMap::iterator mateIter;
501 for(mateIter = myMateMap.begin(); mateIter != myMateMap.end(); mateIter++)
502 {
503 // stop if a record was specified and we have gone past the mate
504 // stop position. If no record was specified, we want to clean
505 // it all out.
506 if((record != NULL) && (mateIter->first >= mateStopPos))
507 {
508 break;
509 }
510 // Passed the mate, but it was not found.
511 handleMissingMate(mateIter->second.key.reference,
512 mateIter->first >> 32);
513 }
514 // Erase the entries.
515 if(mateIter != myMateMap.begin())
516 {
517 myMateMap.erase(myMateMap.begin(), mateIter);
518 }
519 return;
520 }
521
522
523 // determine whether the record's position is different from the previous record
hasPositionChanged(SamRecord & record)524 bool Dedup_LowMem::hasPositionChanged(SamRecord& record)
525 {
526 if (lastReference != record.getReferenceID() ||
527 lastCoordinate < record.get0BasedPosition())
528 {
529 if (lastReference != record.getReferenceID())
530 {
531 lastReference = record.getReferenceID();
532 Logger::gLogger->writeLog("Reading ReferenceID %d\n", lastReference);
533 }
534 lastCoordinate = record.get0BasedPosition();
535 return true;
536 }
537 return false;
538 }
539
540 // When a record is read, check if it is a duplicate or
541 // store for future checking.
checkDups(SamRecord & record,uint32_t recordCount)542 void Dedup_LowMem::checkDups(SamRecord& record, uint32_t recordCount)
543 {
544 // Only inside this method if the record is mapped.
545
546 // Get the key for this record.
547 static DupKey key;
548 key.initKey(record, getLibraryID(record));
549
550 int flag = record.getFlag();
551 bool recordPaired = SamFlag::isPaired(flag) && SamFlag::isMateMapped(flag);
552 int sumBaseQual = getBaseQuality(record);
553
554 int32_t chromID = record.getReferenceID();
555 int32_t mateChromID = record.getMateReferenceID();
556
557 // If we are one-chrom and the mate is not on the same chromosome,
558 // mark it as not paired.
559 if(myOneChrom && (chromID != mateChromID))
560 {
561 recordPaired = false;
562 }
563
564 // Look in the fragment map to see if an entry for this key exists.
565 FragmentMapInsertReturn ireturn =
566 myFragmentMap.insert(std::make_pair(key, FragData()));
567
568 FragData* fragData = &(ireturn.first->second);
569
570 // Enter the new record in the fragData if (any of the below):
571 // 1) there is no previous entry for this key (ireturn.second == true)
572 // or
573 // 2) the previous entry is not paired
574 // AND
575 // a) the new record is paired
576 // or
577 // b) the new record has higher quality
578 if((ireturn.second == true) ||
579 ((fragData->paired == false) &&
580 (recordPaired || (sumBaseQual > fragData->sumBaseQual))))
581 {
582 // Check if this is a new key.
583 if(ireturn.second == true)
584 {
585 // New entry, so build the recalibration table now.
586 if(myDoRecab)
587 {
588 myRecab.processReadBuildTable(record);
589 }
590 }
591 else if(fragData->paired == false)
592 {
593 // There was a previous record and it is not paired,
594 // so mark it as a duplicate.
595 // Duplicate checking/marking for pairs is handled below.
596 handleDuplicate(fragData->recordIndex);
597 }
598
599 // Store this record for later duplicate checking.
600 fragData->sumBaseQual = sumBaseQual;
601 fragData->recordIndex = recordCount;
602 fragData->paired = recordPaired;
603 }
604 else
605 {
606 // Leave the old record in fragData.
607 // If the new record is not paired, handle it as a duplicate.
608 if(recordPaired == false)
609 {
610 // This record is a duplicate, so mark it and release it.
611 handleDuplicate(recordCount);
612 }
613 }
614
615 // Only paired processing is left, so return if not paired.
616 if(recordPaired == false)
617 {
618 // Not paired, no more operations required, so return.
619 return;
620 }
621
622 // This is a paired record, so check for its mate.
623 uint64_t readPos =
624 SamHelper::combineChromPos(chromID,
625 record.get0BasedPosition());
626 uint64_t matePos =
627 SamHelper::combineChromPos(mateChromID,
628 record.get0BasedMatePosition());
629 int mateIndex = -1;
630 MateData* mateData = NULL;
631
632 // Check to see if the mate is prior to this record.
633 if(matePos <= readPos)
634 {
635 // The mate map is stored by the mate position, so look for this
636 // record's position.
637 // The mate should be in the mate map, so find it.
638 std::pair<MateMap::iterator,MateMap::iterator> matches =
639 myMateMap.equal_range(readPos);
640 // Loop through the elements that matched the pos looking for the mate.
641 for(MateMap::iterator iter = matches.first;
642 iter != matches.second; iter++)
643 {
644 if(strcmp((*iter).second.readName.c_str(),
645 record.getReadName()) == 0)
646 {
647 // Found the match.
648 mateData = &((*iter).second);
649 // Update the quality and track the mate record and index.
650 sumBaseQual += mateData->sumBaseQual;
651 mateIndex = mateData->recordIndex;
652 // Remove the entry from the map.
653 myMateMap.erase(iter);
654 break;
655 }
656 }
657 }
658 if(mateData == NULL)
659 {
660 if(matePos >= readPos)
661 {
662 // Haven't gotten to the mate yet, so store this record.
663 MateMap::iterator mateIter =
664 myMateMap.insert(std::make_pair(matePos, MateData()));
665 mateIter->second.sumBaseQual = sumBaseQual;
666 mateIter->second.recordIndex = recordCount;
667 mateIter->second.key.copy(key);
668 mateIter->second.readName = record.getReadName();
669 }
670 else
671 {
672 // Passed the mate, but it was not found.
673 handleMissingMate(record.getReferenceID(), record.getMateReferenceID());
674 }
675 return;
676 }
677
678 // Make the paired key.
679 PairedKey pkey(key, mateData->key);
680
681 // Check to see if this pair is a duplicate.
682 PairedMapInsertReturn pairedReturn =
683 myPairedMap.insert(std::make_pair(pkey,PairedData()));
684 PairedData* storedPair = &(pairedReturn.first->second);
685
686 // Get the index for "record 1" - the one with the earlier coordinate.
687 int record1Index = getFirstIndex(key, recordCount,
688 mateData->key, mateIndex);
689
690 // Check if we have already found a duplicate pair.
691 // If there is no duplicate found, there is nothing more to do.
692 if(pairedReturn.second == false)
693 {
694 // Duplicate found.
695 bool keepStored = true;
696 if(pairedReturn.first->second.sumBaseQual < sumBaseQual)
697 {
698 // The new pair has higher quality, so keep that.
699 keepStored = false;
700 }
701 else if(pairedReturn.first->second.sumBaseQual == sumBaseQual)
702 {
703 // Same quality, so keep the one with the earlier record1Index.
704 if(record1Index < storedPair->record1Index)
705 {
706 // The new pair has an earlier lower coordinate read,
707 // so keep that.
708 keepStored = false;
709 }
710 }
711 // Check to see which one should be kept by checking qualities.
712 if(keepStored)
713 {
714 // The old pair had higher quality so mark the new pair as a
715 // duplicate and release them.
716 handleDuplicate(mateIndex);
717 handleDuplicate(recordCount);
718 }
719 else
720 {
721 // The new pair has higher quality, so keep that.
722 // First mark the previous one as duplicates and release them.
723 handleDuplicate(storedPair->record1Index);
724 handleDuplicate(storedPair->record2Index);
725 // Store this pair's information.
726 if(record1Index == mateIndex)
727 {
728 // Mate has a lower coordinate, so make mate
729 // record1.
730 storedPair->sumBaseQual = sumBaseQual;
731 storedPair->record1Index = mateIndex;
732 storedPair->record2Index = recordCount;
733 }
734 else
735 {
736 // This record has a lower coordinate, so make it
737 // record1.
738 storedPair->sumBaseQual = sumBaseQual;
739 storedPair->record1Index = recordCount;
740 storedPair->record2Index = mateIndex;
741 }
742 }
743 }
744 else
745 {
746 // Store this pair's information.
747 storedPair->sumBaseQual = sumBaseQual;
748
749 if(record1Index == mateIndex)
750 {
751 // Mate has a lower coordinate, so make mate
752 // record1.
753 storedPair->record1Index = mateIndex;
754 storedPair->record2Index = recordCount;
755 }
756 else
757 {
758 // This record has a lower coordinate, so make it
759 // record1.
760 storedPair->record1Index = recordCount;
761 storedPair->record2Index = mateIndex;
762 }
763 }
764 }
765
766
767 // Finds the total base quality of a read
getBaseQuality(SamRecord & record)768 int Dedup_LowMem::getBaseQuality(SamRecord & record) {
769 const char* baseQualities = record.getQuality();
770 int readLength = record.getReadLength();
771 int quality = 0.;
772 if(strcmp(baseQualities, "*") == 0)
773 {
774 return(0);
775 }
776 for(int i=0; i < readLength; ++i) {
777 int q = static_cast<int>(baseQualities[i])-33;
778 if ( q >= myMinQual ) quality += q;
779 }
780 return quality;
781 }
782
783
784 // build the read group library map
buildReadGroupLibraryMap(SamFileHeader & header)785 void Dedup_LowMem::buildReadGroupLibraryMap(SamFileHeader& header) {
786 rgidLibMap.clear();
787 numLibraries = 0;
788 std::map<std::string,uint32_t> libNameMap;
789
790 SamHeaderRecord * headerRecord = header.getNextRGRecord();
791 while(headerRecord != NULL) {
792 std::string ID = headerRecord->getTagValue("ID");
793 std::string LB = headerRecord->getTagValue("LB");
794
795 if ( ID.empty() ) {
796 std::string headerRecordString;
797 headerRecord->appendString(headerRecordString);
798 Logger::gLogger->error("Cannot find readGroup ID information in the header line %s",
799 headerRecordString.c_str());
800 }
801 if ( rgidLibMap.find(ID) != rgidLibMap.end() ) {
802 Logger::gLogger->error("The readGroup ID %s is not a unique identifier",ID.c_str());
803 }
804
805 if ( LB.empty() ) {
806 std::string headerRecordString;
807 headerRecord->appendString(headerRecordString);
808 Logger::gLogger->warning("Cannot find library information in the header line %s. Using empty string for library name",
809 headerRecordString.c_str());
810 }
811
812 if ( libNameMap.find( LB ) != libNameMap.end() ) {
813 rgidLibMap[ID] = libNameMap[LB];
814 }
815 else {
816 numLibraries = libNameMap.size()+1;
817 libNameMap[LB] = numLibraries;
818 rgidLibMap[ID] = numLibraries;
819 }
820 headerRecord = header.getNextRGRecord();
821 }
822
823 if (numLibraries > 0xff) {
824 Logger::gLogger->error("More than 255 library names are identified. Dedup_LowMem currently only allows up to 255 library names");
825 }
826 }
827
828 // get the libraryID of a record
getLibraryID(SamRecord & record,bool checkTags)829 uint32_t Dedup_LowMem::getLibraryID(SamRecord& record, bool checkTags) {
830 if ( ( checkTags == false ) && ( numLibraries <= 1 ) ) {
831 return 0;
832 } else {
833 char tag[3];
834 char vtype;
835 void* value;
836 std::string rgID;
837 record.resetTagIter();
838 while( record.getNextSamTag(tag,vtype,&value) != false ) {
839 if ( ( tag[0] == 'R' ) && ( tag[1] == 'G' ) && ( vtype == 'Z' ) ) {
840 if ( !rgID.empty() ) {
841 Logger::gLogger->error("Multiple RG tag found in one record. ReadName is %s",record.getReadName());
842 }
843 else if ( record.isStringType(vtype) ) {
844 String s = (String)*(String*)value;
845 rgID = s.c_str();
846 }
847 else {
848 Logger::gLogger->error("vtype is not string (Z) for RG tag");
849 }
850 }
851 }
852 if ( rgID.empty() ) {
853 Logger::gLogger->error("No RG tag is found in read %s",record.getReadName());
854 return 0;
855 }
856 else {
857 std::map<std::string,uint32_t>::iterator it = rgidLibMap.find(rgID);
858 if ( it != rgidLibMap.end() ) {
859 return it->second;
860 }
861 else {
862 Logger::gLogger->warning("RG tag %s does not exist in the header",rgID.c_str());
863 return 0; // cannot be reached
864 }
865 }
866 }
867 }
868
869
handleMissingMate(int32_t refID,int32_t mateRefID)870 void Dedup_LowMem::handleMissingMate(int32_t refID, int32_t mateRefID)
871 {
872 static bool firstDifferChrom = true;
873 static bool firstSameChrom = true;
874
875 ++myNumMissingMate;
876
877 // Passed the mate, but it was not found.
878 if(refID != mateRefID)
879 {
880 if(firstDifferChrom)
881 {
882 std::cerr << "Mate on different chromosome was not found.\n"
883 << "If you are running single chromosome, consider "
884 << "using --oneChrom to treat reads with mates on "
885 << "different chromosomes as single-ended.\n";
886 firstDifferChrom = false;
887 }
888 }
889 else if(firstSameChrom)
890 {
891 std::cerr << "ERROR: Records with missing mate can't be checked for "
892 << "duplicates.\n";
893 firstSameChrom = false;
894 }
895
896 // Don't consider this record to be a duplicate.
897 handleNonDuplicate();
898 }
899
900
handleDuplicate(uint32_t index)901 void Dedup_LowMem::handleDuplicate(uint32_t index)
902 {
903 // Add the index to the duplicate list.
904 myDupList.push_back(index);
905 }
906