1 /*
2 * RecordOutputMgr.cpp
3 *
4 * Created on: May 28, 2013
5 * Author: nek3d
6 */
7
8 #include "RecordOutputMgr.h"
9 #include "ContextBase.h"
10 #include "ContextIntersect.h"
11 #include "ContextClosest.h"
12 #include "ContextGroupBy.h"
13 #include "BlockMgr.h"
14 #include "Bed3Interval.h"
15 #include "Bed4Interval.h"
16 #include "BedGraphInterval.h"
17 #include "Bed5Interval.h"
18 #include "Bed6Interval.h"
19 #include "BedPlusInterval.h"
20 #include "Bed12Interval.h"
21 #include "BamRecord.h"
22 #include "VcfRecord.h"
23 #include "GffRecord.h"
24 #include "NoPosPlusRecord.h"
25
26
27
28 #include <cstdio>
29
30
RecordOutputMgr()31 RecordOutputMgr::RecordOutputMgr()
32 : _context(NULL),
33 _printable(true),
34 _bamWriter(NULL),
35 _currBamBlockList(NULL),
36 _bamBlockMgr(NULL)
37 {
38 _bamBlockMgr = new BlockMgr();
39 }
40
~RecordOutputMgr()41 RecordOutputMgr::~RecordOutputMgr()
42 {
43 // In the rare case when a file had a header but was otherwise empty,
44 // we'll need to make a last check to see if the header still needs to be printed.
45 checkForHeader();
46
47 if (_outBuf.size() > 0) {
48 flush();
49 }
50 if (_bamWriter != NULL) {
51 _bamWriter->Close();
52 delete _bamWriter;
53 _bamWriter = NULL;
54 }
55 delete _bamBlockMgr;
56 _bamBlockMgr = NULL;
57
58 }
59
init(ContextBase * context)60 void RecordOutputMgr::init(ContextBase *context) {
61 _context = context;
62 if (_context->getOutputFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) {
63 //set-up BAM writer.
64 _bamWriter = new BamTools::BamWriter();
65 _bamWriter->SetCompressionMode(_context->getUncompressedBam() ? BamTools::BamWriter::Uncompressed : BamTools::BamWriter::Compressed);
66
67 int bamFileIdx = _context->getBamHeaderAndRefIdx();
68
69 if(!_context->isCram())
70 _bamWriter->Open("stdout", _context->getFile(bamFileIdx)->getHeader().c_str(), _context->getFile(bamFileIdx)->getBamReferences());
71 else
72 _bamWriter->Open("stdout", _context->getFile(bamFileIdx)->getHeader().c_str(),
73 _context->getFile(bamFileIdx)->getBamReferences(),
74 _context->getFile(bamFileIdx)->getCramRefs());
75 } else {
76 //for everything but BAM, we'll copy output to an output buffer before printing.
77 _outBuf.reserve(MAX_OUTBUF_SIZE);
78 }
79 if (_context->getProgram() == ContextBase::INTERSECT) {
80 if ((static_cast<ContextIntersect *>(_context))->getAnyHit() ||
81 (static_cast<ContextIntersect *>(_context))->getNoHit() ||
82 (static_cast<ContextIntersect *>(_context))->getWriteCount() ||
83 (static_cast<ContextIntersect *>(_context))->getWriteCountsPerDatabase()) {
84 _printable = false;
85 }
86 }
87 }
88
printKeyAndTerminate(RecordKeyVector & keyList)89 bool RecordOutputMgr::printKeyAndTerminate(RecordKeyVector &keyList) {
90 if (_context->getProgram() == ContextBase::MERGE) {
91 //when printing merged records, we want to force the printing into
92 //bed3 format, which is surprisingly difficult to do. Had to use the following:
93 const Bed3Interval *bed3 = static_cast<const Bed3Interval *>(keyList.getKey());
94 bed3->Bed3Interval::print(_outBuf);
95 return false;
96 }
97 printBamType bamCode = printBamRecord(keyList);
98 if (bamCode == BAM_AS_BAM) {
99 return true;
100 } else if (bamCode == NOT_BAM) {
101 keyList.getKey()->print(_outBuf);
102 return false;
103 }
104 //otherwise, it was BAM_AS_BED, and the key was printed.
105 return false;
106
107 }
108
printBamRecord(RecordKeyVector & keyList,bool bamOutputOnly)109 RecordOutputMgr::printBamType RecordOutputMgr::printBamRecord(RecordKeyVector &keyList, bool bamOutputOnly)
110 {
111 const Record *record = keyList.getKey();
112 if (record->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) {
113 if (_context->getOutputFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) {
114 _bamWriter->SaveAlignment(static_cast<const BamRecord *>(record)->getAlignment());
115 return BAM_AS_BAM;
116 } else {
117 if (!bamOutputOnly) {
118 if (record->isUnmapped()) {
119 record->printUnmapped(_outBuf);
120 } else {
121 static_cast<const BamRecord *>(record)->print(_outBuf, _currBamBlockList);
122 }
123 }
124 return BAM_AS_BED;
125 }
126 }
127 return NOT_BAM;
128 }
129
printRecord(Record * record)130 void RecordOutputMgr::printRecord(Record *record)
131 {
132 RecordKeyVector keyList(record);
133 printRecord(keyList);
134 }
135
printRecord(Record * record,const string & value)136 void RecordOutputMgr::printRecord(Record *record, const string & value)
137 {
138 checkForHeader();
139
140 _afterVal = value;
141 bool recordPrinted = false;
142 if (record != NULL) {
143 printRecord(record);
144 recordPrinted = true;
145 }
146 if (!value.empty()) {
147 if (recordPrinted) tab();
148 _outBuf.append(value);
149 }
150 newline();
151
152 if (needsFlush()) flush();
153 }
154
printClosest(RecordKeyVector & keyList,const vector<CHRPOS> * dists)155 void RecordOutputMgr::printClosest(RecordKeyVector &keyList, const vector<CHRPOS> *dists) {
156
157 //The first time we print a record is when we print any header, because the header
158 //hasn't been read from the query file until after the first record has also been read.
159 checkForHeader();
160
161 const ContextClosest *context = static_cast<const ContextClosest *>(_context);
162 bool deleteBlocks = false;
163 Record *keyRec = keyList.getKey();
164 RecordKeyVector blockList(keyRec);
165 if (keyRec->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) {
166 _bamBlockMgr->getBlocks(blockList, deleteBlocks);
167 _currBamBlockList = &blockList;
168 }
169 if (!keyList.empty()) {
170 if (context->getNumClosestHitsWanted() > keyList.size())
171 {
172 cerr << "Warning: Fewer hits ("
173 << keyList.size()
174 << ") found on "
175 << keyRec->getChrName()
176 << " than requested ("
177 << context->getNumClosestHitsWanted()
178 << "). It is likely that there are fewer total records"
179 << " on that chromosome than requested."
180 << endl;
181 }
182 int distCount = 0;
183 for (RecordKeyVector::iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next())
184 {
185 const Record *hitRec = *iter;
186 printKey(keyRec, keyRec->getStartPosStr(), keyRec->getEndPosStr());
187 tab();
188 addDbFileId(hitRec->getFileIdx());
189 printKey(hitRec, hitRec->getStartPosStr(), hitRec->getEndPosStr());
190 if (dists != NULL) {
191 tab();
192 CHRPOS dist = (*dists)[distCount];
193 //if not using sign distance, use absolute value instead.
194 dist = context->signDistance() ? dist : abs(dist);
195 ostringstream s;
196 s << dist;
197 _outBuf.append(s.str());
198 distCount++;
199 }
200 newline();
201 if (needsFlush()) flush();
202 }
203 } else {
204 printKey(keyRec, keyRec->getStartPosStr(), keyRec->getEndPosStr());
205 tab();
206 // need to add a dummy file id if multiple DB files are used
207 if (_context->getNumInputFiles() > 2) {
208 _outBuf.append(".");
209 tab();
210 }
211 null(false, true);
212 if (context->reportDistance()) {
213 tab();
214 _outBuf.append("-1");
215 }
216 newline();
217 }
218 if (deleteBlocks) {
219 _bamBlockMgr->deleteBlocks(blockList);
220 _currBamBlockList = NULL;
221 }
222 return;
223 }
224
225
printRecord(RecordKeyVector & keyList)226 void RecordOutputMgr::printRecord(RecordKeyVector &keyList) {
227 if (keyList.getKey()->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) {
228 RecordKeyVector blockList(keyList.getKey());
229 bool deleteBlocks = false;
230 _bamBlockMgr->getBlocks(blockList, deleteBlocks);
231 printRecord(keyList, &blockList);
232 if (deleteBlocks) {
233 _bamBlockMgr->deleteBlocks(blockList);
234 }
235 return;
236 }
237 printRecord(keyList, NULL);
238
239 }
240
printRecord(RecordKeyVector & keyList,RecordKeyVector * blockList)241 void RecordOutputMgr::printRecord(RecordKeyVector &keyList, RecordKeyVector *blockList)
242 {
243 if (needsFlush()) {
244 flush();
245 }
246
247 //The first time we print a record is when we print any header, because the header
248 //hasn't been read from the query file until after the first record has also been read.
249 checkForHeader();
250
251 const_cast<Record *>(keyList.getKey())->undoZeroLength();
252 _currBamBlockList = blockList;
253
254 if (_context->getProgram() == ContextBase::INTERSECT || _context->getProgram() == ContextBase::SUBTRACT) {
255 if (_printable) {
256 if (keyList.empty()) {
257 if ((static_cast<ContextIntersect *>(_context))->getWriteAllOverlap())
258 {
259 // -wao the user wants to force the reporting of 0 overlap
260 if (printKeyAndTerminate(keyList)) {
261 _currBamBlockList = NULL;
262 const_cast<Record *>(keyList.getKey())->adjustZeroLength();
263
264 return;
265 }
266 tab();
267 // need to add a dummy file id if multiple DB files are used
268 if (_context->getNumInputFiles() > 2) {
269 _outBuf.append(".");
270 tab();
271 }
272 null(false, true);
273 tab();
274 _outBuf.append("0");
275 newline();
276 if (needsFlush()) flush();
277 }
278 else if ((static_cast<ContextIntersect *>(_context))->getLeftJoin())
279 {
280 if (printKeyAndTerminate(keyList)) {
281 _currBamBlockList = NULL;
282
283 const_cast<Record *>(keyList.getKey())->adjustZeroLength();
284 return;
285 }
286 tab();
287 // need to add a dummy file id if multiple DB files are used
288 if (_context->getNumInputFiles() > 2) {
289 _outBuf.append(".");
290 tab();
291 }
292 null(false, true);
293 newline();
294 if (needsFlush()) flush();
295 _currBamBlockList = NULL;
296
297 return;
298 }
299 }
300 else
301 {
302 if (printBamRecord(keyList, true) == BAM_AS_BAM) {
303 _currBamBlockList = NULL;
304
305 const_cast<Record *>(keyList.getKey())->adjustZeroLength();
306 return;
307 }
308 int hitIdx = 0;
309 for (RecordKeyVector::iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next())
310 {
311 // a hit can be invalid if there was no enough overlap, etc.
312 //if ((*iter)->isValid())
313 //{
314 reportOverlapDetail(keyList.getKey(), *iter, hitIdx);
315 hitIdx++;
316 //}
317 }
318 }
319 } else { // not printable
320 reportOverlapSummary(keyList);
321 }
322 _currBamBlockList = NULL;
323 } else if (_context->getProgram() == ContextBase::SAMPLE) {
324 if (!printKeyAndTerminate(keyList)) {
325 newline();
326 }
327 } else { // if (_context->getProgram() == ContextBase::MAP || _context->getProgram() == ContextBase::MERGE) {
328 printKeyAndTerminate(keyList);
329 }
330 _currBamBlockList = NULL;
331 const_cast<Record *>(keyList.getKey())->adjustZeroLength();
332
333 }
334
checkForHeader()335 void RecordOutputMgr::checkForHeader() {
336 // Do we need to print a header?
337 if (!_context->getPrintHeader()) return;
338
339 //If the tool is groupBy, and outheader was set, but the header is empty, we need to print groupBy's
340 //default header
341 if (_context->getProgram() == ContextBase::GROUP_BY) {
342 const string &header = _context->getFile(0)->getHeader();
343 if (header.empty()) {
344 const string &defaultHeader = (static_cast<ContextGroupBy *>(_context))->getDefaultHeader();
345 _outBuf.append(defaultHeader);
346 } else {
347 _outBuf.append(header);
348 }
349 } else if (_context->hasIntersectMethods()) {
350 //if the tool is based on intersection, we want the header from the query file.
351
352 int queryIdx = (static_cast<ContextIntersect *>(_context))->getQueryFileIdx();
353 const string &header = _context->getFile(queryIdx)->getHeader();
354 _outBuf.append(header);
355 } else {
356 _outBuf.append(_context->getFile(0)->getHeader());
357 }
358
359 _context->setPrintHeader(false);
360 flush();
361 }
362
reportOverlapDetail(const Record * keyRecord,const Record * hitRecord,int hitIdx)363 void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record *hitRecord, int hitIdx)
364 {
365
366 // overlap interval is defined by min(e1,e2) - max(s1,s2)
367 CHRPOS maxStart = max(keyRecord->getStartPos(), hitRecord->getStartPos());
368 //cout << keyRecord->getStartPos() << "," << hitRecord->getStartPos();
369 CHRPOS minEnd = min(keyRecord->getEndPos(), hitRecord->getEndPos());
370
371 // need to undo our conversion of 1-based start coordinates to 0-based
372 if (!keyRecord->isZeroBased())
373 maxStart++;
374
375 // all of the different printing scenarios based upon the options used.
376 if (!(static_cast<ContextIntersect *>(_context))->getWriteA() && !(static_cast<ContextIntersect *>(_context))->getWriteB()
377 && !(static_cast<ContextIntersect *>(_context))->getWriteOverlap() && !(static_cast<ContextIntersect *>(_context))->getLeftJoin()) {
378 const_cast<Record *>(keyRecord)->undoZeroLength();
379 printKey(keyRecord, maxStart, minEnd);
380 }
381 else if (((static_cast<ContextIntersect *>(_context))->getWriteA() &&
382 (static_cast<ContextIntersect *>(_context))->getWriteB()) || (static_cast<ContextIntersect *>(_context))->getLeftJoin()) {
383 const_cast<Record *>(keyRecord)->undoZeroLength();
384 printKey(keyRecord);
385 tab();
386 const_cast<Record *>(hitRecord)->undoZeroLength();
387 addDbFileId(hitRecord->getFileIdx());
388 hitRecord->print(_outBuf);
389 }
390 else if ((static_cast<ContextIntersect *>(_context))->getWriteA()) {
391 const_cast<Record *>(keyRecord)->undoZeroLength();
392 printKey(keyRecord);
393 }
394 else if ((static_cast<ContextIntersect *>(_context))->getWriteB()) {
395 printKey(keyRecord, maxStart, minEnd);
396 tab();
397 addDbFileId(hitRecord->getFileIdx());
398 const_cast<Record *>(hitRecord)->undoZeroLength();
399 hitRecord->print(_outBuf);
400 }
401 else if ((static_cast<ContextIntersect *>(_context))->getWriteOverlap()) {
402 int overlapBases = 0;
403 if (_context->getObeySplits()) {
404 overlapBases = _context->getSplitBlockInfo()->getOverlapBases(hitIdx);
405 } else {
406 // if one of the records was zerolength, the number of
407 // overlapping bases needs to be corrected
408 if (keyRecord->isZeroLength() || hitRecord->isZeroLength())
409 {
410 maxStart++;
411 minEnd--;
412 }
413 overlapBases = minEnd - maxStart;
414 // add one to overlapBases since we decremented minStart
415 // for 1-based records.
416 if (!keyRecord->isZeroBased())
417 overlapBases++;
418 }
419 const_cast<Record *>(keyRecord)->undoZeroLength();
420 printKey(keyRecord);
421 tab();
422 addDbFileId(hitRecord->getFileIdx());
423 const_cast<Record *>(hitRecord)->undoZeroLength();
424 hitRecord->print(_outBuf);
425 tab();
426 int2str(overlapBases, _outBuf, true);
427 }
428 newline();
429 if (needsFlush()) flush();
430 const_cast<Record *>(hitRecord)->adjustZeroLength();
431 }
432
reportOverlapSummary(RecordKeyVector & keyList)433 void RecordOutputMgr::reportOverlapSummary(RecordKeyVector &keyList)
434 {
435 int numOverlapsFound = (int)keyList.size();
436 if ((static_cast<ContextIntersect *>(_context))->getAnyHit() && numOverlapsFound > 0) {
437 if (printKeyAndTerminate(keyList)) {
438 return;
439 }
440 newline();
441 if (needsFlush()) flush();
442 } else if ((static_cast<ContextIntersect *>(_context))->getWriteCount()) {
443 if (printKeyAndTerminate(keyList)) {
444 return;
445 }
446 tab();
447 int2str(numOverlapsFound, _outBuf, true);
448 newline();
449 if (needsFlush()) flush();
450 }
451 else if ((static_cast<ContextIntersect *>(_context))->getWriteCountsPerDatabase()) {
452 // build a map of the hit counts per database
453 map<int, int> db_counts;
454 // initialize to 0 for all files (-A is file 0)
455 for (size_t i = 1; i < _context->getNumInputFiles(); i++)
456 {
457 db_counts[i] = 0;
458 }
459 // tally hits per database
460 for (auto & hit : keyList) {
461 db_counts[hit->getFileIdx()]+=1;
462 }
463
464 // report A with a separate line for each db and its hit count
465 for (auto it=db_counts.begin(); it!=db_counts.end(); ++it)
466 {
467 if (printKeyAndTerminate(keyList)) {
468 return;
469 }
470 tab();
471 addDbFileId(it->first);
472 int2str(it->second, _outBuf, true);
473 newline();
474 }
475 if (needsFlush()) flush();
476 }
477 else if ((static_cast<ContextIntersect *>(_context))->getNoHit() && numOverlapsFound == 0) {
478 if (printKeyAndTerminate(keyList)) {
479 return;
480 }
481 newline();
482 if (needsFlush()) flush();
483 }
484 }
485
addDbFileId(int fileId)486 void RecordOutputMgr::addDbFileId(int fileId) {
487 ostringstream s;
488 if ((static_cast<ContextIntersect *>(_context))->getNumDatabaseFiles() == 1) return;
489 if (!_context->getUseDBnameTags() && (!_context->getUseDBfileNames())) {
490 s << fileId;
491 } else if (_context->getUseDBnameTags()){
492 s << (static_cast<ContextIntersect *>(_context))->getDatabaseNameTag((static_cast<ContextIntersect *>(_context))->getDbIdx(fileId));
493 } else {
494 s << _context->getInputFileName(fileId);
495 }
496 _outBuf.append(s.str());
497 tab();
498 }
499
null(bool queryType,bool dbType)500 void RecordOutputMgr::null(bool queryType, bool dbType)
501 {
502 FileRecordTypeChecker::RECORD_TYPE recordType = FileRecordTypeChecker::UNKNOWN_RECORD_TYPE;
503 if (_context->hasIntersectMethods()) {
504 if (queryType) {
505 recordType = (static_cast<ContextIntersect *>(_context))->getQueryRecordType();
506 } else if (dbType) {
507 recordType = (static_cast<ContextIntersect *>(_context))->getDatabaseRecordType(0);
508 }
509 } else {
510 recordType = _context->getFile(0)->getRecordType();
511 }
512 //This is kind of a hack. Need an instance of the correct class of record in order to call it's printNull method.
513 Record *dummyRecord = NULL;
514 switch (recordType) {
515 case FileRecordTypeChecker::BED3_RECORD_TYPE:
516 dummyRecord = new Bed3Interval();
517 break;
518 case FileRecordTypeChecker::BED4_RECORD_TYPE:
519 dummyRecord = new Bed4Interval();
520 break;
521 case FileRecordTypeChecker::BEDGRAPH_RECORD_TYPE:
522 dummyRecord = new BedGraphInterval();
523 break;
524 case FileRecordTypeChecker::BED5_RECORD_TYPE:
525 dummyRecord = new Bed5Interval();
526 break;
527 case FileRecordTypeChecker::BED6_RECORD_TYPE:
528 dummyRecord = new Bed6Interval();
529 break;
530 case FileRecordTypeChecker::BED12_RECORD_TYPE:
531 dummyRecord = new Bed12Interval();
532 break;
533 case FileRecordTypeChecker::BED_PLUS_RECORD_TYPE:
534 dummyRecord = new BedPlusInterval();
535 (static_cast<BedPlusInterval *>(dummyRecord))->setNumPrintFields((static_cast<ContextIntersect *>(_context))->getMaxNumDatabaseFields());
536 break;
537 case FileRecordTypeChecker::BED6_PLUS_RECORD_TYPE:
538 dummyRecord = new BedPlusInterval();
539 (static_cast<BedPlusInterval *>(dummyRecord))->setNumPrintFields((static_cast<ContextIntersect *>(_context))->getMaxNumDatabaseFields());
540 break;
541 case FileRecordTypeChecker::VCF_RECORD_TYPE:
542 dummyRecord = new VcfRecord();
543 (static_cast<VcfRecord *>(dummyRecord))->setNumPrintFields((static_cast<ContextIntersect *>(_context))->getMaxNumDatabaseFields());
544 break;
545 case FileRecordTypeChecker::BAM_RECORD_TYPE:
546 dummyRecord = new BamRecord();
547 break;
548 case FileRecordTypeChecker::GFF_RECORD_TYPE:
549 dummyRecord = new GffRecord();
550 (static_cast<GffRecord *>(dummyRecord))->setNumFields((static_cast<ContextIntersect *>(_context))->getMaxNumDatabaseFields());
551 break;
552 case FileRecordTypeChecker::GFF_PLUS_RECORD_TYPE:
553 dummyRecord = new GffPlusRecord();
554 (static_cast<GffRecord *>(dummyRecord))->setNumFields((static_cast<ContextIntersect *>(_context))->getMaxNumDatabaseFields());
555 break;
556 default:
557 dummyRecord = new Bed3Interval();
558 break;
559 }
560 if (dummyRecord) {
561 dummyRecord->printNull(_outBuf);
562 delete dummyRecord;
563 }
564 }
565
printKey(const Record * key,const string & start,const string & end)566 void RecordOutputMgr::printKey(const Record *key, const string & start, const string & end)
567 {
568 if (key->getType() != FileRecordTypeChecker::BAM_RECORD_TYPE) {
569 key->print(_outBuf, start, end);
570 } else {
571 static_cast<const BamRecord *>(key)->print(_outBuf, start, end, _currBamBlockList);
572 }
573 }
574
printKey(const Record * key,CHRPOS start,CHRPOS end)575 void RecordOutputMgr::printKey(const Record *key, CHRPOS start, CHRPOS end)
576 {
577 if (key->getType() != FileRecordTypeChecker::BAM_RECORD_TYPE) {
578 key->print(_outBuf, start, end);
579 } else {
580 static_cast<const BamRecord *>(key)->print(_outBuf, start, end, _currBamBlockList);
581 }
582 }
583
printKey(const Record * key)584 void RecordOutputMgr::printKey(const Record *key)
585 {
586 if (key->getType() != FileRecordTypeChecker::BAM_RECORD_TYPE) {
587 key->print(_outBuf);
588 } else {
589 static_cast<const BamRecord *>(key)->print(_outBuf, _currBamBlockList);
590 }
591 }
592
flush()593 void RecordOutputMgr::flush() {
594 fwrite(_outBuf.c_str(), 1, _outBuf.size(), stdout);
595 _outBuf.clear();
596 }
597