1 /*****************************************************************************
2 bedFile.cpp
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licensed under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "bedFile.h"
13
14
15 /***********************************************
16 Sorting comparison functions
17 ************************************************/
sortByChrom(BED const & a,BED const & b)18 bool sortByChrom(BED const &a, BED const &b) {
19 if (a.chrom < b.chrom) return true;
20 else return false;
21 };
22
sortByStart(const BED & a,const BED & b)23 bool sortByStart(const BED &a, const BED &b) {
24 CHRPOS a_corrected = a.start;
25 if(a.zeroLength)
26 a_corrected++;
27 CHRPOS b_corrected = b.start;
28 if(b.zeroLength)
29 b_corrected++;
30
31 if (a_corrected < b_corrected) return true;
32 else return false;
33 };
34
sortBySizeAsc(const BED & a,const BED & b)35 bool sortBySizeAsc(const BED &a, const BED &b) {
36
37 CHRPOS aLen = a.end - a.start;
38 CHRPOS bLen = b.end - b.start;
39
40 if (aLen < bLen) return true;
41 else if (aLen > bLen) return false;
42 // If they're the same size, sort by position (as expected by tests)
43 else return byChromThenStart(a, b);
44 };
45
sortBySizeDesc(const BED & a,const BED & b)46 bool sortBySizeDesc(const BED &a, const BED &b) {
47
48 CHRPOS aLen = a.end - a.start;
49 CHRPOS bLen = b.end - b.start;
50
51 if (aLen > bLen) return true;
52 else return false;
53 };
54
sortByScoreAsc(const BED & a,const BED & b)55 bool sortByScoreAsc(const BED &a, const BED &b) {
56 if (a.score < b.score) return true;
57 else return false;
58 };
59
sortByScoreDesc(const BED & a,const BED & b)60 bool sortByScoreDesc(const BED &a, const BED &b) {
61 if (a.score > b.score) return true;
62 else return false;
63 };
64
byChromThenStart(BED const & a,BED const & b)65 bool byChromThenStart(BED const &a, BED const &b) {
66
67 if (a.chrom < b.chrom) return true;
68 else if (a.chrom > b.chrom) return false;
69
70 if (a.start < b.start) return true;
71 else if (a.start >= b.start) return false;
72
73 return false;
74 };
75
sortByWeight(const BED & a,const BED & b)76 bool sortByWeight(const BED &a, const BED &b) {
77 if (a.weight > b.weight) return true;
78 else return false;
79 };
80 /*******************************************
81 Class methods
82 *******************************************/
83
84 // Constructor
BedFile(string & bedFile)85 BedFile::BedFile(string &bedFile)
86 : bedFile(bedFile),
87 _isGff(false),
88 _isVcf(false),
89 _typeIsKnown(false),
90 _merged_start(-1),
91 _merged_end(-1),
92 _merged_chrom(""),
93 _prev_start(-1),
94 _prev_chrom(""),
95 _total_length(0)
96 {}
97
BedFile(void)98 BedFile::BedFile(void)
99 : _isGff(false),
100 _isVcf(false),
101 _typeIsKnown(false),
102 _merged_start(-1),
103 _merged_end(-1),
104 _merged_chrom(""),
105 _prev_start(-1),
106 _prev_chrom(""),
107 _total_length(0)
108 {}
109
110 // Destructor
~BedFile(void)111 BedFile::~BedFile(void) {
112 }
113
114
Open(void)115 void BedFile::Open(void) {
116
117 _bedFields.reserve(12);
118
119 if (bedFile == "stdin" || bedFile == "-") {
120 _bedStream = &cin;
121 }
122 else {
123 _bedStream = new ifstream(bedFile.c_str(), ios::in);
124
125 if( isGzipFile(_bedStream) ) {
126 delete _bedStream;
127 _bedStream = new igzstream(bedFile.c_str(), ios::in);
128 }
129 if ( _bedStream->fail() ) {
130 cerr << "Error: The requested file ("
131 << bedFile
132 << ") "
133 << "could not be opened. "
134 << "Error message: ("
135 << strerror(errno)
136 << "). Exiting!" << endl;
137 exit (1);
138 }
139 }
140 // save the file's header (if there is one)
141 GetHeader();
142 }
143
144 // Rewind the pointer back to the beginning of the file
Rewind(void)145 void BedFile::Rewind(void) {
146 _bedStream->seekg(0, ios::beg);
147
148 _prev_start = -1;
149 _prev_chrom = "";
150 }
151
152 // Jump to a specific byte in the file
Seek(unsigned long offset)153 void BedFile::Seek(unsigned long offset) {
154 _bedStream->seekg(offset);
155 }
156
157 // are the any intervals left in the file?
Empty(void)158 bool BedFile::Empty(void) {
159 return _status == BED_INVALID || _status == BED_BLANK;
160 }
161
162 // Close the BED file
Close(void)163 void BedFile::Close(void) {
164 if (bedFile != "stdin" && bedFile != "-")
165 delete _bedStream;
166 }
167
168 // Extract and store the header for the file.
GetHeader(void)169 void BedFile::GetHeader(void) {
170 while(getline(*_bedStream, _bedLine))
171 {
172 _lineNum++;
173 // look for header lines. ^# headers can span multiple lines,
174 // but ^[browser|track|chrom] headers must occur on the 1st line.
175 if ( (_bedLine.find("#") == 0) ||
176 (_bedLine.find("browser") == 0) ||
177 (_bedLine.find("track") == 0)
178 )
179 {
180 _header += _bedLine + '\n';
181
182 if (_bedLine.find("##fileformat=VCF") == 0) {
183 _typeIsKnown = true;
184 setFileType(VCF_FILETYPE);
185 setGff(false);
186 setVcf(true);
187 }
188 }
189 // we are done with the header. stop looking
190 // and indicate that the first data line has been read
191 // (i.e., _bedLine now houses the first data line)
192 else
193 {
194 _firstLine = true;
195 break;
196 }
197 }
198 }
199
200 // Dump the header
PrintHeader(void)201 void BedFile::PrintHeader(void) {
202 cout << _header;
203 }
204
205
GetNextBed(BED & bed,bool forceSorted)206 bool BedFile::GetNextBed(BED &bed, bool forceSorted) {
207
208 // make sure there are still lines to process.
209 // if so, tokenize, validate and return the BED entry.
210 _bedFields.clear();
211 // clear out the previous bed's data
212
213 // read the next line in the file (unless this is the first line,
214 // which has already been read by GetHeader()).
215 if (!_firstLine) {
216 if (!getline(*_bedStream, _bedLine)) {
217 _status = BED_INVALID;
218 return false;
219 }
220 _lineNum++;
221 }
222 // ditch \r for Windows if necessary.
223 if (_bedLine.size() && _bedLine[_bedLine.size()-1] == '\r') {
224 _bedLine.resize(_bedLine.size()-1);
225 }
226
227 // split into a string vector.
228 Tokenize(_bedLine, _bedFields);
229
230 if (_firstLine) {
231 _firstLine = false;
232 setBedType(_bedFields.size());
233 }
234
235 // load the BED struct as long as it's a valid BED entry.
236
237 _numFields = _bedFields.size();
238 _status = parseLine(bed, _bedFields);
239
240 if (_status == BED_VALID) {
241 if (bed.chrom == _prev_chrom) {
242 if (bed.start >= _prev_start) {
243 _prev_chrom = bed.chrom;
244 _prev_start = bed.start;
245 }
246 else if (forceSorted) {
247 cerr << "ERROR: input file: (" << bedFile
248 << ") is not sorted by chrom then start." << endl
249 << " The start coordinate at line " << _lineNum
250 << " is less than the start at line " << _lineNum-1
251 << endl;
252 exit(1);
253 }
254 }
255 else if (bed.chrom != _prev_chrom) {
256 _prev_chrom = bed.chrom;
257 _prev_start = bed.start;
258 }
259 _total_length += (bed.end - bed.start);
260 return true;
261 }
262 else if (_status == BED_HEADER || _status == BED_BLANK)
263 {
264 return true;
265 }
266 else
267 {
268 _status = BED_INVALID;
269 return false;
270 }
271 }
272
273
GetNextMergedBed(BED & merged_bed)274 bool BedFile::GetNextMergedBed(BED &merged_bed) {
275
276 if (_bedStream->good()) {
277 BED bed;
278 // force sorting; hence third param = true
279 while (GetNextBed(bed, true)) {
280 if (_status == BED_VALID) {
281 if ((bed.start - _merged_end > 0) ||
282 (_merged_end < 0) ||
283 (bed.chrom != _merged_chrom))
284 {
285 if (_merged_start >= 0) {
286 merged_bed.chrom = _merged_chrom;
287 merged_bed.start = _merged_start;
288 merged_bed.end = _merged_end;
289
290 _merged_chrom = bed.chrom;
291 _merged_start = bed.start;
292 _merged_end = bed.end;
293
294 _total_flattened_length += \
295 (merged_bed.end - merged_bed.start);
296 return true;
297 }
298 else {
299 _merged_start = bed.start;
300 _merged_chrom = bed.chrom;
301 _merged_end = bed.end;
302 }
303 }
304 else if (bed.end > _merged_end)
305 {
306 _merged_end = bed.end;
307 }
308 }
309 }
310
311 // handle the last merged block in the file.
312 if (_status == BED_INVALID)
313 {
314 _status = BED_VALID;
315 merged_bed.chrom = _merged_chrom;
316 merged_bed.start = _merged_start;
317 merged_bed.end = _merged_end;
318
319 _total_flattened_length += \
320 (merged_bed.end - merged_bed.start);
321 return true;
322 }
323 }
324 _status = BED_INVALID;
325 return false;
326 }
327
328
getTotalLength(void)329 unsigned long BedFile::getTotalLength(void) {
330 return _total_length;
331 }
332
getTotalFlattenedLength(void)333 unsigned long BedFile::getTotalFlattenedLength(void) {
334 return _total_flattened_length;
335 }
336
allHits(string chrom,CHRPOS start,CHRPOS end,string strand,vector<BED> & hits,bool sameStrand,bool diffStrand,float overlapFraction,bool reciprocal)337 void BedFile::allHits(string chrom, CHRPOS start,
338 CHRPOS end, string strand,
339 vector<BED> &hits, bool sameStrand,
340 bool diffStrand, float overlapFraction,
341 bool reciprocal)
342 {
343
344 BIN startBin, endBin;
345 startBin = (start >> _binFirstShift);
346 endBin = ((end-1) >> _binFirstShift);
347 CHRPOS aLength = (end - start);
348
349 /* SYNOPSIS:
350 1. We loop through each UCSC BIN level for feature A's chrom.
351 2. For each BIN, we loop through each B feature and add it to
352 hits if it meets all of the user's requests, which include:
353 (a) overlap fractio, (b) strandedness, (c) reciprocal overlap
354 */
355 for (BINLEVEL i = 0; i < _binLevels; ++i) {
356 BIN offset = _binOffsetsExtended[i];
357 for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {
358 // move to the next bin if this one is empty
359 if (bedMap[chrom][j].empty()) continue;
360 vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin();
361 vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end();
362 for (; bedItr != bedEnd; ++bedItr) {
363 CHRPOS s = max(start, bedItr->start);
364 CHRPOS e = min(end, bedItr->end);
365 int overlapBases = (int)(e - s);
366 // 1. is there sufficient overlap w.r.t A?
367 if ( (float) overlapBases
368 /
369 (float) aLength >= overlapFraction)
370 {
371 CHRPOS bLength = (bedItr->end - bedItr->start);
372 float bOverlap = ( (float) overlapBases / (float) bLength );
373 bool strands_are_same = (strand == bedItr->strand);
374 // 2. does the overlap meet the user's strand repuirements?
375 if ( (sameStrand == false && diffStrand == false)
376 ||
377 (sameStrand == true && strands_are_same == true)
378 ||
379 (diffStrand == true && strands_are_same == false)
380 )
381 {
382 // 3. did the user request reciprocal overlap
383 // (i.e. sufficient overlap w.r.t. both A and B?)
384 if (!reciprocal)
385 hits.push_back(*bedItr);
386 else if (bOverlap >= overlapFraction)
387 hits.push_back(*bedItr);
388 }
389 }
390 }
391 }
392 startBin >>= _binNextShift;
393 endBin >>= _binNextShift;
394 }
395 }
396
397
anyHits(string chrom,CHRPOS start,CHRPOS end,string strand,bool sameStrand,bool diffStrand,float overlapFraction,bool reciprocal)398 bool BedFile::anyHits(string chrom, CHRPOS start, CHRPOS end, string strand,
399 bool sameStrand, bool diffStrand, float overlapFraction, bool reciprocal) {
400
401 BIN startBin, endBin;
402 startBin = (start >> _binFirstShift);
403 endBin = ((end-1) >> _binFirstShift);
404 CHRPOS aLength = (end - start);
405
406 /* SYNOPSIS:
407 1. We loop through each UCSC BIN level for feature A's chrom.
408 2. For each BIN, we loop through each B feature and return true
409 if it meets all of the user's requests, which include:
410 (a) overlap fractio, (b) strandedness, (c) reciprocal overlap.
411 Otherwise, return false.
412 */
413 for (BINLEVEL i = 0; i < _binLevels; ++i) {
414 BIN offset = _binOffsetsExtended[i];
415 for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {
416 // move to the next bin if this one is empty
417 if (bedMap[chrom][j].empty()) continue;
418 vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin();
419 vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end();
420 for (; bedItr != bedEnd; ++bedItr) {
421 CHRPOS s = max(start, bedItr->start);
422 CHRPOS e = min(end, bedItr->end);
423 int overlapBases = (int)(e - s);
424 // 1. is there sufficient overlap w.r.t A?
425 if ( (float) overlapBases
426 /
427 (float) aLength >= overlapFraction)
428 {
429 CHRPOS bLength = (bedItr->end - bedItr->start);
430 float bOverlap = ( (float) overlapBases / (float) bLength );
431 bool strands_are_same = (strand == bedItr->strand);
432 // 2. does the overlap meet the user's strand repuirements?
433 if ( (sameStrand == false && diffStrand == false)
434 ||
435 (sameStrand == true && strands_are_same == true)
436 ||
437 (diffStrand == true && strands_are_same == false)
438 )
439 {
440 // 3. did the user request reciprocal overlap
441 // (i.e. sufficient overlap w.r.t. both A and B?)
442 if (!reciprocal)
443 return true;
444 else if (bOverlap >= overlapFraction)
445 return true;
446 }
447 }
448 }
449 }
450 startBin >>= _binNextShift;
451 endBin >>= _binNextShift;
452 }
453 return false;
454 }
455
456
countHits(const BED & a,bool sameStrand,bool diffStrand,bool countsOnly)457 void BedFile::countHits(const BED &a, bool sameStrand, bool diffStrand, bool countsOnly) {
458
459 BIN startBin, endBin;
460 startBin = (a.start >> _binFirstShift);
461 endBin = ((a.end-1) >> _binFirstShift);
462
463 // loop through each bin "level" in the binning hierarchy
464 for (BINLEVEL i = 0; i < _binLevels; ++i) {
465
466 // loop through each bin at this level of the hierarchy
467 BIN offset = _binOffsetsExtended[i];
468 for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {
469 // loop through each feature in this chrom/bin and
470 // see if it overlaps with the feature that was passed in.
471 // if so, add the feature to the list of hits.
472 vector<BEDCOV>::iterator bedItr = bedCovMap[a.chrom][j].begin();
473 vector<BEDCOV>::iterator bedEnd = bedCovMap[a.chrom][j].end();
474 for (; bedItr != bedEnd; ++bedItr) {
475
476 bool strands_are_same = (a.strand == bedItr->strand);
477 // skip the hit if not on the same strand (and we care)
478 if ((sameStrand == true && strands_are_same == false) ||
479 (diffStrand == true && strands_are_same == true)
480 )
481 {
482 continue;
483 }
484 else if (overlaps(bedItr->start, bedItr->end, a.start, a.end)
485 > 0)
486 {
487 bedItr->count++;
488 if (countsOnly == false) {
489 if (a.zeroLength == false) {
490 bedItr->depthMap[a.start+1].starts++;
491 bedItr->depthMap[a.end].ends++;
492 }
493 else {
494 // correct for the fact that we artificially
495 // expanded the zeroLength feature
496 bedItr->depthMap[a.start+2].starts++;
497 bedItr->depthMap[a.end-1].ends++;
498 }
499
500 if (a.start < bedItr->minOverlapStart) {
501 bedItr->minOverlapStart = a.start;
502 }
503 }
504 }
505 }
506 }
507 startBin >>= _binNextShift;
508 endBin >>= _binNextShift;
509 }
510 }
511
512
countSplitHits(const vector<BED> & bedBlocks,bool sameStrand,bool diffStrand,bool countsOnly)513 void BedFile::countSplitHits(const vector<BED> &bedBlocks, bool sameStrand, bool diffStrand, bool countsOnly) {
514
515 // set to track the distinct B features that had coverage.
516 // we'll update the counts of coverage for these features by one
517 // at the end of this function to avoid over-counting.
518 set< vector<BEDCOV>::iterator > validHits;
519
520 vector<BED>::const_iterator blockItr = bedBlocks.begin();
521 vector<BED>::const_iterator blockEnd = bedBlocks.end();
522 for (; blockItr != blockEnd; ++blockItr) {
523
524 BIN startBin, endBin;
525 startBin = (blockItr->start >> _binFirstShift);
526 endBin = ((blockItr->end-1) >> _binFirstShift);
527
528 // loop through each bin "level" in the binning hierarchy
529 for (BINLEVEL i = 0; i < _binLevels; ++i) {
530
531 // loop through each bin at this level of the hierarchy
532 BIN offset = _binOffsetsExtended[i];
533 for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {
534 // loop through each feature in this chrom/bin and see if it
535 // overlaps with the feature that was passed in.
536 // if so, add the feature to the list of hits.
537 vector<BEDCOV>::iterator
538 bedItr = bedCovMap[blockItr->chrom][j].begin();
539 vector<BEDCOV>::iterator
540 bedEnd = bedCovMap[blockItr->chrom][j].end();
541 for (; bedItr != bedEnd; ++bedItr) {
542 bool strands_are_same =
543 (blockItr->strand == bedItr->strand);
544 // skip the hit if not on the same strand (and we care)
545 if ((sameStrand == true && strands_are_same == false) ||
546 (diffStrand == true && strands_are_same == true)
547 )
548 {
549 continue;
550 }
551 else if (overlaps(bedItr->start, bedItr->end,
552 blockItr->start, blockItr->end) > 0)
553 {
554 if (countsOnly == false) {
555 if (blockItr->zeroLength == false) {
556 bedItr->depthMap[blockItr->start+1].starts++;
557 bedItr->depthMap[blockItr->end].ends++;
558 }
559 else {
560 // correct for the fact that we artificially
561 // expanded the zeroLength feature
562 bedItr->depthMap[blockItr->start+2].starts++;
563 bedItr->depthMap[blockItr->end-1].ends++;
564 }
565 }
566
567 validHits.insert(bedItr);
568 if (blockItr->start < bedItr->minOverlapStart)
569 bedItr->minOverlapStart = blockItr->start;
570 }
571 }
572 }
573 startBin >>= _binNextShift;
574 endBin >>= _binNextShift;
575 }
576 }
577 // incrment the count of overlap by one for each B feature that overlapped
578 // the current passed hit. This is necessary to prevent over-counting for
579 // each "split"" of a single read.
580 set< vector<BEDCOV>::iterator >::iterator validHitsItr = validHits.begin();
581 set< vector<BEDCOV>::iterator >::iterator validHitsEnd = validHits.end();
582 for (; validHitsItr != validHitsEnd; ++validHitsItr)
583 // the validHitsItr points to another itr, hence
584 // the (*itr)-> dereferencing. ugly, but that's C++.
585 (*validHitsItr)->count++;
586 }
587
588
countListHits(const BED & a,int index,bool sameStrand,bool diffStrand)589 void BedFile::countListHits(const BED &a, int index, bool sameStrand, bool diffStrand) {
590
591 BIN startBin, endBin;
592 startBin = (a.start >> _binFirstShift);
593 endBin = ((a.end-1) >> _binFirstShift);
594
595 // loop through each bin "level" in the binning hierarchy
596 for (BINLEVEL i = 0; i < _binLevels; ++i) {
597
598 // loop through each bin at this level of the hierarchy
599 BIN offset = _binOffsetsExtended[i];
600 for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {
601
602 // loop through each feature in this chrom/bin and see if it
603 // overlaps with the feature that was passed in. if so,
604 // add the feature tothe list of hits.
605 vector<BEDCOVLIST>::iterator
606 bedItr = bedCovListMap[a.chrom][j].begin();
607 vector<BEDCOVLIST>::iterator
608 bedEnd = bedCovListMap[a.chrom][j].end();
609 for (; bedItr != bedEnd; ++bedItr) {
610
611 bool strands_are_same = (a.strand == bedItr->strand);
612 // skip the hit if not on the same strand (and we care)
613 if ((sameStrand == true && strands_are_same == false) ||
614 (diffStrand == true && strands_are_same == true)
615 )
616 {
617 continue;
618 }
619 else if (overlaps(bedItr->start, bedItr->end,
620 a.start, a.end) > 0)
621 {
622 bedItr->counts[index]++;
623 if (a.zeroLength == false) {
624 bedItr->depthMapList[index][a.start+1].starts++;
625 bedItr->depthMapList[index][a.end].ends++;
626 }
627 else {
628 // correct for the fact that we artificially expanded
629 // the zeroLength feature
630 bedItr->depthMapList[index][a.start+2].starts++;
631 bedItr->depthMapList[index][a.end-1].ends++;
632 }
633
634 if (a.start < bedItr->minOverlapStarts[index]) {
635 bedItr->minOverlapStarts[index] = a.start;
636 }
637 }
638 }
639 }
640 startBin >>= _binNextShift;
641 endBin >>= _binNextShift;
642 }
643 }
644
setZeroBased(bool zeroBased)645 void BedFile::setZeroBased(bool zeroBased) { this->isZeroBased = zeroBased; }
646
setGff(bool gff)647 void BedFile::setGff (bool gff) { this->_isGff = gff; }
648
setVcf(bool vcf)649 void BedFile::setVcf (bool vcf) { this->_isVcf = vcf; }
650
651
setFileType(FileType type)652 void BedFile::setFileType (FileType type) {
653 _fileType = type;
654 _typeIsKnown = true;
655 }
656
657
setBedType(int colNums)658 void BedFile::setBedType (int colNums) { bedType = colNums; }
setBed12(bool isBed12)659 void BedFile::setBed12 (bool isBed12) { this->isBed12 = isBed12; }
660
loadBedFileIntoMap()661 void BedFile::loadBedFileIntoMap() {
662
663 BED bedEntry;
664
665 Open();
666 while (GetNextBed(bedEntry)) {
667 if (_status == BED_VALID) {
668 addBEDIntoMap(bedEntry);
669 }
670 }
671 Close();
672 }
673
loadBedFileIntoMergedMap()674 void BedFile::loadBedFileIntoMergedMap() {
675
676 BED bedEntry;
677
678 Open();
679 while (GetNextMergedBed(bedEntry)) {
680 if (_status == BED_VALID) {
681 addBEDIntoMap(bedEntry);
682 }
683 }
684 Close();
685 }
686
addBEDIntoMap(BED bedEntry)687 void BedFile::addBEDIntoMap(BED bedEntry) {
688 BIN bin = getBin(bedEntry.start, bedEntry.end);
689 bedMap[bedEntry.chrom][bin].push_back(bedEntry);
690 }
691
692
loadBedCovFileIntoMap()693 void BedFile::loadBedCovFileIntoMap() {
694
695 BED bedEntry;
696 Open();
697 while (GetNextBed(bedEntry)) {
698 if (_status == BED_VALID) {
699 BIN bin = getBin(bedEntry.start, bedEntry.end);
700
701 BEDCOV bedCov;
702 bedCov.chrom = bedEntry.chrom;
703 bedCov.start = bedEntry.start;
704 bedCov.end = bedEntry.end;
705 bedCov.name = bedEntry.name;
706 bedCov.score = bedEntry.score;
707 bedCov.strand = bedEntry.strand;
708 bedCov.fields = bedEntry.fields;
709 bedCov.other_idxs = bedEntry.other_idxs;
710 bedCov.zeroLength = bedEntry.zeroLength;
711 bedCov.count = 0;
712 bedCov.minOverlapStart = INT_MAX;
713
714 bedCovMap[bedEntry.chrom][bin].push_back(bedCov);
715 }
716 }
717 Close();
718 }
719
loadBedCovListFileIntoMap()720 void BedFile::loadBedCovListFileIntoMap() {
721
722 BED bedEntry;
723 Open();
724 while (GetNextBed(bedEntry)) {
725 if (_status == BED_VALID) {
726 BIN bin = getBin(bedEntry.start, bedEntry.end);
727
728 BEDCOVLIST bedCovList;
729 bedCovList.chrom = bedEntry.chrom;
730 bedCovList.start = bedEntry.start;
731 bedCovList.end = bedEntry.end;
732 bedCovList.name = bedEntry.name;
733 bedCovList.score = bedEntry.score;
734 bedCovList.strand = bedEntry.strand;
735 bedCovList.fields = bedEntry.fields;
736 bedCovList.other_idxs = bedEntry.other_idxs;
737 bedCovList.zeroLength = bedEntry.zeroLength;
738
739 bedCovListMap[bedEntry.chrom][bin].push_back(bedCovList);
740 }
741 }
742 Close();
743 }
744
745
loadBedFileIntoMapNoBin()746 void BedFile::loadBedFileIntoMapNoBin() {
747
748 BED bedEntry;
749
750 Open();
751 while (GetNextBed(bedEntry)) {
752 if (_status == BED_VALID) {
753 bedMapNoBin[bedEntry.chrom].push_back(bedEntry);
754 }
755 }
756 Close();
757
758 // sort the BED entries for each chromosome
759 // in ascending order of start position
760 for (masterBedMapNoBin::iterator m = this->bedMapNoBin.begin();
761 m != this->bedMapNoBin.end();
762 ++m)
763 {
764 sort(m->second.begin(), m->second.end(), sortByStart);
765 }
766 }
767
loadBedFileIntoVector()768 void BedFile::loadBedFileIntoVector() {
769
770 BED bedEntry;
771
772 Open();
773 while (GetNextBed(bedEntry)) {
774 if (_status == BED_VALID) {
775 bedList.push_back(bedEntry);
776 }
777 }
778 Close();
779 }
780
assignWeightsBasedOnSize()781 void BedFile::assignWeightsBasedOnSize() {
782 // sort by size
783 sort(bedList.begin(), bedList.end(), sortBySizeAsc);
784 // then assign a weight to each interval based on the
785 // proportion of the total interval length of the file
786 size_t totalSize = 0;
787 for (unsigned int i = 0; i < bedList.size(); ++i)
788 {
789 totalSize += bedList[i].size();
790 }
791 double totalWeight = 0.0;
792 for (unsigned int i = 0; i < bedList.size(); ++i)
793 {
794 totalWeight += (double) bedList[i].size() / (double) totalSize;
795 bedList[i].weight = totalWeight;
796 }
797 }
798
799 struct CompareByWeight {
operator ()CompareByWeight800 bool operator()(double const val, BED const& bed) const
801 {
802 return bed.weight > val;
803 }
804 };
805
sizeWeightedSearch(double val)806 BED * BedFile::sizeWeightedSearch(double val) {
807 // binary search for first interval with weight greater than search val
808 vector<BED>::iterator up = upper_bound(bedList.begin(), bedList.end(), val, CompareByWeight());
809 return &(*up);
810 }
811
812
813