1 /*****************************************************************************
2   bedFile.cpp
3 
4   (c) 2009 - Aaron Quinlan
5   Hall Laboratory
6   Department of Biochemistry and Molecular Genetics
7   University of Virginia
8   aaronquinlan@gmail.com
9 
10   Licensed under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "bedFile.h"
13 
14 
15 /***********************************************
16 Sorting comparison functions
17 ************************************************/
sortByChrom(BED const & a,BED const & b)18 bool sortByChrom(BED const &a, BED const &b) {
19     if (a.chrom < b.chrom) return true;
20     else return false;
21 };
22 
sortByStart(const BED & a,const BED & b)23 bool sortByStart(const BED &a, const BED &b) {
24     CHRPOS a_corrected = a.start;
25     if(a.zeroLength)
26         a_corrected++;
27     CHRPOS b_corrected = b.start;
28     if(b.zeroLength)
29         b_corrected++;
30 
31     if (a_corrected < b_corrected) return true;
32     else return false;
33 };
34 
sortBySizeAsc(const BED & a,const BED & b)35 bool sortBySizeAsc(const BED &a, const BED &b) {
36 
37     CHRPOS aLen = a.end - a.start;
38     CHRPOS bLen = b.end - b.start;
39 
40     if (aLen < bLen) return true;
41     else if (aLen > bLen) return false;
42     // If they're the same size, sort by position (as expected by tests)
43     else return byChromThenStart(a, b);
44 };
45 
sortBySizeDesc(const BED & a,const BED & b)46 bool sortBySizeDesc(const BED &a, const BED &b) {
47 
48     CHRPOS aLen = a.end - a.start;
49     CHRPOS bLen = b.end - b.start;
50 
51     if (aLen > bLen) return true;
52     else return false;
53 };
54 
sortByScoreAsc(const BED & a,const BED & b)55 bool sortByScoreAsc(const BED &a, const BED &b) {
56     if (a.score < b.score) return true;
57     else return false;
58 };
59 
sortByScoreDesc(const BED & a,const BED & b)60 bool sortByScoreDesc(const BED &a, const BED &b) {
61     if (a.score > b.score) return true;
62     else return false;
63 };
64 
byChromThenStart(BED const & a,BED const & b)65 bool byChromThenStart(BED const &a, BED const &b) {
66 
67     if (a.chrom < b.chrom) return true;
68     else if (a.chrom > b.chrom) return false;
69 
70     if (a.start < b.start) return true;
71     else if (a.start >= b.start) return false;
72 
73     return false;
74 };
75 
sortByWeight(const BED & a,const BED & b)76 bool sortByWeight(const BED &a, const BED &b) {
77     if (a.weight > b.weight) return true;
78     else return false;
79 };
80 /*******************************************
81 Class methods
82 *******************************************/
83 
84 // Constructor
BedFile(string & bedFile)85 BedFile::BedFile(string &bedFile)
86 : bedFile(bedFile),
87   _isGff(false),
88   _isVcf(false),
89   _typeIsKnown(false),
90   _merged_start(-1),
91   _merged_end(-1),
92   _merged_chrom(""),
93   _prev_start(-1),
94   _prev_chrom(""),
95   _total_length(0)
96 {}
97 
BedFile(void)98 BedFile::BedFile(void)
99 : _isGff(false),
100   _isVcf(false),
101   _typeIsKnown(false),
102   _merged_start(-1),
103   _merged_end(-1),
104   _merged_chrom(""),
105   _prev_start(-1),
106   _prev_chrom(""),
107   _total_length(0)
108 {}
109 
110 // Destructor
~BedFile(void)111 BedFile::~BedFile(void) {
112 }
113 
114 
Open(void)115 void BedFile::Open(void) {
116 
117     _bedFields.reserve(12);
118 
119     if (bedFile == "stdin" || bedFile == "-") {
120         _bedStream = &cin;
121     }
122     else {
123         _bedStream = new ifstream(bedFile.c_str(), ios::in);
124 
125         if( isGzipFile(_bedStream) ) {
126             delete _bedStream;
127             _bedStream = new igzstream(bedFile.c_str(), ios::in);
128         }
129         if ( _bedStream->fail() ) {
130             cerr << "Error: The requested file ("
131                  << bedFile
132                  << ") "
133                  << "could not be opened. "
134                  << "Error message: ("
135                  << strerror(errno)
136                  << "). Exiting!" << endl;
137             exit (1);
138         }
139     }
140     // save the file's header (if there is one)
141     GetHeader();
142 }
143 
144 // Rewind the pointer back to the beginning of the file
Rewind(void)145 void BedFile::Rewind(void) {
146     _bedStream->seekg(0, ios::beg);
147 
148     _prev_start = -1;
149     _prev_chrom = "";
150 }
151 
152 // Jump to a specific byte in the file
Seek(unsigned long offset)153 void BedFile::Seek(unsigned long offset) {
154     _bedStream->seekg(offset);
155 }
156 
157 // are the any intervals left in the file?
Empty(void)158 bool BedFile::Empty(void) {
159     return _status == BED_INVALID || _status == BED_BLANK;
160 }
161 
162 // Close the BED file
Close(void)163 void BedFile::Close(void) {
164     if (bedFile != "stdin" && bedFile != "-")
165         delete _bedStream;
166 }
167 
168 // Extract and store the header for the file.
GetHeader(void)169 void BedFile::GetHeader(void) {
170     while(getline(*_bedStream, _bedLine))
171     {
172         _lineNum++;
173         // look for header lines.  ^# headers can span multiple lines,
174         // but ^[browser|track|chrom] headers must occur on the 1st line.
175         if ( (_bedLine.find("#")       == 0) ||
176              (_bedLine.find("browser") == 0) ||
177              (_bedLine.find("track")   == 0)
178            )
179         {
180             _header += _bedLine + '\n';
181 
182             if (_bedLine.find("##fileformat=VCF") == 0) {
183                 _typeIsKnown = true;
184                 setFileType(VCF_FILETYPE);
185                 setGff(false);
186                 setVcf(true);
187             }
188         }
189         // we are done with the header. stop looking
190         // and indicate that the first data line has been read
191         // (i.e., _bedLine now houses the first data line)
192         else
193         {
194             _firstLine = true;
195             break;
196         }
197     }
198 }
199 
200 // Dump the header
PrintHeader(void)201 void BedFile::PrintHeader(void) {
202     cout << _header;
203 }
204 
205 
GetNextBed(BED & bed,bool forceSorted)206 bool BedFile::GetNextBed(BED &bed, bool forceSorted) {
207 
208     // make sure there are still lines to process.
209     // if so, tokenize, validate and return the BED entry.
210     _bedFields.clear();
211     // clear out the previous bed's data
212 
213     // read the next line in the file (unless this is the first line,
214     // which has already been read by GetHeader()).
215     if (!_firstLine) {
216 	if (!getline(*_bedStream, _bedLine)) {
217 	    _status = BED_INVALID;
218 	    return false;
219 	}
220 	_lineNum++;
221     }
222     // ditch \r for Windows if necessary.
223     if (_bedLine.size() && _bedLine[_bedLine.size()-1] == '\r') {
224 	_bedLine.resize(_bedLine.size()-1);
225     }
226 
227     // split into a string vector.
228     Tokenize(_bedLine, _bedFields);
229 
230     if (_firstLine) {
231 	_firstLine = false;
232 	setBedType(_bedFields.size());
233     }
234 
235     // load the BED struct as long as it's a valid BED entry.
236 
237     _numFields = _bedFields.size();
238     _status = parseLine(bed, _bedFields);
239 
240     if (_status == BED_VALID) {
241 	if (bed.chrom == _prev_chrom) {
242 	    if (bed.start >= _prev_start) {
243 		_prev_chrom = bed.chrom;
244 		_prev_start = bed.start;
245 	    }
246 	    else if (forceSorted) {
247 		cerr << "ERROR: input file: (" << bedFile
248 		     << ") is not sorted by chrom then start." << endl
249 		     << "       The start coordinate at line " << _lineNum
250 		     << " is less than the start at line " << _lineNum-1
251 		     << endl;
252 		exit(1);
253 	    }
254 	}
255 	else if (bed.chrom != _prev_chrom) {
256 	    _prev_chrom = bed.chrom;
257 	    _prev_start = bed.start;
258 	}
259 	_total_length += (bed.end - bed.start);
260 	return true;
261     }
262     else if (_status == BED_HEADER || _status == BED_BLANK)
263     {
264 	return true;
265     }
266     else
267     {
268 	_status = BED_INVALID;
269 	return false;
270     }
271 }
272 
273 
GetNextMergedBed(BED & merged_bed)274 bool BedFile::GetNextMergedBed(BED &merged_bed) {
275 
276     if (_bedStream->good()) {
277         BED bed;
278         // force sorting; hence third param = true
279         while (GetNextBed(bed, true)) {
280             if (_status == BED_VALID) {
281                 if ((bed.start - _merged_end > 0) ||
282                    (_merged_end < 0) ||
283                    (bed.chrom != _merged_chrom))
284                 {
285                     if (_merged_start >= 0) {
286                         merged_bed.chrom = _merged_chrom;
287                         merged_bed.start = _merged_start;
288                         merged_bed.end   = _merged_end;
289 
290                         _merged_chrom = bed.chrom;
291                         _merged_start = bed.start;
292                         _merged_end   = bed.end;
293 
294                         _total_flattened_length += \
295                             (merged_bed.end - merged_bed.start);
296                         return true;
297                     }
298                     else {
299                         _merged_start = bed.start;
300                         _merged_chrom = bed.chrom;
301                         _merged_end = bed.end;
302                     }
303                 }
304                 else if (bed.end > _merged_end)
305                 {
306                     _merged_end = bed.end;
307                 }
308             }
309         }
310 
311         // handle the last merged block in the file.
312         if (_status == BED_INVALID)
313         {
314             _status = BED_VALID;
315             merged_bed.chrom = _merged_chrom;
316             merged_bed.start = _merged_start;
317             merged_bed.end   = _merged_end;
318 
319             _total_flattened_length += \
320                 (merged_bed.end - merged_bed.start);
321             return true;
322         }
323     }
324     _status = BED_INVALID;
325     return false;
326 }
327 
328 
getTotalLength(void)329 unsigned long BedFile::getTotalLength(void) {
330     return _total_length;
331 }
332 
getTotalFlattenedLength(void)333 unsigned long BedFile::getTotalFlattenedLength(void) {
334     return _total_flattened_length;
335 }
336 
allHits(string chrom,CHRPOS start,CHRPOS end,string strand,vector<BED> & hits,bool sameStrand,bool diffStrand,float overlapFraction,bool reciprocal)337 void BedFile::allHits(string chrom, CHRPOS start,
338                       CHRPOS end, string strand,
339                       vector<BED> &hits, bool sameStrand,
340                       bool diffStrand, float overlapFraction,
341                       bool reciprocal)
342 {
343 
344     BIN startBin, endBin;
345     startBin = (start >> _binFirstShift);
346     endBin = ((end-1) >> _binFirstShift);
347     CHRPOS aLength = (end - start);
348 
349     /* SYNOPSIS:
350          1. We loop through each UCSC BIN level for feature A's chrom.
351          2. For each BIN, we loop through each B feature and add it to
352             hits if it meets all of the user's requests, which include:
353                (a) overlap fractio, (b) strandedness, (c) reciprocal overlap
354     */
355     for (BINLEVEL i = 0; i < _binLevels; ++i) {
356         BIN offset = _binOffsetsExtended[i];
357         for (BIN j = (startBin+offset); j <= (endBin+offset); ++j)  {
358             // move to the next bin if this one is empty
359             if (bedMap[chrom][j].empty()) continue;
360             vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin();
361             vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end();
362             for (; bedItr != bedEnd; ++bedItr) {
363                 CHRPOS s = max(start, bedItr->start);
364                 CHRPOS e = min(end, bedItr->end);
365                 int overlapBases = (int)(e - s);
366                 // 1. is there sufficient overlap w.r.t A?
367                 if ( (float) overlapBases
368                       /
369                      (float) aLength  >= overlapFraction)
370                 {
371                     CHRPOS bLength = (bedItr->end - bedItr->start);
372                     float bOverlap = ( (float) overlapBases / (float) bLength );
373                     bool strands_are_same = (strand == bedItr->strand);
374                     // 2. does the overlap meet the user's strand repuirements?
375                     if ( (sameStrand == false && diffStrand == false)
376                          ||
377                          (sameStrand == true && strands_are_same == true)
378                          ||
379                          (diffStrand == true && strands_are_same == false)
380                        )
381                     {
382                         // 3. did the user request reciprocal overlap
383                         // (i.e. sufficient overlap w.r.t. both A and B?)
384                         if (!reciprocal)
385                             hits.push_back(*bedItr);
386                         else if (bOverlap >= overlapFraction)
387                             hits.push_back(*bedItr);
388                     }
389                 }
390             }
391         }
392         startBin >>= _binNextShift;
393         endBin >>= _binNextShift;
394     }
395 }
396 
397 
anyHits(string chrom,CHRPOS start,CHRPOS end,string strand,bool sameStrand,bool diffStrand,float overlapFraction,bool reciprocal)398 bool BedFile::anyHits(string chrom, CHRPOS start, CHRPOS end, string strand,
399                      bool sameStrand, bool diffStrand, float overlapFraction, bool reciprocal) {
400 
401     BIN startBin, endBin;
402     startBin = (start >> _binFirstShift);
403     endBin = ((end-1) >> _binFirstShift);
404     CHRPOS aLength = (end - start);
405 
406     /* SYNOPSIS:
407     1. We loop through each UCSC BIN level for feature A's chrom.
408     2. For each BIN, we loop through each B feature and return true
409        if it meets all of the user's requests, which include:
410        (a) overlap fractio, (b) strandedness, (c) reciprocal overlap.
411        Otherwise, return false.
412     */
413     for (BINLEVEL i = 0; i < _binLevels; ++i) {
414         BIN offset = _binOffsetsExtended[i];
415         for (BIN j = (startBin+offset); j <= (endBin+offset); ++j)  {
416             // move to the next bin if this one is empty
417             if (bedMap[chrom][j].empty()) continue;
418             vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin();
419             vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end();
420             for (; bedItr != bedEnd; ++bedItr) {
421                 CHRPOS s = max(start, bedItr->start);
422                 CHRPOS e = min(end, bedItr->end);
423                 int overlapBases = (int)(e - s);
424                 // 1. is there sufficient overlap w.r.t A?
425                 if ( (float) overlapBases
426                       /
427                      (float) aLength  >= overlapFraction)
428                 {
429                     CHRPOS bLength = (bedItr->end - bedItr->start);
430                     float bOverlap = ( (float) overlapBases / (float) bLength );
431                     bool strands_are_same = (strand == bedItr->strand);
432                     // 2. does the overlap meet the user's strand repuirements?
433                     if ( (sameStrand == false && diffStrand == false)
434                         ||
435                         (sameStrand == true && strands_are_same == true)
436                         ||
437                         (diffStrand == true && strands_are_same == false)
438                         )
439                     {
440                         // 3. did the user request reciprocal overlap
441                         // (i.e. sufficient overlap w.r.t. both A and B?)
442                         if (!reciprocal)
443                             return true;
444                         else if (bOverlap >= overlapFraction)
445                             return true;
446                     }
447                 }
448             }
449         }
450         startBin >>= _binNextShift;
451         endBin >>= _binNextShift;
452     }
453     return false;
454 }
455 
456 
countHits(const BED & a,bool sameStrand,bool diffStrand,bool countsOnly)457 void BedFile::countHits(const BED &a, bool sameStrand, bool diffStrand, bool countsOnly) {
458 
459     BIN startBin, endBin;
460     startBin = (a.start >> _binFirstShift);
461     endBin = ((a.end-1) >> _binFirstShift);
462 
463     // loop through each bin "level" in the binning hierarchy
464     for (BINLEVEL i = 0; i < _binLevels; ++i) {
465 
466         // loop through each bin at this level of the hierarchy
467         BIN offset = _binOffsetsExtended[i];
468         for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {
469             // loop through each feature in this chrom/bin and
470             // see if it overlaps with the feature that was passed in.
471             // if so, add the feature to the list of hits.
472             vector<BEDCOV>::iterator bedItr = bedCovMap[a.chrom][j].begin();
473             vector<BEDCOV>::iterator bedEnd = bedCovMap[a.chrom][j].end();
474             for (; bedItr != bedEnd; ++bedItr) {
475 
476                 bool strands_are_same = (a.strand == bedItr->strand);
477                 // skip the hit if not on the same strand (and we care)
478                 if ((sameStrand == true && strands_are_same == false) ||
479                     (diffStrand == true && strands_are_same == true)
480                    )
481                 {
482                     continue;
483                 }
484                 else if (overlaps(bedItr->start, bedItr->end, a.start, a.end)
485                          > 0)
486                 {
487                     bedItr->count++;
488                     if (countsOnly == false) {
489                         if (a.zeroLength == false) {
490                             bedItr->depthMap[a.start+1].starts++;
491                             bedItr->depthMap[a.end].ends++;
492                         }
493                         else {
494                             // correct for the fact that we artificially
495                             // expanded the zeroLength feature
496                             bedItr->depthMap[a.start+2].starts++;
497                             bedItr->depthMap[a.end-1].ends++;
498                         }
499 
500                         if (a.start < bedItr->minOverlapStart) {
501                             bedItr->minOverlapStart = a.start;
502                         }
503                     }
504                 }
505             }
506         }
507         startBin >>= _binNextShift;
508         endBin >>= _binNextShift;
509     }
510 }
511 
512 
countSplitHits(const vector<BED> & bedBlocks,bool sameStrand,bool diffStrand,bool countsOnly)513 void BedFile::countSplitHits(const vector<BED> &bedBlocks, bool sameStrand, bool diffStrand, bool countsOnly) {
514 
515     // set to track the distinct B features that had coverage.
516     // we'll update the counts of coverage for these features by one
517     // at the end of this function to avoid over-counting.
518     set< vector<BEDCOV>::iterator > validHits;
519 
520     vector<BED>::const_iterator blockItr  = bedBlocks.begin();
521     vector<BED>::const_iterator blockEnd  = bedBlocks.end();
522     for (; blockItr != blockEnd; ++blockItr) {
523 
524         BIN startBin, endBin;
525         startBin = (blockItr->start >> _binFirstShift);
526         endBin = ((blockItr->end-1) >> _binFirstShift);
527 
528         // loop through each bin "level" in the binning hierarchy
529         for (BINLEVEL i = 0; i < _binLevels; ++i) {
530 
531             // loop through each bin at this level of the hierarchy
532             BIN offset = _binOffsetsExtended[i];
533             for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {
534                 // loop through each feature in this chrom/bin and see if it
535                 // overlaps with the feature that was passed in.
536                 // if so, add the feature to the list of hits.
537                 vector<BEDCOV>::iterator
538                     bedItr = bedCovMap[blockItr->chrom][j].begin();
539                 vector<BEDCOV>::iterator
540                     bedEnd = bedCovMap[blockItr->chrom][j].end();
541                 for (; bedItr != bedEnd; ++bedItr) {
542                     bool strands_are_same =
543                         (blockItr->strand == bedItr->strand);
544                     // skip the hit if not on the same strand (and we care)
545                     if ((sameStrand == true && strands_are_same == false) ||
546                         (diffStrand == true && strands_are_same == true)
547                        )
548                     {
549                         continue;
550                     }
551                     else if (overlaps(bedItr->start, bedItr->end,
552                                       blockItr->start, blockItr->end) > 0)
553                     {
554                         if (countsOnly == false) {
555                             if (blockItr->zeroLength == false) {
556                                 bedItr->depthMap[blockItr->start+1].starts++;
557                                 bedItr->depthMap[blockItr->end].ends++;
558                             }
559                             else {
560                                 // correct for the fact that we artificially
561                                 // expanded the zeroLength feature
562                                 bedItr->depthMap[blockItr->start+2].starts++;
563                                 bedItr->depthMap[blockItr->end-1].ends++;
564                             }
565                         }
566 
567                         validHits.insert(bedItr);
568                         if (blockItr->start < bedItr->minOverlapStart)
569                             bedItr->minOverlapStart = blockItr->start;
570                     }
571                 }
572             }
573             startBin >>= _binNextShift;
574             endBin >>= _binNextShift;
575         }
576     }
577     // incrment the count of overlap by one for each B feature that overlapped
578     // the current passed hit.  This is necessary to prevent over-counting for
579     // each "split"" of a single read.
580     set< vector<BEDCOV>::iterator >::iterator validHitsItr = validHits.begin();
581     set< vector<BEDCOV>::iterator >::iterator validHitsEnd = validHits.end();
582     for (; validHitsItr != validHitsEnd; ++validHitsItr)
583         // the validHitsItr points to another itr, hence
584         // the (*itr)-> dereferencing. ugly, but that's C++.
585         (*validHitsItr)->count++;
586 }
587 
588 
countListHits(const BED & a,int index,bool sameStrand,bool diffStrand)589 void BedFile::countListHits(const BED &a, int index, bool sameStrand, bool diffStrand) {
590 
591     BIN startBin, endBin;
592     startBin = (a.start >> _binFirstShift);
593     endBin = ((a.end-1) >> _binFirstShift);
594 
595     // loop through each bin "level" in the binning hierarchy
596     for (BINLEVEL i = 0; i < _binLevels; ++i) {
597 
598         // loop through each bin at this level of the hierarchy
599         BIN offset = _binOffsetsExtended[i];
600         for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {
601 
602             // loop through each feature in this chrom/bin and see if it
603             // overlaps with the feature that was passed in.  if so,
604             // add the feature tothe list of hits.
605             vector<BEDCOVLIST>::iterator
606                 bedItr = bedCovListMap[a.chrom][j].begin();
607             vector<BEDCOVLIST>::iterator
608                 bedEnd = bedCovListMap[a.chrom][j].end();
609             for (; bedItr != bedEnd; ++bedItr) {
610 
611                 bool strands_are_same = (a.strand == bedItr->strand);
612                 // skip the hit if not on the same strand (and we care)
613                 if ((sameStrand == true && strands_are_same == false) ||
614                     (diffStrand == true && strands_are_same == true)
615                    )
616                 {
617                     continue;
618                 }
619                 else if (overlaps(bedItr->start, bedItr->end,
620                                   a.start, a.end) > 0)
621                 {
622                     bedItr->counts[index]++;
623                     if (a.zeroLength == false) {
624                         bedItr->depthMapList[index][a.start+1].starts++;
625                         bedItr->depthMapList[index][a.end].ends++;
626                     }
627                     else {
628                         // correct for the fact that we artificially expanded
629                         // the zeroLength feature
630                         bedItr->depthMapList[index][a.start+2].starts++;
631                         bedItr->depthMapList[index][a.end-1].ends++;
632                     }
633 
634                     if (a.start < bedItr->minOverlapStarts[index]) {
635                         bedItr->minOverlapStarts[index] = a.start;
636                     }
637                 }
638             }
639         }
640         startBin >>= _binNextShift;
641         endBin >>= _binNextShift;
642     }
643 }
644 
setZeroBased(bool zeroBased)645 void BedFile::setZeroBased(bool zeroBased) { this->isZeroBased = zeroBased; }
646 
setGff(bool gff)647 void BedFile::setGff (bool gff) { this->_isGff = gff; }
648 
setVcf(bool vcf)649 void BedFile::setVcf (bool vcf) { this->_isVcf = vcf; }
650 
651 
setFileType(FileType type)652 void BedFile::setFileType (FileType type) {
653     _fileType    = type;
654     _typeIsKnown = true;
655 }
656 
657 
setBedType(int colNums)658 void BedFile::setBedType (int colNums) { bedType = colNums; }
setBed12(bool isBed12)659 void BedFile::setBed12 (bool isBed12) { this->isBed12 = isBed12; }
660 
loadBedFileIntoMap()661 void BedFile::loadBedFileIntoMap() {
662 
663     BED bedEntry;
664 
665     Open();
666     while (GetNextBed(bedEntry)) {
667         if (_status == BED_VALID) {
668             addBEDIntoMap(bedEntry);
669         }
670     }
671     Close();
672 }
673 
loadBedFileIntoMergedMap()674 void BedFile::loadBedFileIntoMergedMap() {
675 
676     BED bedEntry;
677 
678     Open();
679     while (GetNextMergedBed(bedEntry)) {
680         if (_status == BED_VALID) {
681             addBEDIntoMap(bedEntry);
682         }
683     }
684     Close();
685 }
686 
addBEDIntoMap(BED bedEntry)687 void BedFile::addBEDIntoMap(BED bedEntry) {
688     BIN bin = getBin(bedEntry.start, bedEntry.end);
689     bedMap[bedEntry.chrom][bin].push_back(bedEntry);
690 }
691 
692 
loadBedCovFileIntoMap()693 void BedFile::loadBedCovFileIntoMap() {
694 
695     BED bedEntry;
696     Open();
697     while (GetNextBed(bedEntry)) {
698         if (_status == BED_VALID) {
699             BIN bin = getBin(bedEntry.start, bedEntry.end);
700 
701             BEDCOV bedCov;
702             bedCov.chrom        = bedEntry.chrom;
703             bedCov.start        = bedEntry.start;
704             bedCov.end          = bedEntry.end;
705             bedCov.name         = bedEntry.name;
706             bedCov.score        = bedEntry.score;
707             bedCov.strand       = bedEntry.strand;
708             bedCov.fields       = bedEntry.fields;
709             bedCov.other_idxs   = bedEntry.other_idxs;
710             bedCov.zeroLength   = bedEntry.zeroLength;
711             bedCov.count = 0;
712             bedCov.minOverlapStart = INT_MAX;
713 
714             bedCovMap[bedEntry.chrom][bin].push_back(bedCov);
715         }
716     }
717     Close();
718 }
719 
loadBedCovListFileIntoMap()720 void BedFile::loadBedCovListFileIntoMap() {
721 
722     BED bedEntry;
723     Open();
724     while (GetNextBed(bedEntry)) {
725         if (_status == BED_VALID) {
726             BIN bin = getBin(bedEntry.start, bedEntry.end);
727 
728             BEDCOVLIST bedCovList;
729             bedCovList.chrom        = bedEntry.chrom;
730             bedCovList.start        = bedEntry.start;
731             bedCovList.end          = bedEntry.end;
732             bedCovList.name         = bedEntry.name;
733             bedCovList.score        = bedEntry.score;
734             bedCovList.strand       = bedEntry.strand;
735             bedCovList.fields       = bedEntry.fields;
736             bedCovList.other_idxs   = bedEntry.other_idxs;
737             bedCovList.zeroLength   = bedEntry.zeroLength;
738 
739             bedCovListMap[bedEntry.chrom][bin].push_back(bedCovList);
740         }
741     }
742     Close();
743 }
744 
745 
loadBedFileIntoMapNoBin()746 void BedFile::loadBedFileIntoMapNoBin() {
747 
748     BED bedEntry;
749 
750     Open();
751     while (GetNextBed(bedEntry)) {
752         if (_status == BED_VALID) {
753             bedMapNoBin[bedEntry.chrom].push_back(bedEntry);
754         }
755     }
756     Close();
757 
758     // sort the BED entries for each chromosome
759     // in ascending order of start position
760     for (masterBedMapNoBin::iterator m = this->bedMapNoBin.begin();
761          m != this->bedMapNoBin.end();
762          ++m)
763     {
764         sort(m->second.begin(), m->second.end(), sortByStart);
765     }
766 }
767 
loadBedFileIntoVector()768 void BedFile::loadBedFileIntoVector() {
769 
770     BED bedEntry;
771 
772     Open();
773     while (GetNextBed(bedEntry)) {
774         if (_status == BED_VALID) {
775             bedList.push_back(bedEntry);
776         }
777     }
778     Close();
779 }
780 
assignWeightsBasedOnSize()781 void BedFile::assignWeightsBasedOnSize() {
782     // sort by size
783     sort(bedList.begin(), bedList.end(), sortBySizeAsc);
784     // then assign a weight to each interval based on the
785     // proportion of the total interval length of the file
786     size_t totalSize = 0;
787     for (unsigned int i = 0; i < bedList.size(); ++i)
788     {
789         totalSize += bedList[i].size();
790     }
791     double totalWeight = 0.0;
792     for (unsigned int i = 0; i < bedList.size(); ++i)
793     {
794         totalWeight += (double) bedList[i].size() / (double) totalSize;
795         bedList[i].weight = totalWeight;
796     }
797 }
798 
799 struct CompareByWeight {
operator ()CompareByWeight800     bool operator()(double const val, BED const& bed) const
801     {
802         return bed.weight > val;
803     }
804 };
805 
sizeWeightedSearch(double val)806 BED * BedFile::sizeWeightedSearch(double val) {
807     // binary search for first interval with weight greater than search val
808     vector<BED>::iterator up = upper_bound(bedList.begin(), bedList.end(), val, CompareByWeight());
809     return &(*up);
810 }
811 
812 
813