1 //
2 //  bedFilePE.cpp
3 //  BEDTools
4 //
5 //  Created by Aaron Quinlan Spring 2009.
6 //  Copyright 2009 Aaron Quinlan. All rights reserved.
7 //
8 //  Summary:  Contains common functions for finding BED overlaps.
9 //
10 //  Acknowledgments: Much of the code herein is taken from Jim Kent's
11 //                   BED processing code.  I am grateful for his elegant
12 //                   genome binning algorithm and therefore use it extensively.
13 
14 
15 #include "bedFilePE.h"
16 
17 using namespace std;
18 
19 // Constructor
BedFilePE(string & bedFile)20 BedFilePE::BedFilePE(string &bedFile) {
21     this->bedFile = bedFile;
22 }
23 
24 // Destructor
~BedFilePE(void)25 BedFilePE::~BedFilePE(void) {
26 }
27 
Open(void)28 void BedFilePE::Open(void) {
29     if (bedFile == "stdin" || bedFile == "-") {
30         _bedStream = &cin;
31     }
32     else {
33         _bedStream = new ifstream(bedFile.c_str(), ios::in);
34 
35         if (isGzipFile(_bedStream) == true) {
36             delete _bedStream;
37             _bedStream = new igzstream(bedFile.c_str(), ios::in);
38         }
39         // can we open the file?
40         if ( _bedStream->fail() ) {
41             cerr << "Error: The requested BEDPE file ("
42                  << bedFile
43                  << ") "
44                  << "could not be opened. "
45                  << "Error message: ("
46                  << strerror(errno)
47                  << "). Exiting!" << endl;
48             exit (1);
49         }
50     }
51 }
52 
53 
54 
55 // Close the BEDPE file
Close(void)56 void BedFilePE::Close(void) {
57     if (bedFile != "stdin" && bedFile != "-") delete _bedStream;
58 }
59 
60 
GetNextBedPE(BEDPE & bedpe,int & lineNum)61 BedLineStatus BedFilePE::GetNextBedPE (BEDPE &bedpe, int &lineNum) {
62 
63     // make sure there are still lines to process.
64     // if so, tokenize, validate and return the BEDPE entry.
65     if (_bedStream->good()) {
66         string bedPELine;
67         vector<string> bedPEFields;
68         bedPEFields.reserve(10);
69 
70         // parse the bedStream pointer
71         getline(*_bedStream, bedPELine);
72         lineNum++;
73 
74         // split into a string vector.
75         Tokenize(bedPELine,bedPEFields);
76 
77         // load the BEDPE struct as long as it's a valid BEDPE entry.
78         return parseLine(bedpe, bedPEFields, lineNum);
79     }
80     // default if file is closed or EOF
81     return BED_INVALID;
82 }
83 
84 
85 /*
86     reportBedPETab
87 
88     Writes the _original_ BED entry for A.
89     Works for BEDPE only.
90 */
reportBedPETab(const BEDPE & a)91 void BedFilePE::reportBedPETab(const BEDPE &a) {
92 
93     if (this->bedType == 6) {
94         printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t",
95                                             a.chrom1.c_str(), a.start1, a.end1,
96                                             a.chrom2.c_str(), a.start2, a.end2);
97     }
98     else if (this->bedType == 7) {
99         printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t",
100                                             a.chrom1.c_str(), a.start1, a.end1,
101                                             a.chrom2.c_str(), a.start2, a.end2,
102                                             a.name.c_str());
103     }
104     else if (this->bedType == 8) {
105         printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t",
106                                             a.chrom1.c_str(), a.start1, a.end1,
107                                             a.chrom2.c_str(), a.start2, a.end2,
108                                             a.name.c_str(), a.score.c_str());
109     }
110     else if (this->bedType == 10) {
111         printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s\t",
112                                             a.chrom1.c_str(), a.start1, a.end1,
113                                             a.chrom2.c_str(), a.start2, a.end2,
114                                             a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
115     }
116     else if (this->bedType > 10) {
117         printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s",
118                                             a.chrom1.c_str(), a.start1, a.end1,
119                                             a.chrom2.c_str(), a.start2, a.end2,
120                                             a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
121         vector<uint16_t>::const_iterator othIt  = a.other_idxs.begin();
122         vector<uint16_t>::const_iterator othEnd = a.other_idxs.end();
123         for ( ; othIt != othEnd; ++othIt) {
124             printf("\t%s", a.fields[*othIt].c_str());
125         }
126         printf("\t");
127     }
128 }
129 
130 
131 
132 /*
133     reportBedPENewLine
134 
135     Writes the _original_ BED entry for A.
136     Works for BEDPE only.
137 */
reportBedPENewLine(const BEDPE & a)138 void BedFilePE::reportBedPENewLine(const BEDPE &a) {
139 
140     if (this->bedType == 6) {
141         printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\n",
142                                             a.chrom1.c_str(), a.start1, a.end1,
143                                             a.chrom2.c_str(), a.start2, a.end2);
144     }
145     else if (this->bedType == 7) {
146         printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\n",
147                                             a.chrom1.c_str(), a.start1, a.end1,
148                                             a.chrom2.c_str(), a.start2, a.end2,
149                                             a.name.c_str());
150     }
151     else if (this->bedType == 8) {
152         printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\n",
153                                             a.chrom1.c_str(), a.start1, a.end1,
154                                             a.chrom2.c_str(), a.start2, a.end2,
155                                             a.name.c_str(), a.score.c_str());
156     }
157     else if (this->bedType == 10) {
158         printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s\n",
159                                             a.chrom1.c_str(), a.start1, a.end1,
160                                             a.chrom2.c_str(), a.start2, a.end2,
161                                             a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
162     }
163     else if (this->bedType > 10) {
164         printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s",
165                                             a.chrom1.c_str(), a.start1, a.end1,
166                                             a.chrom2.c_str(), a.start2, a.end2,
167                                             a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
168         vector<uint16_t>::const_iterator othIt  = a.other_idxs.begin();
169         vector<uint16_t>::const_iterator othEnd = a.other_idxs.end();
170         for ( ; othIt != othEnd; ++othIt) {
171             printf("\t%s", a.fields[*othIt].c_str());
172         }
173         printf("\n");
174     }
175 }
176 
177 
parseLine(BEDPE & bedpe,const vector<string> & lineVector,int & lineNum)178 BedLineStatus BedFilePE::parseLine (BEDPE &bedpe, const vector<string> &lineVector, int &lineNum) {
179 
180     // bail out if we have a blank line
181     if (lineVector.empty())
182         return BED_BLANK;
183 
184     if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) {
185         // we need at least 6 columns
186         if (lineVector.size() >= 6) {
187             if (parseBedPELine(bedpe, lineVector, lineNum) == true)
188                 return BED_VALID;
189             else return BED_INVALID;
190         }
191         else  {
192             cerr << "It looks as though you have less than 6 columns.  Are you sure your files are tab-delimited?" << endl;
193             exit(1);
194         }
195     }
196     else {
197         lineNum--;
198         return BED_HEADER;
199     }
200 
201     // default
202     return BED_INVALID;
203 }
204 
205 
parseBedPELine(BEDPE & bed,const vector<string> & lineVector,const int & lineNum)206 bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, const int &lineNum) {
207 
208     if ((lineNum == 1) && (lineVector.size() >= 6)) {
209 
210         this->bedType = lineVector.size();
211         bed.fields = lineVector;
212         if (this->bedType == 6) {
213             bed.chrom1 = lineVector[0];
214             bed.start1 = stoll(lineVector[1].c_str());
215             bed.end1 = stoll(lineVector[2].c_str());
216 
217             bed.chrom2 = lineVector[3];
218             bed.start2 = stoll(lineVector[4].c_str());
219             bed.end2 = stoll(lineVector[5].c_str());
220 
221             return true;
222         }
223         else if (this->bedType == 7) {
224             bed.chrom1 = lineVector[0];
225             bed.start1 = stoll(lineVector[1].c_str());
226             bed.end1 = stoll(lineVector[2].c_str());
227 
228             bed.chrom2 = lineVector[3];
229             bed.start2 = stoll(lineVector[4].c_str());
230             bed.end2 = stoll(lineVector[5].c_str());
231 
232             bed.name = lineVector[6];
233             return true;
234         }
235         else if (this->bedType == 8) {
236             bed.chrom1 = lineVector[0];
237             bed.start1 = stoll(lineVector[1].c_str());
238             bed.end1 = stoll(lineVector[2].c_str());
239 
240             bed.chrom2 = lineVector[3];
241             bed.start2 = stoll(lineVector[4].c_str());
242             bed.end2 = stoll(lineVector[5].c_str());
243 
244             bed.name = lineVector[6];
245             bed.score = lineVector[7].c_str();
246             return true;
247         }
248         else if (this->bedType == 10) {
249             bed.chrom1 = lineVector[0];
250             bed.start1 = stoll(lineVector[1].c_str());
251             bed.end1 = stoll(lineVector[2].c_str());
252 
253             bed.chrom2 = lineVector[3];
254             bed.start2 = stoll(lineVector[4].c_str());
255             bed.end2 = stoll(lineVector[5].c_str());
256 
257             bed.name = lineVector[6];
258             bed.score = lineVector[7].c_str();
259 
260             bed.strand1 = lineVector[8];
261             bed.strand2 = lineVector[9];
262 
263             return true;
264         }
265         else if (this->bedType > 10) {
266             bed.chrom1 = lineVector[0];
267             bed.start1 = stoll(lineVector[1].c_str());
268             bed.end1 = stoll(lineVector[2].c_str());
269 
270             bed.chrom2 = lineVector[3];
271             bed.start2 = stoll(lineVector[4].c_str());
272             bed.end2 = stoll(lineVector[5].c_str());
273 
274             bed.name = lineVector[6];
275             bed.score = lineVector[7].c_str();
276 
277             bed.strand1 = lineVector[8];
278             bed.strand2 = lineVector[9];
279 
280             for (unsigned int i = 10; i < lineVector.size(); ++i) {
281                 bed.other_idxs.push_back(i);
282             }
283             return true;
284         }
285         else {
286             cerr << "Unexpected number of fields: " << lineNum << ".  Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields.  Exiting..." << endl;
287             exit(1);
288         }
289 
290         if (bed.start1 > bed.end1) {
291             cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl;
292             return false;
293         }
294         else if (bed.start2 > bed.end2) {
295             cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl;
296             return false;
297         }
298     }
299     else if ( (lineNum > 1) && (lineVector.size() == this->bedType)) {
300 
301         bed.fields = lineVector;
302 
303         if (this->bedType == 6) {
304             bed.chrom1 = lineVector[0];
305             bed.start1 = stoll(lineVector[1].c_str());
306             bed.end1 = stoll(lineVector[2].c_str());
307 
308             bed.chrom2 = lineVector[3];
309             bed.start2 = stoll(lineVector[4].c_str());
310             bed.end2 = stoll(lineVector[5].c_str());
311 
312             return true;
313         }
314         else if (this->bedType == 7) {
315             bed.chrom1 = lineVector[0];
316             bed.start1 = stoll(lineVector[1].c_str());
317             bed.end1 = stoll(lineVector[2].c_str());
318 
319             bed.chrom2 = lineVector[3];
320             bed.start2 = stoll(lineVector[4].c_str());
321             bed.end2 = stoll(lineVector[5].c_str());
322 
323             bed.name = lineVector[6];
324             return true;
325         }
326         else if (this->bedType == 8) {
327             bed.chrom1 = lineVector[0];
328             bed.start1 = stoll(lineVector[1].c_str());
329             bed.end1 = stoll(lineVector[2].c_str());
330 
331             bed.chrom2 = lineVector[3];
332             bed.start2 = stoll(lineVector[4].c_str());
333             bed.end2 = stoll(lineVector[5].c_str());
334 
335             bed.name = lineVector[6];
336             bed.score = lineVector[7].c_str();
337             return true;
338         }
339         else if (this->bedType == 10) {
340             bed.chrom1 = lineVector[0];
341             bed.start1 = stoll(lineVector[1].c_str());
342             bed.end1 = stoll(lineVector[2].c_str());
343 
344             bed.chrom2 = lineVector[3];
345             bed.start2 = stoll(lineVector[4].c_str());
346             bed.end2 = stoll(lineVector[5].c_str());
347 
348             bed.name = lineVector[6];
349             bed.score = lineVector[7].c_str();
350 
351             bed.strand1 = lineVector[8];
352             bed.strand2 = lineVector[9];
353 
354             return true;
355         }
356         else if (this->bedType > 10) {
357             bed.chrom1 = lineVector[0];
358             bed.start1 = stoll(lineVector[1].c_str());
359             bed.end1 = stoll(lineVector[2].c_str());
360 
361             bed.chrom2 = lineVector[3];
362             bed.start2 = stoll(lineVector[4].c_str());
363             bed.end2 = stoll(lineVector[5].c_str());
364 
365             bed.name = lineVector[6];
366             bed.score = lineVector[7].c_str();
367 
368             bed.strand1 = lineVector[8];
369             bed.strand2 = lineVector[9];
370             for (unsigned int i = 10; i < lineVector.size(); ++i) {
371                 bed.other_idxs.push_back(i);
372             }
373             return true;
374         }
375         else {
376             cerr << "Unexpected number of fields: " << lineNum << ".  Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields.  Exiting..." << endl;
377             exit(1);
378         }
379 
380         if (bed.start1 > bed.end1) {
381             cerr << "Error: malformed BED entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl;
382             return false;
383         }
384         else if (bed.start2 > bed.end2) {
385             cerr << "Error: malformed BED entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl;
386             return false;
387         }
388     }
389     else if (lineVector.size() == 1) {
390         cerr << "Only one BED field detected: " << lineNum << ".  Verify that your files are TAB-delimited.  Exiting..." << endl;
391         exit(1);
392     }
393     else if ((lineVector.size() != this->bedType) && (lineVector.size() != 0)) {
394         cerr << "Differing number of BEDPE fields encountered at line: " << lineNum << ".  Exiting..." << endl;
395         exit(1);
396     }
397     else if ((lineVector.size() < 6) && (lineVector.size() != 0)) {
398         cerr << "TAB delimited BEDPE file with at least 6 fields (chrom1, start1, end1, chrom2, start2, end2) is required at line: "<< lineNum << ".  Exiting..." << endl;
399         exit(1);
400     }
401     return false;
402 }
403 
404 
405 /*
406     Adapted from kent source "binKeeperFind"
407 */
FindOverlapsPerBin(int bEnd,string chrom,CHRPOS start,CHRPOS end,string name,string strand,vector<MATE> & hits,float overlapFraction,bool forceStrand,bool enforceDiffNames)408 void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string name, string strand,
409                                    vector<MATE> &hits, float overlapFraction, bool forceStrand, bool enforceDiffNames) {
410 
411     CHRPOS startBin, endBin;
412     startBin = (start >> _binFirstShift);
413     endBin = ((end-1) >> _binFirstShift);
414 
415     // loop through each bin "level" in the binning hierarchy
416     for (int i = 0; i < _binLevels; ++i) {
417 
418         // loop through each bin at this level of the hierarchy
419         CHRPOS offset = _binOffsetsExtended[i];
420         for (CHRPOS j = (startBin+offset); j <= (endBin+offset); ++j)  {
421 
422             // loop through each feature in this chrom/bin and see if it overlaps
423             // with the feature that was passed in.  if so, add the feature to
424             // the list of hits.
425             vector<MATE>::const_iterator bedItr;
426             vector<MATE>::const_iterator bedEnd;
427             if (bEnd == 1) {
428                 bedItr = bedMapEnd1[chrom][j].begin();
429                 bedEnd = bedMapEnd1[chrom][j].end();
430             }
431             else if (bEnd == 2) {
432                 bedItr = bedMapEnd2[chrom][j].begin();
433                 bedEnd = bedMapEnd2[chrom][j].end();
434             }
435             else {
436                 cerr << "Unexpected end of B requested" << endl;
437             }
438             for (; bedItr != bedEnd; ++bedItr) {
439                 float overlap = overlaps(bedItr->bed.start, bedItr->bed.end, start, end);
440                 float size    = (float)(end - start);
441 
442                 if ( (overlap / size) >= overlapFraction ) {
443 
444                     // skip the hit if not on the same strand (and we care)
445                     if ((forceStrand == false) && (enforceDiffNames == false)) {
446                         hits.push_back(*bedItr);    // it's a hit, add it.
447                     }
448                     else if ((forceStrand == true) && (enforceDiffNames == false)) {
449                         if (strand == bedItr->bed.strand)
450                             hits.push_back(*bedItr);    // it's a hit, add it.
451                     }
452                     else if ((forceStrand == true) && (enforceDiffNames == true)) {
453                         if ((strand == bedItr->bed.strand) && (name != bedItr->bed.name))
454                             hits.push_back(*bedItr);    // it's a hit, add it.
455                     }
456                     else if ((forceStrand == false) && (enforceDiffNames == true)) {
457                         if (name != bedItr->bed.name)
458                             hits.push_back(*bedItr);    // it's a hit, add it.
459                     }
460                 }
461 
462             }
463         }
464         startBin >>= _binNextShift;
465         endBin >>= _binNextShift;
466     }
467 }
468 
469 
loadBedPEFileIntoMap()470 void BedFilePE::loadBedPEFileIntoMap() {
471 
472     int lineNum = 0;
473     int bin1, bin2;
474     BedLineStatus bedStatus;
475     BEDPE bedpeEntry, nullBedPE;
476 
477     Open();
478     bedStatus = this->GetNextBedPE(bedpeEntry, lineNum);
479     while (bedStatus != BED_INVALID) {
480 
481         if (bedStatus == BED_VALID) {
482             MATE *bedEntry1 = new MATE();
483             MATE *bedEntry2 = new MATE();
484             // separate the BEDPE entry into separate
485             // BED entries
486             splitBedPEIntoBeds(bedpeEntry, lineNum, bedEntry1, bedEntry2);
487 
488             // load end1 into a UCSC bin map
489             bin1 = getBin(bedEntry1->bed.start, bedEntry1->bed.end);
490             this->bedMapEnd1[bedEntry1->bed.chrom][bin1].push_back(*bedEntry1);
491 
492             // load end2 into a UCSC bin map
493             bin2 = getBin(bedEntry2->bed.start, bedEntry2->bed.end);
494             this->bedMapEnd2[bedEntry2->bed.chrom][bin2].push_back(*bedEntry2);
495 
496             bedpeEntry = nullBedPE;
497         }
498         bedStatus = this->GetNextBedPE(bedpeEntry, lineNum);
499     }
500     Close();
501 }
502 
503 
splitBedPEIntoBeds(const BEDPE & bedpeEntry,const int & lineNum,MATE * bedEntry1,MATE * bedEntry2)504 void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, MATE *bedEntry1, MATE *bedEntry2) {
505 
506     /*
507        Split the BEDPE entry into separate BED entries
508 
509        NOTE: I am using a trick here where I store
510        the lineNum of the BEDPE from the original file
511        in the "count" column.  This allows me to later
512        resolve whether the hits found on both ends of BEDPE A
513        came from the same entry in BEDPE B.  Tracking by "name"
514        alone with fail when there are multiple mappings for a given
515        read-pair.
516     */
517 
518     bedEntry1->bed.chrom           = bedpeEntry.chrom1;
519     bedEntry1->bed.start           = bedpeEntry.start1;
520     bedEntry1->bed.end             = bedpeEntry.end1;
521     bedEntry1->bed.name            = bedpeEntry.name;
522     bedEntry1->bed.score           = bedpeEntry.score;        // only store the score in end1 to save memory
523     bedEntry1->bed.strand          = bedpeEntry.strand1;
524     bedEntry1->bed.fields          = bedpeEntry.fields;       // only store the fields in end1 to save memory
525     bedEntry1->bed.other_idxs      = bedpeEntry.other_idxs;   // only store the other_idxs in end1 to save memory
526     bedEntry1->lineNum             = lineNum;
527     bedEntry1->mate                = bedEntry2;               // keep a pointer to end2
528 
529     bedEntry2->bed.chrom           = bedpeEntry.chrom2;
530     bedEntry2->bed.start           = bedpeEntry.start2;
531     bedEntry2->bed.end             = bedpeEntry.end2;
532     bedEntry2->bed.name            = bedpeEntry.name;
533     bedEntry2->bed.strand          = bedpeEntry.strand2;
534     bedEntry2->lineNum             = lineNum;
535     bedEntry2->mate                = bedEntry1;               // keep a pointer to end1
536 }
537 
538 
539 
540