1 //
2 // bedFilePE.cpp
3 // BEDTools
4 //
5 // Created by Aaron Quinlan Spring 2009.
6 // Copyright 2009 Aaron Quinlan. All rights reserved.
7 //
8 // Summary: Contains common functions for finding BED overlaps.
9 //
10 // Acknowledgments: Much of the code herein is taken from Jim Kent's
11 // BED processing code. I am grateful for his elegant
12 // genome binning algorithm and therefore use it extensively.
13
14
15 #include "bedFilePE.h"
16
17 using namespace std;
18
19 // Constructor
BedFilePE(string & bedFile)20 BedFilePE::BedFilePE(string &bedFile) {
21 this->bedFile = bedFile;
22 }
23
24 // Destructor
~BedFilePE(void)25 BedFilePE::~BedFilePE(void) {
26 }
27
Open(void)28 void BedFilePE::Open(void) {
29 if (bedFile == "stdin" || bedFile == "-") {
30 _bedStream = &cin;
31 }
32 else {
33 _bedStream = new ifstream(bedFile.c_str(), ios::in);
34
35 if (isGzipFile(_bedStream) == true) {
36 delete _bedStream;
37 _bedStream = new igzstream(bedFile.c_str(), ios::in);
38 }
39 // can we open the file?
40 if ( _bedStream->fail() ) {
41 cerr << "Error: The requested BEDPE file ("
42 << bedFile
43 << ") "
44 << "could not be opened. "
45 << "Error message: ("
46 << strerror(errno)
47 << "). Exiting!" << endl;
48 exit (1);
49 }
50 }
51 }
52
53
54
55 // Close the BEDPE file
Close(void)56 void BedFilePE::Close(void) {
57 if (bedFile != "stdin" && bedFile != "-") delete _bedStream;
58 }
59
60
GetNextBedPE(BEDPE & bedpe,int & lineNum)61 BedLineStatus BedFilePE::GetNextBedPE (BEDPE &bedpe, int &lineNum) {
62
63 // make sure there are still lines to process.
64 // if so, tokenize, validate and return the BEDPE entry.
65 if (_bedStream->good()) {
66 string bedPELine;
67 vector<string> bedPEFields;
68 bedPEFields.reserve(10);
69
70 // parse the bedStream pointer
71 getline(*_bedStream, bedPELine);
72 lineNum++;
73
74 // split into a string vector.
75 Tokenize(bedPELine,bedPEFields);
76
77 // load the BEDPE struct as long as it's a valid BEDPE entry.
78 return parseLine(bedpe, bedPEFields, lineNum);
79 }
80 // default if file is closed or EOF
81 return BED_INVALID;
82 }
83
84
85 /*
86 reportBedPETab
87
88 Writes the _original_ BED entry for A.
89 Works for BEDPE only.
90 */
reportBedPETab(const BEDPE & a)91 void BedFilePE::reportBedPETab(const BEDPE &a) {
92
93 if (this->bedType == 6) {
94 printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t",
95 a.chrom1.c_str(), a.start1, a.end1,
96 a.chrom2.c_str(), a.start2, a.end2);
97 }
98 else if (this->bedType == 7) {
99 printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t",
100 a.chrom1.c_str(), a.start1, a.end1,
101 a.chrom2.c_str(), a.start2, a.end2,
102 a.name.c_str());
103 }
104 else if (this->bedType == 8) {
105 printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t",
106 a.chrom1.c_str(), a.start1, a.end1,
107 a.chrom2.c_str(), a.start2, a.end2,
108 a.name.c_str(), a.score.c_str());
109 }
110 else if (this->bedType == 10) {
111 printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s\t",
112 a.chrom1.c_str(), a.start1, a.end1,
113 a.chrom2.c_str(), a.start2, a.end2,
114 a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
115 }
116 else if (this->bedType > 10) {
117 printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s",
118 a.chrom1.c_str(), a.start1, a.end1,
119 a.chrom2.c_str(), a.start2, a.end2,
120 a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
121 vector<uint16_t>::const_iterator othIt = a.other_idxs.begin();
122 vector<uint16_t>::const_iterator othEnd = a.other_idxs.end();
123 for ( ; othIt != othEnd; ++othIt) {
124 printf("\t%s", a.fields[*othIt].c_str());
125 }
126 printf("\t");
127 }
128 }
129
130
131
132 /*
133 reportBedPENewLine
134
135 Writes the _original_ BED entry for A.
136 Works for BEDPE only.
137 */
reportBedPENewLine(const BEDPE & a)138 void BedFilePE::reportBedPENewLine(const BEDPE &a) {
139
140 if (this->bedType == 6) {
141 printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\n",
142 a.chrom1.c_str(), a.start1, a.end1,
143 a.chrom2.c_str(), a.start2, a.end2);
144 }
145 else if (this->bedType == 7) {
146 printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\n",
147 a.chrom1.c_str(), a.start1, a.end1,
148 a.chrom2.c_str(), a.start2, a.end2,
149 a.name.c_str());
150 }
151 else if (this->bedType == 8) {
152 printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\n",
153 a.chrom1.c_str(), a.start1, a.end1,
154 a.chrom2.c_str(), a.start2, a.end2,
155 a.name.c_str(), a.score.c_str());
156 }
157 else if (this->bedType == 10) {
158 printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s\n",
159 a.chrom1.c_str(), a.start1, a.end1,
160 a.chrom2.c_str(), a.start2, a.end2,
161 a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
162 }
163 else if (this->bedType > 10) {
164 printf("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s",
165 a.chrom1.c_str(), a.start1, a.end1,
166 a.chrom2.c_str(), a.start2, a.end2,
167 a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
168 vector<uint16_t>::const_iterator othIt = a.other_idxs.begin();
169 vector<uint16_t>::const_iterator othEnd = a.other_idxs.end();
170 for ( ; othIt != othEnd; ++othIt) {
171 printf("\t%s", a.fields[*othIt].c_str());
172 }
173 printf("\n");
174 }
175 }
176
177
parseLine(BEDPE & bedpe,const vector<string> & lineVector,int & lineNum)178 BedLineStatus BedFilePE::parseLine (BEDPE &bedpe, const vector<string> &lineVector, int &lineNum) {
179
180 // bail out if we have a blank line
181 if (lineVector.empty())
182 return BED_BLANK;
183
184 if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) {
185 // we need at least 6 columns
186 if (lineVector.size() >= 6) {
187 if (parseBedPELine(bedpe, lineVector, lineNum) == true)
188 return BED_VALID;
189 else return BED_INVALID;
190 }
191 else {
192 cerr << "It looks as though you have less than 6 columns. Are you sure your files are tab-delimited?" << endl;
193 exit(1);
194 }
195 }
196 else {
197 lineNum--;
198 return BED_HEADER;
199 }
200
201 // default
202 return BED_INVALID;
203 }
204
205
parseBedPELine(BEDPE & bed,const vector<string> & lineVector,const int & lineNum)206 bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, const int &lineNum) {
207
208 if ((lineNum == 1) && (lineVector.size() >= 6)) {
209
210 this->bedType = lineVector.size();
211 bed.fields = lineVector;
212 if (this->bedType == 6) {
213 bed.chrom1 = lineVector[0];
214 bed.start1 = stoll(lineVector[1].c_str());
215 bed.end1 = stoll(lineVector[2].c_str());
216
217 bed.chrom2 = lineVector[3];
218 bed.start2 = stoll(lineVector[4].c_str());
219 bed.end2 = stoll(lineVector[5].c_str());
220
221 return true;
222 }
223 else if (this->bedType == 7) {
224 bed.chrom1 = lineVector[0];
225 bed.start1 = stoll(lineVector[1].c_str());
226 bed.end1 = stoll(lineVector[2].c_str());
227
228 bed.chrom2 = lineVector[3];
229 bed.start2 = stoll(lineVector[4].c_str());
230 bed.end2 = stoll(lineVector[5].c_str());
231
232 bed.name = lineVector[6];
233 return true;
234 }
235 else if (this->bedType == 8) {
236 bed.chrom1 = lineVector[0];
237 bed.start1 = stoll(lineVector[1].c_str());
238 bed.end1 = stoll(lineVector[2].c_str());
239
240 bed.chrom2 = lineVector[3];
241 bed.start2 = stoll(lineVector[4].c_str());
242 bed.end2 = stoll(lineVector[5].c_str());
243
244 bed.name = lineVector[6];
245 bed.score = lineVector[7].c_str();
246 return true;
247 }
248 else if (this->bedType == 10) {
249 bed.chrom1 = lineVector[0];
250 bed.start1 = stoll(lineVector[1].c_str());
251 bed.end1 = stoll(lineVector[2].c_str());
252
253 bed.chrom2 = lineVector[3];
254 bed.start2 = stoll(lineVector[4].c_str());
255 bed.end2 = stoll(lineVector[5].c_str());
256
257 bed.name = lineVector[6];
258 bed.score = lineVector[7].c_str();
259
260 bed.strand1 = lineVector[8];
261 bed.strand2 = lineVector[9];
262
263 return true;
264 }
265 else if (this->bedType > 10) {
266 bed.chrom1 = lineVector[0];
267 bed.start1 = stoll(lineVector[1].c_str());
268 bed.end1 = stoll(lineVector[2].c_str());
269
270 bed.chrom2 = lineVector[3];
271 bed.start2 = stoll(lineVector[4].c_str());
272 bed.end2 = stoll(lineVector[5].c_str());
273
274 bed.name = lineVector[6];
275 bed.score = lineVector[7].c_str();
276
277 bed.strand1 = lineVector[8];
278 bed.strand2 = lineVector[9];
279
280 for (unsigned int i = 10; i < lineVector.size(); ++i) {
281 bed.other_idxs.push_back(i);
282 }
283 return true;
284 }
285 else {
286 cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields. Exiting..." << endl;
287 exit(1);
288 }
289
290 if (bed.start1 > bed.end1) {
291 cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl;
292 return false;
293 }
294 else if (bed.start2 > bed.end2) {
295 cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl;
296 return false;
297 }
298 }
299 else if ( (lineNum > 1) && (lineVector.size() == this->bedType)) {
300
301 bed.fields = lineVector;
302
303 if (this->bedType == 6) {
304 bed.chrom1 = lineVector[0];
305 bed.start1 = stoll(lineVector[1].c_str());
306 bed.end1 = stoll(lineVector[2].c_str());
307
308 bed.chrom2 = lineVector[3];
309 bed.start2 = stoll(lineVector[4].c_str());
310 bed.end2 = stoll(lineVector[5].c_str());
311
312 return true;
313 }
314 else if (this->bedType == 7) {
315 bed.chrom1 = lineVector[0];
316 bed.start1 = stoll(lineVector[1].c_str());
317 bed.end1 = stoll(lineVector[2].c_str());
318
319 bed.chrom2 = lineVector[3];
320 bed.start2 = stoll(lineVector[4].c_str());
321 bed.end2 = stoll(lineVector[5].c_str());
322
323 bed.name = lineVector[6];
324 return true;
325 }
326 else if (this->bedType == 8) {
327 bed.chrom1 = lineVector[0];
328 bed.start1 = stoll(lineVector[1].c_str());
329 bed.end1 = stoll(lineVector[2].c_str());
330
331 bed.chrom2 = lineVector[3];
332 bed.start2 = stoll(lineVector[4].c_str());
333 bed.end2 = stoll(lineVector[5].c_str());
334
335 bed.name = lineVector[6];
336 bed.score = lineVector[7].c_str();
337 return true;
338 }
339 else if (this->bedType == 10) {
340 bed.chrom1 = lineVector[0];
341 bed.start1 = stoll(lineVector[1].c_str());
342 bed.end1 = stoll(lineVector[2].c_str());
343
344 bed.chrom2 = lineVector[3];
345 bed.start2 = stoll(lineVector[4].c_str());
346 bed.end2 = stoll(lineVector[5].c_str());
347
348 bed.name = lineVector[6];
349 bed.score = lineVector[7].c_str();
350
351 bed.strand1 = lineVector[8];
352 bed.strand2 = lineVector[9];
353
354 return true;
355 }
356 else if (this->bedType > 10) {
357 bed.chrom1 = lineVector[0];
358 bed.start1 = stoll(lineVector[1].c_str());
359 bed.end1 = stoll(lineVector[2].c_str());
360
361 bed.chrom2 = lineVector[3];
362 bed.start2 = stoll(lineVector[4].c_str());
363 bed.end2 = stoll(lineVector[5].c_str());
364
365 bed.name = lineVector[6];
366 bed.score = lineVector[7].c_str();
367
368 bed.strand1 = lineVector[8];
369 bed.strand2 = lineVector[9];
370 for (unsigned int i = 10; i < lineVector.size(); ++i) {
371 bed.other_idxs.push_back(i);
372 }
373 return true;
374 }
375 else {
376 cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields. Exiting..." << endl;
377 exit(1);
378 }
379
380 if (bed.start1 > bed.end1) {
381 cerr << "Error: malformed BED entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl;
382 return false;
383 }
384 else if (bed.start2 > bed.end2) {
385 cerr << "Error: malformed BED entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl;
386 return false;
387 }
388 }
389 else if (lineVector.size() == 1) {
390 cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl;
391 exit(1);
392 }
393 else if ((lineVector.size() != this->bedType) && (lineVector.size() != 0)) {
394 cerr << "Differing number of BEDPE fields encountered at line: " << lineNum << ". Exiting..." << endl;
395 exit(1);
396 }
397 else if ((lineVector.size() < 6) && (lineVector.size() != 0)) {
398 cerr << "TAB delimited BEDPE file with at least 6 fields (chrom1, start1, end1, chrom2, start2, end2) is required at line: "<< lineNum << ". Exiting..." << endl;
399 exit(1);
400 }
401 return false;
402 }
403
404
405 /*
406 Adapted from kent source "binKeeperFind"
407 */
FindOverlapsPerBin(int bEnd,string chrom,CHRPOS start,CHRPOS end,string name,string strand,vector<MATE> & hits,float overlapFraction,bool forceStrand,bool enforceDiffNames)408 void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string name, string strand,
409 vector<MATE> &hits, float overlapFraction, bool forceStrand, bool enforceDiffNames) {
410
411 CHRPOS startBin, endBin;
412 startBin = (start >> _binFirstShift);
413 endBin = ((end-1) >> _binFirstShift);
414
415 // loop through each bin "level" in the binning hierarchy
416 for (int i = 0; i < _binLevels; ++i) {
417
418 // loop through each bin at this level of the hierarchy
419 CHRPOS offset = _binOffsetsExtended[i];
420 for (CHRPOS j = (startBin+offset); j <= (endBin+offset); ++j) {
421
422 // loop through each feature in this chrom/bin and see if it overlaps
423 // with the feature that was passed in. if so, add the feature to
424 // the list of hits.
425 vector<MATE>::const_iterator bedItr;
426 vector<MATE>::const_iterator bedEnd;
427 if (bEnd == 1) {
428 bedItr = bedMapEnd1[chrom][j].begin();
429 bedEnd = bedMapEnd1[chrom][j].end();
430 }
431 else if (bEnd == 2) {
432 bedItr = bedMapEnd2[chrom][j].begin();
433 bedEnd = bedMapEnd2[chrom][j].end();
434 }
435 else {
436 cerr << "Unexpected end of B requested" << endl;
437 }
438 for (; bedItr != bedEnd; ++bedItr) {
439 float overlap = overlaps(bedItr->bed.start, bedItr->bed.end, start, end);
440 float size = (float)(end - start);
441
442 if ( (overlap / size) >= overlapFraction ) {
443
444 // skip the hit if not on the same strand (and we care)
445 if ((forceStrand == false) && (enforceDiffNames == false)) {
446 hits.push_back(*bedItr); // it's a hit, add it.
447 }
448 else if ((forceStrand == true) && (enforceDiffNames == false)) {
449 if (strand == bedItr->bed.strand)
450 hits.push_back(*bedItr); // it's a hit, add it.
451 }
452 else if ((forceStrand == true) && (enforceDiffNames == true)) {
453 if ((strand == bedItr->bed.strand) && (name != bedItr->bed.name))
454 hits.push_back(*bedItr); // it's a hit, add it.
455 }
456 else if ((forceStrand == false) && (enforceDiffNames == true)) {
457 if (name != bedItr->bed.name)
458 hits.push_back(*bedItr); // it's a hit, add it.
459 }
460 }
461
462 }
463 }
464 startBin >>= _binNextShift;
465 endBin >>= _binNextShift;
466 }
467 }
468
469
loadBedPEFileIntoMap()470 void BedFilePE::loadBedPEFileIntoMap() {
471
472 int lineNum = 0;
473 int bin1, bin2;
474 BedLineStatus bedStatus;
475 BEDPE bedpeEntry, nullBedPE;
476
477 Open();
478 bedStatus = this->GetNextBedPE(bedpeEntry, lineNum);
479 while (bedStatus != BED_INVALID) {
480
481 if (bedStatus == BED_VALID) {
482 MATE *bedEntry1 = new MATE();
483 MATE *bedEntry2 = new MATE();
484 // separate the BEDPE entry into separate
485 // BED entries
486 splitBedPEIntoBeds(bedpeEntry, lineNum, bedEntry1, bedEntry2);
487
488 // load end1 into a UCSC bin map
489 bin1 = getBin(bedEntry1->bed.start, bedEntry1->bed.end);
490 this->bedMapEnd1[bedEntry1->bed.chrom][bin1].push_back(*bedEntry1);
491
492 // load end2 into a UCSC bin map
493 bin2 = getBin(bedEntry2->bed.start, bedEntry2->bed.end);
494 this->bedMapEnd2[bedEntry2->bed.chrom][bin2].push_back(*bedEntry2);
495
496 bedpeEntry = nullBedPE;
497 }
498 bedStatus = this->GetNextBedPE(bedpeEntry, lineNum);
499 }
500 Close();
501 }
502
503
splitBedPEIntoBeds(const BEDPE & bedpeEntry,const int & lineNum,MATE * bedEntry1,MATE * bedEntry2)504 void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, MATE *bedEntry1, MATE *bedEntry2) {
505
506 /*
507 Split the BEDPE entry into separate BED entries
508
509 NOTE: I am using a trick here where I store
510 the lineNum of the BEDPE from the original file
511 in the "count" column. This allows me to later
512 resolve whether the hits found on both ends of BEDPE A
513 came from the same entry in BEDPE B. Tracking by "name"
514 alone with fail when there are multiple mappings for a given
515 read-pair.
516 */
517
518 bedEntry1->bed.chrom = bedpeEntry.chrom1;
519 bedEntry1->bed.start = bedpeEntry.start1;
520 bedEntry1->bed.end = bedpeEntry.end1;
521 bedEntry1->bed.name = bedpeEntry.name;
522 bedEntry1->bed.score = bedpeEntry.score; // only store the score in end1 to save memory
523 bedEntry1->bed.strand = bedpeEntry.strand1;
524 bedEntry1->bed.fields = bedpeEntry.fields; // only store the fields in end1 to save memory
525 bedEntry1->bed.other_idxs = bedpeEntry.other_idxs; // only store the other_idxs in end1 to save memory
526 bedEntry1->lineNum = lineNum;
527 bedEntry1->mate = bedEntry2; // keep a pointer to end2
528
529 bedEntry2->bed.chrom = bedpeEntry.chrom2;
530 bedEntry2->bed.start = bedpeEntry.start2;
531 bedEntry2->bed.end = bedpeEntry.end2;
532 bedEntry2->bed.name = bedpeEntry.name;
533 bedEntry2->bed.strand = bedpeEntry.strand2;
534 bedEntry2->lineNum = lineNum;
535 bedEntry2->mate = bedEntry1; // keep a pointer to end1
536 }
537
538
539
540