1 /*****************************************************************************
2 bedFile.h
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licensed under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #ifndef BEDFILE_H
13 #define BEDFILE_H
14
15 // "local" includes
16 #include "BedtoolsTypes.h"
17 #include "gzstream.h"
18 #include "lineFileUtilities.h"
19 #include "fileType.h"
20
21 // standard includes
22 #include <vector>
23 #include <map>
24 #include <set>
25 #include <string>
26 #include <iostream>
27 #include <fstream>
28 #include <sstream>
29 #include <cstring>
30 #include <algorithm>
31 #include <limits.h>
32 #include <stdint.h>
33 #include <cstdio>
34 //#include <tr1/unordered_map> // Experimental.
35 using namespace std;
36
37
38 //*************************************************
39 // Data type tydedef
40 //*************************************************
41 typedef uint16_t BINLEVEL;
42 typedef uint32_t BIN;
43 typedef uint16_t USHORT;
44 typedef uint32_t UINT;
45
46 //*************************************************
47 // Genome binning constants
48 //*************************************************
49 const BINLEVEL _binLevels = 8;
50
51 const BIN _binOffsetsExtended[] = {262144+32678+4096+512+64+8+1, 32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0};
52
53 const USHORT _binFirstShift = 14; /* How much to shift to get to finest bin. */
54 const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin. */
55
56 const BIN _numBins = (1 << (_binNextShift * _binLevels)) / ((1 << _binNextShift) - 1);
57
58
59 //*************************************************
60 // Common data structures
61 //*************************************************
62
63 struct DEPTH {
64 UINT starts;
65 UINT ends;
66 };
67
68
69 /*
70 Structure for regular BED records
71 */
72 struct BED {
73
74 // Regular BED fields
75 string chrom;
76 CHRPOS start;
77 CHRPOS end;
78 string name;
79 string score;
80 string strand;
81 // all of the original fields in the record
82 vector<string> fields;
83 // indices of the "other" fields
84 vector<uint16_t> other_idxs;
85 // is this a zero length feature: i.e., start == end
86 bool zeroLength;
87 double weight;
88
89 public:
90 // constructors
91
92 // Null
BEDBED93 BED()
94 : chrom(""),
95 start(0),
96 end(0),
97 name(""),
98 score(""),
99 strand(""),
100 fields(),
101 other_idxs(),
102 zeroLength(false),
103 weight(0.0)
104 {}
105
106 // BED3
BEDBED107 BED(string chrom, CHRPOS start, CHRPOS end)
108 : chrom(chrom),
109 start(start),
110 end(end),
111 name(""),
112 score(""),
113 strand(""),
114 fields(),
115 other_idxs(),
116 zeroLength(false),
117 weight(0.0)
118 {}
119
120 // BED4
BEDBED121 BED(string chrom, CHRPOS start, CHRPOS end, string strand)
122 : chrom(chrom),
123 start(start),
124 end(end),
125 name(""),
126 score(""),
127 strand(strand),
128 fields(),
129 other_idxs(),
130 zeroLength(false),
131 weight(0.0)
132 {}
133
134 // BED6
BEDBED135 BED(string chrom, CHRPOS start, CHRPOS end, string name,
136 string score, string strand)
137 : chrom(chrom),
138 start(start),
139 end(end),
140 name(name),
141 score(score),
142 strand(strand),
143 fields(),
144 other_idxs(),
145 zeroLength(false),
146 weight(0.0)
147 {}
148
149 // BEDALL
BEDBED150 BED(string chrom, CHRPOS start, CHRPOS end, string name,
151 string score, string strand, vector<string> fields,
152 vector<uint16_t> other_idxs)
153 : chrom(chrom),
154 start(start),
155 end(end),
156 name(name),
157 score(score),
158 strand(strand),
159 fields(fields),
160 other_idxs(other_idxs),
161 zeroLength(false),
162 weight(0.0)
163 {}
164
sizeBED165 CHRPOS size() const {
166 return end-start;
167 }
168
169 }; // BED
170
171
172 /*
173 Structure for each end of a paired BED record
174 mate points to the other end.
175 */
176 struct MATE {
177 BED bed;
178 int lineNum;
179 MATE *mate;
180 };
181
182
183 /*
184 Structure for regular BED COVERAGE records
185 */
186 struct BEDCOV {
187
188 string chrom;
189
190 // Regular BED fields
191 CHRPOS start;
192 CHRPOS end;
193 string name;
194 string score;
195 string strand;
196
197 // all of the original fields in the record
198 vector<string> fields;
199 // indices of the "other" fields
200 vector<uint16_t> other_idxs;
201 // is this a zero length feature: i.e., start == end
202 bool zeroLength;
203
204 // Additional fields specific to computing coverage
205 map<unsigned int, DEPTH> depthMap;
206 unsigned int count;
207 CHRPOS minOverlapStart;
208
209
210 public:
211 // constructors
212 // Null
BEDCOVBEDCOV213 BEDCOV()
214 : chrom(""),
215 start(0),
216 end(0),
217 name(""),
218 score(""),
219 strand(""),
220 fields(),
221 other_idxs(),
222 zeroLength(false),
223 depthMap(),
224 count(0),
225 minOverlapStart(0)
226 {}
227 };
228
229
230 /*
231 Structure for BED COVERAGE records having lists of
232 multiple coverages
233 */
234 struct BEDCOVLIST {
235
236 // Regular BED fields
237 string chrom;
238 CHRPOS start;
239 CHRPOS end;
240 string name;
241 string score;
242 string strand;
243
244 // all of the original fields in the record
245 vector<string> fields;
246 // indices of the "other" fields
247 vector<uint16_t> other_idxs;
248 // is this a zero length feature: i.e., start == end
249 bool zeroLength;
250
251 // Additional fields specific to computing coverage
252 vector< map<CHRPOS, DEPTH> > depthMapList;
253 vector<unsigned int> counts;
254 vector<CHRPOS> minOverlapStarts;
255
256
257 public:
258 // constructors
259 // Null
BEDCOVLISTBEDCOVLIST260 BEDCOVLIST()
261 : chrom(""),
262 start(0),
263 end(0),
264 name(""),
265 score(""),
266 strand(""),
267 fields(),
268 other_idxs(),
269 zeroLength(false),
270 depthMapList(),
271 counts(0),
272 minOverlapStarts(0)
273 {}
274 };
275
276
277 // enum to flag the state of a given line in a BED file.
278 enum BedLineStatus
279 {
280 BED_INVALID = -1,
281 BED_HEADER = 0,
282 BED_BLANK = 1,
283 BED_VALID = 2
284 };
285
286 // enum to indicate the type of file we are dealing with
287 enum FileType
288 {
289 BED_FILETYPE,
290 GFF_FILETYPE,
291 VCF_FILETYPE
292 };
293
294 //*************************************************
295 // Data structure typedefs
296 //*************************************************
297 typedef vector<BED> bedVector;
298 typedef vector<BEDCOV> bedCovVector;
299 typedef vector<MATE> mateVector;
300 typedef vector<BEDCOVLIST> bedCovListVector;
301
302 typedef map<BIN, bedVector> binsToBeds;
303 typedef map<BIN, bedCovVector> binsToBedCovs;
304 typedef map<BIN, mateVector> binsToMates;
305 typedef map<BIN, bedCovListVector> binsToBedCovLists;
306
307 typedef map<string, binsToBeds> masterBedMap;
308 typedef map<string, binsToBedCovs> masterBedCovMap;
309 typedef map<string, binsToMates> masterMateMap;
310 typedef map<string, binsToBedCovLists> masterBedCovListMap;
311 typedef map<string, bedVector> masterBedMapNoBin;
312
313
314 // return the genome "bin" for a feature with this start and end
315 inline
getBin(CHRPOS start,CHRPOS end)316 BIN getBin(CHRPOS start, CHRPOS end) {
317 --end;
318 start >>= _binFirstShift;
319 end >>= _binFirstShift;
320
321 for (short i = 0; i < _binLevels; ++i) {
322 if (start == end) return _binOffsetsExtended[i] + start;
323 start >>= _binNextShift;
324 end >>= _binNextShift;
325 }
326 cerr << "start " << start << ", end " << end
327 << " out of range in findBin (max is 512M)"
328 << endl;
329 return 0;
330 }
331
332 /****************************************************
333 // isInteger(s): Tests if string s is a valid integer
334 *****************************************************/
isInteger(const std::string & s)335 inline bool isInteger(const std::string& s) {
336 int len = s.length();
337 for (int i = 0; i < len; i++) {
338 if (!std::isdigit(s[i])) return false;
339 }
340 return true;
341 }
342
343
344 // return the amount of overlap between two features. Negative if none and the
345 // number of negative bases is the distance between the two.
346 inline
overlaps(CHRPOS aS,CHRPOS aE,CHRPOS bS,CHRPOS bE)347 CHRPOS overlaps(CHRPOS aS, CHRPOS aE, CHRPOS bS, CHRPOS bE) {
348 return min(aE, bE) - max(aS, bS);
349 }
350
351 // is A after (to the right of) B?
352 inline
after(const BED & a,const BED & b)353 bool after(const BED &a, const BED &b) {
354 return (a.start >= b.end);
355 }
356
357
358 // Ancillary functions
359 void splitBedIntoBlocks(const BED &bed, bedVector &bedBlocks);
360
361
362 // BED Sorting Methods
363 bool sortByChrom(const BED &a, const BED &b);
364 bool sortByStart(const BED &a, const BED &b);
365 bool sortBySizeAsc(const BED &a, const BED &b);
366 bool sortBySizeDesc(const BED &a, const BED &b);
367 bool sortByScoreAsc(const BED &a, const BED &b);
368 bool sortByScoreDesc(const BED &a, const BED &b);
369 bool byChromThenStart(BED const &a, BED const &b);
370
371
372
373 //************************************************
374 // BedFile Class methods and elements
375 //************************************************
376 class BedFile {
377
378 public:
379
380 // Constructor
381 BedFile(string &);
382 BedFile(void);
383
384 // Destructor
385 ~BedFile(void);
386
387 /********* File management ********/
388 // Open a BED file for reading (creates an istream pointer)
389 void Open(void);
390
391 // Close an opened BED file.
392 void Close(void);
393
394 // are the any intervals left in the file?
395 bool Empty(void);
396
397 // Rewind the pointer back to the beginning of the file
398 void Rewind(void);
399
400 // Jump to a specific byte in the file
401 void Seek(unsigned long offset);
402
403 // dump the header, which is collected as part of Open()
404 void PrintHeader(void);
405
406 // Get the next BED entry in an opened BED file.
407 bool GetNextBed (BED &bed, bool forceSorted = false);
408
409 // Returns the next MERGED (i.e., non-overlapping) interval in
410 // an opened BED file
411 // NOTE: assumes input file is sorted by chrom then start
412 bool GetNextMergedBed(BED &merged_bed);
413
414 // load a BED file into a map keyed by chrom, then bin. value is
415 // vector of BEDs
416 void loadBedFileIntoMap();
417 void loadBedFileIntoMergedMap();
418
419 // load a BED entry into and existing map
420 void addBEDIntoMap(BED bedEntry);
421
422 // load a BED file into a map keyed by chrom, then bin. value is
423 // vector of BEDCOVs
424 void loadBedCovFileIntoMap();
425
426 // load a BED file into a map keyed by chrom, then bin. value is
427 // vector of BEDCOVLISTs
428 void loadBedCovListFileIntoMap();
429
430 // load a BED file into a map keyed by chrom. value is vector of BEDs
431 void loadBedFileIntoMapNoBin();
432
433 // load a BED file into a vector of BEDs
434 void loadBedFileIntoVector();
435
436 // load a BED file into a vector ordered in decreasing order by size
437 void assignWeightsBasedOnSize();
438
439 BED * sizeWeightedSearch(double val);
440
441 // Given a chrom, start, end and strand for a single feature,
442 // search for all overlapping features in another BED file.
443 // Searches through each relevant genome bin on the same chromosome
444 // as the single feature. Note: Adapted from kent source "binKeeperFind"
445 void allHits(string chrom, CHRPOS start, CHRPOS end, string strand,
446 vector<BED> &hits, bool sameStrand, bool diffStrand,
447 float overlapFraction, bool reciprocal);
448
449 // return true if at least one overlap was found. otherwise, return false.
450 bool anyHits(string chrom, CHRPOS start, CHRPOS end, string strand,
451 bool sameStrand, bool diffStrand, float overlapFraction, bool reciprocal);
452
453
454 // Given a chrom, start, end and strand for a single feature,
455 // increment a the number of hits for each feature in B file
456 // that the feature overlaps
457 void countHits(const BED &a, bool sameStrand = false,
458 bool diffStrand = false, bool countsOnly = false);
459
460 // same as above, but has special logic that processes a set of
461 // BED "blocks" from a single entry so as to avoid over-counting
462 // each "block" of a single BAM/BED12 as distinct coverage. That is,
463 // if one read has four block, we only want to count the coverage as
464 // coming from one read, not four.
465 void countSplitHits(const vector<BED> &bedBlock, bool sameStrand = false,
466 bool diffStrand = false, bool countsOnly = false);
467
468 // Given a chrom, start, end and strand for a single feature,
469 // increment a the number of hits for each feature in B file
470 // that the feature overlaps
471 void countListHits(const BED &a, int index,
472 bool sameStrand, bool diffStrand);
473
474
475 // return the total length of all the intervals in the file.
476 // use with GetNextBed()
477 unsigned long getTotalLength(void);
478
479 // return the total _flattened_ length of all the intervals in the file.
480 // use with GetNextMergedBed()
481 unsigned long getTotalFlattenedLength(void);
482
483
484 // the bedfile with which this instance is associated
485 string bedFile;
486 unsigned int bedType; // 3-6, 12 for BED
487 // 9 for GFF
488 bool isBed12; // is it file of true blocked BED12 records?
489 bool isZeroBased;
490
491 // Main data structures used by BEDTools
492 masterBedCovMap bedCovMap;
493 masterBedCovListMap bedCovListMap;
494 masterBedMap bedMap;
495 bedVector bedList;
496 masterBedMapNoBin bedMapNoBin;
497
498 BedLineStatus _status;
499 int _lineNum;
500
501 private:
502
503 // data
504 bool _isGff;
505 bool _isVcf;
506 bool _typeIsKnown; // do we know the type? (i.e., BED, GFF, VCF)
507 FileType _fileType; // what is the file type? (BED? GFF? VCF?)
508 istream *_bedStream;
509 string _bedLine;
510
511 BED _nullBed;
512 string _header;
513 bool _firstLine;
514 vector<string> _bedFields;
515 unsigned int _numFields;
516 CHRPOS _merged_start;
517 CHRPOS _merged_end;
518 string _merged_chrom;
519 CHRPOS _prev_start;
520 string _prev_chrom;
521 unsigned long _total_length;
522 unsigned long _total_flattened_length;
523
524 void setZeroBased(bool zeroBased);
525 void setGff (bool isGff);
526 void setVcf (bool isVcf);
527 void setFileType (FileType type);
528 void setBedType (int colNums);
529 void setBed12 (bool isBed12);
530
531 /************ Private utilities ***********************/
532 void GetHeader(void);
533
534 /******************************************************
535 Private definitions to circumvent linker issues with
536 templated member functions.
537 *******************************************************/
538
539 /*
540 parseLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
541 */
542 template <typename T>
parseLine(T & bed,const vector<string> & fields)543 inline BedLineStatus parseLine (T &bed, const vector<string> &fields) {
544
545 // clear out the data from the last line.
546 bed = _nullBed;
547 // bail out if we have a blank line
548 if (_numFields == 0) {
549 return BED_BLANK;
550 }
551 // bail out if we have a comment line
552 if ( (fields[0].find("#") == 0) ||
553 (fields[0].find("browser") == 0) ||
554 (fields[0].find("track") == 0)
555 )
556 {
557 return BED_HEADER;
558 }
559
560 if (_numFields >= 3) {
561 // line parsing for all lines after the first non-header line
562 if (_typeIsKnown == true) {
563 switch(_fileType) {
564 case BED_FILETYPE:
565 if (parseBedLine(bed, fields) == true)
566 return BED_VALID;
567 case VCF_FILETYPE:
568 if (parseVcfLine(bed, fields) == true)
569 {
570 return BED_VALID;
571 }
572 case GFF_FILETYPE:
573 if (parseGffLine(bed, fields) == true)
574 return BED_VALID;
575 default:
576 printf("ERROR: file type encountered. Exiting\n");
577 exit(1);
578 }
579 }
580 // line parsing for first non-header line: figure out file contents
581 else {
582 // it's BED format if columns 2 and 3 are integers
583 if (isInteger(fields[1]) && isInteger(fields[2])) {
584 setGff(false);
585 setZeroBased(true);
586 setFileType(BED_FILETYPE);
587 // we now expect numFields columns in each line
588 setBedType(_numFields);
589
590 // test to see if the file has true blocked BED12 records
591 if (_numFields == 12) {
592 CHRPOS cdsStart = stoll(fields[6].c_str());
593 CHRPOS cdsEnd = stoll(fields[7].c_str());
594 int numExons = atoi(fields[9].c_str());
595
596 if (cdsStart > 0 && cdsEnd > 0&& numExons > 0 &&
597 fields[10].find(",") == 0 &&
598 fields[11].find(",") == 0)
599 {
600 setBed12(true);
601 }
602 else setBed12(false);
603 }
604 if (parseBedLine(bed, fields) == true)
605 return BED_VALID;
606 }
607 // it's VCF, assuming the second column is numeric and
608 // there are at least 8 fields.
609 else if (isInteger(fields[1]) && _numFields >= 8) {
610 setGff(false);
611 setVcf(true);
612 setZeroBased(false);
613 setFileType(VCF_FILETYPE);
614 // we now expect numFields columns in each line
615 setBedType(_numFields);
616 if (parseVcfLine(bed, fields) == true)
617 return BED_VALID;
618 }
619 // it's GFF, assuming columns columns 4 and 5 are numeric
620 // and we have 9 fields total.
621 else if ((_numFields >= 8) &&
622 isInteger(fields[3]) &&
623 isInteger(fields[4]))
624 {
625 setGff(true);
626 setZeroBased(false);
627 setFileType(GFF_FILETYPE);
628 // we now expect numFields columns in each line
629 setBedType(_numFields);
630 if (parseGffLine(bed, fields) == true)
631 {
632 return BED_VALID;
633 }
634 }
635 else {
636 cerr << "Unexpected file format. "
637 << "Please use tab-delimited BED, GFF, or VCF. "
638 << "Perhaps you have non-integer starts or ends "
639 << "at line "
640 << _lineNum
641 << "?"
642 << endl;
643 exit(1);
644 }
645 }
646 }
647 else {
648 cerr << "It looks as though you have less than 3 columns at line "
649 << _lineNum
650 << " in file " << bedFile << ". Are you sure your files are tab-delimited?"
651 << endl;
652 exit(1);
653 }
654 // default
655 return BED_INVALID;
656 }
657
658
659 /*
660 parseBedLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
661 */
662 template <typename T>
parseBedLine(T & bed,const vector<string> & fields)663 inline bool parseBedLine (T &bed, const vector<string> &fields)
664 {
665 // process as long as the number of fields in this
666 // line matches what we expect for this file.
667 if (_numFields == this->bedType) {
668 bed.fields = fields;
669 bed.chrom = fields[0];
670
671 CHRPOS i;
672 i = stoll(fields[1].c_str());
673 if (i<0) {
674 cerr << "Error: malformed BED entry at line "
675 << _lineNum
676 << ". Start Coordinate detected that is < 0. Exiting."
677 << endl;
678 exit(1);
679 }
680 bed.start = i;
681 i = stoll(fields[2].c_str());
682 if (i<0) {
683 cerr << "Error: malformed BED entry at line "
684 << _lineNum
685 << ". End Coordinate detected that is < 0. Exiting."
686 << endl;
687 exit(1);
688 }
689 bed.end = i;
690
691 // handle starts == end (e.g., insertions in reference genome)
692 if (bed.start == bed.end) {
693 bed.start--;
694 bed.end++;
695 bed.zeroLength = true;
696 }
697
698 if (this->bedType == 4) {
699 bed.name = fields[3];
700 }
701 else if (this->bedType == 5) {
702 bed.name = fields[3];
703 bed.score = fields[4];
704 }
705 else if (this->bedType == 6) {
706 bed.name = fields[3];
707 bed.score = fields[4];
708 bed.strand = fields[5];
709 }
710 else if (this->bedType > 6) {
711 bed.name = fields[3];
712 bed.score = fields[4];
713 bed.strand = fields[5];
714 for (unsigned int i = 6; i < fields.size(); ++i) {
715 bed.other_idxs.push_back(i);
716 }
717 }
718 else if (this->bedType != 3) {
719 cerr << "Error: unexpected number of fields at line: "
720 << _lineNum
721 << ". Verify that your files are TAB-delimited. "
722 << "Exiting..."
723 << endl;
724 exit(1);
725 }
726
727 // sanity checks.
728 if (bed.start <= bed.end) {
729 return true;
730 }
731 else {
732 cerr << "Error: malformed BED entry at line "
733 << _lineNum
734 << ". Start was greater than end. Exiting."
735 << endl;
736 exit(1);
737 }
738 }
739 else if (_numFields == 1) {
740 cerr << "Only one BED field detected: "
741 << _lineNum
742 << ". Verify that your files are TAB-delimited. Exiting..."
743 << endl;
744 exit(1);
745 }
746 else if ((_numFields != this->bedType) && (_numFields != 0)) {
747 cerr << "Differing number of BED fields encountered at line: "
748 << _lineNum
749 << ". Exiting..."
750 << endl;
751 exit(1);
752 }
753 else if ((_numFields < 3) && (_numFields != 0)) {
754 cerr << "TAB delimited BED file with at least 3 fields"
755 << " (chrom, start, end) is required at line: "
756 << _lineNum
757 << ". Exiting..."
758 << endl;
759 exit(1);
760 }
761 return false;
762 }
763
764
765 /*
766 parseVcfLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
767 */
768 template <typename T>
parseVcfLine(T & bed,const vector<string> & fields)769 inline bool parseVcfLine (T &bed, const vector<string> &fields)
770 {
771 if (_numFields >= this->bedType) {
772 bed.fields = fields;
773 bed.chrom = fields[0];
774 // VCF is one-based
775 bed.start = stoll(fields[1].c_str()) - 1;
776 // VCF 4.0 stores the size of the affected REF allele.
777 bed.end = bed.start + fields[3].size();
778 bed.strand = "+";
779 // construct the name from the ref and alt alleles.
780 // if it's an annotated variant, add the rsId as well.
781 bed.name = fields[3] + "/" + fields[4];
782 if (fields[2] != ".") {
783 bed.name += "_" + fields[2];
784 }
785
786 if (this->bedType > 2) {
787 for (unsigned int i = 2; i < _numFields; ++i)
788 bed.other_idxs.push_back(i);
789 }
790
791 if ((bed.start <= bed.end) && (bed.start >= 0) && ((bed.end) >= 0)) {
792 return true;
793 }
794 else if (bed.start > bed.end) {
795 cerr << "Error: malformed VCF entry at line "
796 << _lineNum
797 << ". Start was greater than end. Exiting."
798 << endl;
799 exit(1);
800 }
801 else if ( (bed.start < 0) || ((bed.end) < 0) ) {
802 cerr << "Error: malformed VCF entry at line "
803 << _lineNum << ". Coordinate detected that is < 0. "
804 << "Exiting."
805 << endl;
806 exit(1);
807 }
808 }
809 else if (_numFields == 1) {
810 cerr << "Only one VCF field detected: "
811 << _lineNum
812 << ". Verify that your files are TAB-delimited. "
813 << "Exiting..."
814 << endl;
815 exit(1);
816 }
817 else if ((_numFields != this->bedType) && (_numFields != 0)) {
818 cerr << "Differing number of VCF fields encountered at line: "
819 << _lineNum
820 << ". Exiting..."
821 << endl;
822 exit(1);
823 }
824 else if ((_numFields < 2) && (_numFields != 0)) {
825 cerr << "TAB delimited VCF file with at least 2 fields "
826 << "(chrom, pos) is required at line: "
827 << _lineNum
828 << ". Exiting..."
829 << endl;
830 exit(1);
831 }
832 return false;
833 }
834
835
836
837 /*
838 parseGffLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
839 */
840 template <typename T>
parseGffLine(T & bed,const vector<string> & fields)841 inline bool parseGffLine (T &bed, const vector<string> &fields)
842 {
843 if (_numFields == this->bedType) {
844 bed.fields = fields;
845 if (this->bedType >= 8 && _isGff) {
846 bed.chrom = fields[0];
847 if (isInteger(fields[3]))
848 bed.start = stoll(fields[3].c_str());
849 if (isInteger(fields[4]))
850 bed.end = stoll(fields[4].c_str());
851 bed.name = fields[2];
852 bed.score = fields[5];
853 bed.strand = fields[6].c_str();
854 // add GFF "source". unused in BED
855 bed.other_idxs.push_back(1);
856 // add GFF "fname". unused in BED
857 bed.other_idxs.push_back(7);
858 // handle the optional 9th field.
859 if (this->bedType == 9)
860 // add GFF "group". unused in BED
861 bed.other_idxs.push_back(8);
862 bed.start--;
863 }
864 else {
865 cerr << "Error: unexpected number of fields at line: "
866 << _lineNum
867 << ". Verify that your files are TAB-delimited and that "
868 << "your GFF file has 8 or 9 fields. Exiting..."
869 << endl;
870 exit(1);
871 }
872 if (bed.start > bed.end) {
873 cerr << "Error: malformed GFF entry at line "
874 << _lineNum
875 << ". Start was greater than end. Exiting."
876 << endl;
877 exit(1);
878 }
879 if ( (bed.start < 0) || ((bed.end) < 0) ) {
880 cerr << "Error: malformed GFF entry at line "
881 << _lineNum
882 << ". Coordinate detected that is < 1. Exiting."
883 << endl;
884 exit(1);
885 }
886 return true;
887 }
888 else if (_numFields == 1) {
889 cerr << "Only one GFF field detected: "
890 << _lineNum
891 << ". Verify that your files are TAB-delimited. Exiting..."
892 << endl;
893 exit(1);
894 }
895 else if ((_numFields != this->bedType) && (_numFields != 0)) {
896 cerr << "Differing number of GFF fields encountered at line: "
897 << _lineNum
898 << ". Exiting..."
899 << endl;
900 exit(1);
901 }
902 else if ((_numFields < 8) && (_numFields != 0)) {
903 cerr << "TAB delimited GFF file with 8 or 9 fields is required"
904 << " at line: "
905 << _lineNum
906 << ". Exiting..."
907 << endl;
908 exit(1);
909 }
910 return false;
911 }
912
913
914 public:
915
916 /*
917 reportBedTab
918
919 Writes the _original_ BED entry with a TAB
920 at the end of the line.
921 Works for BED3 - BED6.
922 */
923 template <typename T>
reportBedTab(const T & bed)924 inline void reportBedTab(const T &bed) {
925 // if it is azeroLength feature, we need to
926 // correct the start and end coords to what they were
927 // in the original file
928 CHRPOS start = bed.start;
929 CHRPOS end = bed.end;
930 if (bed.zeroLength) {
931 if (_isGff == false)
932 start++;
933 end--;
934 }
935
936 // BED
937 if (_isGff == false && _isVcf == false) {
938 if (this->bedType == 3) {
939 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t", bed.chrom.c_str(), start, end);
940 }
941 else if (this->bedType == 4) {
942 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t",
943 bed.chrom.c_str(), start, end, bed.name.c_str());
944 }
945 else if (this->bedType == 5) {
946 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t",
947 bed.chrom.c_str(), start, end,
948 bed.name.c_str(), bed.score.c_str());
949 }
950 else if (this->bedType == 6) {
951 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t",
952 bed.chrom.c_str(), start, end,
953 bed.name.c_str(), bed.score.c_str(), bed.strand.c_str());
954 }
955 else if (this->bedType > 6) {
956 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t",
957 bed.chrom.c_str(), start, end, bed.name.c_str(),
958 bed.score.c_str(), bed.strand.c_str());
959
960 vector<uint16_t>::const_iterator
961 othIt = bed.other_idxs.begin();
962 vector<uint16_t>::const_iterator
963 othEnd = bed.other_idxs.end();
964 for ( ; othIt != othEnd; ++othIt) {
965 printf("%s\t", bed.fields[*othIt].c_str());
966 }
967 }
968 }
969 // VCF
970 else if (_isGff == false && _isVcf == true) {
971 printf ("%s\t%" PRId_CHRPOS "\t", bed.chrom.c_str(), start+1);
972
973 vector<uint16_t>::const_iterator othIt = bed.other_idxs.begin();
974 vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end();
975 for ( ; othIt != othEnd; ++othIt) {
976 printf("%s\t", bed.fields[*othIt].c_str());
977 }
978 }
979 // GFF
980 else if (_isGff == true) {
981 // "GFF-8"
982 if (this->bedType == 8) {
983 printf ("%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t",
984 bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
985 bed.name.c_str(), start+1, end, bed.score.c_str(),
986 bed.strand.c_str(), bed.fields[bed.other_idxs[1]].c_str());
987 }
988 // "GFF-9"
989 else if (this->bedType == 9) {
990 printf ("%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s\t",
991 bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
992 bed.name.c_str(), start+1, end,
993 bed.score.c_str(), bed.strand.c_str(),
994 bed.fields[bed.other_idxs[1]].c_str(),
995 bed.fields[bed.other_idxs[2]].c_str());
996 }
997 }
998 }
999
1000
1001
1002 /*
1003 reportToFileBedNewLine
1004
1005 Writes the _original_ BED entry with a NEWLINE
1006 at the end of the line.
1007 Works for BED3 - BED6.
1008 */
1009 template <typename T>
reportToFileBedNewLine(FILE * out,const T & bed)1010 inline void reportToFileBedNewLine(FILE* out,const T &bed) {
1011
1012 // if it is azeroLength feature, we need to
1013 // correct the start and end coords to what they were
1014 // in the original file
1015 CHRPOS start = bed.start;
1016 CHRPOS end = bed.end;
1017 if (bed.zeroLength) {
1018 if (_isGff == false)
1019 start++;
1020 end--;
1021 }
1022 //BED
1023 if (_isGff == false && _isVcf == false) {
1024 if (this->bedType == 3) {
1025 fprintf(out,"%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\n", bed.chrom.c_str(), start, end);
1026 }
1027 else if (this->bedType == 4) {
1028 fprintf(out,"%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\n",
1029 bed.chrom.c_str(), start, end, bed.name.c_str());
1030 }
1031 else if (this->bedType == 5) {
1032 fprintf(out,"%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\n",
1033 bed.chrom.c_str(), start, end,
1034 bed.name.c_str(), bed.score.c_str());
1035 }
1036 else if (this->bedType == 6) {
1037 fprintf(out,"%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\n",
1038 bed.chrom.c_str(), start, end, bed.name.c_str(),
1039 bed.score.c_str(), bed.strand.c_str());
1040 }
1041 else if (this->bedType > 6) {
1042 fprintf(out,"%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s",
1043 bed.chrom.c_str(), start, end, bed.name.c_str(),
1044 bed.score.c_str(), bed.strand.c_str());
1045
1046 vector<uint16_t>::const_iterator
1047 othIt = bed.other_idxs.begin();
1048 vector<uint16_t>::const_iterator
1049 othEnd = bed.other_idxs.end();
1050 for ( ; othIt != othEnd; ++othIt) {
1051 fprintf(out,"\t%s", bed.fields[*othIt].c_str());
1052 }
1053 fprintf(out,"\n");
1054 }
1055 }
1056 // VCF
1057 else if (_isGff == false && _isVcf == true) {
1058 fprintf(out,"%s\t%" PRId_CHRPOS, bed.chrom.c_str(), start+1);
1059
1060 vector<uint16_t>::const_iterator othIt = bed.other_idxs.begin();
1061 vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end();
1062 for ( ; othIt != othEnd; ++othIt) {
1063 fprintf(out,"\t%s", bed.fields[*othIt].c_str());
1064 }
1065 fprintf(out,"\n");
1066 }
1067 // GFF
1068 else if (_isGff == true) {
1069 // "GFF-8"
1070 if (this->bedType == 8) {
1071 fprintf (out,"%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\n",
1072 bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
1073 bed.name.c_str(), start+1, end,
1074 bed.score.c_str(), bed.strand.c_str(),
1075 bed.fields[bed.other_idxs[1]].c_str());
1076 }
1077 // "GFF-9"
1078 else if (this->bedType == 9) {
1079 fprintf (out,"%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s\n",
1080 bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
1081 bed.name.c_str(), start+1, end,
1082 bed.score.c_str(), bed.strand.c_str(),
1083 bed.fields[bed.other_idxs[1]].c_str(),
1084 bed.fields[bed.other_idxs[2]].c_str());
1085 }
1086 }
1087 }
1088
1089 /* specialized version of reportBedNewLine printing to stdout */
1090 template <typename T>
reportBedNewLine(const T & bed)1091 inline void reportBedNewLine(const T &bed) {
1092 reportToFileBedNewLine(stdout,bed);
1093 }
1094
1095 /*
1096 reportBedRangeNewLine
1097
1098 Writes a custom start->end for a BED entry
1099 with a NEWLINE at the end of the line.
1100
1101 Works for BED3 - BED6.
1102 */
1103 template <typename T>
reportBedRangeTab(const T & bed,CHRPOS start,CHRPOS end)1104 inline void reportBedRangeTab(const T &bed, CHRPOS start, CHRPOS end) {
1105 // if it is azeroLength feature, we need to
1106 // correct the start and end coords to what they were
1107 // in the original file
1108 if (bed.zeroLength) {
1109 start = bed.start + 1;
1110 end = bed.end - 1;
1111 }
1112 // BED
1113 if (_isGff == false && _isVcf == false) {
1114 if (this->bedType == 3) {
1115 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t", bed.chrom.c_str(), start, end);
1116 }
1117 else if (this->bedType == 4) {
1118 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t",
1119 bed.chrom.c_str(), start, end, bed.name.c_str());
1120 }
1121 else if (this->bedType == 5) {
1122 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t",
1123 bed.chrom.c_str(), start, end,
1124 bed.name.c_str(), bed.score.c_str());
1125 }
1126 else if (this->bedType == 6) {
1127 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t",
1128 bed.chrom.c_str(), start, end, bed.name.c_str(),
1129 bed.score.c_str(), bed.strand.c_str());
1130 }
1131 else if (this->bedType > 6) {
1132 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t",
1133 bed.chrom.c_str(), start, end, bed.name.c_str(),
1134 bed.score.c_str(), bed.strand.c_str());
1135
1136 vector<uint16_t>::const_iterator
1137 othIt = bed.other_idxs.begin();
1138 vector<uint16_t>::const_iterator
1139 othEnd = bed.other_idxs.end();
1140 for ( ; othIt != othEnd; ++othIt) {
1141 printf("%s\t", bed.fields[*othIt].c_str());
1142 }
1143 }
1144 }
1145 // VCF
1146 else if (_isGff == false && _isVcf == true) {
1147 printf ("%s\t%" PRId_CHRPOS "\t", bed.chrom.c_str(), bed.start+1);
1148 vector<uint16_t>::const_iterator othIt = bed.other_idxs.begin();
1149 vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end();
1150 for ( ; othIt != othEnd; ++othIt) {
1151 printf("%s\t", bed.fields[*othIt].c_str());
1152 }
1153 }
1154 // GFF
1155 else if (_isGff == true) {
1156 // "GFF-8"
1157 if (this->bedType == 8) {
1158 printf ("%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t",
1159 bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
1160 bed.name.c_str(), start+1, end,
1161 bed.score.c_str(), bed.strand.c_str(),
1162 bed.fields[bed.other_idxs[1]].c_str());
1163 }
1164 // "GFF-9"
1165 else if (this->bedType == 9) {
1166 printf ("%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s\t",
1167 bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
1168 bed.name.c_str(), start+1, end,
1169 bed.score.c_str(), bed.strand.c_str(),
1170 bed.fields[bed.other_idxs[1]].c_str(),
1171 bed.fields[bed.other_idxs[2]].c_str());
1172 }
1173 }
1174 }
1175
1176
1177
1178 /*
1179 reportBedRangeTab
1180
1181 Writes a custom start->end for a BED entry
1182 with a TAB at the end of the line.
1183
1184 Works for BED3 - BED6.
1185 */
1186 template <typename T>
reportBedRangeNewLine(const T & bed,CHRPOS start,CHRPOS end)1187 inline void reportBedRangeNewLine(const T &bed, CHRPOS start, CHRPOS end) {
1188
1189 // if it is azeroLength feature, we need to
1190 // correct the start and end coords to what they were
1191 // in the original file
1192 if (bed.zeroLength) {
1193 start = bed.start + 1;
1194 end = bed.end - 1;
1195 }
1196 // BED
1197 if (_isGff == false && _isVcf == false) {
1198 if (this->bedType == 3) {
1199 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\n", bed.chrom.c_str(), start, end);
1200 }
1201 else if (this->bedType == 4) {
1202 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\n",
1203 bed.chrom.c_str(), start, end, bed.name.c_str());
1204 }
1205 else if (this->bedType == 5) {
1206 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\n",
1207 bed.chrom.c_str(), start, end,
1208 bed.name.c_str(), bed.score.c_str());
1209 }
1210 else if (this->bedType == 6) {
1211 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\n",
1212 bed.chrom.c_str(), start, end, bed.name.c_str(),
1213 bed.score.c_str(), bed.strand.c_str());
1214 }
1215 else if (this->bedType > 6) {
1216 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s",
1217 bed.chrom.c_str(), start, end, bed.name.c_str(),
1218 bed.score.c_str(), bed.strand.c_str());
1219
1220 vector<uint16_t>::const_iterator
1221 othIt = bed.other_idxs.begin();
1222 vector<uint16_t>::const_iterator
1223 othEnd = bed.other_idxs.end();
1224 for ( ; othIt != othEnd; ++othIt) {
1225 printf("\t%s", bed.fields[*othIt].c_str());
1226 }
1227 printf("\n");
1228 }
1229 }
1230 // VCF
1231 else if (_isGff == false && _isVcf == true) {
1232 printf ("%s\t%" PRId_CHRPOS, bed.chrom.c_str(), bed.start+1);
1233 vector<uint16_t>::const_iterator othIt = bed.other_idxs.begin();
1234 vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end();
1235 for ( ; othIt != othEnd; ++othIt) {
1236 printf("\t%s", bed.fields[*othIt].c_str());
1237 }
1238 printf("\n");
1239 }
1240 // GFF
1241 else if (_isGff == true) {
1242 // "GFF-8"
1243 if (this->bedType == 8) {
1244 printf ("%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\n",
1245 bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
1246 bed.name.c_str(), start+1, end,
1247 bed.score.c_str(), bed.strand.c_str(), bed.fields[bed.other_idxs[1]].c_str());
1248 }
1249 // "GFF-9"
1250 else if (this->bedType == 9) {
1251 printf ("%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s\n",
1252 bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
1253 bed.name.c_str(), start+1, end,
1254 bed.score.c_str(), bed.strand.c_str(),
1255 bed.fields[bed.other_idxs[1]].c_str(),
1256 bed.fields[bed.other_idxs[2]].c_str());
1257 }
1258 }
1259 }
1260
1261
1262 /*
1263 reportNullBedTab
1264 */
reportNullBedTab()1265 void reportNullBedTab() {
1266
1267 if (_isGff == false && _isVcf == false) {
1268 if (this->bedType == 3) {
1269 printf (".\t-1\t-1\t");
1270 }
1271 else if (this->bedType == 4) {
1272 printf (".\t-1\t-1\t.\t");
1273 }
1274 else if (this->bedType == 5) {
1275 printf (".\t-1\t-1\t.\t-1\t");
1276 }
1277 else if (this->bedType == 6) {
1278 printf (".\t-1\t-1\t.\t-1\t.\t");
1279 }
1280 else if (this->bedType > 6) {
1281 printf (".\t-1\t-1\t.\t-1\t.\t");
1282 for (unsigned int i = 6; i < this->bedType; ++i) {
1283 printf(".\t");
1284 }
1285 }
1286 }
1287 else if (_isGff == true && _isVcf == false) {
1288 if (this->bedType == 8) {
1289 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t");
1290 }
1291 else if (this->bedType == 9) {
1292 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t");
1293 }
1294 }
1295 }
1296
1297
1298 /*
1299 reportNullBedTab
1300 */
reportNullBedNewLine()1301 void reportNullBedNewLine() {
1302
1303 if (_isGff == false && _isVcf == false) {
1304 if (this->bedType == 3) {
1305 printf (".\t-1\t-1\n");
1306 }
1307 else if (this->bedType == 4) {
1308 printf (".\t-1\t-1\t.\n");
1309 }
1310 else if (this->bedType == 5) {
1311 printf (".\t-1\t-1\t.\t-1\n");
1312 }
1313 else if (this->bedType == 6) {
1314 printf (".\t-1\t-1\t.\t-1\t.\n");
1315 }
1316 else if (this->bedType > 6) {
1317 printf (".\t-1\t-1\t.\t-1\t.");
1318 for (unsigned int i = 6; i < this->bedType; ++i) {
1319 printf("\t.");
1320 }
1321 printf("\n");
1322 }
1323 }
1324 else if (_isGff == true && _isVcf == false) {
1325 if (this->bedType == 8) {
1326 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\n");
1327 }
1328 else if (this->bedType == 9) {
1329 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n");
1330 }
1331 }
1332 }
1333
1334
1335 };
1336
1337 #endif /* BEDFILE_H */
1338