1 /*****************************************************************************
2   bedFile.h
3 
4   (c) 2009 - Aaron Quinlan
5   Hall Laboratory
6   Department of Biochemistry and Molecular Genetics
7   University of Virginia
8   aaronquinlan@gmail.com
9 
10   Licensed under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #ifndef BEDFILE_H
13 #define BEDFILE_H
14 
15 // "local" includes
16 #include "BedtoolsTypes.h"
17 #include "gzstream.h"
18 #include "lineFileUtilities.h"
19 #include "fileType.h"
20 
21 // standard includes
22 #include <vector>
23 #include <map>
24 #include <set>
25 #include <string>
26 #include <iostream>
27 #include <fstream>
28 #include <sstream>
29 #include <cstring>
30 #include <algorithm>
31 #include <limits.h>
32 #include <stdint.h>
33 #include <cstdio>
34 //#include <tr1/unordered_map>  // Experimental.
35 using namespace std;
36 
37 
38 //*************************************************
39 // Data type tydedef
40 //*************************************************
41 typedef uint16_t BINLEVEL;
42 typedef uint32_t BIN;
43 typedef uint16_t USHORT;
44 typedef uint32_t UINT;
45 
46 //*************************************************
47 // Genome binning constants
48 //*************************************************
49 const BINLEVEL _binLevels = 8;
50 
51 const BIN _binOffsetsExtended[] = {262144+32678+4096+512+64+8+1, 32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0};
52 
53 const USHORT _binFirstShift = 14;       /* How much to shift to get to finest bin. */
54 const USHORT _binNextShift  = 3;        /* How much to shift to get to next larger bin. */
55 
56 const BIN _numBins = (1 << (_binNextShift * _binLevels)) / ((1 << _binNextShift) - 1);
57 
58 
59 //*************************************************
60 // Common data structures
61 //*************************************************
62 
63 struct DEPTH {
64     UINT starts;
65     UINT ends;
66 };
67 
68 
69 /*
70     Structure for regular BED records
71 */
72 struct BED {
73 
74     // Regular BED fields
75     string chrom;
76     CHRPOS start;
77     CHRPOS end;
78     string name;
79     string score;
80     string strand;
81     // all of the original fields in the record
82     vector<string> fields;
83     // indices of the "other" fields
84     vector<uint16_t> other_idxs;
85     // is this a zero length feature: i.e., start == end
86     bool   zeroLength;
87     double weight;
88 
89 public:
90     // constructors
91 
92     // Null
BEDBED93     BED()
94     : chrom(""),
95       start(0),
96       end(0),
97       name(""),
98       score(""),
99       strand(""),
100       fields(),
101       other_idxs(),
102       zeroLength(false),
103       weight(0.0)
104     {}
105 
106     // BED3
BEDBED107     BED(string chrom, CHRPOS start, CHRPOS end)
108     : chrom(chrom),
109       start(start),
110       end(end),
111       name(""),
112       score(""),
113       strand(""),
114       fields(),
115       other_idxs(),
116       zeroLength(false),
117       weight(0.0)
118     {}
119 
120     // BED4
BEDBED121     BED(string chrom, CHRPOS start, CHRPOS end, string strand)
122     : chrom(chrom),
123       start(start),
124       end(end),
125       name(""),
126       score(""),
127       strand(strand),
128       fields(),
129       other_idxs(),
130       zeroLength(false),
131       weight(0.0)
132     {}
133 
134     // BED6
BEDBED135     BED(string chrom, CHRPOS start, CHRPOS end, string name,
136         string score, string strand)
137     : chrom(chrom),
138       start(start),
139       end(end),
140       name(name),
141       score(score),
142       strand(strand),
143       fields(),
144       other_idxs(),
145       zeroLength(false),
146       weight(0.0)
147     {}
148 
149     // BEDALL
BEDBED150     BED(string chrom, CHRPOS start, CHRPOS end, string name,
151         string score, string strand, vector<string> fields,
152         vector<uint16_t> other_idxs)
153     : chrom(chrom),
154       start(start),
155       end(end),
156       name(name),
157       score(score),
158       strand(strand),
159       fields(fields),
160       other_idxs(other_idxs),
161       zeroLength(false),
162       weight(0.0)
163     {}
164 
sizeBED165     CHRPOS size() const {
166         return end-start;
167     }
168 
169 }; // BED
170 
171 
172 /*
173     Structure for each end of a paired BED record
174     mate points to the other end.
175 */
176 struct MATE {
177     BED bed;
178     int lineNum;
179     MATE *mate;
180 };
181 
182 
183 /*
184     Structure for regular BED COVERAGE records
185 */
186 struct BEDCOV {
187 
188     string chrom;
189 
190     // Regular BED fields
191     CHRPOS start;
192     CHRPOS end;
193     string name;
194     string score;
195     string strand;
196 
197     // all of the original fields in the record
198     vector<string> fields;
199     // indices of the "other" fields
200     vector<uint16_t> other_idxs;
201     // is this a zero length feature: i.e., start == end
202     bool   zeroLength;
203 
204     // Additional fields specific to computing coverage
205     map<unsigned int, DEPTH> depthMap;
206     unsigned int count;
207     CHRPOS minOverlapStart;
208 
209 
210     public:
211     // constructors
212     // Null
BEDCOVBEDCOV213     BEDCOV()
214     : chrom(""),
215       start(0),
216       end(0),
217       name(""),
218       score(""),
219       strand(""),
220       fields(),
221       other_idxs(),
222       zeroLength(false),
223       depthMap(),
224       count(0),
225       minOverlapStart(0)
226     {}
227 };
228 
229 
230 /*
231     Structure for BED COVERAGE records having lists of
232     multiple coverages
233 */
234 struct BEDCOVLIST {
235 
236     // Regular BED fields
237     string chrom;
238     CHRPOS start;
239     CHRPOS end;
240     string name;
241     string score;
242     string strand;
243 
244     // all of the original fields in the record
245     vector<string> fields;
246     // indices of the "other" fields
247     vector<uint16_t> other_idxs;
248     // is this a zero length feature: i.e., start == end
249     bool   zeroLength;
250 
251     // Additional fields specific to computing coverage
252     vector< map<CHRPOS, DEPTH> > depthMapList;
253     vector<unsigned int> counts;
254     vector<CHRPOS> minOverlapStarts;
255 
256 
257     public:
258     // constructors
259     // Null
BEDCOVLISTBEDCOVLIST260     BEDCOVLIST()
261     : chrom(""),
262       start(0),
263       end(0),
264       name(""),
265       score(""),
266       strand(""),
267       fields(),
268       other_idxs(),
269       zeroLength(false),
270       depthMapList(),
271       counts(0),
272       minOverlapStarts(0)
273     {}
274 };
275 
276 
277 // enum to flag the state of a given line in a BED file.
278 enum BedLineStatus
279 {
280     BED_INVALID = -1,
281     BED_HEADER  = 0,
282     BED_BLANK   = 1,
283     BED_VALID   = 2
284 };
285 
286 // enum to indicate the type of file we are dealing with
287 enum FileType
288 {
289     BED_FILETYPE,
290     GFF_FILETYPE,
291     VCF_FILETYPE
292 };
293 
294 //*************************************************
295 // Data structure typedefs
296 //*************************************************
297 typedef vector<BED>    bedVector;
298 typedef vector<BEDCOV> bedCovVector;
299 typedef vector<MATE> mateVector;
300 typedef vector<BEDCOVLIST> bedCovListVector;
301 
302 typedef map<BIN, bedVector> binsToBeds;
303 typedef map<BIN, bedCovVector> binsToBedCovs;
304 typedef map<BIN, mateVector> binsToMates;
305 typedef map<BIN, bedCovListVector> binsToBedCovLists;
306 
307 typedef map<string, binsToBeds>    masterBedMap;
308 typedef map<string, binsToBedCovs> masterBedCovMap;
309 typedef map<string, binsToMates> masterMateMap;
310 typedef map<string, binsToBedCovLists> masterBedCovListMap;
311 typedef map<string, bedVector>     masterBedMapNoBin;
312 
313 
314 // return the genome "bin" for a feature with this start and end
315 inline
getBin(CHRPOS start,CHRPOS end)316 BIN getBin(CHRPOS start, CHRPOS end) {
317     --end;
318     start >>= _binFirstShift;
319     end   >>= _binFirstShift;
320 
321     for (short i = 0; i < _binLevels; ++i) {
322         if (start == end) return _binOffsetsExtended[i] + start;
323         start >>= _binNextShift;
324         end   >>= _binNextShift;
325     }
326     cerr << "start " << start << ", end " << end
327          << " out of range in findBin (max is 512M)"
328          << endl;
329     return 0;
330 }
331 
332 /****************************************************
333 // isInteger(s): Tests if string s is a valid integer
334 *****************************************************/
isInteger(const std::string & s)335 inline bool isInteger(const std::string& s) {
336     int len = s.length();
337     for (int i = 0; i < len; i++) {
338         if (!std::isdigit(s[i])) return false;
339     }
340     return true;
341 }
342 
343 
344 // return the amount of overlap between two features.  Negative if none and the
345 // number of negative bases is the distance between the two.
346 inline
overlaps(CHRPOS aS,CHRPOS aE,CHRPOS bS,CHRPOS bE)347 CHRPOS overlaps(CHRPOS aS, CHRPOS aE, CHRPOS bS, CHRPOS bE) {
348     return min(aE, bE) - max(aS, bS);
349 }
350 
351 // is A after (to the right of) B?
352 inline
after(const BED & a,const BED & b)353 bool after(const BED &a, const BED &b) {
354     return (a.start >= b.end);
355 }
356 
357 
358 // Ancillary functions
359 void splitBedIntoBlocks(const BED &bed, bedVector &bedBlocks);
360 
361 
362 // BED Sorting Methods
363 bool sortByChrom(const BED &a, const BED &b);
364 bool sortByStart(const BED &a, const BED &b);
365 bool sortBySizeAsc(const BED &a, const BED &b);
366 bool sortBySizeDesc(const BED &a, const BED &b);
367 bool sortByScoreAsc(const BED &a, const BED &b);
368 bool sortByScoreDesc(const BED &a, const BED &b);
369 bool byChromThenStart(BED const &a, BED const &b);
370 
371 
372 
373 //************************************************
374 // BedFile Class methods and elements
375 //************************************************
376 class BedFile {
377 
378 public:
379 
380     // Constructor
381     BedFile(string &);
382     BedFile(void);
383 
384     // Destructor
385     ~BedFile(void);
386 
387     /********* File management ********/
388     // Open a BED file for reading (creates an istream pointer)
389     void Open(void);
390 
391     // Close an opened BED file.
392     void Close(void);
393 
394     // are the any intervals left in the file?
395     bool Empty(void);
396 
397     // Rewind the pointer back to the beginning of the file
398     void Rewind(void);
399 
400     // Jump to a specific byte in the file
401     void Seek(unsigned long offset);
402 
403     // dump the header, which is collected as part of Open()
404     void PrintHeader(void);
405 
406     // Get the next BED entry in an opened BED file.
407     bool GetNextBed (BED &bed, bool forceSorted = false);
408 
409     // Returns the next MERGED (i.e., non-overlapping) interval in
410     // an opened BED file
411     // NOTE: assumes input file is sorted by chrom then start
412     bool GetNextMergedBed(BED &merged_bed);
413 
414     // load a BED file into a map keyed by chrom, then bin. value is
415     // vector of BEDs
416     void loadBedFileIntoMap();
417     void loadBedFileIntoMergedMap();
418 
419     // load a BED entry into and existing map
420     void addBEDIntoMap(BED bedEntry);
421 
422     // load a BED file into a map keyed by chrom, then bin. value is
423     // vector of BEDCOVs
424     void loadBedCovFileIntoMap();
425 
426     // load a BED file into a map keyed by chrom, then bin. value is
427     // vector of BEDCOVLISTs
428     void loadBedCovListFileIntoMap();
429 
430     // load a BED file into a map keyed by chrom. value is vector of BEDs
431     void loadBedFileIntoMapNoBin();
432 
433     // load a BED file into a vector of BEDs
434     void loadBedFileIntoVector();
435 
436     // load a BED file into a vector ordered in decreasing order by size
437     void assignWeightsBasedOnSize();
438 
439     BED * sizeWeightedSearch(double val);
440 
441     // Given a chrom, start, end and strand for a single feature,
442     // search for all overlapping features in another BED file.
443     // Searches through each relevant genome bin on the same chromosome
444     // as the single feature. Note: Adapted from kent source "binKeeperFind"
445     void allHits(string chrom, CHRPOS start, CHRPOS end, string strand,
446                  vector<BED> &hits, bool sameStrand, bool diffStrand,
447                  float overlapFraction, bool reciprocal);
448 
449     // return true if at least one overlap was found.  otherwise, return false.
450     bool anyHits(string chrom, CHRPOS start, CHRPOS end, string strand,
451                 bool sameStrand, bool diffStrand, float overlapFraction, bool reciprocal);
452 
453 
454     // Given a chrom, start, end and strand for a single feature,
455     // increment a the number of hits for each feature in B file
456     // that the feature overlaps
457     void countHits(const BED &a, bool sameStrand = false,
458                    bool diffStrand = false, bool countsOnly = false);
459 
460     // same as above, but has special logic that processes a set of
461     // BED "blocks" from a single entry so as to avoid over-counting
462     // each "block" of a single BAM/BED12 as distinct coverage.  That is,
463     // if one read has four block, we only want to count the coverage as
464     // coming from one read, not four.
465     void countSplitHits(const vector<BED> &bedBlock, bool sameStrand = false,
466                         bool diffStrand = false, bool countsOnly = false);
467 
468     // Given a chrom, start, end and strand for a single feature,
469     // increment a the number of hits for each feature in B file
470     // that the feature overlaps
471     void countListHits(const BED &a, int index,
472                        bool sameStrand, bool diffStrand);
473 
474 
475     // return the total length of all the intervals in the file.
476     // use with GetNextBed()
477     unsigned long getTotalLength(void);
478 
479     // return the total _flattened_ length of all the intervals in the file.
480     // use with GetNextMergedBed()
481     unsigned long getTotalFlattenedLength(void);
482 
483 
484     // the bedfile with which this instance is associated
485     string bedFile;
486     unsigned int bedType;  // 3-6, 12 for BED
487                            // 9 for GFF
488     bool isBed12;          // is it file of true blocked BED12 records?
489     bool isZeroBased;
490 
491     // Main data structures used by BEDTools
492     masterBedCovMap      bedCovMap;
493     masterBedCovListMap  bedCovListMap;
494     masterBedMap         bedMap;
495     bedVector            bedList;
496     masterBedMapNoBin    bedMapNoBin;
497 
498     BedLineStatus _status;
499     int _lineNum;
500 
501 private:
502 
503     // data
504     bool _isGff;
505     bool _isVcf;
506     bool _typeIsKnown;        // do we know the type?   (i.e., BED, GFF, VCF)
507     FileType   _fileType;     // what is the file type? (BED? GFF? VCF?)
508     istream   *_bedStream;
509     string _bedLine;
510 
511     BED _nullBed;
512     string _header;
513     bool _firstLine;
514     vector<string> _bedFields;
515     unsigned int _numFields;
516     CHRPOS _merged_start;
517     CHRPOS _merged_end;
518     string _merged_chrom;
519     CHRPOS _prev_start;
520     string _prev_chrom;
521     unsigned long _total_length;
522     unsigned long _total_flattened_length;
523 
524     void setZeroBased(bool zeroBased);
525     void setGff (bool isGff);
526     void setVcf (bool isVcf);
527     void setFileType (FileType type);
528     void setBedType (int colNums);
529     void setBed12 (bool isBed12);
530 
531     /************ Private utilities ***********************/
532     void GetHeader(void);
533 
534     /******************************************************
535     Private definitions to circumvent linker issues with
536     templated member functions.
537     *******************************************************/
538 
539     /*
540         parseLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
541     */
542     template <typename T>
parseLine(T & bed,const vector<string> & fields)543     inline BedLineStatus parseLine (T &bed, const vector<string> &fields) {
544 
545         // clear out the data from the last line.
546         bed = _nullBed;
547         // bail out if we have a blank line
548         if (_numFields == 0) {
549             return BED_BLANK;
550         }
551         // bail out if we have a comment line
552         if ( (fields[0].find("#")       == 0) ||
553              (fields[0].find("browser") == 0) ||
554              (fields[0].find("track")   == 0)
555            )
556         {
557             return BED_HEADER;
558         }
559 
560         if (_numFields >= 3) {
561             // line parsing for all lines after the first non-header line
562             if (_typeIsKnown == true) {
563                 switch(_fileType) {
564                     case BED_FILETYPE:
565                         if (parseBedLine(bed, fields) == true)
566                             return BED_VALID;
567                     case VCF_FILETYPE:
568                         if (parseVcfLine(bed, fields) == true)
569                         {
570                             return BED_VALID;
571                         }
572                     case GFF_FILETYPE:
573                         if (parseGffLine(bed, fields) == true)
574                                 return BED_VALID;
575                     default:
576                         printf("ERROR: file type encountered. Exiting\n");
577                         exit(1);
578                 }
579             }
580             // line parsing for first non-header line: figure out file contents
581             else {
582                 // it's BED format if columns 2 and 3 are integers
583                 if (isInteger(fields[1]) && isInteger(fields[2])) {
584                     setGff(false);
585                     setZeroBased(true);
586                     setFileType(BED_FILETYPE);
587                     // we now expect numFields columns in each line
588                     setBedType(_numFields);
589 
590                     // test to see if the file has true blocked BED12 records
591                     if (_numFields == 12) {
592                         CHRPOS cdsStart = stoll(fields[6].c_str());
593                         CHRPOS cdsEnd   = stoll(fields[7].c_str());
594                         int numExons = atoi(fields[9].c_str());
595 
596                         if (cdsStart > 0 && cdsEnd > 0&& numExons > 0 &&
597                             fields[10].find(",") == 0 &&
598                             fields[11].find(",") == 0)
599                         {
600                             setBed12(true);
601                         }
602                         else setBed12(false);
603                     }
604                     if (parseBedLine(bed, fields) == true)
605                         return BED_VALID;
606                 }
607                 // it's VCF, assuming the second column is numeric and
608                 // there are at least 8 fields.
609                 else if (isInteger(fields[1]) && _numFields >= 8) {
610                     setGff(false);
611                     setVcf(true);
612                     setZeroBased(false);
613                     setFileType(VCF_FILETYPE);
614                     // we now expect numFields columns in each line
615                     setBedType(_numFields);
616                     if (parseVcfLine(bed, fields) == true)
617                         return BED_VALID;
618                 }
619                 // it's GFF, assuming columns columns 4 and 5 are numeric
620                 // and we have 9 fields total.
621                 else if ((_numFields >= 8) &&
622                           isInteger(fields[3]) &&
623                           isInteger(fields[4]))
624                 {
625                     setGff(true);
626                     setZeroBased(false);
627                     setFileType(GFF_FILETYPE);
628                     // we now expect numFields columns in each line
629                     setBedType(_numFields);
630                     if (parseGffLine(bed, fields) == true)
631                     {
632                         return BED_VALID;
633                     }
634                 }
635                 else {
636                     cerr << "Unexpected file format.  "
637                          << "Please use tab-delimited BED, GFF, or VCF. "
638                          << "Perhaps you have non-integer starts or ends "
639                          << "at line "
640                          << _lineNum
641                          << "?"
642                          << endl;
643                     exit(1);
644                 }
645             }
646         }
647         else {
648             cerr << "It looks as though you have less than 3 columns at line "
649                  << _lineNum
650                  << " in file " << bedFile << ".  Are you sure your files are tab-delimited?"
651                  << endl;
652             exit(1);
653         }
654         // default
655         return BED_INVALID;
656     }
657 
658 
659     /*
660         parseBedLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
661     */
662     template <typename T>
parseBedLine(T & bed,const vector<string> & fields)663     inline bool parseBedLine (T &bed, const vector<string> &fields)
664     {
665         // process as long as the number of fields in this
666         // line matches what we expect for this file.
667         if (_numFields == this->bedType) {
668             bed.fields = fields;
669             bed.chrom = fields[0];
670 
671             CHRPOS i;
672             i = stoll(fields[1].c_str());
673             if (i<0) {
674                  cerr << "Error: malformed BED entry at line "
675                       << _lineNum
676                       << ". Start Coordinate detected that is < 0. Exiting."
677                       << endl;
678                  exit(1);
679             }
680             bed.start = i;
681             i = stoll(fields[2].c_str());
682             if (i<0) {
683                 cerr << "Error: malformed BED entry at line "
684                      << _lineNum
685                      << ". End Coordinate detected that is < 0. Exiting."
686                      << endl;
687                 exit(1);
688             }
689             bed.end = i;
690 
691             // handle starts == end (e.g., insertions in reference genome)
692             if (bed.start == bed.end) {
693                 bed.start--;
694                 bed.end++;
695                 bed.zeroLength = true;
696             }
697 
698             if (this->bedType == 4) {
699                 bed.name = fields[3];
700             }
701             else if (this->bedType == 5) {
702                 bed.name = fields[3];
703                 bed.score = fields[4];
704             }
705             else if (this->bedType == 6) {
706                 bed.name = fields[3];
707                 bed.score = fields[4];
708                 bed.strand = fields[5];
709             }
710             else if (this->bedType > 6) {
711                 bed.name = fields[3];
712                 bed.score = fields[4];
713                 bed.strand = fields[5];
714                 for (unsigned int i = 6; i < fields.size(); ++i) {
715                     bed.other_idxs.push_back(i);
716                 }
717             }
718             else if (this->bedType != 3) {
719                 cerr << "Error: unexpected number of fields at line: "
720                      << _lineNum
721                      << ".  Verify that your files are TAB-delimited. "
722                      << "Exiting..."
723                      << endl;
724                 exit(1);
725             }
726 
727             // sanity checks.
728             if (bed.start <= bed.end) {
729                 return true;
730             }
731             else {
732                 cerr << "Error: malformed BED entry at line "
733                      << _lineNum
734                      << ". Start was greater than end. Exiting."
735                      << endl;
736                 exit(1);
737             }
738         }
739         else if (_numFields == 1) {
740             cerr << "Only one BED field detected: "
741                  << _lineNum
742                  << ".  Verify that your files are TAB-delimited.  Exiting..."
743                  << endl;
744             exit(1);
745         }
746         else if ((_numFields != this->bedType) && (_numFields != 0)) {
747             cerr << "Differing number of BED fields encountered at line: "
748                  << _lineNum
749                  << ".  Exiting..."
750                  << endl;
751             exit(1);
752         }
753         else if ((_numFields < 3) && (_numFields != 0)) {
754             cerr << "TAB delimited BED file with at least 3 fields"
755                  << " (chrom, start, end) is required at line: "
756                  << _lineNum
757                  << ".  Exiting..."
758                  << endl;
759             exit(1);
760         }
761         return false;
762     }
763 
764 
765     /*
766         parseVcfLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
767     */
768     template <typename T>
parseVcfLine(T & bed,const vector<string> & fields)769     inline bool parseVcfLine (T &bed, const vector<string> &fields)
770     {
771         if (_numFields >= this->bedType) {
772             bed.fields = fields;
773             bed.chrom  = fields[0];
774             // VCF is one-based
775             bed.start  = stoll(fields[1].c_str()) - 1;
776             // VCF 4.0 stores the size of the affected REF allele.
777             bed.end    = bed.start + fields[3].size();
778             bed.strand = "+";
779             // construct the name from the ref and alt alleles.
780             // if it's an annotated variant, add the rsId as well.
781             bed.name   = fields[3] + "/" + fields[4];
782             if (fields[2] != ".") {
783                 bed.name += "_" + fields[2];
784             }
785 
786             if (this->bedType > 2) {
787                 for (unsigned int i = 2; i < _numFields; ++i)
788                     bed.other_idxs.push_back(i);
789             }
790 
791             if ((bed.start <= bed.end) && (bed.start >= 0) && ((bed.end) >= 0)) {
792                 return true;
793             }
794             else if (bed.start > bed.end) {
795                 cerr << "Error: malformed VCF entry at line "
796                     << _lineNum
797                     << ". Start was greater than end. Exiting."
798                     << endl;
799                 exit(1);
800             }
801             else if ( (bed.start < 0) || ((bed.end) < 0) ) {
802                 cerr << "Error: malformed VCF entry at line "
803                      << _lineNum << ". Coordinate detected that is < 0. "
804                      << "Exiting."
805                      << endl;
806                 exit(1);
807             }
808         }
809         else if (_numFields == 1) {
810             cerr << "Only one VCF field detected: "
811                  << _lineNum
812                  << ".  Verify that your files are TAB-delimited. "
813                  << "Exiting..."
814                  << endl;
815             exit(1);
816         }
817         else if ((_numFields != this->bedType) && (_numFields != 0)) {
818             cerr << "Differing number of VCF fields encountered at line: "
819                  << _lineNum
820                  << ".  Exiting..."
821                  << endl;
822             exit(1);
823         }
824         else if ((_numFields < 2) && (_numFields != 0)) {
825             cerr << "TAB delimited VCF file with at least 2 fields "
826                  << "(chrom, pos) is required at line: "
827                  << _lineNum
828                  << ".  Exiting..."
829                  << endl;
830             exit(1);
831         }
832         return false;
833     }
834 
835 
836 
837     /*
838         parseGffLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
839     */
840     template <typename T>
parseGffLine(T & bed,const vector<string> & fields)841     inline bool parseGffLine (T &bed, const vector<string> &fields)
842     {
843         if (_numFields == this->bedType) {
844             bed.fields = fields;
845             if (this->bedType >= 8 && _isGff) {
846                 bed.chrom = fields[0];
847                 if (isInteger(fields[3]))
848                     bed.start  = stoll(fields[3].c_str());
849                 if (isInteger(fields[4]))
850                     bed.end  = stoll(fields[4].c_str());
851                 bed.name   = fields[2];
852                 bed.score  = fields[5];
853                 bed.strand = fields[6].c_str();
854                 // add GFF "source". unused in BED
855                 bed.other_idxs.push_back(1);
856                 // add GFF "fname". unused in BED
857                 bed.other_idxs.push_back(7);
858                 // handle the optional 9th field.
859                 if (this->bedType == 9)
860                     // add GFF "group". unused in BED
861                     bed.other_idxs.push_back(8);
862                 bed.start--;
863             }
864             else {
865                 cerr << "Error: unexpected number of fields at line: "
866                     << _lineNum
867                     << ".  Verify that your files are TAB-delimited and that "
868                     << "your GFF file has 8 or 9 fields.  Exiting..."
869                     << endl;
870                 exit(1);
871             }
872             if (bed.start > bed.end) {
873                 cerr << "Error: malformed GFF entry at line "
874                      << _lineNum
875                      << ". Start was greater than end. Exiting."
876                      << endl;
877                 exit(1);
878             }
879             if ( (bed.start < 0) || ((bed.end) < 0) ) {
880                 cerr << "Error: malformed GFF entry at line "
881                      << _lineNum
882                      << ". Coordinate detected that is < 1. Exiting."
883                      << endl;
884                 exit(1);
885             }
886             return true;
887         }
888         else if (_numFields == 1) {
889             cerr << "Only one GFF field detected: "
890                  << _lineNum
891                  << ".  Verify that your files are TAB-delimited.  Exiting..."
892                  << endl;
893             exit(1);
894         }
895         else if ((_numFields != this->bedType) && (_numFields != 0)) {
896             cerr << "Differing number of GFF fields encountered at line: "
897                  << _lineNum
898                  << ".  Exiting..."
899                  << endl;
900             exit(1);
901         }
902         else if ((_numFields < 8) && (_numFields != 0)) {
903             cerr << "TAB delimited GFF file with 8 or 9 fields is required"
904                  << " at line: "
905                  << _lineNum
906                  << ".  Exiting..."
907                  << endl;
908             exit(1);
909         }
910         return false;
911     }
912 
913 
914 public:
915 
916     /*
917         reportBedTab
918 
919         Writes the _original_ BED entry with a TAB
920         at the end of the line.
921         Works for BED3 - BED6.
922     */
923     template <typename T>
reportBedTab(const T & bed)924     inline void reportBedTab(const T &bed) {
925         // if it is azeroLength feature, we need to
926         // correct the start and end coords to what they were
927         // in the original file
928         CHRPOS start = bed.start;
929         CHRPOS end   = bed.end;
930         if (bed.zeroLength) {
931             if (_isGff == false)
932                 start++;
933             end--;
934         }
935 
936         // BED
937         if (_isGff == false && _isVcf == false) {
938             if (this->bedType == 3) {
939                 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t", bed.chrom.c_str(), start, end);
940             }
941             else if (this->bedType == 4) {
942                 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t",
943                     bed.chrom.c_str(), start, end, bed.name.c_str());
944             }
945             else if (this->bedType == 5) {
946                 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t",
947                     bed.chrom.c_str(), start, end,
948                     bed.name.c_str(), bed.score.c_str());
949             }
950             else if (this->bedType == 6) {
951                 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t",
952                     bed.chrom.c_str(), start, end,
953                     bed.name.c_str(), bed.score.c_str(), bed.strand.c_str());
954             }
955             else if (this->bedType > 6) {
956                 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t",
957                     bed.chrom.c_str(), start, end, bed.name.c_str(),
958                     bed.score.c_str(), bed.strand.c_str());
959 
960                 vector<uint16_t>::const_iterator
961                     othIt  = bed.other_idxs.begin();
962                 vector<uint16_t>::const_iterator
963                     othEnd = bed.other_idxs.end();
964                 for ( ; othIt != othEnd; ++othIt) {
965                     printf("%s\t", bed.fields[*othIt].c_str());
966                 }
967             }
968         }
969         // VCF
970         else if (_isGff == false && _isVcf == true) {
971             printf ("%s\t%" PRId_CHRPOS "\t", bed.chrom.c_str(), start+1);
972 
973             vector<uint16_t>::const_iterator othIt  = bed.other_idxs.begin();
974             vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end();
975             for ( ; othIt != othEnd; ++othIt) {
976                 printf("%s\t", bed.fields[*othIt].c_str());
977             }
978         }
979         // GFF
980         else if (_isGff == true) {
981             // "GFF-8"
982             if (this->bedType == 8) {
983                 printf ("%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t",
984                     bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
985                     bed.name.c_str(), start+1, end, bed.score.c_str(),
986                     bed.strand.c_str(), bed.fields[bed.other_idxs[1]].c_str());
987             }
988             // "GFF-9"
989             else if (this->bedType == 9) {
990                 printf ("%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s\t",
991                     bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
992                     bed.name.c_str(), start+1, end,
993                     bed.score.c_str(), bed.strand.c_str(),
994                     bed.fields[bed.other_idxs[1]].c_str(),
995                     bed.fields[bed.other_idxs[2]].c_str());
996             }
997         }
998     }
999 
1000 
1001 
1002     /*
1003         reportToFileBedNewLine
1004 
1005         Writes the _original_ BED entry with a NEWLINE
1006         at the end of the line.
1007         Works for BED3 - BED6.
1008     */
1009     template <typename T>
reportToFileBedNewLine(FILE * out,const T & bed)1010     inline void reportToFileBedNewLine(FILE* out,const T &bed) {
1011 
1012         // if it is azeroLength feature, we need to
1013         // correct the start and end coords to what they were
1014         // in the original file
1015         CHRPOS start = bed.start;
1016         CHRPOS end   = bed.end;
1017         if (bed.zeroLength) {
1018             if (_isGff == false)
1019                 start++;
1020             end--;
1021         }
1022         //BED
1023         if (_isGff == false && _isVcf == false) {
1024             if (this->bedType == 3) {
1025                 fprintf(out,"%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\n", bed.chrom.c_str(), start, end);
1026             }
1027             else if (this->bedType == 4) {
1028                 fprintf(out,"%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\n",
1029                     bed.chrom.c_str(), start, end, bed.name.c_str());
1030             }
1031             else if (this->bedType == 5) {
1032                 fprintf(out,"%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\n",
1033                     bed.chrom.c_str(), start, end,
1034                     bed.name.c_str(), bed.score.c_str());
1035             }
1036             else if (this->bedType == 6) {
1037                 fprintf(out,"%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\n",
1038                     bed.chrom.c_str(), start, end, bed.name.c_str(),
1039                     bed.score.c_str(), bed.strand.c_str());
1040             }
1041             else if (this->bedType > 6) {
1042                 fprintf(out,"%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s",
1043                     bed.chrom.c_str(), start, end, bed.name.c_str(),
1044                     bed.score.c_str(), bed.strand.c_str());
1045 
1046                 vector<uint16_t>::const_iterator
1047                     othIt  = bed.other_idxs.begin();
1048                 vector<uint16_t>::const_iterator
1049                     othEnd = bed.other_idxs.end();
1050                 for ( ; othIt != othEnd; ++othIt) {
1051                     fprintf(out,"\t%s", bed.fields[*othIt].c_str());
1052                 }
1053                 fprintf(out,"\n");
1054             }
1055         }
1056         // VCF
1057         else if (_isGff == false && _isVcf == true) {
1058             fprintf(out,"%s\t%" PRId_CHRPOS, bed.chrom.c_str(), start+1);
1059 
1060             vector<uint16_t>::const_iterator othIt  = bed.other_idxs.begin();
1061             vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end();
1062             for ( ; othIt != othEnd; ++othIt) {
1063                 fprintf(out,"\t%s", bed.fields[*othIt].c_str());
1064             }
1065             fprintf(out,"\n");
1066         }
1067         // GFF
1068         else if (_isGff == true) {
1069             // "GFF-8"
1070             if (this->bedType == 8) {
1071                 fprintf (out,"%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\n",
1072                     bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
1073                     bed.name.c_str(), start+1, end,
1074                     bed.score.c_str(), bed.strand.c_str(),
1075                     bed.fields[bed.other_idxs[1]].c_str());
1076             }
1077             // "GFF-9"
1078             else if (this->bedType == 9) {
1079                 fprintf (out,"%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s\n",
1080                     bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
1081                     bed.name.c_str(), start+1, end,
1082                     bed.score.c_str(), bed.strand.c_str(),
1083                     bed.fields[bed.other_idxs[1]].c_str(),
1084                     bed.fields[bed.other_idxs[2]].c_str());
1085             }
1086         }
1087     }
1088 
1089     /* specialized version of reportBedNewLine printing to stdout */
1090      template <typename T>
reportBedNewLine(const T & bed)1091      inline void reportBedNewLine(const T &bed) {
1092      reportToFileBedNewLine(stdout,bed);
1093      }
1094 
1095     /*
1096         reportBedRangeNewLine
1097 
1098         Writes a custom start->end for a BED entry
1099         with a NEWLINE at the end of the line.
1100 
1101         Works for BED3 - BED6.
1102     */
1103     template <typename T>
reportBedRangeTab(const T & bed,CHRPOS start,CHRPOS end)1104     inline void reportBedRangeTab(const T &bed, CHRPOS start, CHRPOS end) {
1105         // if it is azeroLength feature, we need to
1106         // correct the start and end coords to what they were
1107         // in the original file
1108         if (bed.zeroLength) {
1109             start = bed.start + 1;
1110             end   = bed.end - 1;
1111         }
1112         // BED
1113         if (_isGff == false && _isVcf == false) {
1114             if (this->bedType == 3) {
1115                 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t", bed.chrom.c_str(), start, end);
1116             }
1117             else if (this->bedType == 4) {
1118                 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t",
1119                     bed.chrom.c_str(), start, end, bed.name.c_str());
1120             }
1121             else if (this->bedType == 5) {
1122                 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t",
1123                     bed.chrom.c_str(), start, end,
1124                     bed.name.c_str(), bed.score.c_str());
1125             }
1126             else if (this->bedType == 6) {
1127                 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t",
1128                     bed.chrom.c_str(), start, end, bed.name.c_str(),
1129                     bed.score.c_str(), bed.strand.c_str());
1130             }
1131             else if (this->bedType > 6) {
1132                 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t",
1133                     bed.chrom.c_str(), start, end, bed.name.c_str(),
1134                     bed.score.c_str(), bed.strand.c_str());
1135 
1136                 vector<uint16_t>::const_iterator
1137                     othIt  = bed.other_idxs.begin();
1138                 vector<uint16_t>::const_iterator
1139                     othEnd = bed.other_idxs.end();
1140                 for ( ; othIt != othEnd; ++othIt) {
1141                     printf("%s\t", bed.fields[*othIt].c_str());
1142                 }
1143             }
1144         }
1145         // VCF
1146         else if (_isGff == false && _isVcf == true) {
1147             printf ("%s\t%" PRId_CHRPOS "\t", bed.chrom.c_str(), bed.start+1);
1148             vector<uint16_t>::const_iterator othIt  = bed.other_idxs.begin();
1149             vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end();
1150             for ( ; othIt != othEnd; ++othIt) {
1151                 printf("%s\t", bed.fields[*othIt].c_str());
1152             }
1153         }
1154         // GFF
1155         else if (_isGff == true) {
1156             // "GFF-8"
1157             if (this->bedType == 8) {
1158                 printf ("%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t",
1159                     bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
1160                     bed.name.c_str(), start+1, end,
1161                     bed.score.c_str(), bed.strand.c_str(),
1162                     bed.fields[bed.other_idxs[1]].c_str());
1163             }
1164             // "GFF-9"
1165             else if (this->bedType == 9) {
1166                 printf ("%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s\t",
1167                     bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
1168                     bed.name.c_str(), start+1, end,
1169                     bed.score.c_str(), bed.strand.c_str(),
1170                     bed.fields[bed.other_idxs[1]].c_str(),
1171                     bed.fields[bed.other_idxs[2]].c_str());
1172             }
1173         }
1174     }
1175 
1176 
1177 
1178     /*
1179         reportBedRangeTab
1180 
1181         Writes a custom start->end for a BED entry
1182         with a TAB at the end of the line.
1183 
1184         Works for BED3 - BED6.
1185     */
1186     template <typename T>
reportBedRangeNewLine(const T & bed,CHRPOS start,CHRPOS end)1187     inline void reportBedRangeNewLine(const T &bed, CHRPOS start, CHRPOS end) {
1188 
1189         // if it is azeroLength feature, we need to
1190         // correct the start and end coords to what they were
1191         // in the original file
1192         if (bed.zeroLength) {
1193             start = bed.start + 1;
1194             end   = bed.end - 1;
1195         }
1196         // BED
1197         if (_isGff == false && _isVcf == false) {
1198             if (this->bedType == 3) {
1199                 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\n", bed.chrom.c_str(), start, end);
1200             }
1201             else if (this->bedType == 4) {
1202                 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\n",
1203                     bed.chrom.c_str(), start, end, bed.name.c_str());
1204             }
1205             else if (this->bedType == 5) {
1206                 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\n",
1207                     bed.chrom.c_str(), start, end,
1208                     bed.name.c_str(), bed.score.c_str());
1209             }
1210             else if (this->bedType == 6) {
1211                 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\n",
1212                     bed.chrom.c_str(), start, end, bed.name.c_str(),
1213                     bed.score.c_str(), bed.strand.c_str());
1214             }
1215             else if (this->bedType > 6) {
1216                 printf ("%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s",
1217                     bed.chrom.c_str(), start, end, bed.name.c_str(),
1218                     bed.score.c_str(), bed.strand.c_str());
1219 
1220                 vector<uint16_t>::const_iterator
1221                     othIt  = bed.other_idxs.begin();
1222                 vector<uint16_t>::const_iterator
1223                     othEnd = bed.other_idxs.end();
1224                 for ( ; othIt != othEnd; ++othIt) {
1225                     printf("\t%s", bed.fields[*othIt].c_str());
1226                 }
1227                 printf("\n");
1228             }
1229         }
1230         // VCF
1231         else if (_isGff == false && _isVcf == true) {
1232             printf ("%s\t%" PRId_CHRPOS, bed.chrom.c_str(), bed.start+1);
1233             vector<uint16_t>::const_iterator othIt  = bed.other_idxs.begin();
1234             vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end();
1235             for ( ; othIt != othEnd; ++othIt) {
1236                 printf("\t%s", bed.fields[*othIt].c_str());
1237             }
1238             printf("\n");
1239         }
1240         // GFF
1241         else if (_isGff == true) {
1242             // "GFF-8"
1243             if (this->bedType == 8) {
1244                 printf ("%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\n",
1245                     bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
1246                     bed.name.c_str(), start+1, end,
1247                     bed.score.c_str(), bed.strand.c_str(),                                          bed.fields[bed.other_idxs[1]].c_str());
1248             }
1249             // "GFF-9"
1250             else if (this->bedType == 9) {
1251                 printf ("%s\t%s\t%s\t%" PRId_CHRPOS "\t%" PRId_CHRPOS "\t%s\t%s\t%s\t%s\n",
1252                     bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
1253                     bed.name.c_str(), start+1, end,
1254                     bed.score.c_str(), bed.strand.c_str(),
1255                     bed.fields[bed.other_idxs[1]].c_str(),
1256                     bed.fields[bed.other_idxs[2]].c_str());
1257             }
1258         }
1259     }
1260 
1261 
1262     /*
1263         reportNullBedTab
1264     */
reportNullBedTab()1265     void reportNullBedTab() {
1266 
1267         if (_isGff == false && _isVcf == false) {
1268             if (this->bedType == 3) {
1269                 printf (".\t-1\t-1\t");
1270             }
1271             else if (this->bedType == 4) {
1272                 printf (".\t-1\t-1\t.\t");
1273             }
1274             else if (this->bedType == 5) {
1275                 printf (".\t-1\t-1\t.\t-1\t");
1276             }
1277             else if (this->bedType == 6) {
1278                 printf (".\t-1\t-1\t.\t-1\t.\t");
1279             }
1280             else if (this->bedType > 6) {
1281                 printf (".\t-1\t-1\t.\t-1\t.\t");
1282                 for (unsigned int i = 6; i < this->bedType; ++i) {
1283                     printf(".\t");
1284                 }
1285             }
1286         }
1287         else if (_isGff == true && _isVcf == false) {
1288             if (this->bedType == 8) {
1289                 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t");
1290             }
1291             else if (this->bedType == 9) {
1292                 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t");
1293             }
1294         }
1295     }
1296 
1297 
1298     /*
1299         reportNullBedTab
1300     */
reportNullBedNewLine()1301     void reportNullBedNewLine() {
1302 
1303         if (_isGff == false && _isVcf == false) {
1304             if (this->bedType == 3) {
1305                 printf (".\t-1\t-1\n");
1306             }
1307             else if (this->bedType == 4) {
1308                 printf (".\t-1\t-1\t.\n");
1309             }
1310             else if (this->bedType == 5) {
1311                 printf (".\t-1\t-1\t.\t-1\n");
1312             }
1313             else if (this->bedType == 6) {
1314                 printf (".\t-1\t-1\t.\t-1\t.\n");
1315             }
1316             else if (this->bedType > 6) {
1317                 printf (".\t-1\t-1\t.\t-1\t.");
1318                 for (unsigned int i = 6; i < this->bedType; ++i) {
1319                     printf("\t.");
1320                 }
1321                 printf("\n");
1322             }
1323         }
1324         else if (_isGff == true && _isVcf == false) {
1325             if (this->bedType == 8) {
1326                 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\n");
1327             }
1328             else if (this->bedType == 9) {
1329                 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n");
1330             }
1331         }
1332     }
1333 
1334 
1335 };
1336 
1337 #endif /* BEDFILE_H */
1338