1 /*****************************************************************************
2   bedpeToBam.cpp
3 
4   (c) 2009 - Royden Clark, Aaron Quinlan
5   Hall Laboratory
6   Department of Biochemistry and Molecular Genetics
7   University of Virginia
8   aaronquinlan@gmail.com
9 
10   Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "lineFileUtilities.h"
13 #include "bedFilePE.h"
14 #include "GenomeFile.h"
15 #include "version.h"
16 
17 
18 #include "api/BamReader.h"
19 #include "api/BamAux.h"
20 #include "api/BamWriter.h"
21 using namespace BamTools;
22 
23 #include <vector>
24 #include <iostream>
25 #include <fstream>
26 #include <stdlib.h>
27 
28 using namespace std;
29 
30 
31 // define our program name
32 #define PROGRAM_NAME "bedpetobam"
33 
34 // define our parameter checking macro
35 #define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
36 
37 
38 // function declarations
39 void bedpetobam_help(void);
40 void ProcessBedPE(BedFilePE *bedpe, GenomeFile *genome,  int mapQual, bool uncompressedBam);
41 void ConvertBedPEToBam(const BEDPE &bedpe, BamAlignment &bam1,BamAlignment &bam2, map<string, int> &chromToId, int mapQual, int lineNum);
42 
43 void bedpetobam_MakeBamHeader(const string &genomeFile, RefVector &refs, string &header, map<string, int> &chromToInt);
44 int  bedpetobam_reg2bin(int beg, int end);
45 
46 
47 
bedpetobam_main(int argc,char * argv[])48 int bedpetobam_main(int argc, char* argv[]) {
49 
50     // our configuration variables
51     bool showHelp = false;
52 
53     // input files
54     string bedpeFile = "stdin";
55     string genomeFile;
56 
57     int mapQual = 255;
58 
59     bool haveBedPE         = true;
60     bool haveGenome      = false;
61     bool haveMapQual     = false;
62    // bool isBED12         = false;
63     bool uncompressedBam = false;
64 
65     for(int i = 1; i < argc; i++) {
66         int parameterLength = (int)strlen(argv[i]);
67 
68         if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
69         (PARAMETER_CHECK("--help", 5, parameterLength))) {
70             showHelp = true;
71         }
72     }
73 
74     if(showHelp) bedpetobam_help();
75 
76     // do some parsing (all of these parameters require 2 strings)
77     for(int i = 1; i < argc; i++) {
78 
79         int parameterLength = (int)strlen(argv[i]);
80 
81         if(PARAMETER_CHECK("-i", 2, parameterLength)) {
82             if ((i+1) < argc) {
83                 bedpeFile = argv[i + 1];
84                 i++;
85             }
86         }
87         else if(PARAMETER_CHECK("-g", 2, parameterLength)) {
88             if ((i+1) < argc) {
89                 haveGenome = true;
90                 genomeFile = argv[i + 1];
91                 i++;
92             }
93         }
94         else if(PARAMETER_CHECK("-mapq", 5, parameterLength)) {
95             haveMapQual = true;
96             if ((i+1) < argc) {
97                 mapQual = atoi(argv[i + 1]);
98                 i++;
99             }
100         }
101         else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) {
102             uncompressedBam = true;
103         }
104         else {
105             cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
106             showHelp = true;
107         }
108     }
109 
110     // make sure we have an input files
111     if (!haveBedPE ) {
112         cerr << endl << "*****" << endl << "*****ERROR: Need -i (BEDPE) file. " << endl << "*****" << endl;
113         showHelp = true;
114     }
115     if (!haveGenome ) {
116         cerr << endl << "*****" << endl << "*****ERROR: Need -g (genome) file. " << endl << "*****" << endl;
117         showHelp = true;
118     }
119     if (mapQual < 0 || mapQual > 255) {
120         cerr << endl << "*****" << endl << "*****ERROR: MAPQ must be in range [0,255]. " << endl << "*****" << endl;
121         showHelp = true;
122     }
123 
124 
125     if (!showHelp) {
126         BedFilePE *bedpe= new BedFilePE(bedpeFile);
127         GenomeFile *genome = new GenomeFile(genomeFile);
128 
129        ProcessBedPE(bedpe, genome,  mapQual, uncompressedBam);
130     }
131     else {
132         bedpetobam_help();
133     }
134     return 0;
135 }
136 
137 
bedpetobam_help(void)138 void bedpetobam_help(void) {
139 
140     cerr << "\nTool:    bedtools bedpetobam (aka bedpeToBam)" << endl;
141     cerr << "Version: " << VERSION << "\n";
142     cerr << "Summary: Converts feature records to BAM format." << endl << endl;
143 
144     cerr << "Usage:   " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl;
145 
146     cerr << "Options: " << endl;
147 
148     cerr << "\t-mapq\t" << "Set the mappinq quality for the BAM records." << endl;
149     cerr                    << "\t\t(INT) Default: 255" << endl << endl;
150 
151     cerr << "\t-ubam\t"     << "Write uncompressed BAM output. Default writes compressed BAM." << endl << endl;
152 
153     cerr << "Notes: " << endl;
154     cerr << "\t(1)  BED files must be at least BED4 to create BAM (needs name field)." << endl << endl;
155 
156 
157     // end the program here
158     exit(1);
159 }
160 
ProcessBedPE(BedFilePE * bedpe,GenomeFile * genome,int mapQual,bool uncompressedBam)161 void ProcessBedPE(BedFilePE *bedpe, GenomeFile *genome,  int mapQual, bool uncompressedBam) {
162 
163     BamWriter *writer = new BamWriter();
164 
165     // build a BAM header from the genomeFile
166     RefVector refs;
167     string    bamHeader;
168     map<string, int, std::less<string> > chromToId;
169     bedpetobam_MakeBamHeader(genome->getGenomeFileName(), refs, bamHeader, chromToId);
170 
171     // set compression mode
172     BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
173     if ( uncompressedBam ) compressionMode = BamWriter::Uncompressed;
174     writer->SetCompressionMode(compressionMode);
175     // open a BAM and add the reference headers to the BAM file
176     writer->Open("stdout", bamHeader, refs);
177 
178 
179     // process each BED entry and convert to BAM
180     BEDPE bedpeEntry, nullBedpe;
181     int lineNum = 0;
182     BedLineStatus bedpeStatus;
183     // open the BED file for reading.
184     bedpe->Open();
185     while ((bedpeStatus = bedpe->GetNextBedPE(bedpeEntry, lineNum)) != BED_INVALID) {
186         if (bedpeStatus == BED_VALID) {
187             BamAlignment bamEntry1;
188            	BamAlignment bamEntry2;
189 
190             if (bedpe->bedType >= 10) {
191                 ConvertBedPEToBam(bedpeEntry, bamEntry1, bamEntry2, chromToId,  mapQual, lineNum);
192                 writer->SaveAlignment(bamEntry1, true);
193                 writer->SaveAlignment(bamEntry2, true);
194 
195             }
196             else {
197                 cerr << "Error: BEDPE entry without name found at line: " << lineNum << ".  Exiting!" << endl;
198                 exit (1);
199             }
200             bedpeEntry = nullBedpe;
201         }
202     }
203     //close up
204     bedpe->Close();
205     writer->Close();
206 }
207 
208 
209 //void ConvertBedPEToBam(const BEDPE &bedpe, BamAlignment &bam1,BamAlignment &bam2, map<string, int, std::less<string> > &chromToId,
210  //                    bool isBED12, int mapQual, int lineNum) {
ConvertBedPEToBam(const BEDPE & bedpe,BamAlignment & bam1,BamAlignment & bam2,map<string,int,std::less<string>> & chromToId,int mapQual,int lineNum)211 void ConvertBedPEToBam(const BEDPE &bedpe, BamAlignment &bam1,BamAlignment &bam2, map<string, int, std::less<string> > &chromToId,
212 	                    int mapQual, int lineNum) {
213 
214     bam1.Name       = bedpe.name;
215     bam1.Position   = bedpe.start1;
216     bam1.Bin        = bedpetobam_reg2bin(bedpe.start1, bedpe.end1);
217     bam2.Name       = bedpe.name;
218     bam2.Position   = bedpe.start2;
219     bam2.Bin        = bedpetobam_reg2bin(bedpe.start2, bedpe.end2);
220 
221     // hard-code the sequence and qualities.
222     int bedpeLength1  = bedpe.end1 - bedpe.start1;
223     int bedpeLength2  = bedpe.end2 - bedpe.start2;
224 
225     // set dummy seq and qual strings.  the input is BED,
226     // so the sequence is inherently the same as it's
227     // reference genome.
228     // Thanks to James M. Ward for pointing this out.
229     bam1.QueryBases = "";
230     bam1.Qualities  = "";
231  	bam2.QueryBases = "";
232     bam2.Qualities  = "";
233 
234     // chrom and map quality
235     bam1.RefID      = chromToId[bedpe.chrom1];
236     bam1.MapQuality = mapQual;
237     bam2.RefID      = chromToId[bedpe.chrom2];
238     bam2.MapQuality = mapQual;
239 
240     // set the BAM FLAG
241     bam1.AlignmentFlag = 0;
242     bam2.AlignmentFlag = 0;
243 
244    if (bedpe.strand1 == "-"){
245         bam1.SetIsReverseStrand(true);
246 		bam2.SetIsMateReverseStrand(true);
247 
248 	}
249 
250    if (bedpe.strand2 == "-"){
251         bam2.SetIsReverseStrand(true);
252 		bam1.SetIsMateReverseStrand(true);
253 	}
254     bam1.MatePosition = bedpe.start2;
255 
256 	if(chromToId[bedpe.chrom1] == chromToId[bedpe.chrom2]){
257 		bam1.InsertSize   = bedpe.start2-bedpe.start1;
258 		bam2.InsertSize   = bedpe.start2-bedpe.start1;
259 		if((bedpe.strand1 == "+") && (bedpe.strand2 == "-")){
260 			bam1.SetIsProperPair(true);
261 			bam2.SetIsProperPair(true);
262 		}
263 	}
264 	else{
265 		bam1.InsertSize   = 0;
266 		bam2.InsertSize   = 0;
267 
268 	}
269     bam1.MateRefID    = chromToId[bedpe.chrom2];
270     bam2.MatePosition = bedpe.start1;
271     bam2.MateRefID    = chromToId[bedpe.chrom1];
272 
273 	bam1.SetIsFirstMate(true);
274 	bam2.SetIsSecondMate(true);
275 	bam1.SetIsPaired(true);
276 	bam2.SetIsPaired(true);
277 
278     bam1.CigarData.clear();
279 	bam2.CigarData.clear();
280 
281     CigarOp cOp1;
282     cOp1.Type = 'M';
283     cOp1.Length = bedpeLength1;
284     bam1.CigarData.push_back(cOp1);
285     CigarOp cOp2;
286     cOp2.Type = 'M';
287     cOp2.Length = bedpeLength2;
288     bam2.CigarData.push_back(cOp2);
289 }
290 
291 
bedpetobam_MakeBamHeader(const string & genomeFile,RefVector & refs,string & header,map<string,int,std::less<string>> & chromToId)292 void bedpetobam_MakeBamHeader(const string &genomeFile, RefVector &refs, string &header,
293                    map<string, int, std::less<string> > &chromToId) {
294 
295     // make a genome map of the genome file.
296     GenomeFile genome(genomeFile);
297 
298     header += "@HD\tVN:1.0\tSO:unsorted\n";
299     header += "@PG\tID:BEDTools_bedpeToBam\tVN:V";
300     header += VERSION;
301     header += "\n";
302 
303     int chromId = 0;
304     vector<string> chromList = genome.getChromList();
305     sort(chromList.begin(), chromList.end());
306 
307     // create a BAM header (@SQ) entry for each chrom in the BEDTools genome file.
308     vector<string>::const_iterator genomeItr  = chromList.begin();
309     vector<string>::const_iterator genomeEnd  = chromList.end();
310     for (; genomeItr != genomeEnd; ++genomeItr) {
311         chromToId[*genomeItr] = chromId;
312         chromId++;
313 
314         // add to the header text
315         int size = genome.getChromSize(*genomeItr);
316         string chromLine = "@SQ\tSN:" + *genomeItr + "\tAS:" + genomeFile + "\tLN:" + ToString(size) + "\n";
317         header += chromLine;
318 
319         // create a chrom entry and add it to
320         // the RefVector
321         RefData chrom;
322         chrom.RefName            = *genomeItr;
323         chrom.RefLength          = size;
324         refs.push_back(chrom);
325     }
326 }
327 
328 
329 /* Taken directly from the SAMTools spec
330 calculate bin given an alignment in [beg,end) (zero-based, half-close, half-open) */
bedpetobam_reg2bin(int beg,int end)331 int bedpetobam_reg2bin(int beg, int end) {
332     --end;
333     if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14);
334     if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17);
335     if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20);
336     if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23);
337     if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26);
338     return 0;
339 }
340 
341 
342