1 /*****************************************************************************
2 bedpeToBam.cpp
3
4 (c) 2009 - Royden Clark, Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "lineFileUtilities.h"
13 #include "bedFilePE.h"
14 #include "GenomeFile.h"
15 #include "version.h"
16
17
18 #include "api/BamReader.h"
19 #include "api/BamAux.h"
20 #include "api/BamWriter.h"
21 using namespace BamTools;
22
23 #include <vector>
24 #include <iostream>
25 #include <fstream>
26 #include <stdlib.h>
27
28 using namespace std;
29
30
31 // define our program name
32 #define PROGRAM_NAME "bedpetobam"
33
34 // define our parameter checking macro
35 #define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
36
37
38 // function declarations
39 void bedpetobam_help(void);
40 void ProcessBedPE(BedFilePE *bedpe, GenomeFile *genome, int mapQual, bool uncompressedBam);
41 void ConvertBedPEToBam(const BEDPE &bedpe, BamAlignment &bam1,BamAlignment &bam2, map<string, int> &chromToId, int mapQual, int lineNum);
42
43 void bedpetobam_MakeBamHeader(const string &genomeFile, RefVector &refs, string &header, map<string, int> &chromToInt);
44 int bedpetobam_reg2bin(int beg, int end);
45
46
47
bedpetobam_main(int argc,char * argv[])48 int bedpetobam_main(int argc, char* argv[]) {
49
50 // our configuration variables
51 bool showHelp = false;
52
53 // input files
54 string bedpeFile = "stdin";
55 string genomeFile;
56
57 int mapQual = 255;
58
59 bool haveBedPE = true;
60 bool haveGenome = false;
61 bool haveMapQual = false;
62 // bool isBED12 = false;
63 bool uncompressedBam = false;
64
65 for(int i = 1; i < argc; i++) {
66 int parameterLength = (int)strlen(argv[i]);
67
68 if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
69 (PARAMETER_CHECK("--help", 5, parameterLength))) {
70 showHelp = true;
71 }
72 }
73
74 if(showHelp) bedpetobam_help();
75
76 // do some parsing (all of these parameters require 2 strings)
77 for(int i = 1; i < argc; i++) {
78
79 int parameterLength = (int)strlen(argv[i]);
80
81 if(PARAMETER_CHECK("-i", 2, parameterLength)) {
82 if ((i+1) < argc) {
83 bedpeFile = argv[i + 1];
84 i++;
85 }
86 }
87 else if(PARAMETER_CHECK("-g", 2, parameterLength)) {
88 if ((i+1) < argc) {
89 haveGenome = true;
90 genomeFile = argv[i + 1];
91 i++;
92 }
93 }
94 else if(PARAMETER_CHECK("-mapq", 5, parameterLength)) {
95 haveMapQual = true;
96 if ((i+1) < argc) {
97 mapQual = atoi(argv[i + 1]);
98 i++;
99 }
100 }
101 else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) {
102 uncompressedBam = true;
103 }
104 else {
105 cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
106 showHelp = true;
107 }
108 }
109
110 // make sure we have an input files
111 if (!haveBedPE ) {
112 cerr << endl << "*****" << endl << "*****ERROR: Need -i (BEDPE) file. " << endl << "*****" << endl;
113 showHelp = true;
114 }
115 if (!haveGenome ) {
116 cerr << endl << "*****" << endl << "*****ERROR: Need -g (genome) file. " << endl << "*****" << endl;
117 showHelp = true;
118 }
119 if (mapQual < 0 || mapQual > 255) {
120 cerr << endl << "*****" << endl << "*****ERROR: MAPQ must be in range [0,255]. " << endl << "*****" << endl;
121 showHelp = true;
122 }
123
124
125 if (!showHelp) {
126 BedFilePE *bedpe= new BedFilePE(bedpeFile);
127 GenomeFile *genome = new GenomeFile(genomeFile);
128
129 ProcessBedPE(bedpe, genome, mapQual, uncompressedBam);
130 }
131 else {
132 bedpetobam_help();
133 }
134 return 0;
135 }
136
137
bedpetobam_help(void)138 void bedpetobam_help(void) {
139
140 cerr << "\nTool: bedtools bedpetobam (aka bedpeToBam)" << endl;
141 cerr << "Version: " << VERSION << "\n";
142 cerr << "Summary: Converts feature records to BAM format." << endl << endl;
143
144 cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl;
145
146 cerr << "Options: " << endl;
147
148 cerr << "\t-mapq\t" << "Set the mappinq quality for the BAM records." << endl;
149 cerr << "\t\t(INT) Default: 255" << endl << endl;
150
151 cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default writes compressed BAM." << endl << endl;
152
153 cerr << "Notes: " << endl;
154 cerr << "\t(1) BED files must be at least BED4 to create BAM (needs name field)." << endl << endl;
155
156
157 // end the program here
158 exit(1);
159 }
160
ProcessBedPE(BedFilePE * bedpe,GenomeFile * genome,int mapQual,bool uncompressedBam)161 void ProcessBedPE(BedFilePE *bedpe, GenomeFile *genome, int mapQual, bool uncompressedBam) {
162
163 BamWriter *writer = new BamWriter();
164
165 // build a BAM header from the genomeFile
166 RefVector refs;
167 string bamHeader;
168 map<string, int, std::less<string> > chromToId;
169 bedpetobam_MakeBamHeader(genome->getGenomeFileName(), refs, bamHeader, chromToId);
170
171 // set compression mode
172 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
173 if ( uncompressedBam ) compressionMode = BamWriter::Uncompressed;
174 writer->SetCompressionMode(compressionMode);
175 // open a BAM and add the reference headers to the BAM file
176 writer->Open("stdout", bamHeader, refs);
177
178
179 // process each BED entry and convert to BAM
180 BEDPE bedpeEntry, nullBedpe;
181 int lineNum = 0;
182 BedLineStatus bedpeStatus;
183 // open the BED file for reading.
184 bedpe->Open();
185 while ((bedpeStatus = bedpe->GetNextBedPE(bedpeEntry, lineNum)) != BED_INVALID) {
186 if (bedpeStatus == BED_VALID) {
187 BamAlignment bamEntry1;
188 BamAlignment bamEntry2;
189
190 if (bedpe->bedType >= 10) {
191 ConvertBedPEToBam(bedpeEntry, bamEntry1, bamEntry2, chromToId, mapQual, lineNum);
192 writer->SaveAlignment(bamEntry1, true);
193 writer->SaveAlignment(bamEntry2, true);
194
195 }
196 else {
197 cerr << "Error: BEDPE entry without name found at line: " << lineNum << ". Exiting!" << endl;
198 exit (1);
199 }
200 bedpeEntry = nullBedpe;
201 }
202 }
203 //close up
204 bedpe->Close();
205 writer->Close();
206 }
207
208
209 //void ConvertBedPEToBam(const BEDPE &bedpe, BamAlignment &bam1,BamAlignment &bam2, map<string, int, std::less<string> > &chromToId,
210 // bool isBED12, int mapQual, int lineNum) {
ConvertBedPEToBam(const BEDPE & bedpe,BamAlignment & bam1,BamAlignment & bam2,map<string,int,std::less<string>> & chromToId,int mapQual,int lineNum)211 void ConvertBedPEToBam(const BEDPE &bedpe, BamAlignment &bam1,BamAlignment &bam2, map<string, int, std::less<string> > &chromToId,
212 int mapQual, int lineNum) {
213
214 bam1.Name = bedpe.name;
215 bam1.Position = bedpe.start1;
216 bam1.Bin = bedpetobam_reg2bin(bedpe.start1, bedpe.end1);
217 bam2.Name = bedpe.name;
218 bam2.Position = bedpe.start2;
219 bam2.Bin = bedpetobam_reg2bin(bedpe.start2, bedpe.end2);
220
221 // hard-code the sequence and qualities.
222 int bedpeLength1 = bedpe.end1 - bedpe.start1;
223 int bedpeLength2 = bedpe.end2 - bedpe.start2;
224
225 // set dummy seq and qual strings. the input is BED,
226 // so the sequence is inherently the same as it's
227 // reference genome.
228 // Thanks to James M. Ward for pointing this out.
229 bam1.QueryBases = "";
230 bam1.Qualities = "";
231 bam2.QueryBases = "";
232 bam2.Qualities = "";
233
234 // chrom and map quality
235 bam1.RefID = chromToId[bedpe.chrom1];
236 bam1.MapQuality = mapQual;
237 bam2.RefID = chromToId[bedpe.chrom2];
238 bam2.MapQuality = mapQual;
239
240 // set the BAM FLAG
241 bam1.AlignmentFlag = 0;
242 bam2.AlignmentFlag = 0;
243
244 if (bedpe.strand1 == "-"){
245 bam1.SetIsReverseStrand(true);
246 bam2.SetIsMateReverseStrand(true);
247
248 }
249
250 if (bedpe.strand2 == "-"){
251 bam2.SetIsReverseStrand(true);
252 bam1.SetIsMateReverseStrand(true);
253 }
254 bam1.MatePosition = bedpe.start2;
255
256 if(chromToId[bedpe.chrom1] == chromToId[bedpe.chrom2]){
257 bam1.InsertSize = bedpe.start2-bedpe.start1;
258 bam2.InsertSize = bedpe.start2-bedpe.start1;
259 if((bedpe.strand1 == "+") && (bedpe.strand2 == "-")){
260 bam1.SetIsProperPair(true);
261 bam2.SetIsProperPair(true);
262 }
263 }
264 else{
265 bam1.InsertSize = 0;
266 bam2.InsertSize = 0;
267
268 }
269 bam1.MateRefID = chromToId[bedpe.chrom2];
270 bam2.MatePosition = bedpe.start1;
271 bam2.MateRefID = chromToId[bedpe.chrom1];
272
273 bam1.SetIsFirstMate(true);
274 bam2.SetIsSecondMate(true);
275 bam1.SetIsPaired(true);
276 bam2.SetIsPaired(true);
277
278 bam1.CigarData.clear();
279 bam2.CigarData.clear();
280
281 CigarOp cOp1;
282 cOp1.Type = 'M';
283 cOp1.Length = bedpeLength1;
284 bam1.CigarData.push_back(cOp1);
285 CigarOp cOp2;
286 cOp2.Type = 'M';
287 cOp2.Length = bedpeLength2;
288 bam2.CigarData.push_back(cOp2);
289 }
290
291
bedpetobam_MakeBamHeader(const string & genomeFile,RefVector & refs,string & header,map<string,int,std::less<string>> & chromToId)292 void bedpetobam_MakeBamHeader(const string &genomeFile, RefVector &refs, string &header,
293 map<string, int, std::less<string> > &chromToId) {
294
295 // make a genome map of the genome file.
296 GenomeFile genome(genomeFile);
297
298 header += "@HD\tVN:1.0\tSO:unsorted\n";
299 header += "@PG\tID:BEDTools_bedpeToBam\tVN:V";
300 header += VERSION;
301 header += "\n";
302
303 int chromId = 0;
304 vector<string> chromList = genome.getChromList();
305 sort(chromList.begin(), chromList.end());
306
307 // create a BAM header (@SQ) entry for each chrom in the BEDTools genome file.
308 vector<string>::const_iterator genomeItr = chromList.begin();
309 vector<string>::const_iterator genomeEnd = chromList.end();
310 for (; genomeItr != genomeEnd; ++genomeItr) {
311 chromToId[*genomeItr] = chromId;
312 chromId++;
313
314 // add to the header text
315 int size = genome.getChromSize(*genomeItr);
316 string chromLine = "@SQ\tSN:" + *genomeItr + "\tAS:" + genomeFile + "\tLN:" + ToString(size) + "\n";
317 header += chromLine;
318
319 // create a chrom entry and add it to
320 // the RefVector
321 RefData chrom;
322 chrom.RefName = *genomeItr;
323 chrom.RefLength = size;
324 refs.push_back(chrom);
325 }
326 }
327
328
329 /* Taken directly from the SAMTools spec
330 calculate bin given an alignment in [beg,end) (zero-based, half-close, half-open) */
bedpetobam_reg2bin(int beg,int end)331 int bedpetobam_reg2bin(int beg, int end) {
332 --end;
333 if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14);
334 if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17);
335 if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20);
336 if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23);
337 if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26);
338 return 0;
339 }
340
341
342