1 #include <cstring>
2 #include <string>
3 #include <map>
4
5 #include "constants.h"
6 #include "BamI.h"
7
8 using namespace std;
9
10 const uint32_t o = UINT32_MAX;
11 const uint32_t cigar_c2i[128] = {
12 o,o,o,o, o,o,o,o, o,o,o,o, o,o,o,o,
13 o,o,o,o, o,o,o,o, o,o,o,o, o,o,o,o,
14 o,o,o,o, o,o,o,o, o,o,o,o, o,o,o,o,
15 o,o,o,o, o,o,o,o, o,o,o,o, o,7,o,o,
16
17 o,o,9,o, 2,o,o,o, 5,1,o,o, o,0,3,o,
18 6,o,o,4, o,o,o,o, 8,o,o,o, o,o,o,o,
19 o,o,o,o, o,o,o,o, o,o,o,o, o,o,o,o,
20 o,o,o,o, o,o,o,o, o,o,o,o, o,o,o,o,
21 };
22
assign(const string & name,uint16_t flg,int32_t chr_index,int32_t aln_pos,const vector<pair<char,uint>> & cig,const DNASeq4 & seq,size_t read_group)23 void BamRecord::assign(
24 const string& name,
25 uint16_t flg,
26 int32_t chr_index,
27 int32_t aln_pos,
28 const vector<pair<char,uint>>& cig,
29 const DNASeq4& seq,
30 size_t read_group
31 ) {
32 if (empty())
33 reinit();
34
35 // bam1_t::core
36 r_->core.tid = chr_index;
37 r_->core.pos = aln_pos;
38 r_->core.bin = 0; // No idea
39 r_->core.qual = 255;
40 r_->core.l_qname = name.length() + 1; // `l_qname` includes the trailing \0.
41 r_->core.flag = flg;
42 r_->core.n_cigar = cig.size();
43 r_->core.l_qseq = seq.length();
44 r_->core.mtid = -1;
45 r_->core.mpos = -1;
46 r_->core.isize = -1; // No idea
47
48 // bam1_t::data
49 // Htslib says: "bam1_t::data -- all variable-length data, concatenated;
50 // structure: qname-cigar-seq-qual-aux, concatenated".
51
52 // Prepare the `aux` data.
53 string rg = string() + "RG" + "Z" + to_string(read_group);
54
55 // Determine the length of `data`.
56 size_t l_aux = rg.length() + 1;
57 r_->l_data = r_->core.l_qname + r_->core.n_cigar*sizeof(uint32_t) + seq.nbytes() + seq.length() + l_aux;
58 if ((uint)r_->l_data > r_->m_data) {
59 if (r_->data != NULL)
60 free(r_->data);
61 r_->m_data = r_->l_data;
62 r_->data = (uchar*) malloc(r_->m_data);
63 }
64
65 // Fill the data array.
66 uchar* p = r_->data;
67 //qname
68 strcpy((char*)p, name.c_str());
69 p += r_->core.l_qname;
70 //cigar
71 for (const pair<char, uint>& op : cig) {
72 // Cigars are uint32_t's with the length on the 28 high bits & op on the low 4 bits.
73 *(uint32_t*)p = (uint32_t(op.second) <<BAM_CIGAR_SHIFT) | cigar_c2i[size_t(op.first)];
74 p += sizeof(uint32_t);
75 }
76 //seq & qual
77 memcpy(p, seq.vdata(), seq.nbytes());
78 p += seq.nbytes();
79 memset(p, 0xFF, seq.length());
80 p += seq.length();
81 //aux
82 memcpy(p, rg.c_str(), rg.length()+1);
83
84 // bam1_t::core.bin
85 // c.f. `sam_parse1()`; I have no idea what this is.
86 uint32_t* cigar = (uint32_t*)(r_->data + r_->core.l_qname);
87 r_->core.bin = hts_reg2bin(r_->core.pos, r_->core.pos + bam_cigar2rlen(r_->core.n_cigar, cigar), 14, 5);
88 }
89
BamHeader(const char * text,size_t len)90 BamHeader::BamHeader(const char* text, size_t len) {
91 h_ = sam_hdr_parse(len+1, text); // null-terminated
92 if (h_ == NULL)
93 throw ios::failure("sam_hdr_parse");
94 h_->l_text = len+1;
95 h_->text = (char*) malloc(h_->l_text);
96 strcpy(h_->text, text);
97 }
98
check_same_ref_chroms(const BamHeader & h1,const BamHeader & h2)99 void BamHeader::check_same_ref_chroms(
100 const BamHeader& h1,
101 const BamHeader& h2
102 ) {
103 if (h1.n_ref_chroms() != h2.n_ref_chroms()) {
104 cerr << "Error: Headers have different number of chromosomes.\n";
105 throw exception();
106 }
107 for (size_t i=0; i<h1.n_ref_chroms(); ++i) {
108 if (strcmp(h1.chrom_str(i), h2.chrom_str(i)) != 0) {
109 cerr << "Error: Conflicting names for the " << i+1 << "th chromosome, '"
110 << h1.chrom_str(i) << "' and '" << h2.chrom_str(i) << "'.\n";
111 throw exception();
112 }
113 if (h1.chrom_len(i) != h2.chrom_len(i)) {
114 cerr << "Error: " << i+1 << "th chromosome has lengths "
115 << h1.chrom_len(i) << " and " << h2.chrom_len(i) << ".\n";
116 throw exception();
117 }
118 }
119 }
120
Bam(const char * path)121 Bam::Bam(const char *path)
122 :
123 Input(),
124 bam_fh(hts_open(path, "r")),
125 hdr(),
126 eof_(false),
127 n_records_read_(0),
128 prev_chrom_(0),
129 prev_pos_(0)
130 {
131 this->path = string(path);
132 check_open(bam_fh, path);
133 if (bam_fh->format.format != bam) {
134 cerr << "Error: '" << path << "':";
135 if (bam_fh->format.format == sam)
136 cerr << " this is a SAM file (and BAM was specified).\n";
137 else
138 cerr << " not a BAM file.\n";
139 throw exception();
140 }
141 hdr.reinit(bam_fh);
142 };
143
Bam(const string & path,BamHeader && header)144 Bam::Bam(const string& path, BamHeader&& header)
145 :
146 Input(),
147 bam_fh(hts_open(path.c_str(), "wb")),
148 hdr(move(header)),
149 eof_(false),
150 n_records_read_(0),
151 prev_chrom_(0),
152 prev_pos_(0)
153 {
154 this->path = path;
155 check_open(bam_fh, path);
156
157 // Write the header.
158 int rv = bam_hdr_write(bam_fh->fp.bgzf, hdr.hts());
159 if (rv != 0) {
160 cerr << "Error: Writing of BAM header failed (`bam_hdr_write()`returned " << rv << ").\n";
161 throw ios::failure("bam_hdr_write");
162 }
163 }
164
check_open(const htsFile * bam_f,const string & path)165 void Bam::check_open(const htsFile* bam_f, const string& path) {
166 if (bam_f == NULL) {
167 #pragma omp critical (bam_check_open)
168 {
169 cerr << "Error: Failed to open BAM file '" << path << "'.\n";
170 }
171 throw exception();
172 }
173 }
174
175 Seq *
next_seq()176 Bam::next_seq()
177 {
178 Seq* s = new Seq();
179 if(next_seq(*s) != 1) {
180 delete s;
181 s = NULL;
182 }
183 return s;
184 }
185
186 int
next_seq(Seq & s)187 Bam::next_seq(Seq& s)
188 {
189 //
190 // Read a record
191 //
192 BamRecord rec;
193 if (!next_record(rec))
194 return false;
195
196 //
197 // Fetch the sequence.
198 //
199 string seq;
200 seq.reserve(rec.hts()->core.l_qseq);
201 for (int i = 0; i < rec.hts()->core.l_qseq; i++) {
202 uint8_t j = bam_seqi(bam_get_seq(rec.hts()), i);
203 switch(j) {
204 case 1:
205 seq += 'A';
206 break;
207 case 2:
208 seq += 'C';
209 break;
210 case 4:
211 seq += 'G';
212 break;
213 case 8:
214 seq += 'T';
215 break;
216 case 15:
217 seq += 'N';
218 break;
219 default:
220 DOES_NOT_HAPPEN;
221 break;
222 }
223 }
224
225 //
226 // Fetch the quality score.
227 //
228 string qual;
229 uint8_t *q = bam_get_qual(rec.hts());
230 for (int i = 0; i < rec.hts()->core.l_qseq; i++) {
231 qual += char(int(q[i]) + 33);
232 }
233
234 AlnT aln_type;
235 if (rec.is_unmapped())
236 aln_type = AlnT::null;
237 else if (rec.is_secondary())
238 aln_type = AlnT::secondary;
239 else if (rec.is_supplementary())
240 aln_type = AlnT::supplementary;
241 else
242 aln_type = AlnT::primary;
243
244 if (aln_type == AlnT::null) {
245 s = Seq(rec.qname(), seq.c_str(), qual.c_str());
246 } else {
247 //
248 // Check which strand this is aligned to:
249 // SAM reference: FLAG bit 0x10 - sequence is reverse complemented
250 //
251 strand_type strand = rec.is_rev_compl() ? strand_minus : strand_plus;
252
253 //
254 // Parse the alignment CIGAR string.
255 // If aligned to the negative strand, sequence has been reverse complemented and
256 // CIGAR string should be interpreted in reverse.
257 //
258 Cigar cigar = rec.cigar();
259 if (strand == strand_minus)
260 std::reverse(cigar.begin(), cigar.end());
261
262 //
263 // If the read was aligned on the reverse strand (and is therefore reverse complemented)
264 // alter the start point of the alignment to reflect the right-side of the read, at the
265 // end of the RAD cut site.
266 //
267 uint bp = bam_find_start_bp(rec.pos(), strand, cigar);
268
269 //
270 // Calculate the percentage of the sequence that was aligned to the reference.
271 //
272 uint clipped = 0;
273 for (auto& op : cigar)
274 if (op.first == 'S')
275 clipped += op.second;
276 double pct_clipped = (double) clipped / seq.length();
277
278 string name = rec.qname();
279 if (rec.is_read1())
280 name += "/1";
281 else if (rec.is_read2())
282 name += "/2";
283 s = Seq(name.c_str(), seq.c_str(), qual.c_str(),
284 hdr.chrom_str(rec.chrom()), bp, strand,
285 aln_type, pct_clipped, rec.mapq());
286
287 if (cigar.size() > 0)
288 bam_edit_gaps(cigar, s.seq);
289 }
290
291 return true;
292 }
293
294 int
bam_find_start_bp(int aln_bp,strand_type strand,const Cigar & cigar)295 bam_find_start_bp(int aln_bp, strand_type strand, const Cigar& cigar)
296 {
297 if (strand == strand_plus) {
298 if (cigar.at(0).first == 'S')
299 aln_bp -= cigar.at(0).second;
300 } else {
301 // assert(strand == strand_minus);
302 for (uint i = 0; i < cigar.size(); i++) {
303 char op = cigar[i].first;
304 uint dist = cigar[i].second;
305
306 switch(op) {
307 case 'I':
308 case 'H':
309 break;
310 case 'S':
311 if (i < cigar.size() - 1)
312 aln_bp += dist;
313 break;
314 case 'M':
315 case '=':
316 case 'X':
317 case 'D':
318 case 'N':
319 aln_bp += dist;
320 break;
321 default:
322 break;
323 }
324 }
325 aln_bp -= 1;
326 }
327
328 return aln_bp;
329 }
330
331 int
bam_edit_gaps(const Cigar & cigar,char * seq)332 bam_edit_gaps(const Cigar& cigar, char *seq)
333 {
334 char *buf;
335 uint size = cigar.size();
336 char op;
337 uint dist, bp, len, buf_len, buf_size, j, k, stop;
338
339 len = strlen(seq);
340 bp = 0;
341
342 buf = new char[len + 1];
343 buf_size = len + 1;
344
345 for (uint i = 0; i < size; i++) {
346 op = cigar[i].first;
347 dist = cigar[i].second;
348
349 switch(op) {
350 case 'S':
351 stop = bp + dist;
352 stop = stop > len ? len : stop;
353 while (bp < stop) {
354 seq[bp] = 'N';
355 bp++;
356 }
357 break;
358 case 'D':
359 //
360 // A deletion has occured in the read relative to the reference genome.
361 // Pad the read with sufficent Ns to match the deletion, shifting the existing
362 // sequence down. Trim the final length to keep the read length consistent.
363 //
364 k = bp >= len ? len : bp;
365
366 strncpy(buf, seq + k, buf_size - 1);
367 buf[buf_size - 1] = '\0';
368 buf_len = strlen(buf);
369
370 stop = bp + dist;
371 stop = stop > len ? len : stop;
372 while (bp < stop) {
373 seq[bp] = 'N';
374 bp++;
375 }
376
377 j = bp;
378 k = 0;
379 while (j < len && k < buf_len) {
380 seq[j] = buf[k];
381 k++;
382 j++;
383 }
384 break;
385 case 'I':
386 //
387 // An insertion has occurred in the read relative to the reference genome. Delete the
388 // inserted bases and pad the end of the read with Ns.
389 //
390 if (bp >= len) break;
391
392 k = bp + dist > len ? len : bp + dist;
393 strncpy(buf, seq + k, buf_size - 1);
394 buf[buf_size - 1] = '\0';
395 buf_len = strlen(buf);
396
397 j = bp;
398 k = 0;
399 while (j < len && k < buf_len) {
400 seq[j] = buf[k];
401 k++;
402 j++;
403 }
404
405 stop = j + dist;
406 stop = stop > len ? len : stop;
407 while (j < stop) {
408 seq[j] = 'N';
409 j++;
410 }
411 break;
412 case 'M':
413 case '=':
414 case 'X':
415 bp += dist;
416 break;
417 default:
418 break;
419 }
420 }
421
422 delete [] buf;
423
424 return 0;
425 }
426