1 #include <cstring>
2 #include <string>
3 #include <map>
4 
5 #include "constants.h"
6 #include "BamI.h"
7 
8 using namespace std;
9 
10 const uint32_t o = UINT32_MAX;
11 const uint32_t cigar_c2i[128] = {
12     o,o,o,o, o,o,o,o, o,o,o,o, o,o,o,o,
13     o,o,o,o, o,o,o,o, o,o,o,o, o,o,o,o,
14     o,o,o,o, o,o,o,o, o,o,o,o, o,o,o,o,
15     o,o,o,o, o,o,o,o, o,o,o,o, o,7,o,o,
16 
17     o,o,9,o, 2,o,o,o, 5,1,o,o, o,0,3,o,
18     6,o,o,4, o,o,o,o, 8,o,o,o, o,o,o,o,
19     o,o,o,o, o,o,o,o, o,o,o,o, o,o,o,o,
20     o,o,o,o, o,o,o,o, o,o,o,o, o,o,o,o,
21 };
22 
assign(const string & name,uint16_t flg,int32_t chr_index,int32_t aln_pos,const vector<pair<char,uint>> & cig,const DNASeq4 & seq,size_t read_group)23 void BamRecord::assign(
24         const string& name,
25         uint16_t flg,
26         int32_t chr_index,
27         int32_t aln_pos,
28         const vector<pair<char,uint>>& cig,
29         const DNASeq4& seq,
30         size_t read_group
31         ) {
32     if (empty())
33         reinit();
34 
35     // bam1_t::core
36     r_->core.tid = chr_index;
37     r_->core.pos = aln_pos;
38     r_->core.bin = 0; // No idea
39     r_->core.qual = 255;
40     r_->core.l_qname = name.length() + 1; // `l_qname` includes the trailing \0.
41     r_->core.flag = flg;
42     r_->core.n_cigar = cig.size();
43     r_->core.l_qseq = seq.length();
44     r_->core.mtid = -1;
45     r_->core.mpos = -1;
46     r_->core.isize = -1; // No idea
47 
48     // bam1_t::data
49     // Htslib says: "bam1_t::data -- all variable-length data, concatenated;
50     // structure: qname-cigar-seq-qual-aux, concatenated".
51 
52     // Prepare the `aux` data.
53     string rg = string() + "RG" + "Z" + to_string(read_group);
54 
55     // Determine the length of `data`.
56     size_t l_aux = rg.length() + 1;
57     r_->l_data = r_->core.l_qname + r_->core.n_cigar*sizeof(uint32_t) + seq.nbytes() + seq.length() + l_aux;
58     if ((uint)r_->l_data > r_->m_data) {
59         if (r_->data != NULL)
60             free(r_->data);
61         r_->m_data = r_->l_data;
62         r_->data = (uchar*) malloc(r_->m_data);
63     }
64 
65     // Fill the data array.
66     uchar* p = r_->data;
67     //qname
68     strcpy((char*)p, name.c_str());
69     p += r_->core.l_qname;
70     //cigar
71     for (const pair<char, uint>& op : cig) {
72         // Cigars are uint32_t's with the length on the 28 high bits & op on the low 4 bits.
73         *(uint32_t*)p = (uint32_t(op.second) <<BAM_CIGAR_SHIFT) | cigar_c2i[size_t(op.first)];
74         p += sizeof(uint32_t);
75     }
76     //seq & qual
77     memcpy(p, seq.vdata(), seq.nbytes());
78     p += seq.nbytes();
79     memset(p, 0xFF, seq.length());
80     p += seq.length();
81     //aux
82     memcpy(p, rg.c_str(), rg.length()+1);
83 
84     // bam1_t::core.bin
85     // c.f. `sam_parse1()`; I have no idea what this is.
86     uint32_t* cigar = (uint32_t*)(r_->data + r_->core.l_qname);
87     r_->core.bin = hts_reg2bin(r_->core.pos, r_->core.pos + bam_cigar2rlen(r_->core.n_cigar, cigar), 14, 5);
88 }
89 
BamHeader(const char * text,size_t len)90 BamHeader::BamHeader(const char* text, size_t len) {
91     h_ = sam_hdr_parse(len+1, text); // null-terminated
92     if (h_ == NULL)
93         throw ios::failure("sam_hdr_parse");
94     h_->l_text = len+1;
95     h_->text = (char*) malloc(h_->l_text);
96     strcpy(h_->text, text);
97 }
98 
check_same_ref_chroms(const BamHeader & h1,const BamHeader & h2)99 void BamHeader::check_same_ref_chroms(
100         const BamHeader& h1,
101         const BamHeader& h2
102 ) {
103     if (h1.n_ref_chroms() != h2.n_ref_chroms()) {
104         cerr << "Error: Headers have different number of chromosomes.\n";
105         throw exception();
106     }
107     for (size_t i=0; i<h1.n_ref_chroms(); ++i) {
108         if (strcmp(h1.chrom_str(i), h2.chrom_str(i)) != 0) {
109             cerr << "Error: Conflicting names for the " << i+1 << "th chromosome, '"
110                  << h1.chrom_str(i) << "' and '" << h2.chrom_str(i) << "'.\n";
111             throw exception();
112         }
113         if (h1.chrom_len(i) != h2.chrom_len(i)) {
114             cerr << "Error: " << i+1 << "th chromosome has lengths "
115                  << h1.chrom_len(i) << " and " << h2.chrom_len(i) << ".\n";
116             throw exception();
117         }
118     }
119 }
120 
Bam(const char * path)121 Bam::Bam(const char *path)
122 :
123     Input(),
124     bam_fh(hts_open(path, "r")),
125     hdr(),
126     eof_(false),
127     n_records_read_(0),
128     prev_chrom_(0),
129     prev_pos_(0)
130 {
131     this->path   = string(path);
132     check_open(bam_fh, path);
133     if (bam_fh->format.format != bam) {
134         cerr << "Error: '" << path << "':";
135         if (bam_fh->format.format == sam)
136             cerr << " this is a SAM file (and BAM was specified).\n";
137         else
138             cerr << " not a BAM file.\n";
139         throw exception();
140     }
141     hdr.reinit(bam_fh);
142 };
143 
Bam(const string & path,BamHeader && header)144 Bam::Bam(const string& path, BamHeader&& header)
145 :
146     Input(),
147     bam_fh(hts_open(path.c_str(), "wb")),
148     hdr(move(header)),
149     eof_(false),
150     n_records_read_(0),
151     prev_chrom_(0),
152     prev_pos_(0)
153 {
154     this->path   = path;
155     check_open(bam_fh, path);
156 
157     // Write the header.
158     int rv = bam_hdr_write(bam_fh->fp.bgzf, hdr.hts());
159     if (rv != 0) {
160         cerr << "Error: Writing of BAM header failed (`bam_hdr_write()`returned " << rv << ").\n";
161         throw ios::failure("bam_hdr_write");
162     }
163 }
164 
check_open(const htsFile * bam_f,const string & path)165 void Bam::check_open(const htsFile* bam_f, const string& path) {
166     if (bam_f == NULL) {
167         #pragma omp critical (bam_check_open)
168         {
169             cerr << "Error: Failed to open BAM file '" << path << "'.\n";
170         }
171         throw exception();
172     }
173 }
174 
175 Seq *
next_seq()176 Bam::next_seq()
177 {
178     Seq* s = new Seq();
179     if(next_seq(*s) != 1) {
180         delete s;
181         s = NULL;
182     }
183     return s;
184 }
185 
186 int
next_seq(Seq & s)187 Bam::next_seq(Seq& s)
188 {
189     //
190     // Read a record
191     //
192     BamRecord rec;
193     if (!next_record(rec))
194         return false;
195 
196     //
197     // Fetch the sequence.
198     //
199     string  seq;
200     seq.reserve(rec.hts()->core.l_qseq);
201     for (int i = 0; i < rec.hts()->core.l_qseq; i++) {
202         uint8_t j = bam_seqi(bam_get_seq(rec.hts()), i);
203         switch(j) {
204         case 1:
205             seq += 'A';
206             break;
207         case 2:
208             seq += 'C';
209             break;
210         case 4:
211             seq += 'G';
212             break;
213         case 8:
214             seq += 'T';
215             break;
216         case 15:
217             seq += 'N';
218             break;
219         default:
220             DOES_NOT_HAPPEN;
221             break;
222         }
223     }
224 
225     //
226     // Fetch the quality score.
227     //
228     string   qual;
229     uint8_t *q = bam_get_qual(rec.hts());
230     for (int i = 0; i < rec.hts()->core.l_qseq; i++) {
231         qual += char(int(q[i]) + 33);
232     }
233 
234     AlnT aln_type;
235     if (rec.is_unmapped())
236         aln_type = AlnT::null;
237     else if (rec.is_secondary())
238         aln_type = AlnT::secondary;
239     else if (rec.is_supplementary())
240         aln_type = AlnT::supplementary;
241     else
242         aln_type = AlnT::primary;
243 
244     if (aln_type == AlnT::null) {
245         s = Seq(rec.qname(), seq.c_str(), qual.c_str());
246     } else {
247         //
248         // Check which strand this is aligned to:
249         //   SAM reference: FLAG bit 0x10 - sequence is reverse complemented
250         //
251         strand_type strand = rec.is_rev_compl() ? strand_minus : strand_plus;
252 
253         //
254         // Parse the alignment CIGAR string.
255         // If aligned to the negative strand, sequence has been reverse complemented and
256         // CIGAR string should be interpreted in reverse.
257         //
258         Cigar cigar = rec.cigar();
259         if (strand == strand_minus)
260             std::reverse(cigar.begin(), cigar.end());
261 
262         //
263         // If the read was aligned on the reverse strand (and is therefore reverse complemented)
264         // alter the start point of the alignment to reflect the right-side of the read, at the
265         // end of the RAD cut site.
266         //
267         uint bp = bam_find_start_bp(rec.pos(), strand, cigar);
268 
269         //
270         // Calculate the percentage of the sequence that was aligned to the reference.
271         //
272         uint clipped = 0;
273         for (auto& op : cigar)
274             if (op.first == 'S')
275                 clipped += op.second;
276         double pct_clipped = (double) clipped / seq.length();
277 
278         string name = rec.qname();
279         if (rec.is_read1())
280             name += "/1";
281         else if (rec.is_read2())
282             name += "/2";
283         s = Seq(name.c_str(), seq.c_str(), qual.c_str(),
284                 hdr.chrom_str(rec.chrom()), bp, strand,
285                 aln_type, pct_clipped, rec.mapq());
286 
287         if (cigar.size() > 0)
288             bam_edit_gaps(cigar, s.seq);
289     }
290 
291     return true;
292 }
293 
294 int
bam_find_start_bp(int aln_bp,strand_type strand,const Cigar & cigar)295 bam_find_start_bp(int aln_bp, strand_type strand, const Cigar& cigar)
296 {
297     if (strand == strand_plus) {
298         if (cigar.at(0).first == 'S')
299             aln_bp -= cigar.at(0).second;
300     } else {
301         // assert(strand == strand_minus);
302         for (uint i = 0; i < cigar.size(); i++)  {
303             char op   = cigar[i].first;
304             uint dist = cigar[i].second;
305 
306             switch(op) {
307             case 'I':
308             case 'H':
309                 break;
310             case 'S':
311                 if (i < cigar.size() - 1)
312                     aln_bp += dist;
313                 break;
314             case 'M':
315             case '=':
316             case 'X':
317             case 'D':
318             case 'N':
319                 aln_bp += dist;
320                 break;
321             default:
322                 break;
323             }
324         }
325         aln_bp -= 1;
326     }
327 
328     return aln_bp;
329 }
330 
331 int
bam_edit_gaps(const Cigar & cigar,char * seq)332 bam_edit_gaps(const Cigar& cigar, char *seq)
333 {
334     char *buf;
335     uint  size = cigar.size();
336     char  op;
337     uint  dist, bp, len, buf_len, buf_size, j, k, stop;
338 
339     len = strlen(seq);
340     bp  = 0;
341 
342     buf      = new char[len + 1];
343     buf_size = len + 1;
344 
345     for (uint i = 0; i < size; i++)  {
346         op   = cigar[i].first;
347         dist = cigar[i].second;
348 
349         switch(op) {
350         case 'S':
351             stop = bp + dist;
352             stop = stop > len ? len : stop;
353             while (bp < stop) {
354                 seq[bp] = 'N';
355                 bp++;
356             }
357             break;
358         case 'D':
359             //
360             // A deletion has occured in the read relative to the reference genome.
361             // Pad the read with sufficent Ns to match the deletion, shifting the existing
362             // sequence down. Trim the final length to keep the read length consistent.
363             //
364             k = bp >= len ? len : bp;
365 
366             strncpy(buf, seq + k, buf_size - 1);
367             buf[buf_size - 1] = '\0';
368             buf_len         = strlen(buf);
369 
370             stop = bp + dist;
371             stop = stop > len ? len : stop;
372             while (bp < stop) {
373                 seq[bp] = 'N';
374                 bp++;
375             }
376 
377             j = bp;
378             k = 0;
379             while (j < len && k < buf_len) {
380                 seq[j] = buf[k];
381                 k++;
382                 j++;
383             }
384             break;
385         case 'I':
386             //
387             // An insertion has occurred in the read relative to the reference genome. Delete the
388             // inserted bases and pad the end of the read with Ns.
389             //
390             if (bp >= len) break;
391 
392             k = bp + dist > len ? len : bp + dist;
393             strncpy(buf, seq + k, buf_size - 1);
394             buf[buf_size - 1] = '\0';
395             buf_len           = strlen(buf);
396 
397             j = bp;
398             k = 0;
399             while (j < len && k < buf_len) {
400                 seq[j] = buf[k];
401                 k++;
402                 j++;
403             }
404 
405             stop = j + dist;
406             stop = stop > len ? len : stop;
407             while (j < stop) {
408                 seq[j] = 'N';
409                 j++;
410             }
411             break;
412         case 'M':
413         case '=':
414         case 'X':
415             bp += dist;
416             break;
417         default:
418             break;
419         }
420     }
421 
422     delete [] buf;
423 
424     return 0;
425 }
426