1 /*  realn.c -- BAQ calculation and realignment.
2 
3     Copyright (C) 2009-2011, 2014-2016, 2018, 2021 Genome Research Ltd.
4     Portions copyright (C) 2009-2011 Broad Institute.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27 #include <config.h>
28 
29 #include <stdlib.h>
30 #include <string.h>
31 #include <math.h>
32 #include <limits.h>
33 #include <stdint.h>
34 #include <errno.h>
35 #include <assert.h>
36 #include "htslib/hts.h"
37 #include "htslib/sam.h"
38 
sam_cap_mapq(bam1_t * b,const char * ref,hts_pos_t ref_len,int thres)39 int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres)
40 {
41     uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b);
42     uint32_t *cigar = bam_get_cigar(b);
43     bam1_core_t *c = &b->core;
44     int i, y, mm, q, len, clip_l, clip_q;
45     hts_pos_t x;
46     double t;
47     if (thres < 0) thres = 40; // set the default
48     mm = q = len = clip_l = clip_q = 0;
49     for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
50         int j, l = cigar[i]>>4, op = cigar[i]&0xf;
51         if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
52             for (j = 0; j < l; ++j) {
53                 int c1, c2, z = y + j;
54                 if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
55                 c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(unsigned char)ref[x+j]];
56                 if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
57                     ++len;
58                     if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
59                         ++mm;
60                         q += qual[z] > 33? 33 : qual[z];
61                     }
62                 }
63             }
64             if (j < l) break;
65             x += l; y += l; len += l;
66         } else if (op == BAM_CDEL) {
67             for (j = 0; j < l; ++j)
68                 if (x+j >= ref_len || ref[x+j] == '\0') break;
69             if (j < l) break;
70             x += l;
71         } else if (op == BAM_CSOFT_CLIP) {
72             for (j = 0; j < l; ++j) clip_q += qual[y+j];
73             clip_l += l;
74             y += l;
75         } else if (op == BAM_CHARD_CLIP) {
76             clip_q += 13 * l;
77             clip_l += l;
78         } else if (op == BAM_CINS) y += l;
79         else if (op == BAM_CREF_SKIP) x += l;
80     }
81     for (i = 0, t = 1; i < mm; ++i)
82         t *= (double)len / (i+1);
83     t = q - 4.343 * log(t) + clip_q / 5.;
84     if (t > thres) return -1;
85     if (t < 0) t = 0;
86     t = sqrt((thres - t) / thres) * thres;
87     //fprintf(stderr, "%s %lf %d\n", bam_get_qname(b), t, q);
88     return (int)(t + .499);
89 }
90 
realn_check_tag(const uint8_t * tg,enum htsLogLevel severity,const char * type,const bam1_t * b)91 static int realn_check_tag(const uint8_t *tg, enum htsLogLevel severity,
92                            const char *type, const bam1_t *b) {
93     if (*tg != 'Z') {
94         hts_log(severity, "Incorrect %s tag type (%c) for read %s",
95                 type, *tg, bam_get_qname(b));
96         return -1;
97     }
98     if (b->core.l_qseq != strlen((const char *) tg + 1)) {
99         hts_log(severity, "Read %s %s tag is wrong length",
100                 bam_get_qname(b), type);
101         return -1;
102     }
103     return 0;
104 }
105 
sam_prob_realn(bam1_t * b,const char * ref,hts_pos_t ref_len,int flag)106 int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) {
107     int k, bw, y, yb, ye, xb, xe, fix_bq = 0, apply_baq = flag & BAQ_APPLY,
108         extend_baq = flag & BAQ_EXTEND, redo_baq = flag & BAQ_REDO;
109     enum htsRealnFlags system = flag & (0xff << 3);
110     hts_pos_t i, x;
111     uint32_t *cigar = bam_get_cigar(b);
112     bam1_core_t *c = &b->core;
113 
114     // d(I) e(M) band
115     probaln_par_t conf = { 0.001, 0.1, 10 }; // Illumina
116 
117     if (b->core.l_qseq > 1000 || system > BAQ_ILLUMINA) {
118         // Params that work well on PacBio CCS 15k.  Unknown if they
119         // help other long-read platforms yet, but likely better than
120         // the short-read tuned ones.
121         //
122         // This function has no access to the SAM header.
123         // Ideally the calling function would check for e.g.
124         // @RG PL = "PACBIO" and DS contains "READTYPE=CCS".
125         //
126         // In the absense of this, we simply auto-detect via a crude
127         // short vs long strategy.
128         conf.d = 1e-7;
129         conf.e = 1e-1;
130     }
131 
132     uint8_t *bq = NULL, *zq = NULL, *qual = bam_get_qual(b);
133     int *state = NULL;
134     if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0 || qual[0] == (uint8_t)-1)
135         return -1; // do nothing
136 
137     // test if BQ or ZQ is present, and make sanity checks
138     if ((bq = bam_aux_get(b, "BQ")) != NULL) {
139         if (!redo_baq) {
140             if (realn_check_tag(bq, HTS_LOG_WARNING, "BQ", b) < 0)
141                 fix_bq = 1;
142         }
143         ++bq;
144     }
145     if ((zq = bam_aux_get(b, "ZQ")) != NULL) {
146         if (realn_check_tag(zq, HTS_LOG_ERROR, "ZQ", b) < 0)
147             return -4;
148         ++zq;
149     }
150     if (bq && redo_baq)
151     {
152         bam_aux_del(b, bq-1);
153         bq = 0;
154     }
155     if (bq && zq) { // remove the ZQ tag
156         bam_aux_del(b, zq-1);
157         zq = 0;
158     }
159     if (!zq && fix_bq) { // Need to fix invalid BQ tag (by realigning)
160         assert(bq != NULL);
161         bam_aux_del(b, bq-1);
162         bq = 0;
163     }
164 
165     if (bq || zq) {
166         if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
167         if (bq && apply_baq) { // then convert BQ to ZQ
168             for (i = 0; i < c->l_qseq; ++i)
169                 qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
170             *(bq - 3) = 'Z';
171         } else if (zq && !apply_baq) { // then convert ZQ to BQ
172             for (i = 0; i < c->l_qseq; ++i)
173                 qual[i] += (int)zq[i] - 64;
174             *(zq - 3) = 'B';
175         }
176         return 0;
177     }
178     // find the start and end of the alignment
179     x = c->pos, y = 0, yb = ye = xb = xe = -1;
180     for (k = 0; k < c->n_cigar; ++k) {
181         int op, l;
182         op = cigar[k]&0xf; l = cigar[k]>>4;
183         if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
184             if (yb < 0) yb = y;
185             if (xb < 0) xb = x;
186             ye = y + l; xe = x + l;
187             x += l; y += l;
188         } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
189         else if (op == BAM_CDEL) x += l;
190         else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
191     }
192     if (xb == -1) // No matches in CIGAR.
193         return -1;
194     // set bandwidth and the start and the end
195     bw = 7;
196     if (abs((xe - xb) - (ye - yb)) > bw)
197         bw = abs((xe - xb) - (ye - yb)) + 3;
198     conf.bw = bw;
199 
200     xb -= yb + bw/2; if (xb < 0) xb = 0;
201     xe += c->l_qseq - ye + bw/2;
202     if (xe - xb - c->l_qseq > bw)
203         xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
204     { // glocal
205         uint8_t *seq = bam_get_seq(b);
206         uint8_t *tseq; // translated seq A=>0,C=>1,G=>2,T=>3,other=>4
207         uint8_t *tref; // translated ref
208         uint8_t *q; // Probability of incorrect alignment from probaln_glocal()
209         size_t lref = xe > xb ? xe - xb : 1;
210         size_t align_lqseq;
211         if (extend_baq && lref < c->l_qseq)
212             lref = c->l_qseq; // So we can recycle tseq,tref for left,rght below
213         // Try to make q,tref,tseq reasonably well aligned
214         align_lqseq = ((c->l_qseq + 1) | 0xf) + 1;
215         // Overflow check - 3 for *bq, sizeof(int) for *state
216         if ((SIZE_MAX - lref) / (3 + sizeof(int)) < align_lqseq) {
217             errno = ENOMEM;
218             goto fail;
219         }
220 
221         assert(bq == NULL); // bq was used above, but should now be NULL
222         bq = malloc(align_lqseq * 3 + lref);
223         if (!bq) goto fail;
224         q = bq + align_lqseq;
225         tseq = q + align_lqseq;
226         tref = tseq + align_lqseq;
227 
228         memcpy(bq, qual, c->l_qseq); bq[c->l_qseq] = 0;
229         for (i = 0; i < c->l_qseq; ++i)
230             tseq[i] = seq_nt16_int[bam_seqi(seq, i)];
231         for (i = xb; i < xe; ++i) {
232             if (i >= ref_len || ref[i] == '\0') { xe = i; break; }
233             tref[i-xb] = seq_nt16_int[seq_nt16_table[(unsigned char)ref[i]]];
234         }
235 
236         state = malloc(c->l_qseq * sizeof(int));
237         if (!state) goto fail;
238         if (probaln_glocal(tref, xe-xb, tseq, c->l_qseq, qual,
239                            &conf, state, q) == INT_MIN) {
240             goto fail;
241         }
242 
243         if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
244             for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
245                 int op = cigar[k]&0xf, l = cigar[k]>>4;
246                 if (l == 0) continue;
247                 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
248                     // Sanity check running off the end of the sequence
249                     // Can only happen if the alignment is broken
250                     if (l > c->l_qseq - y)
251                         l = c->l_qseq - y;
252                     for (i = y; i < y + l; ++i) {
253                         if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
254                         else bq[i] = bq[i] < q[i]? bq[i] : q[i];
255                     }
256                     x += l; y += l;
257                 } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) {
258                     // Need sanity check here too.
259                     if (l > c->l_qseq - y)
260                         l = c->l_qseq - y;
261                     y += l;
262                 } else if (op == BAM_CDEL) {
263                     x += l;
264                 }
265             }
266             for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
267         } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!)
268             // tseq,tref are no longer needed, so we can steal them to avoid mallocs
269             uint8_t *left = tseq;
270             uint8_t *rght = tref;
271             for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
272                 int op = cigar[k]&0xf, l = cigar[k]>>4;
273                 if (l == 0) continue;
274                 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
275                     // Sanity check running off the end of the sequence
276                     // Can only happen if the alignment is broken
277                     if (l > c->l_qseq - y)
278                         l = c->l_qseq - y;
279                     for (i = y; i < y + l; ++i)
280                         bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
281                     for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
282                         left[i] = bq[i] > left[i-1]? bq[i] : left[i-1];
283                     for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i)
284                         rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1];
285                     for (i = y; i < y + l; ++i)
286                         bq[i] = left[i] < rght[i]? left[i] : rght[i];
287                     x += l; y += l;
288                 } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) {
289                     // Need sanity check here too.
290                     if (l > c->l_qseq - y)
291                         l = c->l_qseq - y;
292                     y += l;
293                 } else if (op == BAM_CDEL) {
294                     x += l;
295                 }
296             }
297             for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ
298         }
299         if (apply_baq) {
300             for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
301             bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
302         } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
303         free(bq); free(state);
304     }
305 
306     return 0;
307 
308  fail:
309     free(bq); free(state);
310     return -4;
311 }
312