1 /* realn.c -- BAQ calculation and realignment.
2
3 Copyright (C) 2009-2011, 2014-2016, 2018, 2021 Genome Research Ltd.
4 Portions copyright (C) 2009-2011 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27 #include <config.h>
28
29 #include <stdlib.h>
30 #include <string.h>
31 #include <math.h>
32 #include <limits.h>
33 #include <stdint.h>
34 #include <errno.h>
35 #include <assert.h>
36 #include "htslib/hts.h"
37 #include "htslib/sam.h"
38
sam_cap_mapq(bam1_t * b,const char * ref,hts_pos_t ref_len,int thres)39 int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres)
40 {
41 uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b);
42 uint32_t *cigar = bam_get_cigar(b);
43 bam1_core_t *c = &b->core;
44 int i, y, mm, q, len, clip_l, clip_q;
45 hts_pos_t x;
46 double t;
47 if (thres < 0) thres = 40; // set the default
48 mm = q = len = clip_l = clip_q = 0;
49 for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
50 int j, l = cigar[i]>>4, op = cigar[i]&0xf;
51 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
52 for (j = 0; j < l; ++j) {
53 int c1, c2, z = y + j;
54 if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
55 c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(unsigned char)ref[x+j]];
56 if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
57 ++len;
58 if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
59 ++mm;
60 q += qual[z] > 33? 33 : qual[z];
61 }
62 }
63 }
64 if (j < l) break;
65 x += l; y += l; len += l;
66 } else if (op == BAM_CDEL) {
67 for (j = 0; j < l; ++j)
68 if (x+j >= ref_len || ref[x+j] == '\0') break;
69 if (j < l) break;
70 x += l;
71 } else if (op == BAM_CSOFT_CLIP) {
72 for (j = 0; j < l; ++j) clip_q += qual[y+j];
73 clip_l += l;
74 y += l;
75 } else if (op == BAM_CHARD_CLIP) {
76 clip_q += 13 * l;
77 clip_l += l;
78 } else if (op == BAM_CINS) y += l;
79 else if (op == BAM_CREF_SKIP) x += l;
80 }
81 for (i = 0, t = 1; i < mm; ++i)
82 t *= (double)len / (i+1);
83 t = q - 4.343 * log(t) + clip_q / 5.;
84 if (t > thres) return -1;
85 if (t < 0) t = 0;
86 t = sqrt((thres - t) / thres) * thres;
87 //fprintf(stderr, "%s %lf %d\n", bam_get_qname(b), t, q);
88 return (int)(t + .499);
89 }
90
realn_check_tag(const uint8_t * tg,enum htsLogLevel severity,const char * type,const bam1_t * b)91 static int realn_check_tag(const uint8_t *tg, enum htsLogLevel severity,
92 const char *type, const bam1_t *b) {
93 if (*tg != 'Z') {
94 hts_log(severity, "Incorrect %s tag type (%c) for read %s",
95 type, *tg, bam_get_qname(b));
96 return -1;
97 }
98 if (b->core.l_qseq != strlen((const char *) tg + 1)) {
99 hts_log(severity, "Read %s %s tag is wrong length",
100 bam_get_qname(b), type);
101 return -1;
102 }
103 return 0;
104 }
105
sam_prob_realn(bam1_t * b,const char * ref,hts_pos_t ref_len,int flag)106 int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) {
107 int k, bw, y, yb, ye, xb, xe, fix_bq = 0, apply_baq = flag & BAQ_APPLY,
108 extend_baq = flag & BAQ_EXTEND, redo_baq = flag & BAQ_REDO;
109 enum htsRealnFlags system = flag & (0xff << 3);
110 hts_pos_t i, x;
111 uint32_t *cigar = bam_get_cigar(b);
112 bam1_core_t *c = &b->core;
113
114 // d(I) e(M) band
115 probaln_par_t conf = { 0.001, 0.1, 10 }; // Illumina
116
117 if (b->core.l_qseq > 1000 || system > BAQ_ILLUMINA) {
118 // Params that work well on PacBio CCS 15k. Unknown if they
119 // help other long-read platforms yet, but likely better than
120 // the short-read tuned ones.
121 //
122 // This function has no access to the SAM header.
123 // Ideally the calling function would check for e.g.
124 // @RG PL = "PACBIO" and DS contains "READTYPE=CCS".
125 //
126 // In the absense of this, we simply auto-detect via a crude
127 // short vs long strategy.
128 conf.d = 1e-7;
129 conf.e = 1e-1;
130 }
131
132 uint8_t *bq = NULL, *zq = NULL, *qual = bam_get_qual(b);
133 int *state = NULL;
134 if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0 || qual[0] == (uint8_t)-1)
135 return -1; // do nothing
136
137 // test if BQ or ZQ is present, and make sanity checks
138 if ((bq = bam_aux_get(b, "BQ")) != NULL) {
139 if (!redo_baq) {
140 if (realn_check_tag(bq, HTS_LOG_WARNING, "BQ", b) < 0)
141 fix_bq = 1;
142 }
143 ++bq;
144 }
145 if ((zq = bam_aux_get(b, "ZQ")) != NULL) {
146 if (realn_check_tag(zq, HTS_LOG_ERROR, "ZQ", b) < 0)
147 return -4;
148 ++zq;
149 }
150 if (bq && redo_baq)
151 {
152 bam_aux_del(b, bq-1);
153 bq = 0;
154 }
155 if (bq && zq) { // remove the ZQ tag
156 bam_aux_del(b, zq-1);
157 zq = 0;
158 }
159 if (!zq && fix_bq) { // Need to fix invalid BQ tag (by realigning)
160 assert(bq != NULL);
161 bam_aux_del(b, bq-1);
162 bq = 0;
163 }
164
165 if (bq || zq) {
166 if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
167 if (bq && apply_baq) { // then convert BQ to ZQ
168 for (i = 0; i < c->l_qseq; ++i)
169 qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
170 *(bq - 3) = 'Z';
171 } else if (zq && !apply_baq) { // then convert ZQ to BQ
172 for (i = 0; i < c->l_qseq; ++i)
173 qual[i] += (int)zq[i] - 64;
174 *(zq - 3) = 'B';
175 }
176 return 0;
177 }
178 // find the start and end of the alignment
179 x = c->pos, y = 0, yb = ye = xb = xe = -1;
180 for (k = 0; k < c->n_cigar; ++k) {
181 int op, l;
182 op = cigar[k]&0xf; l = cigar[k]>>4;
183 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
184 if (yb < 0) yb = y;
185 if (xb < 0) xb = x;
186 ye = y + l; xe = x + l;
187 x += l; y += l;
188 } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
189 else if (op == BAM_CDEL) x += l;
190 else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
191 }
192 if (xb == -1) // No matches in CIGAR.
193 return -1;
194 // set bandwidth and the start and the end
195 bw = 7;
196 if (abs((xe - xb) - (ye - yb)) > bw)
197 bw = abs((xe - xb) - (ye - yb)) + 3;
198 conf.bw = bw;
199
200 xb -= yb + bw/2; if (xb < 0) xb = 0;
201 xe += c->l_qseq - ye + bw/2;
202 if (xe - xb - c->l_qseq > bw)
203 xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
204 { // glocal
205 uint8_t *seq = bam_get_seq(b);
206 uint8_t *tseq; // translated seq A=>0,C=>1,G=>2,T=>3,other=>4
207 uint8_t *tref; // translated ref
208 uint8_t *q; // Probability of incorrect alignment from probaln_glocal()
209 size_t lref = xe > xb ? xe - xb : 1;
210 size_t align_lqseq;
211 if (extend_baq && lref < c->l_qseq)
212 lref = c->l_qseq; // So we can recycle tseq,tref for left,rght below
213 // Try to make q,tref,tseq reasonably well aligned
214 align_lqseq = ((c->l_qseq + 1) | 0xf) + 1;
215 // Overflow check - 3 for *bq, sizeof(int) for *state
216 if ((SIZE_MAX - lref) / (3 + sizeof(int)) < align_lqseq) {
217 errno = ENOMEM;
218 goto fail;
219 }
220
221 assert(bq == NULL); // bq was used above, but should now be NULL
222 bq = malloc(align_lqseq * 3 + lref);
223 if (!bq) goto fail;
224 q = bq + align_lqseq;
225 tseq = q + align_lqseq;
226 tref = tseq + align_lqseq;
227
228 memcpy(bq, qual, c->l_qseq); bq[c->l_qseq] = 0;
229 for (i = 0; i < c->l_qseq; ++i)
230 tseq[i] = seq_nt16_int[bam_seqi(seq, i)];
231 for (i = xb; i < xe; ++i) {
232 if (i >= ref_len || ref[i] == '\0') { xe = i; break; }
233 tref[i-xb] = seq_nt16_int[seq_nt16_table[(unsigned char)ref[i]]];
234 }
235
236 state = malloc(c->l_qseq * sizeof(int));
237 if (!state) goto fail;
238 if (probaln_glocal(tref, xe-xb, tseq, c->l_qseq, qual,
239 &conf, state, q) == INT_MIN) {
240 goto fail;
241 }
242
243 if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
244 for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
245 int op = cigar[k]&0xf, l = cigar[k]>>4;
246 if (l == 0) continue;
247 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
248 // Sanity check running off the end of the sequence
249 // Can only happen if the alignment is broken
250 if (l > c->l_qseq - y)
251 l = c->l_qseq - y;
252 for (i = y; i < y + l; ++i) {
253 if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
254 else bq[i] = bq[i] < q[i]? bq[i] : q[i];
255 }
256 x += l; y += l;
257 } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) {
258 // Need sanity check here too.
259 if (l > c->l_qseq - y)
260 l = c->l_qseq - y;
261 y += l;
262 } else if (op == BAM_CDEL) {
263 x += l;
264 }
265 }
266 for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
267 } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!)
268 // tseq,tref are no longer needed, so we can steal them to avoid mallocs
269 uint8_t *left = tseq;
270 uint8_t *rght = tref;
271 for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
272 int op = cigar[k]&0xf, l = cigar[k]>>4;
273 if (l == 0) continue;
274 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
275 // Sanity check running off the end of the sequence
276 // Can only happen if the alignment is broken
277 if (l > c->l_qseq - y)
278 l = c->l_qseq - y;
279 for (i = y; i < y + l; ++i)
280 bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
281 for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
282 left[i] = bq[i] > left[i-1]? bq[i] : left[i-1];
283 for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i)
284 rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1];
285 for (i = y; i < y + l; ++i)
286 bq[i] = left[i] < rght[i]? left[i] : rght[i];
287 x += l; y += l;
288 } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) {
289 // Need sanity check here too.
290 if (l > c->l_qseq - y)
291 l = c->l_qseq - y;
292 y += l;
293 } else if (op == BAM_CDEL) {
294 x += l;
295 }
296 }
297 for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ
298 }
299 if (apply_baq) {
300 for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
301 bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
302 } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
303 free(bq); free(state);
304 }
305
306 return 0;
307
308 fail:
309 free(bq); free(state);
310 return -4;
311 }
312