1 /*  realn.c -- BAQ calculation and realignment.
2 
3     Copyright (C) 2009-2011, 2014-2015 Genome Research Ltd.
4     Portions copyright (C) 2009-2011 Broad Institute.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <config.h>
27 
28 #include <stdlib.h>
29 #include <string.h>
30 #include <math.h>
31 #include "htslib/hts.h"
32 #include "htslib/sam.h"
33 
sam_cap_mapq(bam1_t * b,const char * ref,int ref_len,int thres)34 int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres)
35 {
36     uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b);
37     uint32_t *cigar = bam_get_cigar(b);
38     bam1_core_t *c = &b->core;
39     int i, x, y, mm, q, len, clip_l, clip_q;
40     double t;
41     if (thres < 0) thres = 40; // set the default
42     mm = q = len = clip_l = clip_q = 0;
43     for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
44         int j, l = cigar[i]>>4, op = cigar[i]&0xf;
45         if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
46             for (j = 0; j < l; ++j) {
47                 int c1, c2, z = y + j;
48                 if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
49                 c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
50                 if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
51                     ++len;
52                     if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
53                         ++mm;
54                         q += qual[z] > 33? 33 : qual[z];
55                     }
56                 }
57             }
58             if (j < l) break;
59             x += l; y += l; len += l;
60         } else if (op == BAM_CDEL) {
61             for (j = 0; j < l; ++j)
62                 if (x+j >= ref_len || ref[x+j] == '\0') break;
63             if (j < l) break;
64             x += l;
65         } else if (op == BAM_CSOFT_CLIP) {
66             for (j = 0; j < l; ++j) clip_q += qual[y+j];
67             clip_l += l;
68             y += l;
69         } else if (op == BAM_CHARD_CLIP) {
70             clip_q += 13 * l;
71             clip_l += l;
72         } else if (op == BAM_CINS) y += l;
73         else if (op == BAM_CREF_SKIP) x += l;
74     }
75     for (i = 0, t = 1; i < mm; ++i)
76         t *= (double)len / (i+1);
77     t = q - 4.343 * log(t) + clip_q / 5.;
78     if (t > thres) return -1;
79     if (t < 0) t = 0;
80     t = sqrt((thres - t) / thres) * thres;
81 //  fprintf(stderr, "%s %lf %d\n", bam_get_qname(b), t, q);
82     return (int)(t + .499);
83 }
84 
sam_prob_realn(bam1_t * b,const char * ref,int ref_len,int flag)85 int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag)
86 {
87     int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
88     uint32_t *cigar = bam_get_cigar(b);
89     bam1_core_t *c = &b->core;
90     probaln_par_t conf = { 0.001, 0.1, 10 };
91     uint8_t *bq = 0, *zq = 0, *qual = bam_get_qual(b);
92     if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0 || qual[0] == (uint8_t)-1)
93         return -1; // do nothing
94 
95     // test if BQ or ZQ is present
96     if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
97     if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
98     if (bq && redo_baq)
99     {
100         bam_aux_del(b, bq-1);
101         bq = 0;
102     }
103     if (bq && zq) { // remove the ZQ tag
104         bam_aux_del(b, zq-1);
105         zq = 0;
106     }
107     if (bq || zq) {
108         if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
109         if (bq && apply_baq) { // then convert BQ to ZQ
110             for (i = 0; i < c->l_qseq; ++i)
111                 qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
112             *(bq - 3) = 'Z';
113         } else if (zq && !apply_baq) { // then convert ZQ to BQ
114             for (i = 0; i < c->l_qseq; ++i)
115                 qual[i] += (int)zq[i] - 64;
116             *(zq - 3) = 'B';
117         }
118         return 0;
119     }
120     // find the start and end of the alignment
121     x = c->pos, y = 0, yb = ye = xb = xe = -1;
122     for (k = 0; k < c->n_cigar; ++k) {
123         int op, l;
124         op = cigar[k]&0xf; l = cigar[k]>>4;
125         if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
126             if (yb < 0) yb = y;
127             if (xb < 0) xb = x;
128             ye = y + l; xe = x + l;
129             x += l; y += l;
130         } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
131         else if (op == BAM_CDEL) x += l;
132         else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
133     }
134     // set bandwidth and the start and the end
135     bw = 7;
136     if (abs((xe - xb) - (ye - yb)) > bw)
137         bw = abs((xe - xb) - (ye - yb)) + 3;
138     conf.bw = bw;
139     xb -= yb + bw/2; if (xb < 0) xb = 0;
140     xe += c->l_qseq - ye + bw/2;
141     if (xe - xb - c->l_qseq > bw)
142         xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
143     { // glocal
144         uint8_t *s, *r, *q, *seq = bam_get_seq(b), *bq;
145         int *state;
146         bq = calloc(c->l_qseq + 1, 1);
147         memcpy(bq, qual, c->l_qseq);
148         s = calloc(c->l_qseq, 1);
149         for (i = 0; i < c->l_qseq; ++i) s[i] = seq_nt16_int[bam_seqi(seq, i)];
150         r = calloc(xe - xb, 1);
151         for (i = xb; i < xe; ++i) {
152             if (i >= ref_len || ref[i] == '\0') { xe = i; break; }
153             r[i-xb] = seq_nt16_int[seq_nt16_table[(int)ref[i]]];
154         }
155         state = calloc(c->l_qseq, sizeof(int));
156         q = calloc(c->l_qseq, 1);
157         probaln_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q);
158         if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
159             for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
160                 int op = cigar[k]&0xf, l = cigar[k]>>4;
161                 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
162                     for (i = y; i < y + l; ++i) {
163                         if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
164                         else bq[i] = bq[i] < q[i]? bq[i] : q[i];
165                     }
166                     x += l; y += l;
167                 } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
168                 else if (op == BAM_CDEL) x += l;
169             }
170             for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
171         } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!)
172             uint8_t *left, *rght;
173             left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1);
174             for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
175                 int op = cigar[k]&0xf, l = cigar[k]>>4;
176                 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
177                     for (i = y; i < y + l; ++i)
178                         bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
179                     for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
180                         left[i] = bq[i] > left[i-1]? bq[i] : left[i-1];
181                     for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i)
182                         rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1];
183                     for (i = y; i < y + l; ++i)
184                         bq[i] = left[i] < rght[i]? left[i] : rght[i];
185                     x += l; y += l;
186                 } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
187                 else if (op == BAM_CDEL) x += l;
188             }
189             for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ
190             free(left); free(rght);
191         }
192         if (apply_baq) {
193             for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
194             bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
195         } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
196         free(bq); free(s); free(r); free(q); free(state);
197     }
198     return 0;
199 }
200