1 /*  bam2bcf_indel.c -- indel caller.
2 
3     Copyright (C) 2010, 2011 Broad Institute.
4     Copyright (C) 2012-2014, 2019 Genome Research Ltd.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <config.h>
27 
28 #include <assert.h>
29 #include <ctype.h>
30 #include <string.h>
31 #include "htslib/hts.h"
32 #include "htslib/sam.h"
33 #include "bam2bcf.h"
34 #include "htslib/khash.h"
35 KHASH_SET_INIT_STR(rg)
36 
37 #include "htslib/ksort.h"
KSORT_INIT_GENERIC(uint32_t)38 KSORT_INIT_GENERIC(uint32_t)
39 
40 #define MINUS_CONST 0x10000000
41 #define INDEL_WINDOW_SIZE 50
42 
43 void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list)
44 {
45     const char *s, *p, *q, *r, *t;
46     khash_t(rg) *hash;
47     if (list == 0 || hdtext == 0) return _hash;
48     if (_hash == 0) _hash = kh_init(rg);
49     hash = (khash_t(rg)*)_hash;
50     if ((s = strstr(hdtext, "@RG\t")) == 0) return hash;
51     do {
52         t = strstr(s + 4, "@RG\t"); // the next @RG
53         if ((p = strstr(s, "\tID:")) != 0) p += 4;
54         if ((q = strstr(s, "\tPL:")) != 0) q += 4;
55         if (p && q && (t == 0 || (p < t && q < t))) { // ID and PL are both present
56             int lp, lq;
57             char *x;
58             for (r = p; *r && *r != '\t' && *r != '\n'; ++r) { }
59             lp = r - p;
60             for (r = q; *r && *r != '\t' && *r != '\n'; ++r) { }
61             lq = r - q;
62             x = calloc((lp > lq? lp : lq) + 1, 1);
63             for (r = q; *r && *r != '\t' && *r != '\n'; ++r) x[r-q] = *r;
64             if (strstr(list, x)) { // insert ID to the hash table
65                 khint_t k;
66                 int ret;
67                 for (r = p; *r && *r != '\t' && *r != '\n'; ++r) x[r-p] = *r;
68                 x[r-p] = 0;
69                 k = kh_get(rg, hash, x);
70                 if (k == kh_end(hash)) k = kh_put(rg, hash, x, &ret);
71                 else free(x);
72             } else free(x);
73         }
74         s = t;
75     } while (s);
76     return hash;
77 }
78 
bcf_call_del_rghash(void * _hash)79 void bcf_call_del_rghash(void *_hash)
80 {
81     khint_t k;
82     khash_t(rg) *hash = (khash_t(rg)*)_hash;
83     if (hash == 0) return;
84     for (k = kh_begin(hash); k < kh_end(hash); ++k)
85         if (kh_exist(hash, k))
86             free((char*)kh_key(hash, k));
87     kh_destroy(rg, hash);
88 }
89 
tpos2qpos(const bam1_core_t * c,const uint32_t * cigar,hts_pos_t tpos,hts_pos_t is_left,hts_pos_t * _tpos)90 static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, hts_pos_t tpos, hts_pos_t is_left, hts_pos_t *_tpos)
91 {
92     int k, y = 0, last_y = 0;
93     hts_pos_t x = c->pos;
94     *_tpos = c->pos;
95     for (k = 0; k < c->n_cigar; ++k) {
96         int op = cigar[k] & BAM_CIGAR_MASK;
97         int l = cigar[k] >> BAM_CIGAR_SHIFT;
98         if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
99             if (c->pos > tpos) return y;
100             if (x + l > tpos) {
101                 *_tpos = tpos;
102                 return y + (tpos - x);
103             }
104             x += l; y += l;
105             last_y = y;
106         } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
107         else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
108             if (x + l > tpos) {
109                 *_tpos = is_left? x : x + l;
110                 return y;
111             }
112             x += l;
113         }
114     }
115     *_tpos = x;
116     return last_y;
117 }
118 // FIXME: check if the inserted sequence is consistent with the homopolymer run
119 // l is the relative gap length and l_run is the length of the homopolymer on the reference
est_seqQ(const bcf_callaux_t * bca,int l,int l_run)120 static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run)
121 {
122     int q, qh;
123     q = bca->openQ + bca->extQ * (abs(l) - 1);
124     qh = l_run >= 3? (int)(bca->tandemQ * (double)abs(l) / l_run + .499) : 1000;
125     return q < qh? q : qh;
126 }
127 
est_indelreg(hts_pos_t pos,const char * ref,int l,char * ins4)128 static inline int est_indelreg(hts_pos_t pos, const char *ref, int l, char *ins4)
129 {
130     int j, max = 0, score = 0;
131     hts_pos_t i, max_i = pos;
132     l = abs(l);
133     for (i = pos + 1, j = 0; ref[i]; ++i, ++j) {
134         if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1;
135         else score += (toupper(ref[i]) != toupper(ref[pos+1+j%l]))? -10 : 1;
136         if (score < 0) break;
137         if (max < score) max = score, max_i = i;
138     }
139     return max_i - pos;
140 }
141 
142 /*
143     notes:
144         - n .. number of samples
145         - the routine sets bam_pileup1_t.aux of each read as follows:
146             - 6: unused
147             - 6: the call; index to bcf_callaux_t.indel_types   .. (aux>>16)&0x3f
148             - 8: estimated sequence quality                     .. (aux>>8)&0xff
149             - 8: indel quality                                  .. aux&0xff
150  */
bcf_call_gap_prep(int n,int * n_plp,bam_pileup1_t ** plp,hts_pos_t pos,bcf_callaux_t * bca,const char * ref,const void * rghash)151 int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf_callaux_t *bca, const char *ref,
152                       const void *rghash)
153 {
154     int s, k, t, n_types, *types, max_rd_len, max_ins, *score1, *score2, max_ref2;
155     int N, K, l_run, ref_type, n_alt;
156     hts_pos_t i, j, left, right;
157     char *inscns = 0, *ref2, *query, **ref_sample;
158     khash_t(rg) *hash = (khash_t(rg)*)rghash;
159     if (ref == 0 || bca == 0) return -1;
160     // mark filtered reads
161     if (rghash) {
162         N = 0;
163         for (s = N = 0; s < n; ++s) {
164             for (i = 0; i < n_plp[s]; ++i) {
165                 bam_pileup1_t *p = plp[s] + i;
166                 const uint8_t *rg = bam_aux_get(p->b, "RG");
167                 p->aux = 1; // filtered by default
168                 if (rg) {
169                     khint_t k = kh_get(rg, hash, (const char*)(rg + 1));
170                     if (k != kh_end(hash)) p->aux = 0, ++N; // not filtered
171                 }
172             }
173         }
174         if (N == 0) return -1; // no reads left
175     }
176     // determine if there is a gap
177     for (s = N = 0; s < n; ++s) {
178         for (i = 0; i < n_plp[s]; ++i)
179             if (plp[s][i].indel != 0) break;
180         if (i < n_plp[s]) break;
181     }
182     if (s == n) return -1; // there is no indel at this position.
183     for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
184     { // find out how many types of indels are present
185         bca->max_support = bca->max_frac = 0;
186         int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
187         uint32_t *aux;
188         aux = calloc(N + 1, 4);
189         m = max_rd_len = 0;
190         aux[m++] = MINUS_CONST; // zero indel is always a type
191         for (s = 0; s < n; ++s) {
192             int na = 0, nt = 0;
193             for (i = 0; i < n_plp[s]; ++i) {
194                 const bam_pileup1_t *p = plp[s] + i;
195                 if (rghash == 0 || p->aux == 0) {
196                     ++nt;
197                     if (p->indel != 0) {
198                         ++na;
199                         aux[m++] = MINUS_CONST + p->indel;
200                     }
201                 }
202                 j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
203                 if (j > max_rd_len) max_rd_len = j;
204             }
205             double frac = (double)na/nt;
206             if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
207                 indel_support_ok = 1;
208             if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
209             n_alt += na;
210             n_tot += nt;
211         }
212         // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases),
213         //  check the number of N's in the sequence and skip places where half or more reference bases are Ns.
214         int nN=0; for (i=pos; i-pos<max_rd_len && ref[i]; i++) if ( ref[i]=='N' ) nN++;
215         if ( nN*2>(i-pos) ) { free(aux); return -1; }
216 
217         ks_introsort(uint32_t, m, aux);
218         // squeeze out identical types
219         for (i = 1, n_types = 1; i < m; ++i)
220             if (aux[i] != aux[i-1]) ++n_types;
221         // Taking totals makes it hard to call rare indels
222         if ( !bca->per_sample_flt )
223             indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
224         if ( n_types == 1 || !indel_support_ok ) { // then skip
225             free(aux); return -1;
226         }
227         if (n_types >= 64) {
228             free(aux);
229             // TODO revisit how/whether to control printing this warning
230             if (hts_verbose >= 2)
231                 fprintf(stderr, "[%s] excessive INDEL alleles at position %"PRIhts_pos". Skip the position.\n", __func__, pos + 1);
232             return -1;
233         }
234         types = (int*)calloc(n_types, sizeof(int));
235         t = 0;
236         types[t++] = aux[0] - MINUS_CONST;
237         for (i = 1; i < m; ++i)
238             if (aux[i] != aux[i-1])
239                 types[t++] = aux[i] - MINUS_CONST;
240         free(aux);
241         for (t = 0; t < n_types; ++t)
242             if (types[t] == 0) break;
243         ref_type = t; // the index of the reference type (0)
244     }
245     { // calculate left and right boundary
246         left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
247         right = pos + INDEL_WINDOW_SIZE;
248         if (types[0] < 0) right -= types[0];
249         // in case the alignments stand out the reference
250         for (i = pos; i < right; ++i)
251             if (ref[i] == 0) break;
252         right = i;
253     }
254     /* The following block fixes a long-existing flaw in the INDEL
255      * calling model: the interference of nearby SNPs. However, it also
256      * reduces the power because sometimes, substitutions caused by
257      * indels are not distinguishable from true mutations. Multiple
258      * sequence realignment helps to increase the power.
259      *
260      * Masks mismatches present in at least 70% of the reads with 'N'.
261      */
262     { // construct per-sample consensus
263         int L = right - left + 1, max_i, max2_i;
264         uint32_t *cns, max, max2;
265         char *ref0, *r;
266         ref_sample = calloc(n, sizeof(char*));
267         cns = calloc(L, 4);
268         ref0 = calloc(L, 1);
269         for (i = 0; i < right - left; ++i)
270             ref0[i] = seq_nt16_table[(int)ref[i+left]];
271         for (s = 0; s < n; ++s) {
272             r = ref_sample[s] = calloc(L, 1);
273             memset(cns, 0, sizeof(int) * L);
274             // collect ref and non-ref counts
275             for (i = 0; i < n_plp[s]; ++i) {
276                 bam_pileup1_t *p = plp[s] + i;
277                 bam1_t *b = p->b;
278                 uint32_t *cigar = bam_get_cigar(b);
279                 uint8_t *seq = bam_get_seq(b);
280                 hts_pos_t x = b->core.pos, y = 0;
281                 for (k = 0; k < b->core.n_cigar; ++k) {
282                     int op = cigar[k]&0xf;
283                     int j, l = cigar[k]>>4;
284                     if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
285                         for (j = 0; j < l; ++j)
286                             if (x + j >= left && x + j < right)
287                                 cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
288                         x += l; y += l;
289                     } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
290                     else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
291                 }
292             }
293             // determine the consensus
294             for (i = 0; i < right - left; ++i) r[i] = ref0[i];
295             max = max2 = 0; max_i = max2_i = -1;
296             for (i = 0; i < right - left; ++i) {
297                 if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i;
298                 else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i;
299             }
300             if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1;
301             if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1;
302             if (max_i >= 0) r[max_i] = 15;
303             if (max2_i >= 0) r[max2_i] = 15;
304             //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr);
305         }
306         free(ref0); free(cns);
307     }
308     { // the length of the homopolymer run around the current position
309         int c = seq_nt16_table[(int)ref[pos + 1]];
310         if (c == 15) l_run = 1;
311         else {
312             for (i = pos + 2; ref[i]; ++i)
313                 if (seq_nt16_table[(int)ref[i]] != c) break;
314             l_run = i;
315             for (i = pos; i >= 0; --i)
316                 if (seq_nt16_table[(int)ref[i]] != c) break;
317             l_run -= i + 1;
318         }
319     }
320     // construct the consensus sequence
321     max_ins = types[n_types - 1];   // max_ins is at least 0
322     if (max_ins > 0) {
323         int *inscns_aux = calloc(5 * n_types * max_ins, sizeof(int));
324         // count the number of occurrences of each base at each position for each type of insertion
325         for (t = 0; t < n_types; ++t) {
326             if (types[t] > 0) {
327                 for (s = 0; s < n; ++s) {
328                     for (i = 0; i < n_plp[s]; ++i) {
329                         bam_pileup1_t *p = plp[s] + i;
330                         if (p->indel == types[t]) {
331                             uint8_t *seq = bam_get_seq(p->b);
332                             for (k = 1; k <= p->indel; ++k) {
333                                 int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
334                                 assert(c<5);
335                                 ++inscns_aux[(t*max_ins+(k-1))*5 + c];
336                             }
337                         }
338                     }
339                 }
340             }
341         }
342         // use the majority rule to construct the consensus
343         inscns = calloc(n_types * max_ins, 1);
344         for (t = 0; t < n_types; ++t) {
345             for (j = 0; j < types[t]; ++j) {
346                 int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
347                 for (k = 0; k < 5; ++k)
348                     if (ia[k] > max)
349                         max = ia[k], max_k = k;
350                 inscns[t*max_ins + j] = max? max_k : 4;
351                 if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's
352             }
353         }
354         free(inscns_aux);
355     }
356     // compute the likelihood given each type of indel for each read
357     max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]);
358     ref2  = calloc(max_ref2, 1);
359     query = calloc(right - left + max_rd_len + max_ins + 2, 1);
360     score1 = calloc(N * n_types, sizeof(int));
361     score2 = calloc(N * n_types, sizeof(int));
362     bca->indelreg = 0;
363     for (t = 0; t < n_types; ++t) {
364         int l, ir;
365         probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
366         apf1.bw = apf2.bw = abs(types[t]) + 3;
367         // compute indelreg
368         if (types[t] == 0) ir = 0;
369         else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
370         else ir = est_indelreg(pos, ref, -types[t], 0);
371         if (ir > bca->indelreg) bca->indelreg = ir;
372 //      fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir);
373         // realignment
374         for (s = K = 0; s < n; ++s) {
375             // write ref2
376             for (k = 0, j = left; j <= pos; ++j)
377                 ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
378             if (types[t] <= 0) j += -types[t];
379             else for (l = 0; l < types[t]; ++l)
380                      ref2[k++] = inscns[t*max_ins + l];
381             for (; j < right && ref[j]; ++j)
382                 ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
383             for (; k < max_ref2; ++k) ref2[k] = 4;
384             if (j < right) right = j;
385             // align each read to ref2
386             for (i = 0; i < n_plp[s]; ++i, ++K) {
387                 bam_pileup1_t *p = plp[s] + i;
388                 int qbeg, qend, sc, kk;
389                 hts_pos_t tbeg, tend;
390                 uint8_t *seq = bam_get_seq(p->b);
391                 uint32_t *cigar = bam_get_cigar(p->b);
392                 if (p->b->core.flag&4) continue; // unmapped reads
393                 // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway.
394                 for (kk = 0; kk < p->b->core.n_cigar; ++kk)
395                     if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break;
396                 if (kk < p->b->core.n_cigar) continue;
397                 // FIXME: the following skips soft clips, but using them may be more sensitive.
398                 // determine the start and end of sequences for alignment
399                 qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left,  0, &tbeg);
400                 qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend);
401                 if (types[t] < 0) {
402                     int l = -types[t];
403                     tbeg = tbeg - l > left?  tbeg - l : left;
404                 }
405                 // write the query sequence
406                 for (l = qbeg; l < qend; ++l)
407                     query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)];
408                 { // do realignment; this is the bottleneck
409                     const uint8_t *qual = bam_get_qual(p->b), *bq;
410                     uint8_t *qq;
411                     if (qend < qbeg) {
412                         fprintf(stderr, "Impossible data in bcf_call_gap_prep\n");
413                         exit(1);
414                     }
415                     qq = calloc(qend - qbeg, 1);
416                     bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
417                     if (bq) ++bq; // skip type
418                     for (l = qbeg; l < qend; ++l) {
419                         qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l];
420                         if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
421                         if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
422                     }
423                     sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
424                                         (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
425                     l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
426                     if (l > 255) l = 255;
427                     score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
428                     if (sc > 5) {
429                         sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
430                                             (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
431                         l = (int)(100. * sc / (qend - qbeg) + .499);
432                         if (l > 255) l = 255;
433                         score2[K*n_types + t] = sc<<8 | l;
434                     }
435                     free(qq);
436                 }
437 /*
438                 for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
439                     fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr);
440                 fputc('\n', stderr);
441                 for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr);
442                 fputc('\n', stderr);
443                 fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc);
444 */
445             }
446         }
447     }
448     free(ref2); free(query);
449     { // compute indelQ
450         int sc_a[16], sumq_a[16];
451         int tmp, *sc = sc_a, *sumq = sumq_a;
452         if (n_types > 16) {
453             sc   = (int *)malloc(n_types * sizeof(int));
454             sumq = (int *)malloc(n_types * sizeof(int));
455         }
456         memset(sumq, 0, n_types * sizeof(int));
457         for (s = K = 0; s < n; ++s) {
458             for (i = 0; i < n_plp[s]; ++i, ++K) {
459                 bam_pileup1_t *p = plp[s] + i;
460                 int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ;
461                 for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
462                 for (t = 1; t < n_types; ++t) // insertion sort
463                     for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
464                         tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
465                 /* errmod_cal() assumes that if the call is wrong, the
466                  * likelihoods of other events are equal. This is about
467                  * right for substitutions, but is not desired for
468                  * indels. To reuse errmod_cal(), I have to make
469                  * compromise for multi-allelic indels.
470                  */
471                 if ((sc[0]&0x3f) == ref_type) {
472                     indelQ1 = (sc[1]>>14) - (sc[0]>>14);
473                     seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
474                 } else {
475                     for (t = 0; t < n_types; ++t) // look for the reference type
476                         if ((sc[t]&0x3f) == ref_type) break;
477                     indelQ1 = (sc[t]>>14) - (sc[0]>>14);
478                     seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
479                 }
480                 tmp = sc[0]>>6 & 0xff;
481                 indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ
482                 sct = &score2[K*n_types];
483                 for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
484                 for (t = 1; t < n_types; ++t) // insertion sort
485                     for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
486                         tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
487                 if ((sc[0]&0x3f) == ref_type) {
488                     indelQ2 = (sc[1]>>14) - (sc[0]>>14);
489                 } else {
490                     for (t = 0; t < n_types; ++t) // look for the reference type
491                         if ((sc[t]&0x3f) == ref_type) break;
492                     indelQ2 = (sc[t]>>14) - (sc[0]>>14);
493                 }
494                 tmp = sc[0]>>6 & 0xff;
495                 indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499);
496                 // pick the smaller between indelQ1 and indelQ2
497                 indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2;
498                 if (indelQ > 255) indelQ = 255;
499                 if (seqQ > 255) seqQ = 255;
500                 p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
501                 sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
502 //              fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
503             }
504         }
505         // determine bca->indel_types[] and bca->inscns
506         bca->maxins = max_ins;
507         bca->inscns = realloc(bca->inscns, bca->maxins * 4);
508         for (t = 0; t < n_types; ++t)
509             sumq[t] = sumq[t]<<6 | t;
510         for (t = 1; t < n_types; ++t) // insertion sort
511             for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
512                 tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
513         for (t = 0; t < n_types; ++t) // look for the reference type
514             if ((sumq[t]&0x3f) == ref_type) break;
515         if (t) { // then move the reference type to the first
516             tmp = sumq[t];
517             for (; t > 0; --t) sumq[t] = sumq[t-1];
518             sumq[0] = tmp;
519         }
520         for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
521         for (t = 0; t < 4 && t < n_types; ++t) {
522             bca->indel_types[t] = types[sumq[t]&0x3f];
523             memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
524         }
525         // update p->aux
526         for (s = n_alt = 0; s < n; ++s) {
527             for (i = 0; i < n_plp[s]; ++i) {
528                 bam_pileup1_t *p = plp[s] + i;
529                 int x = types[p->aux>>16&0x3f];
530                 for (j = 0; j < 4; ++j)
531                     if (x == bca->indel_types[j]) break;
532                 p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
533                 if ((p->aux>>16&0x3f) > 0) ++n_alt;
534                 //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
535             }
536         }
537 
538         if (sc   != sc_a)   free(sc);
539         if (sumq != sumq_a) free(sumq);
540     }
541     free(score1); free(score2);
542     // free
543     for (i = 0; i < n; ++i) free(ref_sample[i]);
544     free(ref_sample);
545     free(types); free(inscns);
546     return n_alt > 0? 0 : -1;
547 }
548