1 #include "samtools.pysam.h"
2 
3 /*  bam2bcf_indel.c -- indel caller.
4 
5     Copyright (C) 2010, 2011 Broad Institute.
6     Copyright (C) 2012-2014, 2019 Genome Research Ltd.
7 
8     Author: Heng Li <lh3@sanger.ac.uk>
9 
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16 
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.  */
27 
28 #include <config.h>
29 
30 #include <assert.h>
31 #include <ctype.h>
32 #include <string.h>
33 #include "htslib/hts.h"
34 #include "htslib/sam.h"
35 #include "bam2bcf.h"
36 #include "htslib/khash.h"
37 KHASH_SET_INIT_STR(rg)
38 
39 #include "htslib/ksort.h"
KSORT_INIT_GENERIC(uint32_t)40 KSORT_INIT_GENERIC(uint32_t)
41 
42 #define MINUS_CONST 0x10000000
43 #define INDEL_WINDOW_SIZE 50
44 
45 void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list)
46 {
47     const char *s, *p, *q, *r, *t;
48     khash_t(rg) *hash;
49     if (list == 0 || hdtext == 0) return _hash;
50     if (_hash == 0) _hash = kh_init(rg);
51     hash = (khash_t(rg)*)_hash;
52     if ((s = strstr(hdtext, "@RG\t")) == 0) return hash;
53     do {
54         t = strstr(s + 4, "@RG\t"); // the next @RG
55         if ((p = strstr(s, "\tID:")) != 0) p += 4;
56         if ((q = strstr(s, "\tPL:")) != 0) q += 4;
57         if (p && q && (t == 0 || (p < t && q < t))) { // ID and PL are both present
58             int lp, lq;
59             char *x;
60             for (r = p; *r && *r != '\t' && *r != '\n'; ++r) { }
61             lp = r - p;
62             for (r = q; *r && *r != '\t' && *r != '\n'; ++r) { }
63             lq = r - q;
64             x = calloc((lp > lq? lp : lq) + 1, 1);
65             for (r = q; *r && *r != '\t' && *r != '\n'; ++r) x[r-q] = *r;
66             if (strstr(list, x)) { // insert ID to the hash table
67                 khint_t k;
68                 int ret;
69                 for (r = p; *r && *r != '\t' && *r != '\n'; ++r) x[r-p] = *r;
70                 x[r-p] = 0;
71                 k = kh_get(rg, hash, x);
72                 if (k == kh_end(hash)) k = kh_put(rg, hash, x, &ret);
73                 else free(x);
74             } else free(x);
75         }
76         s = t;
77     } while (s);
78     return hash;
79 }
80 
bcf_call_del_rghash(void * _hash)81 void bcf_call_del_rghash(void *_hash)
82 {
83     khint_t k;
84     khash_t(rg) *hash = (khash_t(rg)*)_hash;
85     if (hash == 0) return;
86     for (k = kh_begin(hash); k < kh_end(hash); ++k)
87         if (kh_exist(hash, k))
88             free((char*)kh_key(hash, k));
89     kh_destroy(rg, hash);
90 }
91 
tpos2qpos(const bam1_core_t * c,const uint32_t * cigar,hts_pos_t tpos,hts_pos_t is_left,hts_pos_t * _tpos)92 static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, hts_pos_t tpos, hts_pos_t is_left, hts_pos_t *_tpos)
93 {
94     int k, y = 0, last_y = 0;
95     hts_pos_t x = c->pos;
96     *_tpos = c->pos;
97     for (k = 0; k < c->n_cigar; ++k) {
98         int op = cigar[k] & BAM_CIGAR_MASK;
99         int l = cigar[k] >> BAM_CIGAR_SHIFT;
100         if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
101             if (c->pos > tpos) return y;
102             if (x + l > tpos) {
103                 *_tpos = tpos;
104                 return y + (tpos - x);
105             }
106             x += l; y += l;
107             last_y = y;
108         } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
109         else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
110             if (x + l > tpos) {
111                 *_tpos = is_left? x : x + l;
112                 return y;
113             }
114             x += l;
115         }
116     }
117     *_tpos = x;
118     return last_y;
119 }
120 // FIXME: check if the inserted sequence is consistent with the homopolymer run
121 // l is the relative gap length and l_run is the length of the homopolymer on the reference
est_seqQ(const bcf_callaux_t * bca,int l,int l_run)122 static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run)
123 {
124     int q, qh;
125     q = bca->openQ + bca->extQ * (abs(l) - 1);
126     qh = l_run >= 3? (int)(bca->tandemQ * (double)abs(l) / l_run + .499) : 1000;
127     return q < qh? q : qh;
128 }
129 
est_indelreg(hts_pos_t pos,const char * ref,int l,char * ins4)130 static inline int est_indelreg(hts_pos_t pos, const char *ref, int l, char *ins4)
131 {
132     int j, max = 0, score = 0;
133     hts_pos_t i, max_i = pos;
134     l = abs(l);
135     for (i = pos + 1, j = 0; ref[i]; ++i, ++j) {
136         if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1;
137         else score += (toupper(ref[i]) != toupper(ref[pos+1+j%l]))? -10 : 1;
138         if (score < 0) break;
139         if (max < score) max = score, max_i = i;
140     }
141     return max_i - pos;
142 }
143 
144 /*
145     notes:
146         - n .. number of samples
147         - the routine sets bam_pileup1_t.aux of each read as follows:
148             - 6: unused
149             - 6: the call; index to bcf_callaux_t.indel_types   .. (aux>>16)&0x3f
150             - 8: estimated sequence quality                     .. (aux>>8)&0xff
151             - 8: indel quality                                  .. aux&0xff
152  */
bcf_call_gap_prep(int n,int * n_plp,bam_pileup1_t ** plp,hts_pos_t pos,bcf_callaux_t * bca,const char * ref,const void * rghash)153 int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf_callaux_t *bca, const char *ref,
154                       const void *rghash)
155 {
156     int s, k, t, n_types, *types, max_rd_len, max_ins, *score1, *score2, max_ref2;
157     int N, K, l_run, ref_type, n_alt;
158     hts_pos_t i, j, left, right;
159     char *inscns = 0, *ref2, *query, **ref_sample;
160     khash_t(rg) *hash = (khash_t(rg)*)rghash;
161     if (ref == 0 || bca == 0) return -1;
162     // mark filtered reads
163     if (rghash) {
164         N = 0;
165         for (s = N = 0; s < n; ++s) {
166             for (i = 0; i < n_plp[s]; ++i) {
167                 bam_pileup1_t *p = plp[s] + i;
168                 const uint8_t *rg = bam_aux_get(p->b, "RG");
169                 p->aux = 1; // filtered by default
170                 if (rg) {
171                     khint_t k = kh_get(rg, hash, (const char*)(rg + 1));
172                     if (k != kh_end(hash)) p->aux = 0, ++N; // not filtered
173                 }
174             }
175         }
176         if (N == 0) return -1; // no reads left
177     }
178     // determine if there is a gap
179     for (s = N = 0; s < n; ++s) {
180         for (i = 0; i < n_plp[s]; ++i)
181             if (plp[s][i].indel != 0) break;
182         if (i < n_plp[s]) break;
183     }
184     if (s == n) return -1; // there is no indel at this position.
185     for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
186     { // find out how many types of indels are present
187         bca->max_support = bca->max_frac = 0;
188         int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
189         uint32_t *aux;
190         aux = calloc(N + 1, 4);
191         m = max_rd_len = 0;
192         aux[m++] = MINUS_CONST; // zero indel is always a type
193         for (s = 0; s < n; ++s) {
194             int na = 0, nt = 0;
195             for (i = 0; i < n_plp[s]; ++i) {
196                 const bam_pileup1_t *p = plp[s] + i;
197                 if (rghash == 0 || p->aux == 0) {
198                     ++nt;
199                     if (p->indel != 0) {
200                         ++na;
201                         aux[m++] = MINUS_CONST + p->indel;
202                     }
203                 }
204                 j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
205                 if (j > max_rd_len) max_rd_len = j;
206             }
207             double frac = (double)na/nt;
208             if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
209                 indel_support_ok = 1;
210             if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
211             n_alt += na;
212             n_tot += nt;
213         }
214         // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases),
215         //  check the number of N's in the sequence and skip places where half or more reference bases are Ns.
216         int nN=0; for (i=pos; i-pos<max_rd_len && ref[i]; i++) if ( ref[i]=='N' ) nN++;
217         if ( nN*2>(i-pos) ) { free(aux); return -1; }
218 
219         ks_introsort(uint32_t, m, aux);
220         // squeeze out identical types
221         for (i = 1, n_types = 1; i < m; ++i)
222             if (aux[i] != aux[i-1]) ++n_types;
223         // Taking totals makes it hard to call rare indels
224         if ( !bca->per_sample_flt )
225             indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
226         if ( n_types == 1 || !indel_support_ok ) { // then skip
227             free(aux); return -1;
228         }
229         if (n_types >= 64) {
230             free(aux);
231             // TODO revisit how/whether to control printing this warning
232             if (hts_verbose >= 2)
233                 fprintf(samtools_stderr, "[%s] excessive INDEL alleles at position %"PRIhts_pos". Skip the position.\n", __func__, pos + 1);
234             return -1;
235         }
236         types = (int*)calloc(n_types, sizeof(int));
237         t = 0;
238         types[t++] = aux[0] - MINUS_CONST;
239         for (i = 1; i < m; ++i)
240             if (aux[i] != aux[i-1])
241                 types[t++] = aux[i] - MINUS_CONST;
242         free(aux);
243         for (t = 0; t < n_types; ++t)
244             if (types[t] == 0) break;
245         ref_type = t; // the index of the reference type (0)
246     }
247     { // calculate left and right boundary
248         left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
249         right = pos + INDEL_WINDOW_SIZE;
250         if (types[0] < 0) right -= types[0];
251         // in case the alignments stand out the reference
252         for (i = pos; i < right; ++i)
253             if (ref[i] == 0) break;
254         right = i;
255     }
256     /* The following block fixes a long-existing flaw in the INDEL
257      * calling model: the interference of nearby SNPs. However, it also
258      * reduces the power because sometimes, substitutions caused by
259      * indels are not distinguishable from true mutations. Multiple
260      * sequence realignment helps to increase the power.
261      *
262      * Masks mismatches present in at least 70% of the reads with 'N'.
263      */
264     { // construct per-sample consensus
265         int L = right - left + 1, max_i, max2_i;
266         uint32_t *cns, max, max2;
267         char *ref0, *r;
268         ref_sample = calloc(n, sizeof(char*));
269         cns = calloc(L, 4);
270         ref0 = calloc(L, 1);
271         for (i = 0; i < right - left; ++i)
272             ref0[i] = seq_nt16_table[(int)ref[i+left]];
273         for (s = 0; s < n; ++s) {
274             r = ref_sample[s] = calloc(L, 1);
275             memset(cns, 0, sizeof(int) * L);
276             // collect ref and non-ref counts
277             for (i = 0; i < n_plp[s]; ++i) {
278                 bam_pileup1_t *p = plp[s] + i;
279                 bam1_t *b = p->b;
280                 uint32_t *cigar = bam_get_cigar(b);
281                 uint8_t *seq = bam_get_seq(b);
282                 hts_pos_t x = b->core.pos, y = 0;
283                 for (k = 0; k < b->core.n_cigar; ++k) {
284                     int op = cigar[k]&0xf;
285                     int j, l = cigar[k]>>4;
286                     if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
287                         for (j = 0; j < l; ++j)
288                             if (x + j >= left && x + j < right)
289                                 cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
290                         x += l; y += l;
291                     } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
292                     else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
293                 }
294             }
295             // determine the consensus
296             for (i = 0; i < right - left; ++i) r[i] = ref0[i];
297             max = max2 = 0; max_i = max2_i = -1;
298             for (i = 0; i < right - left; ++i) {
299                 if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i;
300                 else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i;
301             }
302             if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1;
303             if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1;
304             if (max_i >= 0) r[max_i] = 15;
305             if (max2_i >= 0) r[max2_i] = 15;
306             //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], samtools_stderr); fputc('\n', samtools_stderr);
307         }
308         free(ref0); free(cns);
309     }
310     { // the length of the homopolymer run around the current position
311         int c = seq_nt16_table[(int)ref[pos + 1]];
312         if (c == 15) l_run = 1;
313         else {
314             for (i = pos + 2; ref[i]; ++i)
315                 if (seq_nt16_table[(int)ref[i]] != c) break;
316             l_run = i;
317             for (i = pos; i >= 0; --i)
318                 if (seq_nt16_table[(int)ref[i]] != c) break;
319             l_run -= i + 1;
320         }
321     }
322     // construct the consensus sequence
323     max_ins = types[n_types - 1];   // max_ins is at least 0
324     if (max_ins > 0) {
325         int *inscns_aux = calloc(5 * n_types * max_ins, sizeof(int));
326         // count the number of occurrences of each base at each position for each type of insertion
327         for (t = 0; t < n_types; ++t) {
328             if (types[t] > 0) {
329                 for (s = 0; s < n; ++s) {
330                     for (i = 0; i < n_plp[s]; ++i) {
331                         bam_pileup1_t *p = plp[s] + i;
332                         if (p->indel == types[t]) {
333                             uint8_t *seq = bam_get_seq(p->b);
334                             for (k = 1; k <= p->indel; ++k) {
335                                 int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
336                                 assert(c<5);
337                                 ++inscns_aux[(t*max_ins+(k-1))*5 + c];
338                             }
339                         }
340                     }
341                 }
342             }
343         }
344         // use the majority rule to construct the consensus
345         inscns = calloc(n_types * max_ins, 1);
346         for (t = 0; t < n_types; ++t) {
347             for (j = 0; j < types[t]; ++j) {
348                 int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
349                 for (k = 0; k < 5; ++k)
350                     if (ia[k] > max)
351                         max = ia[k], max_k = k;
352                 inscns[t*max_ins + j] = max? max_k : 4;
353                 if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's
354             }
355         }
356         free(inscns_aux);
357     }
358     // compute the likelihood given each type of indel for each read
359     max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]);
360     ref2  = calloc(max_ref2, 1);
361     query = calloc(right - left + max_rd_len + max_ins + 2, 1);
362     score1 = calloc(N * n_types, sizeof(int));
363     score2 = calloc(N * n_types, sizeof(int));
364     bca->indelreg = 0;
365     for (t = 0; t < n_types; ++t) {
366         int l, ir;
367         probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
368         apf1.bw = apf2.bw = abs(types[t]) + 3;
369         // compute indelreg
370         if (types[t] == 0) ir = 0;
371         else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
372         else ir = est_indelreg(pos, ref, -types[t], 0);
373         if (ir > bca->indelreg) bca->indelreg = ir;
374 //      fprintf(samtools_stderr, "%d, %d, %d\n", pos, types[t], ir);
375         // realignment
376         for (s = K = 0; s < n; ++s) {
377             // write ref2
378             for (k = 0, j = left; j <= pos; ++j)
379                 ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
380             if (types[t] <= 0) j += -types[t];
381             else for (l = 0; l < types[t]; ++l)
382                      ref2[k++] = inscns[t*max_ins + l];
383             for (; j < right && ref[j]; ++j)
384                 ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
385             for (; k < max_ref2; ++k) ref2[k] = 4;
386             if (j < right) right = j;
387             // align each read to ref2
388             for (i = 0; i < n_plp[s]; ++i, ++K) {
389                 bam_pileup1_t *p = plp[s] + i;
390                 int qbeg, qend, sc, kk;
391                 hts_pos_t tbeg, tend;
392                 uint8_t *seq = bam_get_seq(p->b);
393                 uint32_t *cigar = bam_get_cigar(p->b);
394                 if (p->b->core.flag&4) continue; // unmapped reads
395                 // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway.
396                 for (kk = 0; kk < p->b->core.n_cigar; ++kk)
397                     if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break;
398                 if (kk < p->b->core.n_cigar) continue;
399                 // FIXME: the following skips soft clips, but using them may be more sensitive.
400                 // determine the start and end of sequences for alignment
401                 qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left,  0, &tbeg);
402                 qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend);
403                 if (types[t] < 0) {
404                     int l = -types[t];
405                     tbeg = tbeg - l > left?  tbeg - l : left;
406                 }
407                 // write the query sequence
408                 for (l = qbeg; l < qend; ++l)
409                     query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)];
410                 { // do realignment; this is the bottleneck
411                     const uint8_t *qual = bam_get_qual(p->b), *bq;
412                     uint8_t *qq;
413                     if (qend < qbeg) {
414                         fprintf(samtools_stderr, "Impossible data in bcf_call_gap_prep\n");
415                         samtools_exit(1);
416                     }
417                     qq = calloc(qend - qbeg, 1);
418                     bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
419                     if (bq) ++bq; // skip type
420                     for (l = qbeg; l < qend; ++l) {
421                         qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l];
422                         if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
423                         if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
424                     }
425                     sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
426                                         (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
427                     l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
428                     if (l > 255) l = 255;
429                     score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
430                     if (sc > 5) {
431                         sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
432                                             (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
433                         l = (int)(100. * sc / (qend - qbeg) + .499);
434                         if (l > 255) l = 255;
435                         score2[K*n_types + t] = sc<<8 | l;
436                     }
437                     free(qq);
438                 }
439 /*
440                 for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
441                     fputc("ACGTN"[(int)ref2[tbeg-left+l]], samtools_stderr);
442                 fputc('\n', samtools_stderr);
443                 for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], samtools_stderr);
444                 fputc('\n', samtools_stderr);
445                 fprintf(samtools_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc);
446 */
447             }
448         }
449     }
450     free(ref2); free(query);
451     { // compute indelQ
452         int sc_a[16], sumq_a[16];
453         int tmp, *sc = sc_a, *sumq = sumq_a;
454         if (n_types > 16) {
455             sc   = (int *)malloc(n_types * sizeof(int));
456             sumq = (int *)malloc(n_types * sizeof(int));
457         }
458         memset(sumq, 0, n_types * sizeof(int));
459         for (s = K = 0; s < n; ++s) {
460             for (i = 0; i < n_plp[s]; ++i, ++K) {
461                 bam_pileup1_t *p = plp[s] + i;
462                 int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ;
463                 for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
464                 for (t = 1; t < n_types; ++t) // insertion sort
465                     for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
466                         tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
467                 /* errmod_cal() assumes that if the call is wrong, the
468                  * likelihoods of other events are equal. This is about
469                  * right for substitutions, but is not desired for
470                  * indels. To reuse errmod_cal(), I have to make
471                  * compromise for multi-allelic indels.
472                  */
473                 if ((sc[0]&0x3f) == ref_type) {
474                     indelQ1 = (sc[1]>>14) - (sc[0]>>14);
475                     seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
476                 } else {
477                     for (t = 0; t < n_types; ++t) // look for the reference type
478                         if ((sc[t]&0x3f) == ref_type) break;
479                     indelQ1 = (sc[t]>>14) - (sc[0]>>14);
480                     seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
481                 }
482                 tmp = sc[0]>>6 & 0xff;
483                 indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ
484                 sct = &score2[K*n_types];
485                 for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
486                 for (t = 1; t < n_types; ++t) // insertion sort
487                     for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
488                         tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
489                 if ((sc[0]&0x3f) == ref_type) {
490                     indelQ2 = (sc[1]>>14) - (sc[0]>>14);
491                 } else {
492                     for (t = 0; t < n_types; ++t) // look for the reference type
493                         if ((sc[t]&0x3f) == ref_type) break;
494                     indelQ2 = (sc[t]>>14) - (sc[0]>>14);
495                 }
496                 tmp = sc[0]>>6 & 0xff;
497                 indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499);
498                 // pick the smaller between indelQ1 and indelQ2
499                 indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2;
500                 if (indelQ > 255) indelQ = 255;
501                 if (seqQ > 255) seqQ = 255;
502                 p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
503                 sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
504 //              fprintf(samtools_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
505             }
506         }
507         // determine bca->indel_types[] and bca->inscns
508         bca->maxins = max_ins;
509         bca->inscns = realloc(bca->inscns, bca->maxins * 4);
510         for (t = 0; t < n_types; ++t)
511             sumq[t] = sumq[t]<<6 | t;
512         for (t = 1; t < n_types; ++t) // insertion sort
513             for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
514                 tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
515         for (t = 0; t < n_types; ++t) // look for the reference type
516             if ((sumq[t]&0x3f) == ref_type) break;
517         if (t) { // then move the reference type to the first
518             tmp = sumq[t];
519             for (; t > 0; --t) sumq[t] = sumq[t-1];
520             sumq[0] = tmp;
521         }
522         for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
523         for (t = 0; t < 4 && t < n_types; ++t) {
524             bca->indel_types[t] = types[sumq[t]&0x3f];
525             memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
526         }
527         // update p->aux
528         for (s = n_alt = 0; s < n; ++s) {
529             for (i = 0; i < n_plp[s]; ++i) {
530                 bam_pileup1_t *p = plp[s] + i;
531                 int x = types[p->aux>>16&0x3f];
532                 for (j = 0; j < 4; ++j)
533                     if (x == bca->indel_types[j]) break;
534                 p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
535                 if ((p->aux>>16&0x3f) > 0) ++n_alt;
536                 //fprintf(samtools_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
537             }
538         }
539 
540         if (sc   != sc_a)   free(sc);
541         if (sumq != sumq_a) free(sumq);
542     }
543     free(score1); free(score2);
544     // free
545     for (i = 0; i < n; ++i) free(ref_sample[i]);
546     free(ref_sample);
547     free(types); free(inscns);
548     return n_alt > 0? 0 : -1;
549 }
550