1 /* bam2bcf_indel.c -- indel caller.
2
3 Copyright (C) 2010, 2011 Broad Institute.
4 Copyright (C) 2012-2014, 2019 Genome Research Ltd.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <config.h>
27
28 #include <assert.h>
29 #include <ctype.h>
30 #include <string.h>
31 #include "htslib/hts.h"
32 #include "htslib/sam.h"
33 #include "bam2bcf.h"
34 #include "htslib/khash.h"
35 KHASH_SET_INIT_STR(rg)
36
37 #include "htslib/ksort.h"
KSORT_INIT_GENERIC(uint32_t)38 KSORT_INIT_GENERIC(uint32_t)
39
40 #define MINUS_CONST 0x10000000
41 #define INDEL_WINDOW_SIZE 50
42
43 void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list)
44 {
45 const char *s, *p, *q, *r, *t;
46 khash_t(rg) *hash;
47 if (list == 0 || hdtext == 0) return _hash;
48 if (_hash == 0) _hash = kh_init(rg);
49 hash = (khash_t(rg)*)_hash;
50 if ((s = strstr(hdtext, "@RG\t")) == 0) return hash;
51 do {
52 t = strstr(s + 4, "@RG\t"); // the next @RG
53 if ((p = strstr(s, "\tID:")) != 0) p += 4;
54 if ((q = strstr(s, "\tPL:")) != 0) q += 4;
55 if (p && q && (t == 0 || (p < t && q < t))) { // ID and PL are both present
56 int lp, lq;
57 char *x;
58 for (r = p; *r && *r != '\t' && *r != '\n'; ++r) { }
59 lp = r - p;
60 for (r = q; *r && *r != '\t' && *r != '\n'; ++r) { }
61 lq = r - q;
62 x = calloc((lp > lq? lp : lq) + 1, 1);
63 for (r = q; *r && *r != '\t' && *r != '\n'; ++r) x[r-q] = *r;
64 if (strstr(list, x)) { // insert ID to the hash table
65 khint_t k;
66 int ret;
67 for (r = p; *r && *r != '\t' && *r != '\n'; ++r) x[r-p] = *r;
68 x[r-p] = 0;
69 k = kh_get(rg, hash, x);
70 if (k == kh_end(hash)) k = kh_put(rg, hash, x, &ret);
71 else free(x);
72 } else free(x);
73 }
74 s = t;
75 } while (s);
76 return hash;
77 }
78
bcf_call_del_rghash(void * _hash)79 void bcf_call_del_rghash(void *_hash)
80 {
81 khint_t k;
82 khash_t(rg) *hash = (khash_t(rg)*)_hash;
83 if (hash == 0) return;
84 for (k = kh_begin(hash); k < kh_end(hash); ++k)
85 if (kh_exist(hash, k))
86 free((char*)kh_key(hash, k));
87 kh_destroy(rg, hash);
88 }
89
tpos2qpos(const bam1_core_t * c,const uint32_t * cigar,hts_pos_t tpos,hts_pos_t is_left,hts_pos_t * _tpos)90 static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, hts_pos_t tpos, hts_pos_t is_left, hts_pos_t *_tpos)
91 {
92 int k, y = 0, last_y = 0;
93 hts_pos_t x = c->pos;
94 *_tpos = c->pos;
95 for (k = 0; k < c->n_cigar; ++k) {
96 int op = cigar[k] & BAM_CIGAR_MASK;
97 int l = cigar[k] >> BAM_CIGAR_SHIFT;
98 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
99 if (c->pos > tpos) return y;
100 if (x + l > tpos) {
101 *_tpos = tpos;
102 return y + (tpos - x);
103 }
104 x += l; y += l;
105 last_y = y;
106 } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
107 else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
108 if (x + l > tpos) {
109 *_tpos = is_left? x : x + l;
110 return y;
111 }
112 x += l;
113 }
114 }
115 *_tpos = x;
116 return last_y;
117 }
118 // FIXME: check if the inserted sequence is consistent with the homopolymer run
119 // l is the relative gap length and l_run is the length of the homopolymer on the reference
est_seqQ(const bcf_callaux_t * bca,int l,int l_run)120 static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run)
121 {
122 int q, qh;
123 q = bca->openQ + bca->extQ * (abs(l) - 1);
124 qh = l_run >= 3? (int)(bca->tandemQ * (double)abs(l) / l_run + .499) : 1000;
125 return q < qh? q : qh;
126 }
127
est_indelreg(hts_pos_t pos,const char * ref,int l,char * ins4)128 static inline int est_indelreg(hts_pos_t pos, const char *ref, int l, char *ins4)
129 {
130 int j, max = 0, score = 0;
131 hts_pos_t i, max_i = pos;
132 l = abs(l);
133 for (i = pos + 1, j = 0; ref[i]; ++i, ++j) {
134 if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1;
135 else score += (toupper(ref[i]) != toupper(ref[pos+1+j%l]))? -10 : 1;
136 if (score < 0) break;
137 if (max < score) max = score, max_i = i;
138 }
139 return max_i - pos;
140 }
141
142 /*
143 notes:
144 - n .. number of samples
145 - the routine sets bam_pileup1_t.aux of each read as follows:
146 - 6: unused
147 - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f
148 - 8: estimated sequence quality .. (aux>>8)&0xff
149 - 8: indel quality .. aux&0xff
150 */
bcf_call_gap_prep(int n,int * n_plp,bam_pileup1_t ** plp,hts_pos_t pos,bcf_callaux_t * bca,const char * ref,const void * rghash)151 int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf_callaux_t *bca, const char *ref,
152 const void *rghash)
153 {
154 int s, k, t, n_types, *types, max_rd_len, max_ins, *score1, *score2, max_ref2;
155 int N, K, l_run, ref_type, n_alt;
156 hts_pos_t i, j, left, right;
157 char *inscns = 0, *ref2, *query, **ref_sample;
158 khash_t(rg) *hash = (khash_t(rg)*)rghash;
159 if (ref == 0 || bca == 0) return -1;
160 // mark filtered reads
161 if (rghash) {
162 N = 0;
163 for (s = N = 0; s < n; ++s) {
164 for (i = 0; i < n_plp[s]; ++i) {
165 bam_pileup1_t *p = plp[s] + i;
166 const uint8_t *rg = bam_aux_get(p->b, "RG");
167 p->aux = 1; // filtered by default
168 if (rg) {
169 khint_t k = kh_get(rg, hash, (const char*)(rg + 1));
170 if (k != kh_end(hash)) p->aux = 0, ++N; // not filtered
171 }
172 }
173 }
174 if (N == 0) return -1; // no reads left
175 }
176 // determine if there is a gap
177 for (s = N = 0; s < n; ++s) {
178 for (i = 0; i < n_plp[s]; ++i)
179 if (plp[s][i].indel != 0) break;
180 if (i < n_plp[s]) break;
181 }
182 if (s == n) return -1; // there is no indel at this position.
183 for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
184 { // find out how many types of indels are present
185 bca->max_support = bca->max_frac = 0;
186 int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
187 uint32_t *aux;
188 aux = calloc(N + 1, 4);
189 m = max_rd_len = 0;
190 aux[m++] = MINUS_CONST; // zero indel is always a type
191 for (s = 0; s < n; ++s) {
192 int na = 0, nt = 0;
193 for (i = 0; i < n_plp[s]; ++i) {
194 const bam_pileup1_t *p = plp[s] + i;
195 if (rghash == 0 || p->aux == 0) {
196 ++nt;
197 if (p->indel != 0) {
198 ++na;
199 aux[m++] = MINUS_CONST + p->indel;
200 }
201 }
202 j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
203 if (j > max_rd_len) max_rd_len = j;
204 }
205 double frac = (double)na/nt;
206 if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
207 indel_support_ok = 1;
208 if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
209 n_alt += na;
210 n_tot += nt;
211 }
212 // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases),
213 // check the number of N's in the sequence and skip places where half or more reference bases are Ns.
214 int nN=0; for (i=pos; i-pos<max_rd_len && ref[i]; i++) if ( ref[i]=='N' ) nN++;
215 if ( nN*2>(i-pos) ) { free(aux); return -1; }
216
217 ks_introsort(uint32_t, m, aux);
218 // squeeze out identical types
219 for (i = 1, n_types = 1; i < m; ++i)
220 if (aux[i] != aux[i-1]) ++n_types;
221 // Taking totals makes it hard to call rare indels
222 if ( !bca->per_sample_flt )
223 indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
224 if ( n_types == 1 || !indel_support_ok ) { // then skip
225 free(aux); return -1;
226 }
227 if (n_types >= 64) {
228 free(aux);
229 // TODO revisit how/whether to control printing this warning
230 if (hts_verbose >= 2)
231 fprintf(stderr, "[%s] excessive INDEL alleles at position %"PRIhts_pos". Skip the position.\n", __func__, pos + 1);
232 return -1;
233 }
234 types = (int*)calloc(n_types, sizeof(int));
235 t = 0;
236 types[t++] = aux[0] - MINUS_CONST;
237 for (i = 1; i < m; ++i)
238 if (aux[i] != aux[i-1])
239 types[t++] = aux[i] - MINUS_CONST;
240 free(aux);
241 for (t = 0; t < n_types; ++t)
242 if (types[t] == 0) break;
243 ref_type = t; // the index of the reference type (0)
244 }
245 { // calculate left and right boundary
246 left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
247 right = pos + INDEL_WINDOW_SIZE;
248 if (types[0] < 0) right -= types[0];
249 // in case the alignments stand out the reference
250 for (i = pos; i < right; ++i)
251 if (ref[i] == 0) break;
252 right = i;
253 }
254 /* The following block fixes a long-existing flaw in the INDEL
255 * calling model: the interference of nearby SNPs. However, it also
256 * reduces the power because sometimes, substitutions caused by
257 * indels are not distinguishable from true mutations. Multiple
258 * sequence realignment helps to increase the power.
259 *
260 * Masks mismatches present in at least 70% of the reads with 'N'.
261 */
262 { // construct per-sample consensus
263 int L = right - left + 1, max_i, max2_i;
264 uint32_t *cns, max, max2;
265 char *ref0, *r;
266 ref_sample = calloc(n, sizeof(char*));
267 cns = calloc(L, 4);
268 ref0 = calloc(L, 1);
269 for (i = 0; i < right - left; ++i)
270 ref0[i] = seq_nt16_table[(int)ref[i+left]];
271 for (s = 0; s < n; ++s) {
272 r = ref_sample[s] = calloc(L, 1);
273 memset(cns, 0, sizeof(int) * L);
274 // collect ref and non-ref counts
275 for (i = 0; i < n_plp[s]; ++i) {
276 bam_pileup1_t *p = plp[s] + i;
277 bam1_t *b = p->b;
278 uint32_t *cigar = bam_get_cigar(b);
279 uint8_t *seq = bam_get_seq(b);
280 hts_pos_t x = b->core.pos, y = 0;
281 for (k = 0; k < b->core.n_cigar; ++k) {
282 int op = cigar[k]&0xf;
283 int j, l = cigar[k]>>4;
284 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
285 for (j = 0; j < l; ++j)
286 if (x + j >= left && x + j < right)
287 cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
288 x += l; y += l;
289 } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
290 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
291 }
292 }
293 // determine the consensus
294 for (i = 0; i < right - left; ++i) r[i] = ref0[i];
295 max = max2 = 0; max_i = max2_i = -1;
296 for (i = 0; i < right - left; ++i) {
297 if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i;
298 else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i;
299 }
300 if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1;
301 if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1;
302 if (max_i >= 0) r[max_i] = 15;
303 if (max2_i >= 0) r[max2_i] = 15;
304 //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr);
305 }
306 free(ref0); free(cns);
307 }
308 { // the length of the homopolymer run around the current position
309 int c = seq_nt16_table[(int)ref[pos + 1]];
310 if (c == 15) l_run = 1;
311 else {
312 for (i = pos + 2; ref[i]; ++i)
313 if (seq_nt16_table[(int)ref[i]] != c) break;
314 l_run = i;
315 for (i = pos; i >= 0; --i)
316 if (seq_nt16_table[(int)ref[i]] != c) break;
317 l_run -= i + 1;
318 }
319 }
320 // construct the consensus sequence
321 max_ins = types[n_types - 1]; // max_ins is at least 0
322 if (max_ins > 0) {
323 int *inscns_aux = calloc(5 * n_types * max_ins, sizeof(int));
324 // count the number of occurrences of each base at each position for each type of insertion
325 for (t = 0; t < n_types; ++t) {
326 if (types[t] > 0) {
327 for (s = 0; s < n; ++s) {
328 for (i = 0; i < n_plp[s]; ++i) {
329 bam_pileup1_t *p = plp[s] + i;
330 if (p->indel == types[t]) {
331 uint8_t *seq = bam_get_seq(p->b);
332 for (k = 1; k <= p->indel; ++k) {
333 int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
334 assert(c<5);
335 ++inscns_aux[(t*max_ins+(k-1))*5 + c];
336 }
337 }
338 }
339 }
340 }
341 }
342 // use the majority rule to construct the consensus
343 inscns = calloc(n_types * max_ins, 1);
344 for (t = 0; t < n_types; ++t) {
345 for (j = 0; j < types[t]; ++j) {
346 int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
347 for (k = 0; k < 5; ++k)
348 if (ia[k] > max)
349 max = ia[k], max_k = k;
350 inscns[t*max_ins + j] = max? max_k : 4;
351 if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's
352 }
353 }
354 free(inscns_aux);
355 }
356 // compute the likelihood given each type of indel for each read
357 max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]);
358 ref2 = calloc(max_ref2, 1);
359 query = calloc(right - left + max_rd_len + max_ins + 2, 1);
360 score1 = calloc(N * n_types, sizeof(int));
361 score2 = calloc(N * n_types, sizeof(int));
362 bca->indelreg = 0;
363 for (t = 0; t < n_types; ++t) {
364 int l, ir;
365 probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
366 apf1.bw = apf2.bw = abs(types[t]) + 3;
367 // compute indelreg
368 if (types[t] == 0) ir = 0;
369 else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
370 else ir = est_indelreg(pos, ref, -types[t], 0);
371 if (ir > bca->indelreg) bca->indelreg = ir;
372 // fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir);
373 // realignment
374 for (s = K = 0; s < n; ++s) {
375 // write ref2
376 for (k = 0, j = left; j <= pos; ++j)
377 ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
378 if (types[t] <= 0) j += -types[t];
379 else for (l = 0; l < types[t]; ++l)
380 ref2[k++] = inscns[t*max_ins + l];
381 for (; j < right && ref[j]; ++j)
382 ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
383 for (; k < max_ref2; ++k) ref2[k] = 4;
384 if (j < right) right = j;
385 // align each read to ref2
386 for (i = 0; i < n_plp[s]; ++i, ++K) {
387 bam_pileup1_t *p = plp[s] + i;
388 int qbeg, qend, sc, kk;
389 hts_pos_t tbeg, tend;
390 uint8_t *seq = bam_get_seq(p->b);
391 uint32_t *cigar = bam_get_cigar(p->b);
392 if (p->b->core.flag&4) continue; // unmapped reads
393 // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway.
394 for (kk = 0; kk < p->b->core.n_cigar; ++kk)
395 if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break;
396 if (kk < p->b->core.n_cigar) continue;
397 // FIXME: the following skips soft clips, but using them may be more sensitive.
398 // determine the start and end of sequences for alignment
399 qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg);
400 qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend);
401 if (types[t] < 0) {
402 int l = -types[t];
403 tbeg = tbeg - l > left? tbeg - l : left;
404 }
405 // write the query sequence
406 for (l = qbeg; l < qend; ++l)
407 query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)];
408 { // do realignment; this is the bottleneck
409 const uint8_t *qual = bam_get_qual(p->b), *bq;
410 uint8_t *qq;
411 if (qend < qbeg) {
412 fprintf(stderr, "Impossible data in bcf_call_gap_prep\n");
413 exit(1);
414 }
415 qq = calloc(qend - qbeg, 1);
416 bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
417 if (bq) ++bq; // skip type
418 for (l = qbeg; l < qend; ++l) {
419 qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l];
420 if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
421 if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
422 }
423 sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
424 (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
425 l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
426 if (l > 255) l = 255;
427 score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
428 if (sc > 5) {
429 sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
430 (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
431 l = (int)(100. * sc / (qend - qbeg) + .499);
432 if (l > 255) l = 255;
433 score2[K*n_types + t] = sc<<8 | l;
434 }
435 free(qq);
436 }
437 /*
438 for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
439 fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr);
440 fputc('\n', stderr);
441 for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr);
442 fputc('\n', stderr);
443 fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc);
444 */
445 }
446 }
447 }
448 free(ref2); free(query);
449 { // compute indelQ
450 int sc_a[16], sumq_a[16];
451 int tmp, *sc = sc_a, *sumq = sumq_a;
452 if (n_types > 16) {
453 sc = (int *)malloc(n_types * sizeof(int));
454 sumq = (int *)malloc(n_types * sizeof(int));
455 }
456 memset(sumq, 0, n_types * sizeof(int));
457 for (s = K = 0; s < n; ++s) {
458 for (i = 0; i < n_plp[s]; ++i, ++K) {
459 bam_pileup1_t *p = plp[s] + i;
460 int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ;
461 for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
462 for (t = 1; t < n_types; ++t) // insertion sort
463 for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
464 tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
465 /* errmod_cal() assumes that if the call is wrong, the
466 * likelihoods of other events are equal. This is about
467 * right for substitutions, but is not desired for
468 * indels. To reuse errmod_cal(), I have to make
469 * compromise for multi-allelic indels.
470 */
471 if ((sc[0]&0x3f) == ref_type) {
472 indelQ1 = (sc[1]>>14) - (sc[0]>>14);
473 seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
474 } else {
475 for (t = 0; t < n_types; ++t) // look for the reference type
476 if ((sc[t]&0x3f) == ref_type) break;
477 indelQ1 = (sc[t]>>14) - (sc[0]>>14);
478 seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
479 }
480 tmp = sc[0]>>6 & 0xff;
481 indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ
482 sct = &score2[K*n_types];
483 for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
484 for (t = 1; t < n_types; ++t) // insertion sort
485 for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
486 tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
487 if ((sc[0]&0x3f) == ref_type) {
488 indelQ2 = (sc[1]>>14) - (sc[0]>>14);
489 } else {
490 for (t = 0; t < n_types; ++t) // look for the reference type
491 if ((sc[t]&0x3f) == ref_type) break;
492 indelQ2 = (sc[t]>>14) - (sc[0]>>14);
493 }
494 tmp = sc[0]>>6 & 0xff;
495 indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499);
496 // pick the smaller between indelQ1 and indelQ2
497 indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2;
498 if (indelQ > 255) indelQ = 255;
499 if (seqQ > 255) seqQ = 255;
500 p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
501 sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
502 // fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
503 }
504 }
505 // determine bca->indel_types[] and bca->inscns
506 bca->maxins = max_ins;
507 bca->inscns = realloc(bca->inscns, bca->maxins * 4);
508 for (t = 0; t < n_types; ++t)
509 sumq[t] = sumq[t]<<6 | t;
510 for (t = 1; t < n_types; ++t) // insertion sort
511 for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
512 tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
513 for (t = 0; t < n_types; ++t) // look for the reference type
514 if ((sumq[t]&0x3f) == ref_type) break;
515 if (t) { // then move the reference type to the first
516 tmp = sumq[t];
517 for (; t > 0; --t) sumq[t] = sumq[t-1];
518 sumq[0] = tmp;
519 }
520 for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
521 for (t = 0; t < 4 && t < n_types; ++t) {
522 bca->indel_types[t] = types[sumq[t]&0x3f];
523 memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
524 }
525 // update p->aux
526 for (s = n_alt = 0; s < n; ++s) {
527 for (i = 0; i < n_plp[s]; ++i) {
528 bam_pileup1_t *p = plp[s] + i;
529 int x = types[p->aux>>16&0x3f];
530 for (j = 0; j < 4; ++j)
531 if (x == bca->indel_types[j]) break;
532 p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
533 if ((p->aux>>16&0x3f) > 0) ++n_alt;
534 //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
535 }
536 }
537
538 if (sc != sc_a) free(sc);
539 if (sumq != sumq_a) free(sumq);
540 }
541 free(score1); free(score2);
542 // free
543 for (i = 0; i < n; ++i) free(ref_sample[i]);
544 free(ref_sample);
545 free(types); free(inscns);
546 return n_alt > 0? 0 : -1;
547 }
548