1 #include "samtools.pysam.h"
2
3 /* bam2bcf_indel.c -- indel caller.
4
5 Copyright (C) 2010, 2011 Broad Institute.
6 Copyright (C) 2012-2014, 2019 Genome Research Ltd.
7
8 Author: Heng Li <lh3@sanger.ac.uk>
9
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE. */
27
28 #include <config.h>
29
30 #include <assert.h>
31 #include <ctype.h>
32 #include <string.h>
33 #include "htslib/hts.h"
34 #include "htslib/sam.h"
35 #include "bam2bcf.h"
36 #include "htslib/khash.h"
37 KHASH_SET_INIT_STR(rg)
38
39 #include "htslib/ksort.h"
KSORT_INIT_GENERIC(uint32_t)40 KSORT_INIT_GENERIC(uint32_t)
41
42 #define MINUS_CONST 0x10000000
43 #define INDEL_WINDOW_SIZE 50
44
45 void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list)
46 {
47 const char *s, *p, *q, *r, *t;
48 khash_t(rg) *hash;
49 if (list == 0 || hdtext == 0) return _hash;
50 if (_hash == 0) _hash = kh_init(rg);
51 hash = (khash_t(rg)*)_hash;
52 if ((s = strstr(hdtext, "@RG\t")) == 0) return hash;
53 do {
54 t = strstr(s + 4, "@RG\t"); // the next @RG
55 if ((p = strstr(s, "\tID:")) != 0) p += 4;
56 if ((q = strstr(s, "\tPL:")) != 0) q += 4;
57 if (p && q && (t == 0 || (p < t && q < t))) { // ID and PL are both present
58 int lp, lq;
59 char *x;
60 for (r = p; *r && *r != '\t' && *r != '\n'; ++r) { }
61 lp = r - p;
62 for (r = q; *r && *r != '\t' && *r != '\n'; ++r) { }
63 lq = r - q;
64 x = calloc((lp > lq? lp : lq) + 1, 1);
65 for (r = q; *r && *r != '\t' && *r != '\n'; ++r) x[r-q] = *r;
66 if (strstr(list, x)) { // insert ID to the hash table
67 khint_t k;
68 int ret;
69 for (r = p; *r && *r != '\t' && *r != '\n'; ++r) x[r-p] = *r;
70 x[r-p] = 0;
71 k = kh_get(rg, hash, x);
72 if (k == kh_end(hash)) k = kh_put(rg, hash, x, &ret);
73 else free(x);
74 } else free(x);
75 }
76 s = t;
77 } while (s);
78 return hash;
79 }
80
bcf_call_del_rghash(void * _hash)81 void bcf_call_del_rghash(void *_hash)
82 {
83 khint_t k;
84 khash_t(rg) *hash = (khash_t(rg)*)_hash;
85 if (hash == 0) return;
86 for (k = kh_begin(hash); k < kh_end(hash); ++k)
87 if (kh_exist(hash, k))
88 free((char*)kh_key(hash, k));
89 kh_destroy(rg, hash);
90 }
91
tpos2qpos(const bam1_core_t * c,const uint32_t * cigar,hts_pos_t tpos,hts_pos_t is_left,hts_pos_t * _tpos)92 static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, hts_pos_t tpos, hts_pos_t is_left, hts_pos_t *_tpos)
93 {
94 int k, y = 0, last_y = 0;
95 hts_pos_t x = c->pos;
96 *_tpos = c->pos;
97 for (k = 0; k < c->n_cigar; ++k) {
98 int op = cigar[k] & BAM_CIGAR_MASK;
99 int l = cigar[k] >> BAM_CIGAR_SHIFT;
100 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
101 if (c->pos > tpos) return y;
102 if (x + l > tpos) {
103 *_tpos = tpos;
104 return y + (tpos - x);
105 }
106 x += l; y += l;
107 last_y = y;
108 } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
109 else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
110 if (x + l > tpos) {
111 *_tpos = is_left? x : x + l;
112 return y;
113 }
114 x += l;
115 }
116 }
117 *_tpos = x;
118 return last_y;
119 }
120 // FIXME: check if the inserted sequence is consistent with the homopolymer run
121 // l is the relative gap length and l_run is the length of the homopolymer on the reference
est_seqQ(const bcf_callaux_t * bca,int l,int l_run)122 static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run)
123 {
124 int q, qh;
125 q = bca->openQ + bca->extQ * (abs(l) - 1);
126 qh = l_run >= 3? (int)(bca->tandemQ * (double)abs(l) / l_run + .499) : 1000;
127 return q < qh? q : qh;
128 }
129
est_indelreg(hts_pos_t pos,const char * ref,int l,char * ins4)130 static inline int est_indelreg(hts_pos_t pos, const char *ref, int l, char *ins4)
131 {
132 int j, max = 0, score = 0;
133 hts_pos_t i, max_i = pos;
134 l = abs(l);
135 for (i = pos + 1, j = 0; ref[i]; ++i, ++j) {
136 if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1;
137 else score += (toupper(ref[i]) != toupper(ref[pos+1+j%l]))? -10 : 1;
138 if (score < 0) break;
139 if (max < score) max = score, max_i = i;
140 }
141 return max_i - pos;
142 }
143
144 /*
145 notes:
146 - n .. number of samples
147 - the routine sets bam_pileup1_t.aux of each read as follows:
148 - 6: unused
149 - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f
150 - 8: estimated sequence quality .. (aux>>8)&0xff
151 - 8: indel quality .. aux&0xff
152 */
bcf_call_gap_prep(int n,int * n_plp,bam_pileup1_t ** plp,hts_pos_t pos,bcf_callaux_t * bca,const char * ref,const void * rghash)153 int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf_callaux_t *bca, const char *ref,
154 const void *rghash)
155 {
156 int s, k, t, n_types, *types, max_rd_len, max_ins, *score1, *score2, max_ref2;
157 int N, K, l_run, ref_type, n_alt;
158 hts_pos_t i, j, left, right;
159 char *inscns = 0, *ref2, *query, **ref_sample;
160 khash_t(rg) *hash = (khash_t(rg)*)rghash;
161 if (ref == 0 || bca == 0) return -1;
162 // mark filtered reads
163 if (rghash) {
164 N = 0;
165 for (s = N = 0; s < n; ++s) {
166 for (i = 0; i < n_plp[s]; ++i) {
167 bam_pileup1_t *p = plp[s] + i;
168 const uint8_t *rg = bam_aux_get(p->b, "RG");
169 p->aux = 1; // filtered by default
170 if (rg) {
171 khint_t k = kh_get(rg, hash, (const char*)(rg + 1));
172 if (k != kh_end(hash)) p->aux = 0, ++N; // not filtered
173 }
174 }
175 }
176 if (N == 0) return -1; // no reads left
177 }
178 // determine if there is a gap
179 for (s = N = 0; s < n; ++s) {
180 for (i = 0; i < n_plp[s]; ++i)
181 if (plp[s][i].indel != 0) break;
182 if (i < n_plp[s]) break;
183 }
184 if (s == n) return -1; // there is no indel at this position.
185 for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
186 { // find out how many types of indels are present
187 bca->max_support = bca->max_frac = 0;
188 int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
189 uint32_t *aux;
190 aux = calloc(N + 1, 4);
191 m = max_rd_len = 0;
192 aux[m++] = MINUS_CONST; // zero indel is always a type
193 for (s = 0; s < n; ++s) {
194 int na = 0, nt = 0;
195 for (i = 0; i < n_plp[s]; ++i) {
196 const bam_pileup1_t *p = plp[s] + i;
197 if (rghash == 0 || p->aux == 0) {
198 ++nt;
199 if (p->indel != 0) {
200 ++na;
201 aux[m++] = MINUS_CONST + p->indel;
202 }
203 }
204 j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
205 if (j > max_rd_len) max_rd_len = j;
206 }
207 double frac = (double)na/nt;
208 if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
209 indel_support_ok = 1;
210 if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
211 n_alt += na;
212 n_tot += nt;
213 }
214 // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases),
215 // check the number of N's in the sequence and skip places where half or more reference bases are Ns.
216 int nN=0; for (i=pos; i-pos<max_rd_len && ref[i]; i++) if ( ref[i]=='N' ) nN++;
217 if ( nN*2>(i-pos) ) { free(aux); return -1; }
218
219 ks_introsort(uint32_t, m, aux);
220 // squeeze out identical types
221 for (i = 1, n_types = 1; i < m; ++i)
222 if (aux[i] != aux[i-1]) ++n_types;
223 // Taking totals makes it hard to call rare indels
224 if ( !bca->per_sample_flt )
225 indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
226 if ( n_types == 1 || !indel_support_ok ) { // then skip
227 free(aux); return -1;
228 }
229 if (n_types >= 64) {
230 free(aux);
231 // TODO revisit how/whether to control printing this warning
232 if (hts_verbose >= 2)
233 fprintf(samtools_stderr, "[%s] excessive INDEL alleles at position %"PRIhts_pos". Skip the position.\n", __func__, pos + 1);
234 return -1;
235 }
236 types = (int*)calloc(n_types, sizeof(int));
237 t = 0;
238 types[t++] = aux[0] - MINUS_CONST;
239 for (i = 1; i < m; ++i)
240 if (aux[i] != aux[i-1])
241 types[t++] = aux[i] - MINUS_CONST;
242 free(aux);
243 for (t = 0; t < n_types; ++t)
244 if (types[t] == 0) break;
245 ref_type = t; // the index of the reference type (0)
246 }
247 { // calculate left and right boundary
248 left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
249 right = pos + INDEL_WINDOW_SIZE;
250 if (types[0] < 0) right -= types[0];
251 // in case the alignments stand out the reference
252 for (i = pos; i < right; ++i)
253 if (ref[i] == 0) break;
254 right = i;
255 }
256 /* The following block fixes a long-existing flaw in the INDEL
257 * calling model: the interference of nearby SNPs. However, it also
258 * reduces the power because sometimes, substitutions caused by
259 * indels are not distinguishable from true mutations. Multiple
260 * sequence realignment helps to increase the power.
261 *
262 * Masks mismatches present in at least 70% of the reads with 'N'.
263 */
264 { // construct per-sample consensus
265 int L = right - left + 1, max_i, max2_i;
266 uint32_t *cns, max, max2;
267 char *ref0, *r;
268 ref_sample = calloc(n, sizeof(char*));
269 cns = calloc(L, 4);
270 ref0 = calloc(L, 1);
271 for (i = 0; i < right - left; ++i)
272 ref0[i] = seq_nt16_table[(int)ref[i+left]];
273 for (s = 0; s < n; ++s) {
274 r = ref_sample[s] = calloc(L, 1);
275 memset(cns, 0, sizeof(int) * L);
276 // collect ref and non-ref counts
277 for (i = 0; i < n_plp[s]; ++i) {
278 bam_pileup1_t *p = plp[s] + i;
279 bam1_t *b = p->b;
280 uint32_t *cigar = bam_get_cigar(b);
281 uint8_t *seq = bam_get_seq(b);
282 hts_pos_t x = b->core.pos, y = 0;
283 for (k = 0; k < b->core.n_cigar; ++k) {
284 int op = cigar[k]&0xf;
285 int j, l = cigar[k]>>4;
286 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
287 for (j = 0; j < l; ++j)
288 if (x + j >= left && x + j < right)
289 cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
290 x += l; y += l;
291 } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
292 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
293 }
294 }
295 // determine the consensus
296 for (i = 0; i < right - left; ++i) r[i] = ref0[i];
297 max = max2 = 0; max_i = max2_i = -1;
298 for (i = 0; i < right - left; ++i) {
299 if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i;
300 else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i;
301 }
302 if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1;
303 if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1;
304 if (max_i >= 0) r[max_i] = 15;
305 if (max2_i >= 0) r[max2_i] = 15;
306 //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], samtools_stderr); fputc('\n', samtools_stderr);
307 }
308 free(ref0); free(cns);
309 }
310 { // the length of the homopolymer run around the current position
311 int c = seq_nt16_table[(int)ref[pos + 1]];
312 if (c == 15) l_run = 1;
313 else {
314 for (i = pos + 2; ref[i]; ++i)
315 if (seq_nt16_table[(int)ref[i]] != c) break;
316 l_run = i;
317 for (i = pos; i >= 0; --i)
318 if (seq_nt16_table[(int)ref[i]] != c) break;
319 l_run -= i + 1;
320 }
321 }
322 // construct the consensus sequence
323 max_ins = types[n_types - 1]; // max_ins is at least 0
324 if (max_ins > 0) {
325 int *inscns_aux = calloc(5 * n_types * max_ins, sizeof(int));
326 // count the number of occurrences of each base at each position for each type of insertion
327 for (t = 0; t < n_types; ++t) {
328 if (types[t] > 0) {
329 for (s = 0; s < n; ++s) {
330 for (i = 0; i < n_plp[s]; ++i) {
331 bam_pileup1_t *p = plp[s] + i;
332 if (p->indel == types[t]) {
333 uint8_t *seq = bam_get_seq(p->b);
334 for (k = 1; k <= p->indel; ++k) {
335 int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
336 assert(c<5);
337 ++inscns_aux[(t*max_ins+(k-1))*5 + c];
338 }
339 }
340 }
341 }
342 }
343 }
344 // use the majority rule to construct the consensus
345 inscns = calloc(n_types * max_ins, 1);
346 for (t = 0; t < n_types; ++t) {
347 for (j = 0; j < types[t]; ++j) {
348 int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
349 for (k = 0; k < 5; ++k)
350 if (ia[k] > max)
351 max = ia[k], max_k = k;
352 inscns[t*max_ins + j] = max? max_k : 4;
353 if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's
354 }
355 }
356 free(inscns_aux);
357 }
358 // compute the likelihood given each type of indel for each read
359 max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]);
360 ref2 = calloc(max_ref2, 1);
361 query = calloc(right - left + max_rd_len + max_ins + 2, 1);
362 score1 = calloc(N * n_types, sizeof(int));
363 score2 = calloc(N * n_types, sizeof(int));
364 bca->indelreg = 0;
365 for (t = 0; t < n_types; ++t) {
366 int l, ir;
367 probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
368 apf1.bw = apf2.bw = abs(types[t]) + 3;
369 // compute indelreg
370 if (types[t] == 0) ir = 0;
371 else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
372 else ir = est_indelreg(pos, ref, -types[t], 0);
373 if (ir > bca->indelreg) bca->indelreg = ir;
374 // fprintf(samtools_stderr, "%d, %d, %d\n", pos, types[t], ir);
375 // realignment
376 for (s = K = 0; s < n; ++s) {
377 // write ref2
378 for (k = 0, j = left; j <= pos; ++j)
379 ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
380 if (types[t] <= 0) j += -types[t];
381 else for (l = 0; l < types[t]; ++l)
382 ref2[k++] = inscns[t*max_ins + l];
383 for (; j < right && ref[j]; ++j)
384 ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
385 for (; k < max_ref2; ++k) ref2[k] = 4;
386 if (j < right) right = j;
387 // align each read to ref2
388 for (i = 0; i < n_plp[s]; ++i, ++K) {
389 bam_pileup1_t *p = plp[s] + i;
390 int qbeg, qend, sc, kk;
391 hts_pos_t tbeg, tend;
392 uint8_t *seq = bam_get_seq(p->b);
393 uint32_t *cigar = bam_get_cigar(p->b);
394 if (p->b->core.flag&4) continue; // unmapped reads
395 // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway.
396 for (kk = 0; kk < p->b->core.n_cigar; ++kk)
397 if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break;
398 if (kk < p->b->core.n_cigar) continue;
399 // FIXME: the following skips soft clips, but using them may be more sensitive.
400 // determine the start and end of sequences for alignment
401 qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg);
402 qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend);
403 if (types[t] < 0) {
404 int l = -types[t];
405 tbeg = tbeg - l > left? tbeg - l : left;
406 }
407 // write the query sequence
408 for (l = qbeg; l < qend; ++l)
409 query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)];
410 { // do realignment; this is the bottleneck
411 const uint8_t *qual = bam_get_qual(p->b), *bq;
412 uint8_t *qq;
413 if (qend < qbeg) {
414 fprintf(samtools_stderr, "Impossible data in bcf_call_gap_prep\n");
415 samtools_exit(1);
416 }
417 qq = calloc(qend - qbeg, 1);
418 bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
419 if (bq) ++bq; // skip type
420 for (l = qbeg; l < qend; ++l) {
421 qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l];
422 if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
423 if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
424 }
425 sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
426 (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
427 l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
428 if (l > 255) l = 255;
429 score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
430 if (sc > 5) {
431 sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
432 (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
433 l = (int)(100. * sc / (qend - qbeg) + .499);
434 if (l > 255) l = 255;
435 score2[K*n_types + t] = sc<<8 | l;
436 }
437 free(qq);
438 }
439 /*
440 for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
441 fputc("ACGTN"[(int)ref2[tbeg-left+l]], samtools_stderr);
442 fputc('\n', samtools_stderr);
443 for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], samtools_stderr);
444 fputc('\n', samtools_stderr);
445 fprintf(samtools_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc);
446 */
447 }
448 }
449 }
450 free(ref2); free(query);
451 { // compute indelQ
452 int sc_a[16], sumq_a[16];
453 int tmp, *sc = sc_a, *sumq = sumq_a;
454 if (n_types > 16) {
455 sc = (int *)malloc(n_types * sizeof(int));
456 sumq = (int *)malloc(n_types * sizeof(int));
457 }
458 memset(sumq, 0, n_types * sizeof(int));
459 for (s = K = 0; s < n; ++s) {
460 for (i = 0; i < n_plp[s]; ++i, ++K) {
461 bam_pileup1_t *p = plp[s] + i;
462 int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ;
463 for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
464 for (t = 1; t < n_types; ++t) // insertion sort
465 for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
466 tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
467 /* errmod_cal() assumes that if the call is wrong, the
468 * likelihoods of other events are equal. This is about
469 * right for substitutions, but is not desired for
470 * indels. To reuse errmod_cal(), I have to make
471 * compromise for multi-allelic indels.
472 */
473 if ((sc[0]&0x3f) == ref_type) {
474 indelQ1 = (sc[1]>>14) - (sc[0]>>14);
475 seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
476 } else {
477 for (t = 0; t < n_types; ++t) // look for the reference type
478 if ((sc[t]&0x3f) == ref_type) break;
479 indelQ1 = (sc[t]>>14) - (sc[0]>>14);
480 seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
481 }
482 tmp = sc[0]>>6 & 0xff;
483 indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ
484 sct = &score2[K*n_types];
485 for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
486 for (t = 1; t < n_types; ++t) // insertion sort
487 for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
488 tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
489 if ((sc[0]&0x3f) == ref_type) {
490 indelQ2 = (sc[1]>>14) - (sc[0]>>14);
491 } else {
492 for (t = 0; t < n_types; ++t) // look for the reference type
493 if ((sc[t]&0x3f) == ref_type) break;
494 indelQ2 = (sc[t]>>14) - (sc[0]>>14);
495 }
496 tmp = sc[0]>>6 & 0xff;
497 indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499);
498 // pick the smaller between indelQ1 and indelQ2
499 indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2;
500 if (indelQ > 255) indelQ = 255;
501 if (seqQ > 255) seqQ = 255;
502 p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
503 sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
504 // fprintf(samtools_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
505 }
506 }
507 // determine bca->indel_types[] and bca->inscns
508 bca->maxins = max_ins;
509 bca->inscns = realloc(bca->inscns, bca->maxins * 4);
510 for (t = 0; t < n_types; ++t)
511 sumq[t] = sumq[t]<<6 | t;
512 for (t = 1; t < n_types; ++t) // insertion sort
513 for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
514 tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
515 for (t = 0; t < n_types; ++t) // look for the reference type
516 if ((sumq[t]&0x3f) == ref_type) break;
517 if (t) { // then move the reference type to the first
518 tmp = sumq[t];
519 for (; t > 0; --t) sumq[t] = sumq[t-1];
520 sumq[0] = tmp;
521 }
522 for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
523 for (t = 0; t < 4 && t < n_types; ++t) {
524 bca->indel_types[t] = types[sumq[t]&0x3f];
525 memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
526 }
527 // update p->aux
528 for (s = n_alt = 0; s < n; ++s) {
529 for (i = 0; i < n_plp[s]; ++i) {
530 bam_pileup1_t *p = plp[s] + i;
531 int x = types[p->aux>>16&0x3f];
532 for (j = 0; j < 4; ++j)
533 if (x == bca->indel_types[j]) break;
534 p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
535 if ((p->aux>>16&0x3f) > 0) ++n_alt;
536 //fprintf(samtools_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
537 }
538 }
539
540 if (sc != sc_a) free(sc);
541 if (sumq != sumq_a) free(sumq);
542 }
543 free(score1); free(score2);
544 // free
545 for (i = 0; i < n; ++i) free(ref_sample[i]);
546 free(ref_sample);
547 free(types); free(inscns);
548 return n_alt > 0? 0 : -1;
549 }
550