1 /* The MIT License
2
3 Copyright (c) 2016 Hyun Min Kang <hmkang.umich.edu> and Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "complex_genotyping_record.h"
25
26 /**
27 * Constructor.
28 */
ComplexGenotypingRecord(bcf_hdr_t * h,bcf1_t * v,int32_t vtype,int32_t nsamples)29 ComplexGenotypingRecord::ComplexGenotypingRecord(bcf_hdr_t *h, bcf1_t *v, int32_t vtype, int32_t nsamples)
30 {
31 clear();
32
33 this->h = h;
34 //this->v = v;
35 this->rid = bcf_get_rid(v);
36 this->pos1 = bcf_get_pos1(v);
37 this->vtype = vtype;
38 this->nsamples = nsamples;
39
40 this->alleles = {0,0,0};
41 char** tmp_alleles = bcf_get_allele(v);
42 for (size_t i=0; i< bcf_get_n_allele(v); ++i) {
43 if (i) kputc(',', &this->alleles);
44 kputs(tmp_alleles[i], &this->alleles);
45 v_alleles.push_back(tmp_alleles[i]);
46 }
47
48 n_filter = 0;
49
50 if (vtype==VT_SNP && v_alleles.size()==2)
51 {
52 //rid = bcf_get_rid(v);
53 beg1 = bcf_get_pos1(v);
54 end1 = beg1;
55
56 if (bcf_has_filter(h, v, const_cast<char*>("overlap_snp"))==1)
57 n_filter |= FILTER_MASK_OVERLAP_SNP;
58
59 if (bcf_has_filter(h, v, const_cast<char*>("overlap_indel"))==1)
60 n_filter |= FILTER_MASK_OVERLAP_INDEL;
61
62 if (bcf_has_filter(h, v, const_cast<char*>("overlap_vntr"))==1)
63 n_filter |= FILTER_MASK_OVERLAP_VNTR;
64 }
65 else if (vtype==VT_INDEL && v_alleles.size()==2)
66 {
67 //rid = bcf_get_rid(v);
68 dlen = strlen(tmp_alleles[1])-strlen(tmp_alleles[0]);
69 len = abs(dlen);
70
71 int32_t *flanks_pos1 = NULL;
72 int32_t n = 0;
73
74 if (bcf_get_info_int32(h, v, "FLANKS", &flanks_pos1, &n)>0)
75 {
76 this->beg1 = flanks_pos1[0];
77 this->end1 = flanks_pos1[1];
78 free(flanks_pos1);
79 }
80 else
81 {
82 this->beg1 = bcf_get_pos1(v) - 3;
83 this->end1 = bcf_get_end1(v) + 3;
84 }
85
86 if (dlen>0)
87 {
88 indel.append(&tmp_alleles[1][1]);
89 }
90 else
91 {
92 indel.append(&tmp_alleles[0][1]);
93 }
94
95 if (bcf_has_filter(h, v, const_cast<char*>("overlap_snp"))==1)
96 n_filter |= FILTER_MASK_OVERLAP_SNP;
97
98 if (bcf_has_filter(h, v, const_cast<char*>("overlap_indel"))==1)
99 n_filter |= FILTER_MASK_OVERLAP_INDEL;
100
101 if (bcf_has_filter(h, v, const_cast<char*>("overlap_vntr"))==1)
102 n_filter |= FILTER_MASK_OVERLAP_VNTR;
103 }
104 else if (vtype==VT_VNTR)
105 {
106 rid = bcf_get_rid(v);
107 beg1 = bcf_get_pos1(v) - 1;
108 end1 = bcf_get_end1(v) + 1;
109
110 char *motif = NULL;
111 int32_t n = 0;
112
113 if (bcf_get_info_string(h, v, "MOTIF", &motif, &n)>0)
114 {
115 this->motif.assign(motif);
116 free(motif);
117 }
118 }
119 else
120 {
121 return;
122 }
123
124 pls = (uint8_t*)calloc( nsamples*3, sizeof(uint8_t) );
125 ads = (uint8_t*)calloc( nsamples*3, sizeof(uint8_t) );
126 }
127
128 /**
129 * Clears this record.
130 */
clearTemp()131 void ComplexGenotypingRecord::clearTemp()
132 {
133 tmp_dp_q20 = 0;
134 tmp_dp_ra = 0;
135 tmp_bq_s1 = tmp_bq_s2 = 0;
136 tmp_mq_s1 = tmp_mq_s2 = 0;
137 tmp_cy_s1 = tmp_cy_s2 = 0;
138 tmp_st_s1 = tmp_st_s2 = 0;
139 tmp_al_s1 = tmp_bq_al = tmp_mq_al = tmp_cy_al = tmp_st_al = tmp_nm_al = 0;
140 tmp_nm_s1 = tmp_nm_s2 = 0;
141 tmp_oth_exp_q20 = tmp_oth_obs_q20 = 0;
142 tmp_pls[0] = tmp_pls[1] = tmp_pls[2] = 1.;
143 tmp_ads[0] = tmp_ads[1] = tmp_ads[2] = 0;
144 }
145
clear()146 void ComplexGenotypingRecord::clear()
147 {
148 vtype = -1;
149
150 pls = ads = NULL;
151
152 bqr_num = bqr_den = 0;
153 mqr_num = mqr_den = 0;
154 cyr_num = cyr_den = 0;
155 str_num = str_den = 0;
156 nmr_num = nmr_den = 0;
157 ior_num = ior_den = 0;
158 nm0_num = nm0_den = 0;
159 nm1_num = nm1_den = 0;
160 abe_num = abe_den = 0;
161 abz_num = abz_den = 0;
162 ns_nref = dp_sum = max_gq = 0;
163 clearTemp();
164 }
165
166 /**
167 * Destructor.
168 */
~ComplexGenotypingRecord()169 ComplexGenotypingRecord::~ComplexGenotypingRecord()
170 {
171 //if (v) bcf_destroy(v);
172 if ( pls ) free(pls);
173 if ( ads ) free(ads);
174 if ( alleles.l > 0 ) free(alleles.s);
175 }
176
177 /**
178 * Destructor.
179 */
flush_variant(bcf_hdr_t * hdr)180 bcf1_t* ComplexGenotypingRecord::flush_variant(bcf_hdr_t* hdr)
181 {
182 bcf1_t *nv = bcf_init();
183 bcf_clear(nv);
184 bcf_set_n_sample(nv, nsamples);
185
186 bcf_set_rid(nv, rid);
187 bcf_set_pos1(nv, pos1);
188 bcf_update_alleles_str(hdr, nv, alleles.s);
189
190 int32_t* gt = (int32_t*) calloc ( nsamples * 2, sizeof(int32_t) );
191 int32_t* pl = (int32_t*) calloc ( nsamples * 3, sizeof(int32_t) );
192 int32_t* ad = (int32_t*) calloc ( nsamples * 2, sizeof(int32_t) );
193 int32_t* td = (int32_t*) calloc ( nsamples * 1, sizeof(int32_t) );
194 int32_t* gq = (int32_t*) calloc ( nsamples * 1, sizeof(int32_t) );
195 float MLE_HWE_AF[2];
196 float MLE_HWE_GF[3];
197 double gp, gp_sum, max_gp;
198 int32_t best_gt;
199 int32_t best_a1, best_a2;
200 int32_t* pls_i;
201 int32_t an = 0;
202 int32_t acs[2] = {0,0};
203 int32_t gcs[3] = {0,0,0};
204 float afs[3];
205 int32_t max_gq = 0;
206 int64_t dp_sum = 0;
207 int32_t ploidy = 2;
208
209 std::copy( pls, pls + (nsamples*3), pl );
210 for(int32_t i=0; i < nsamples; ++i)
211 {
212 dp_sum += ( td[i] = ( (ad[2*i] = ads[3*i]) + (ad[2*i+1] = ads[3*i+1]) + ads[3*i+2] ) );
213 }
214
215 int32_t n = 0;
216 int32_t adSumHet[2] = {0,0};
217 Estimator::compute_gl_af_hwe(pl, nsamples, ploidy, 2, MLE_HWE_AF, MLE_HWE_GF, n, 1e-20);
218
219 for(int32_t i=0; i < nsamples; ++i)
220 {
221 int32_t* pli = &pl[ i * 3 ];
222 max_gp = gp_sum = gp = ( LogTool::pl2prob(pli[0]) * MLE_HWE_AF[0] * MLE_HWE_AF[0] );
223 best_gt = 0; best_a1 = 0; best_a2 = 0;
224 for(size_t l=1; l < 2; ++l)
225 {
226 for(size_t m=0; m <= l; ++m)
227 {
228 gp = ( LogTool::pl2prob(pli[ l*(l+1)/2 + m]) * MLE_HWE_AF[l] * MLE_HWE_AF[m] * (l == m ? 1 : 2) );
229 gp_sum += gp;
230 if ( max_gp < gp )
231 {
232 max_gp = gp;
233 best_gt = l*(l+1)/2 + m;
234 best_a1 = m;
235 best_a2 = l;
236 }
237 }
238 }
239
240 if ( best_gt == 1 )
241 {
242 adSumHet[0] += ad[2*i];
243 adSumHet[1] += ad[2*i+1];
244 }
245
246 double prob = 1.-max_gp/gp_sum; // to calculate GQ
247 if ( prob <= 3.162278e-26 )
248 {
249 prob = 3.162278e-26;
250 }
251
252 if ( prob > 1 )
253 {
254 prob = 1;
255 }
256
257 gq[i] = (int32_t)LogTool::prob2pl(prob);
258
259 if ( ( best_gt > 0 ) && ( max_gq < gq[i] ) )
260 {
261 max_gq = gq[i];
262 }
263
264 gt[2*i] = ((best_a1 + 1) << 1);
265 gt[2*i+1] = ((best_a2 + 1) << 1);
266 an += 2; // still use diploid representation of chrX for now.
267 ++acs[best_a1];
268 ++acs[best_a2];
269 ++gcs[best_gt];
270 }
271
272 for(size_t i=0; i < 2; ++i)
273 {
274 afs[i] = acs[i]/(float)an;
275 }
276
277 bcf_update_format_int32(hdr, nv, "GT", gt, nsamples * 2);
278 bcf_update_format_int32(hdr, nv, "GQ", gq, nsamples );
279 bcf_update_format_int32(hdr, nv, "AD", ad, nsamples * 2);
280 bcf_update_format_int32(hdr, nv, "DP", td, nsamples );
281 bcf_update_format_int32(hdr, nv, "PL", pl, nsamples * 3);
282
283 float avgdp = (float)dp_sum / (float)nsamples;
284
285 nv->qual = (float) max_gq;
286 bcf_update_info_float(hdr, nv, "AVGDP", &avgdp, 1);
287 bcf_update_info_int32(hdr, nv, "AC", &acs[1], 1);
288 bcf_update_info_int32(hdr, nv, "AN", &an, 1);
289 bcf_update_info_float(hdr, nv, "AF", &afs[1], 1);
290 bcf_update_info_int32(hdr, nv, "GC", gcs, 3);
291 bcf_update_info_int32(hdr, nv, "GN", &nsamples, 1);
292
293 if (n)
294 {
295 float* MLE_HWE_AF_PTR = &MLE_HWE_AF[1];
296 bcf_update_info_float(hdr, nv, "HWEAF", MLE_HWE_AF_PTR, 1);
297 }
298
299 // calculate the allele frequencies under HWD
300 float MLE_AF[2];
301 float MLE_GF[3];
302 n = 0;
303 Estimator::compute_gl_af(pl, nsamples, ploidy, 2, MLE_AF, MLE_GF, n, 1e-20);
304 if (n)
305 {
306 //float* MLE_AF_PTR = &MLE_AF[1];
307 //bcf_update_info_float(odw->hdr, nv, "HWDAF", MLE_AF_PTR, n_alleles-1);
308 bcf_update_info_float(hdr, nv, "HWDGF", &MLE_GF, 3);
309 }
310
311 float fic = 0;
312 n = 0;
313 Estimator::compute_gl_fic(pl, nsamples, ploidy, MLE_HWE_AF, 2, MLE_GF, fic, n);
314 if ( std::isnan((double)fic) ) fic = 0;
315 if (n)
316 {
317 bcf_update_info_float(hdr, nv, "IBC", &fic, 1);
318 }
319
320 // calculate the LRT statistics related to HWE
321 float lrts;
322 float logp;
323 int32_t df;
324 n = 0;
325 Estimator::compute_hwe_lrt(pl, nsamples, ploidy, 2, MLE_HWE_GF, MLE_GF, n, lrts, logp, df);
326 if (n)
327 {
328 if ( fic < 0 ) logp = 0-logp;
329 {
330 bcf_update_info_float(hdr, nv, "HWE_SLP", &logp, 1);
331 }
332 }
333
334 float abe = (adSumHet[0] + 0.5)/(adSumHet[0] + adSumHet[1] + 1.0);
335 bcf_update_info_float(hdr, nv, "ABE", &abe, 1);
336
337 // update filter
338 if ( n_filter & FILTER_MASK_OVERLAP_SNP )
339 bcf_add_filter(hdr, nv, bcf_hdr_id2int(hdr, BCF_DT_ID, "overlap_snp"));
340 if ( n_filter & FILTER_MASK_OVERLAP_INDEL )
341 bcf_add_filter(hdr, nv, bcf_hdr_id2int(hdr, BCF_DT_ID, "overlap_indel"));
342 if ( n_filter & FILTER_MASK_OVERLAP_VNTR )
343 bcf_add_filter(hdr, nv, bcf_hdr_id2int(hdr, BCF_DT_ID, "overlap_vntr"));
344
345 //bcf_update_info_int32(odw->hdr, nv, "NS_NREF", &v_ns_nrefs[k], 1);
346 abe_num /= (abe_den+1e-6); bcf_update_info_float(hdr, nv, "ABE", &abe_num, 1);
347 abz_num /= sqrt(abz_den+1e-6); bcf_update_info_float(hdr, nv, "ABZ", &abz_num, 1);
348 bqr_num /= sqrt(bqr_den+1e-6); bcf_update_info_float(hdr, nv, "BQZ", &bqr_num, 1);
349 mqr_num /= sqrt(mqr_den+1e-6); bcf_update_info_float(hdr, nv, "MQZ", &mqr_num, 1);
350 cyr_num /= sqrt(cyr_den+1e-6); bcf_update_info_float(hdr, nv, "CYZ", &cyr_num, 1);
351 str_num /= sqrt(str_den+1e-6); bcf_update_info_float(hdr, nv, "STZ", &str_num, 1);
352 nmr_num /= sqrt(nmr_den+1e-6); bcf_update_info_float(hdr, nv, "NMZ", &nmr_num, 1);
353 ior_num = log(ior_num/ior_den+1e-6)/log(10.); bcf_update_info_float(hdr, nv, "IOR", &ior_num, 1);
354 nm1_num /= (nm1_den+1e-6); bcf_update_info_float(hdr, nv, "NM1", &nm1_num, 1);
355 nm0_num /= (nm0_den+1e-6); bcf_update_info_float(hdr, nv, "NM0", &nm0_num, 1);
356
357 //odw->write(nv);
358 //bcf_destroy(nv);
359 free(gt);
360 free(gq);
361 free(pl);
362 free(ad);
363 free(td);
364
365 free(pls); pls = NULL;
366 free(ads); ads = NULL;
367
368 free(alleles.s);
369
370 return nv;
371 }
372
flush_sample(int32_t sampleIndex)373 void ComplexGenotypingRecord::flush_sample(int32_t sampleIndex)
374 {
375 uint8_t* p_pls = &pls[sampleIndex*3];
376 uint8_t* p_ads = &ads[sampleIndex*3];
377
378 int32_t imax = ( tmp_pls[0] > tmp_pls[1] ) ? ( tmp_pls[0] > tmp_pls[2] ? 0 : 2 ) : ( tmp_pls[1] > tmp_pls[2] ? 1 : 2);
379 for(int32_t i=0; i < 3; ++i)
380 {
381 uint32_t l = LogTool::prob2pl(tmp_pls[i]/tmp_pls[imax]);
382 p_pls[i] = ((l > 255) ? 255 : l);
383 p_ads[i] = ((tmp_ads[i] > 255) ? 255 : (uint8_t)tmp_ads[i]);
384 }
385
386 float sqrt_dp_ra = sqrt((float)tmp_dp_ra);
387 float ior = (float)(tmp_oth_obs_q20 / (tmp_oth_exp_q20 + 1e-6));
388 float nm1 = tmp_al_s1 == 0 ? 0 : tmp_nm_al / (float)tmp_al_s1;
389 float nm0 = (tmp_dp_ra - tmp_al_s1) == 0 ? 0 : (tmp_nm_s1-tmp_nm_al) / (float)(tmp_dp_ra - tmp_al_s1);
390 float w_dp_ra = log(tmp_dp_ra+1.); //sqrt(dp_ra);
391 float w_dp_q20 = log(tmp_dp_q20+1.); //sqrt(dp_q20);
392 float w_al_s1 = log(tmp_al_s1+1.); //sqrt(al_s1);
393 float w_ref_s1 = log(tmp_dp_ra - tmp_al_s1+1.);
394
395 if ( p_pls[1] == 0 )
396 { // het genotypes
397 abe_num += (w_dp_ra * (tmp_dp_ra - tmp_al_s1 + 0.05) / (double)(tmp_dp_ra + 0.1));
398 abe_den += w_dp_ra;
399
400 // E(r) = 0.5(r+a) V(r) = 0.25(r+a)
401 abz_num += w_dp_ra * (tmp_dp_ra - tmp_al_s1 - tmp_dp_ra*0.5)/sqrt(0.25 * tmp_dp_ra + 1e-3);
402 abz_den += (w_dp_ra * w_dp_ra);
403
404 float bqr = sqrt_dp_ra * Estimator::compute_correlation( tmp_dp_ra, tmp_bq_al, tmp_bq_s1, tmp_bq_s2, tmp_al_s1, tmp_al_s1, .1 );
405 float mqr = sqrt_dp_ra * Estimator::compute_correlation( tmp_dp_ra, tmp_mq_al, tmp_mq_s1, tmp_mq_s2, tmp_al_s1, tmp_al_s1, .1 );
406 float cyr = sqrt_dp_ra * Estimator::compute_correlation_f( tmp_dp_ra, tmp_cy_al, tmp_cy_s1, tmp_cy_s2, (float)tmp_al_s1, (float)tmp_al_s1, .1 );
407 float str = sqrt_dp_ra * Estimator::compute_correlation( tmp_dp_ra, tmp_st_al, tmp_st_s1, tmp_st_s1, tmp_al_s1, tmp_al_s1, .1 );
408 float nmr = sqrt_dp_ra * Estimator::compute_correlation( tmp_dp_ra, tmp_nm_al, tmp_nm_s1, tmp_nm_s2, tmp_al_s1, tmp_al_s1, .1 );
409
410 // Use Stouffer's method to combine the z-scores, but weighted by log of sample size
411 bqr_num += (bqr * w_dp_ra); bqr_den += (w_dp_ra * w_dp_ra);
412 mqr_num += (mqr * w_dp_ra); mqr_den += (w_dp_ra * w_dp_ra);
413 cyr_num += (cyr * w_dp_ra); cyr_den += (w_dp_ra * w_dp_ra);
414 str_num += (str * w_dp_ra); str_den += (w_dp_ra * w_dp_ra);
415 nmr_num += (nmr * w_dp_ra); nmr_den += (w_dp_ra * w_dp_ra);
416 }
417
418 ior_num += (ior * w_dp_q20); ior_den += w_dp_q20;
419 nm1_num += (nm1 * w_al_s1); nm1_den += w_al_s1;
420 nm0_num += (nm0 * w_ref_s1); nm0_den += w_ref_s1;
421
422 clearTemp();
423 }
424
425 /**
426 * Adds an allele based with collected sufficient statistics.
427 */
add_allele(double contam,int32_t allele,uint8_t mapq,bool fwd,uint32_t q,int32_t cycle,uint32_t nm)428 void ComplexGenotypingRecord::add_allele(double contam, int32_t allele, uint8_t mapq, bool fwd, uint32_t q, int32_t cycle, uint32_t nm)
429 {
430 double pe = LogTool::pl2prob(q);
431 double pm = 1 - pe;
432
433 if (q>40)
434 {
435 q = 40;
436 }
437
438 if ( allele == 0 )
439 {
440 ++tmp_ads[0];
441 tmp_pls[0] *= ( pm * (1-contam) + pe * contam / 3 );
442 tmp_pls[1] *= ( pm / 2 + pe / 6 );
443 tmp_pls[2] *= ( pm * contam + pe * (1-contam) / 3 );
444 }
445 else if ( allele > 0 ) // currently, bi-allelic only
446 {
447 ++tmp_ads[1];
448 tmp_pls[0] *= ( pm * contam + pe * (1-contam) / 3 );
449 tmp_pls[1] *= ( pm / 2 + pe / 6 );
450 tmp_pls[2] *= ( pm * (1-contam) + pe * contam / 3 );
451 }
452 else
453 {
454 ++tmp_ads[2];
455 }
456 double sump = tmp_pls[0] + tmp_pls[1] + tmp_pls[2] + 1e-300;
457 tmp_pls[0] /= sump;
458 tmp_pls[1] /= sump;
459 tmp_pls[2] /= sump;
460
461 if ( allele >= 0 )
462 {
463 if ( q > 20 )
464 {
465 tmp_oth_exp_q20 += (LogTool::pl2prob(q) * 2. / 3.);
466 ++tmp_dp_q20;
467 }
468
469 float log_td = (cycle > 0) ? 0-logf((float)cycle) : 0;
470
471 ++tmp_dp_ra;
472 tmp_bq_s1 += q;
473 tmp_bq_s2 += (q*q);
474 tmp_mq_s1 += mapq;
475 tmp_mq_s2 += (mapq * mapq);
476 tmp_cy_s1 += log_td;
477 tmp_cy_s2 += (log_td * log_td);
478 tmp_st_s1 += fwd;
479
480 if ( allele > 0 )
481 {
482 ++tmp_al_s1;
483 tmp_bq_al += q;
484 tmp_mq_al += mapq;
485 tmp_cy_al += log_td;
486 tmp_st_al += fwd;
487 tmp_nm_al += (nm-1);
488 tmp_nm_s1 += (nm-1);
489 tmp_nm_s2 += (nm-1)*(nm-1);
490 }
491 else
492 {
493 tmp_nm_s1 += nm;
494 tmp_nm_s2 += (nm * nm);
495 }
496 }
497 else
498 {
499 if (q>20)
500 {
501 tmp_oth_exp_q20 += (LogTool::pl2prob(q) * 2. / 3.);
502 ++tmp_oth_obs_q20;
503 ++tmp_dp_q20;
504 }
505 }
506 }
507
508 /**
509 * Collects sufficient statistics from read for variants to be genotyped.
510 */
process_read(AugmentedBAMRecord & as,int32_t sampleIndex,double contam)511 void ComplexGenotypingRecord::process_read(AugmentedBAMRecord& as, int32_t sampleIndex, double contam)
512 {
513 if (v_alleles.size()==2)
514 {
515 if (as.beg1 <= beg1 && end1 <= as.end1)
516 {
517 bam1_t *s = as.s;
518
519 char strand = bam_is_rev(s) ? 'R' : 'F';
520 int32_t allele = 0;
521 //uint32_t bpos1 = bam_get_pos1(s);
522 uint8_t* seq = bam_get_seq(s);
523 uint8_t* qual = bam_get_qual(s);
524 uint32_t rlen = bam_get_l_qseq(s);
525 uint8_t mapq = bam_get_mapq(s);
526
527 uint32_t q = len*30;
528 uint32_t cycle = 10;
529
530 std::vector<uint32_t>& aug_cigar = as.aug_cigar;
531 std::vector<std::string>& aug_ref = as.aug_ref;
532 std::vector<std::string>& aug_alt = as.aug_alt;
533
534 int32_t vpos1 = pos1;
535
536 int32_t cpos1 = bam_get_pos1(s);
537 int32_t rpos0 = 0;
538
539 for (uint32_t i=0; i<aug_cigar.size(); ++i)
540 {
541 uint32_t oplen = bam_cigar_oplen(aug_cigar[i]);
542 char opchr = bam_cigar_opchr(aug_cigar[i]);
543
544 if (opchr=='S')
545 {
546 rpos0 += oplen;
547 }
548 else if (opchr=='=')
549 {
550 if (vpos1>=cpos1 && vpos1<=(cpos1+oplen-1))
551 {
552 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
553 }
554 cpos1 += oplen;
555 rpos0 += oplen;
556 }
557 else if (opchr=='X')
558 {
559 if (cpos1-1==vpos1)
560 {
561 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
562 }
563 ++cpos1;
564 ++rpos0;
565 }
566 else if (opchr=='I')
567 {
568 if (dlen>0 && cpos1-1==vpos1)
569 {
570 if (indel==aug_alt[i])
571 {
572 q = len*30;
573 allele = 1;
574 }
575 else
576 {
577 q = abs(len-(int32_t)aug_ref[i].size())*30;
578 allele = -1;
579 }
580
581 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
582 break;
583 }
584 else if (dlen<0 && cpos1-1==vpos1)
585 {
586 q = 30;
587 allele = -3;
588 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
589 break;
590 }
591
592 rpos0 += oplen;
593 }
594 else if (opchr=='D')
595 {
596 if (dlen<0 && cpos1-1==vpos1)
597 {
598 if (indel==aug_ref[i])
599 {
600 q = len*30;
601 allele = 1;
602 }
603 else
604 {
605 q = abs(len-(int32_t)aug_ref[i].size())*30;
606 allele = -1;
607 }
608
609 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
610 break;
611 }
612 else if (dlen>0 && cpos1-1==vpos1)
613 {
614 q = 30;
615 allele = -2;
616
617 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
618 break;
619 }
620
621 }
622 else
623 {
624 std::cerr << "unrecognized cigar state " << opchr << "\n";
625 }
626 }
627
628 add_allele( contam, allele, mapq, strand == 'F', q, rand() % 75, as.no_mismatches );
629 }
630 else
631 {
632 bam1_t *s = as.s;
633 uint8_t mapq = bam_get_mapq(s);
634
635 add_allele( contam, -1, mapq, bam_is_rev(as.s) ? false : true, 20, rand() % 75, as.no_mismatches );
636 }
637 }
638 else //multiallelic
639 {
640 if (as.beg1 <= beg1 && end1 <= as.end1)
641 {
642 bam1_t *s = as.s;
643
644 char strand = bam_is_rev(s) ? 'R' : 'F';
645 int32_t allele = 0;
646 //uint32_t bpos1 = bam_get_pos1(s);
647 uint8_t* seq = bam_get_seq(s);
648 uint8_t* qual = bam_get_qual(s);
649 uint32_t rlen = bam_get_l_qseq(s);
650 uint8_t mapq = bam_get_mapq(s);
651
652 uint32_t q = len*30;
653 uint32_t cycle = 10;
654
655 std::vector<uint32_t>& aug_cigar = as.aug_cigar;
656 std::vector<std::string>& aug_ref = as.aug_ref;
657 std::vector<std::string>& aug_alt = as.aug_alt;
658
659 int32_t vpos1 = pos1;
660
661 int32_t cpos1 = bam_get_pos1(s);
662 int32_t rpos0 = 0;
663
664 for (uint32_t i=0; i<aug_cigar.size(); ++i)
665 {
666 uint32_t oplen = bam_cigar_oplen(aug_cigar[i]);
667 char opchr = bam_cigar_opchr(aug_cigar[i]);
668
669 if (opchr=='S')
670 {
671 rpos0 += oplen;
672 }
673 else if (opchr=='=')
674 {
675 if (vpos1>=cpos1 && vpos1<=(cpos1+oplen-1))
676 {
677 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
678 }
679 cpos1 += oplen;
680 rpos0 += oplen;
681 }
682 else if (opchr=='X')
683 {
684 if (cpos1-1==vpos1)
685 {
686 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
687 }
688 ++cpos1;
689 ++rpos0;
690 }
691 else if (opchr=='I')
692 {
693 if (dlen>0 && cpos1-1==vpos1)
694 {
695 if (indel==aug_alt[i])
696 {
697 q = len*30;
698 allele = 1;
699 }
700 else
701 {
702 q = abs(len-(int32_t)aug_ref[i].size())*30;
703 allele = -1;
704 }
705
706 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
707 break;
708 }
709 else if (dlen<0 && cpos1-1==vpos1)
710 {
711 q = 30;
712 allele = -3;
713 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
714 break;
715 }
716
717 rpos0 += oplen;
718 }
719 else if (opchr=='D')
720 {
721 if (dlen<0 && cpos1-1==vpos1)
722 {
723 if (indel==aug_ref[i])
724 {
725 q = len*30;
726 allele = 1;
727 }
728 else
729 {
730 q = abs(len-(int32_t)aug_ref[i].size())*30;
731 allele = -1;
732 }
733
734 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
735 break;
736 }
737 else if (dlen>0 && cpos1-1==vpos1)
738 {
739 q = 30;
740 allele = -2;
741
742 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
743 break;
744 }
745
746 }
747 else
748 {
749 std::cerr << "unrecognized cigar state " << opchr << "\n";
750 }
751 }
752
753 add_allele( contam, allele, mapq, strand == 'F', q, rand() % 75, as.no_mismatches );
754 }
755 else
756 {
757 bam1_t *s = as.s;
758 uint8_t mapq = bam_get_mapq(s);
759
760 add_allele( contam, -1, mapq, bam_is_rev(as.s) ? false : true, 20, rand() % 75, as.no_mismatches );
761 }
762 }
763 }
764