1 /* The MIT License
2
3 Copyright (c) 2016 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "vntr_genotyping_record.h"
25
26 /**
27 * Constructor.
28 * @v - VCF record.
29 */
VNTRGenotypingRecord(bcf_hdr_t * h,bcf1_t * v,int32_t vtype,int32_t nsamples,int32_t ploidy,Estimator * est)30 VNTRGenotypingRecord::VNTRGenotypingRecord(bcf_hdr_t *h, bcf1_t *v, int32_t vtype, int32_t nsamples, int32_t ploidy, Estimator* est)
31 {
32 clear();
33
34 this->h = h;
35 //this->v = v;
36 this->rid = bcf_get_rid(v);
37 this->pos1 = bcf_get_pos1(v);
38 this->vtype = vtype;
39 this->nsamples = nsamples;
40
41 this->alleles = {0,0,0};
42 char** tmp_alleles = bcf_get_allele(v);
43 for (size_t i=0; i< bcf_get_n_allele(v); ++i) {
44 if (i) kputc(',', &this->alleles);
45 kputs(tmp_alleles[i], &this->alleles);
46 v_alleles.push_back(tmp_alleles[i]);
47 }
48
49 n_filter = 0;
50
51 //rid = bcf_get_rid(v);
52 beg1 = bcf_get_pos1(v);
53 end1 = beg1;
54
55 if (bcf_has_filter(h, v, const_cast<char*>("overlap_snp"))==1)
56 n_filter |= FILTER_MASK_OVERLAP_SNP;
57
58 if (bcf_has_filter(h, v, const_cast<char*>("overlap_indel"))==1)
59 n_filter |= FILTER_MASK_OVERLAP_INDEL;
60
61 if (bcf_has_filter(h, v, const_cast<char*>("overlap_vntr"))==1)
62 n_filter |= FILTER_MASK_OVERLAP_VNTR;
63
64 pls = (uint8_t*)calloc( nsamples*3, sizeof(uint8_t) );
65 ads = (uint8_t*)calloc( nsamples*3, sizeof(uint8_t) );
66 }
67
68 /**
69 * Clears the temporary variables in this record.
70 */
clearTemp()71 void VNTRGenotypingRecord::clearTemp()
72 {
73 tmp_dp_q20 = 0;
74 tmp_dp_ra = 0;
75 tmp_bq_s1 = tmp_bq_s2 = 0;
76 tmp_mq_s1 = tmp_mq_s2 = 0;
77 tmp_cy_s1 = tmp_cy_s2 = 0;
78 tmp_st_s1 = tmp_st_s2 = 0;
79 tmp_al_s1 = tmp_bq_al = tmp_mq_al = tmp_cy_al = tmp_st_al = tmp_nm_al = 0;
80 tmp_nm_s1 = tmp_nm_s2 = 0;
81 tmp_oth_exp_q20 = tmp_oth_obs_q20 = 0;
82 tmp_pls[0] = tmp_pls[1] = tmp_pls[2] = 1.;
83 tmp_ads[0] = tmp_ads[1] = tmp_ads[2] = 0;
84 }
85
86 /**
87 * Clears this record.
88 */
clear()89 void VNTRGenotypingRecord::clear()
90 {
91 vtype = -1;
92
93 pls = ads = NULL;
94
95 bqr_num = bqr_den = 0;
96 mqr_num = mqr_den = 0;
97 cyr_num = cyr_den = 0;
98 str_num = str_den = 0;
99 nmr_num = nmr_den = 0;
100 ior_num = ior_den = 0;
101 nm0_num = nm0_den = 0;
102 nm1_num = nm1_den = 0;
103 abe_num = abe_den = 0;
104 abz_num = abz_den = 0;
105 ns_nref = dp_sum = max_gq = 0;
106 clearTemp();
107 }
108
109 /**
110 * Destructor.
111 */
~VNTRGenotypingRecord()112 VNTRGenotypingRecord::~VNTRGenotypingRecord()
113 {
114 //if (v) bcf_destroy(v);
115 if ( pls ) free(pls);
116 if ( ads ) free(ads);
117 if ( alleles.l > 0 ) free(alleles.s);
118 if ( est ) delete est;
119 }
120
121 /**
122 * Prints out variant with collected features and genotype information to a bcf object.
123 */
flush_variant(bcf_hdr_t * hdr)124 bcf1_t* VNTRGenotypingRecord::flush_variant(bcf_hdr_t* hdr)
125 {
126 bcf1_t *nv = bcf_init();
127 bcf_clear(nv);
128 bcf_set_n_sample(nv, nsamples);
129
130 bcf_set_rid(nv, rid);
131 bcf_set_pos1(nv, pos1);
132 bcf_update_alleles_str(hdr, nv, alleles.s);
133
134 int32_t* gt = (int32_t*) calloc ( nsamples * 2, sizeof(int32_t) );
135 int32_t* pl = (int32_t*) calloc ( nsamples * 3, sizeof(int32_t) );
136 int32_t* ad = (int32_t*) calloc ( nsamples * 2, sizeof(int32_t) );
137 int32_t* td = (int32_t*) calloc ( nsamples * 1, sizeof(int32_t) );
138 int32_t* gq = (int32_t*) calloc ( nsamples * 1, sizeof(int32_t) );
139 float MLE_HWE_AF[2];
140 float MLE_HWE_GF[3];
141 double gp, gp_sum, max_gp;
142 int32_t best_gt;
143 int32_t best_a1, best_a2;
144 int32_t* pls_i;
145 int32_t an = 0;
146 int32_t acs[2] = {0,0};
147 int32_t gcs[3] = {0,0,0};
148 float afs[3];
149 int32_t max_gq = 0;
150 int64_t dp_sum = 0;
151 int32_t ploidy = 2;
152
153 std::copy( pls, pls + (nsamples*3), pl );
154 for(int32_t i=0; i < nsamples; ++i)
155 {
156 dp_sum += ( td[i] = ( (ad[2*i] = ads[3*i]) + (ad[2*i+1] = ads[3*i+1]) + ads[3*i+2] ) );
157 }
158
159 int32_t n = 0;
160 int32_t adSumHet[2] = {0,0};
161 est->compute_gl_af_hwe(pl, nsamples, ploidy, 2, MLE_HWE_AF, MLE_HWE_GF, n, 1e-20);
162
163 for(int32_t i=0; i < nsamples; ++i)
164 {
165 int32_t* pli = &pl[ i * 3 ];
166 max_gp = gp_sum = gp = ( LogTool::pl2prob(pli[0]) * MLE_HWE_AF[0] * MLE_HWE_AF[0] );
167 best_gt = 0; best_a1 = 0; best_a2 = 0;
168 for(size_t l=1; l < 2; ++l)
169 {
170 for(size_t m=0; m <= l; ++m)
171 {
172 gp = ( LogTool::pl2prob(pli[ l*(l+1)/2 + m]) * MLE_HWE_AF[l] * MLE_HWE_AF[m] * (l == m ? 1 : 2) );
173 gp_sum += gp;
174 if ( max_gp < gp )
175 {
176 max_gp = gp;
177 best_gt = l*(l+1)/2 + m;
178 best_a1 = m;
179 best_a2 = l;
180 }
181 }
182 }
183
184 if ( best_gt == 1 )
185 {
186 adSumHet[0] += ad[2*i];
187 adSumHet[1] += ad[2*i+1];
188 }
189
190 double prob = 1.-max_gp/gp_sum; // to calculate GQ
191 if ( prob <= 3.162278e-26 )
192 {
193 prob = 3.162278e-26;
194 }
195
196 if ( prob > 1 )
197 {
198 prob = 1;
199 }
200
201 gq[i] = (int32_t)LogTool::prob2pl(prob);
202
203 if ( ( best_gt > 0 ) && ( max_gq < gq[i] ) )
204 {
205 max_gq = gq[i];
206 }
207
208 gt[2*i] = ((best_a1 + 1) << 1);
209 gt[2*i+1] = ((best_a2 + 1) << 1);
210 an += 2; // still use diploid representation of chrX for now.
211 ++acs[best_a1];
212 ++acs[best_a2];
213 ++gcs[best_gt];
214 }
215
216 for(size_t i=0; i < 2; ++i)
217 {
218 afs[i] = acs[i]/(float)an;
219 }
220
221 bcf_update_format_int32(hdr, nv, "GT", gt, nsamples * 2);
222 bcf_update_format_int32(hdr, nv, "GQ", gq, nsamples );
223 bcf_update_format_int32(hdr, nv, "AD", ad, nsamples * 2);
224 bcf_update_format_int32(hdr, nv, "DP", td, nsamples );
225 bcf_update_format_int32(hdr, nv, "PL", pl, nsamples * 3);
226
227 float avgdp = (float)dp_sum / (float)nsamples;
228
229 nv->qual = (float) max_gq;
230 bcf_update_info_float(hdr, nv, "AVGDP", &avgdp, 1);
231 bcf_update_info_int32(hdr, nv, "AC", &acs[1], 1);
232 bcf_update_info_int32(hdr, nv, "AN", &an, 1);
233 bcf_update_info_float(hdr, nv, "AF", &afs[1], 1);
234 bcf_update_info_int32(hdr, nv, "GC", gcs, 3);
235 bcf_update_info_int32(hdr, nv, "GN", &nsamples, 1);
236
237 if (n)
238 {
239 float* MLE_HWE_AF_PTR = &MLE_HWE_AF[1];
240 bcf_update_info_float(hdr, nv, "HWEAF", MLE_HWE_AF_PTR, 1);
241 }
242
243 // calculate the allele frequencies under HWD
244 float MLE_AF[2];
245 float MLE_GF[3];
246 n = 0;
247 est->compute_gl_af(pl, nsamples, ploidy, 2, MLE_AF, MLE_GF, n, 1e-20);
248 if (n)
249 {
250 //float* MLE_AF_PTR = &MLE_AF[1];
251 //bcf_update_info_float(odw->hdr, nv, "HWDAF", MLE_AF_PTR, n_alleles-1);
252 bcf_update_info_float(hdr, nv, "HWDGF", &MLE_GF, 3);
253 }
254
255 float fic = 0;
256 n = 0;
257 est->compute_gl_fic(pl, nsamples, ploidy, MLE_HWE_AF, 2, MLE_GF, fic, n);
258 if ( std::isnan((double)fic) ) fic = 0;
259 if (n)
260 {
261 bcf_update_info_float(hdr, nv, "IBC", &fic, 1);
262 }
263
264 // calculate the LRT statistics related to HWE
265 float lrts;
266 float logp;
267 int32_t df;
268 n = 0;
269 est->compute_hwe_lrt(pl, nsamples, ploidy, 2, MLE_HWE_GF, MLE_GF, n, lrts, logp, df);
270 if (n)
271 {
272 if ( fic < 0 ) logp = 0-logp;
273 {
274 bcf_update_info_float(hdr, nv, "HWE_SLP", &logp, 1);
275 }
276 }
277
278 float abe = (adSumHet[0] + 0.5)/(adSumHet[0] + adSumHet[1] + 1.0);
279 bcf_update_info_float(hdr, nv, "ABE", &abe, 1);
280
281 // update filter
282 if ( n_filter & FILTER_MASK_OVERLAP_SNP )
283 bcf_add_filter(hdr, nv, bcf_hdr_id2int(hdr, BCF_DT_ID, "overlap_snp"));
284 if ( n_filter & FILTER_MASK_OVERLAP_INDEL )
285 bcf_add_filter(hdr, nv, bcf_hdr_id2int(hdr, BCF_DT_ID, "overlap_indel"));
286 if ( n_filter & FILTER_MASK_OVERLAP_VNTR )
287 bcf_add_filter(hdr, nv, bcf_hdr_id2int(hdr, BCF_DT_ID, "overlap_vntr"));
288
289 //bcf_update_info_int32(odw->hdr, nv, "NS_NREF", &v_ns_nrefs[k], 1);
290 abe_num /= (abe_den+1e-6); bcf_update_info_float(hdr, nv, "ABE", &abe_num, 1);
291 abz_num /= sqrt(abz_den+1e-6); bcf_update_info_float(hdr, nv, "ABZ", &abz_num, 1);
292 bqr_num /= sqrt(bqr_den+1e-6); bcf_update_info_float(hdr, nv, "BQZ", &bqr_num, 1);
293 mqr_num /= sqrt(mqr_den+1e-6); bcf_update_info_float(hdr, nv, "MQZ", &mqr_num, 1);
294 cyr_num /= sqrt(cyr_den+1e-6); bcf_update_info_float(hdr, nv, "CYZ", &cyr_num, 1);
295 str_num /= sqrt(str_den+1e-6); bcf_update_info_float(hdr, nv, "STZ", &str_num, 1);
296 nmr_num /= sqrt(nmr_den+1e-6); bcf_update_info_float(hdr, nv, "NMZ", &nmr_num, 1);
297 ior_num = log(ior_num/ior_den+1e-6)/log(10.); bcf_update_info_float(hdr, nv, "IOR", &ior_num, 1);
298 nm1_num /= (nm1_den+1e-6); bcf_update_info_float(hdr, nv, "NM1", &nm1_num, 1);
299 nm0_num /= (nm0_den+1e-6); bcf_update_info_float(hdr, nv, "NM0", &nm0_num, 1);
300
301 //odw->write(nv);
302 //bcf_destroy(nv);
303 free(gt);
304 free(gq);
305 free(pl);
306 free(ad);
307 free(td);
308
309 free(pls); pls = NULL;
310 free(ads); ads = NULL;
311
312 delete est;
313 free(alleles.s);
314
315 return nv;
316 }
317
318 /**
319 * Adds information from a sample to aggregated data.
320 */
flush_sample(int32_t sampleIndex)321 void VNTRGenotypingRecord::flush_sample(int32_t sampleIndex)
322 {
323 uint8_t* p_pls = &pls[sampleIndex*3];
324 uint8_t* p_ads = &ads[sampleIndex*3];
325
326 int32_t imax = ( tmp_pls[0] > tmp_pls[1] ) ? ( tmp_pls[0] > tmp_pls[2] ? 0 : 2 ) : ( tmp_pls[1] > tmp_pls[2] ? 1 : 2);
327 for(int32_t i=0; i < 3; ++i)
328 {
329 uint32_t l = LogTool::prob2pl(tmp_pls[i]/tmp_pls[imax]);
330 p_pls[i] = ((l > 255) ? 255 : l);
331 p_ads[i] = ((tmp_ads[i] > 255) ? 255 : (uint8_t)tmp_ads[i]);
332 }
333
334 float sqrt_dp_ra = sqrt((float)tmp_dp_ra);
335 float ior = (float)(tmp_oth_obs_q20 / (tmp_oth_exp_q20 + 1e-6));
336 float nm1 = tmp_al_s1 == 0 ? 0 : tmp_nm_al / (float)tmp_al_s1;
337 float nm0 = (tmp_dp_ra - tmp_al_s1) == 0 ? 0 : (tmp_nm_s1-tmp_nm_al) / (float)(tmp_dp_ra - tmp_al_s1);
338 float w_dp_ra = log(tmp_dp_ra+1.); //sqrt(dp_ra);
339 float w_dp_q20 = log(tmp_dp_q20+1.); //sqrt(dp_q20);
340 float w_al_s1 = log(tmp_al_s1+1.); //sqrt(al_s1);
341 float w_ref_s1 = log(tmp_dp_ra - tmp_al_s1+1.);
342
343 if ( p_pls[1] == 0 )
344 { // het genotypes
345 abe_num += (w_dp_ra * (tmp_dp_ra - tmp_al_s1 + 0.05) / (double)(tmp_dp_ra + 0.1));
346 abe_den += w_dp_ra;
347
348 // E(r) = 0.5(r+a) V(r) = 0.25(r+a)
349 abz_num += w_dp_ra * (tmp_dp_ra - tmp_al_s1 - tmp_dp_ra*0.5)/sqrt(0.25 * tmp_dp_ra + 1e-3);
350 abz_den += (w_dp_ra * w_dp_ra);
351
352 float bqr = sqrt_dp_ra * Estimator::compute_correlation( tmp_dp_ra, tmp_bq_al, tmp_bq_s1, tmp_bq_s2, tmp_al_s1, tmp_al_s1, .1 );
353 float mqr = sqrt_dp_ra * Estimator::compute_correlation( tmp_dp_ra, tmp_mq_al, tmp_mq_s1, tmp_mq_s2, tmp_al_s1, tmp_al_s1, .1 );
354 float cyr = sqrt_dp_ra * Estimator::compute_correlation_f( tmp_dp_ra, tmp_cy_al, tmp_cy_s1, tmp_cy_s2, (float)tmp_al_s1, (float)tmp_al_s1, .1 );
355 float str = sqrt_dp_ra * Estimator::compute_correlation( tmp_dp_ra, tmp_st_al, tmp_st_s1, tmp_st_s1, tmp_al_s1, tmp_al_s1, .1 );
356 float nmr = sqrt_dp_ra * Estimator::compute_correlation( tmp_dp_ra, tmp_nm_al, tmp_nm_s1, tmp_nm_s2, tmp_al_s1, tmp_al_s1, .1 );
357
358 // Use Stouffer's method to combine the z-scores, but weighted by log of sample size
359 bqr_num += (bqr * w_dp_ra); bqr_den += (w_dp_ra * w_dp_ra);
360 mqr_num += (mqr * w_dp_ra); mqr_den += (w_dp_ra * w_dp_ra);
361 cyr_num += (cyr * w_dp_ra); cyr_den += (w_dp_ra * w_dp_ra);
362 str_num += (str * w_dp_ra); str_den += (w_dp_ra * w_dp_ra);
363 nmr_num += (nmr * w_dp_ra); nmr_den += (w_dp_ra * w_dp_ra);
364 }
365
366 ior_num += (ior * w_dp_q20); ior_den += w_dp_q20;
367 nm1_num += (nm1 * w_al_s1); nm1_den += w_al_s1;
368 nm0_num += (nm0 * w_ref_s1); nm0_den += w_ref_s1;
369
370 clearTemp();
371 }
372
373 /**
374 * Adds an allele based with collected sufficient statistics.
375 */
add_allele(double contam,int32_t allele,uint8_t mapq,bool fwd,uint32_t q,int32_t cycle,uint32_t nm)376 void VNTRGenotypingRecord::add_allele(double contam, int32_t allele, uint8_t mapq, bool fwd, uint32_t q, int32_t cycle, uint32_t nm)
377 {
378 double pe = LogTool::pl2prob(q);
379 double pm = 1 - pe;
380
381 if (q>40)
382 {
383 q = 40;
384 }
385
386 if ( allele == 0 )
387 {
388 ++tmp_ads[0];
389 tmp_pls[0] *= ( pm * (1-contam) + pe * contam / 3 );
390 tmp_pls[1] *= ( pm / 2 + pe / 6 );
391 tmp_pls[2] *= ( pm * contam + pe * (1-contam) / 3 );
392 }
393 else if ( allele > 0 ) // currently, bi-allelic only
394 {
395 ++tmp_ads[1];
396 tmp_pls[0] *= ( pm * contam + pe * (1-contam) / 3 );
397 tmp_pls[1] *= ( pm / 2 + pe / 6 );
398 tmp_pls[2] *= ( pm * (1-contam) + pe * contam / 3 );
399 }
400 else
401 {
402 ++tmp_ads[2];
403 }
404 double sump = tmp_pls[0] + tmp_pls[1] + tmp_pls[2] + 1e-300;
405 tmp_pls[0] /= sump;
406 tmp_pls[1] /= sump;
407 tmp_pls[2] /= sump;
408
409 if ( allele >= 0 )
410 {
411 if ( q > 20 )
412 {
413 tmp_oth_exp_q20 += (LogTool::pl2prob(q) * 2. / 3.);
414 ++tmp_dp_q20;
415 }
416
417 float log_td = (cycle > 0) ? 0-logf((float)cycle) : 0;
418
419 ++tmp_dp_ra;
420 tmp_bq_s1 += q;
421 tmp_bq_s2 += (q*q);
422 tmp_mq_s1 += mapq;
423 tmp_mq_s2 += (mapq * mapq);
424 tmp_cy_s1 += log_td;
425 tmp_cy_s2 += (log_td * log_td);
426 tmp_st_s1 += fwd;
427
428 if ( allele > 0 )
429 {
430 ++tmp_al_s1;
431 tmp_bq_al += q;
432 tmp_mq_al += mapq;
433 tmp_cy_al += log_td;
434 tmp_st_al += fwd;
435 tmp_nm_al += (nm-1);
436 tmp_nm_s1 += (nm-1);
437 tmp_nm_s2 += (nm-1)*(nm-1);
438 }
439 else
440 {
441 tmp_nm_s1 += nm;
442 tmp_nm_s2 += (nm * nm);
443 }
444 }
445 else
446 {
447 if (q>20)
448 {
449 tmp_oth_exp_q20 += (LogTool::pl2prob(q) * 2. / 3.);
450 ++tmp_oth_obs_q20;
451 ++tmp_dp_q20;
452 }
453 }
454 }
455
456 /**
457 * Collects sufficient statistics from read for variants to be genotyped.
458 */
process_read(AugmentedBAMRecord & as,int32_t sampleIndex,double contam)459 void VNTRGenotypingRecord::process_read(AugmentedBAMRecord& as, int32_t sampleIndex, double contam)
460 {
461 if (vtype==VT_SNP)
462 {
463 if (v_alleles.size()==2)
464 {
465 bam1_t *s = as.s;
466
467 char strand = bam_is_rev(s) ? 'R' : 'F';
468 int32_t allele = 0;
469 //uint32_t bpos1 = bam_get_pos1(s);
470 uint8_t* seq = bam_get_seq(s);
471 uint8_t* qual = bam_get_qual(s);
472 int32_t rlen = bam_get_l_qseq(s);
473 uint8_t mapq = bam_get_mapq(s);
474 uint32_t q = 30;
475 int32_t cycle = 0;
476
477 std::vector<uint32_t>& aug_cigar = as.aug_cigar;
478 std::vector<std::string>& aug_ref = as.aug_ref;
479 std::vector<std::string>& aug_alt = as.aug_alt;
480
481 int32_t vpos1 = pos1;
482 int32_t cpos1 = bam_get_pos1(s);
483 int32_t rpos0 = 0;
484
485 for (uint32_t i=0; i<aug_cigar.size(); ++i)
486 {
487 uint32_t oplen = bam_cigar_oplen(aug_cigar[i]);
488 char opchr = bam_cigar_opchr(aug_cigar[i]);
489
490 if (opchr=='S')
491 {
492 rpos0 += oplen;
493 }
494 else if (opchr=='=')
495 {
496 if (vpos1>=cpos1 && vpos1<=(cpos1+oplen-1))
497 {
498 rpos0 += vpos1-cpos1;
499 allele = 0;
500 q = qual[rpos0];
501 cycle = rpos0<(rlen>>1) ? (rpos0+1) : -(rlen - rpos0 + 1);
502 break;
503 }
504 else
505 {
506 cpos1 += oplen;
507 rpos0 += oplen;
508 }
509 }
510 else if (opchr=='X')
511 {
512 if (vpos1==cpos1)
513 {
514 allele = (aug_alt[i].at(0) == v_alleles[1].at(0)) ? 1 : -1;
515 q = qual[rpos0];
516 cycle = rpos0<(rlen>>1) ? (rpos0+1) : -(rlen - rpos0 + 1);
517 break;
518 }
519
520 ++cpos1;
521 ++rpos0;
522 }
523 else if (opchr=='I')
524 {
525 rpos0 += oplen;
526 }
527 else if (opchr=='D')
528 {
529 cpos1 += oplen;
530 }
531 else
532 {
533 notice("unrecognized cigar state %d", opchr);
534 }
535 }
536
537 uint32_t no_mismatches = as.no_mismatches;
538 if (allele!=0 && q<20)
539 {
540 ++no_mismatches;
541 }
542
543 if (allele!=0 && no_mismatches==0)
544 {
545 //no_mismatches = 1;
546 std::cerr << "something wrong\n";
547 }
548
549 add_allele( contam, allele, mapq, strand == 'F', q, cycle, no_mismatches );
550 }
551 else //multiallelic
552 {
553 //abort();
554 }
555 }
556 else if (vtype==VT_INDEL)
557 {
558 if (v_alleles.size()==2)
559 {
560 if (as.beg1 <= beg1 && end1 <= as.end1)
561 {
562 bam1_t *s = as.s;
563
564 char strand = bam_is_rev(s) ? 'R' : 'F';
565 int32_t allele = 0;
566 //uint32_t bpos1 = bam_get_pos1(s);
567 uint8_t* seq = bam_get_seq(s);
568 uint8_t* qual = bam_get_qual(s);
569 uint32_t rlen = bam_get_l_qseq(s);
570 uint8_t mapq = bam_get_mapq(s);
571
572 uint32_t q = len*30;
573 uint32_t cycle = 10;
574
575 std::vector<uint32_t>& aug_cigar = as.aug_cigar;
576 std::vector<std::string>& aug_ref = as.aug_ref;
577 std::vector<std::string>& aug_alt = as.aug_alt;
578
579 int32_t vpos1 = pos1;
580
581 int32_t cpos1 = bam_get_pos1(s);
582 int32_t rpos0 = 0;
583
584 for (uint32_t i=0; i<aug_cigar.size(); ++i)
585 {
586 uint32_t oplen = bam_cigar_oplen(aug_cigar[i]);
587 char opchr = bam_cigar_opchr(aug_cigar[i]);
588
589 if (opchr=='S')
590 {
591 rpos0 += oplen;
592 }
593 else if (opchr=='=')
594 {
595 if (vpos1>=cpos1 && vpos1<=(cpos1+oplen-1))
596 {
597 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
598 }
599 cpos1 += oplen;
600 rpos0 += oplen;
601 }
602 else if (opchr=='X')
603 {
604 if (cpos1-1==vpos1)
605 {
606 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
607 }
608 ++cpos1;
609 ++rpos0;
610 }
611 else if (opchr=='I')
612 {
613 if (dlen>0 && cpos1-1==vpos1)
614 {
615 if (indel==aug_alt[i])
616 {
617 q = len*30;
618 allele = 1;
619 }
620 else
621 {
622 q = abs(len-(int32_t)aug_ref[i].size())*30;
623 allele = -1;
624 }
625
626 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
627 break;
628 }
629 else if (dlen<0 && cpos1-1==vpos1)
630 {
631 q = 30;
632 allele = -3;
633 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
634 break;
635 }
636
637 rpos0 += oplen;
638 }
639 else if (opchr=='D')
640 {
641 if (dlen<0 && cpos1-1==vpos1)
642 {
643 if (indel==aug_ref[i])
644 {
645 q = len*30;
646 allele = 1;
647 }
648 else
649 {
650 q = abs(len-(int32_t)aug_ref[i].size())*30;
651 allele = -1;
652 }
653
654 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
655 break;
656 }
657 else if (dlen>0 && cpos1-1==vpos1)
658 {
659 q = 30;
660 allele = -2;
661
662 cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
663 break;
664 }
665
666 }
667 else
668 {
669 std::cerr << "unrecognized cigar state " << opchr << "\n";
670 }
671 }
672
673 add_allele( contam, allele, mapq, strand == 'F', q, rand() % 75, as.no_mismatches );
674 }
675 else
676 {
677 bam1_t *s = as.s;
678 uint8_t mapq = bam_get_mapq(s);
679
680 add_allele( contam, -1, mapq, bam_is_rev(as.s) ? false : true, 20, rand() % 75, as.no_mismatches );
681 }
682 }
683 else //multiallelic
684 {
685 //abort();
686 }
687 }
688 else if (vtype==VT_VNTR)
689 {
690 //abort();
691 // bam1_t *s = as.s;
692
693 // char strand = bam_is_rev(s) ? 'R' : 'F';
694 // int32_t allele = 0;
695 // uint32_t pos1 = bam_get_pos1(s);
696 // uint8_t* seq = bam_get_seq(s);
697 // uint8_t* qual = bam_get_qual(s);
698 // uint32_t rlen = bam_get_l_qseq(s);
699 // uint8_t mapq = bam_get_mapq(s);
700
701 // std::vector<uint32_t>& aug_cigar = as.aug_cigar;
702 // std::vector<std::string>& aug_ref = as.aug_ref;
703 // std::vector<std::string>& aug_alt = as.aug_alt;
704
705 // //genomic bookend positions of VNTR
706 // int32_t vpos1 = g->beg1-1;
707 // int32_t vend1 = g->end1+1;
708
709 // //position with respect to read
710 // int32_t cpos1 = bam_get_pos1(s);
711 // int32_t rpos0 = 0;
712
713 // //genomic bookend positions of VNTR translated to read position
714 // int32_t pos0 = -1;
715 // int32_t end0 = -1;
716
717
718 // //locate repeat region on read.
719 // //translating genomic coordinates to read positions
720 // for (uint32_t i=0; i<aug_cigar.size(); ++i) {
721 // uint32_t oplen = bam_cigar_oplen(aug_cigar[i]);
722 // char opchr = bam_cigar_opchr(aug_cigar[i]);
723
724 // if (opchr=='S') {
725 // rpos0 += oplen;
726 // }
727 // else if (opchr=='=') {
728 // if (pos0==-1 && (cpos1<=vpos1 && vpos1<=(cpos1+oplen-1))) {
729 // pos0 = rpos0 + (vpos1-cpos1+1);
730 // }
731
732 // if (end0==-1 && (cpos1<=vend1 && vend1<=(cpos1+oplen-1))) {
733 // end0 = rpos0 + (vend1-cpos1+1);
734 // break;
735 // }
736
737 // cpos1 += oplen;
738 // rpos0 += oplen;
739 // }
740 // else if (opchr=='X') {
741 // if (pos0==-1 && (cpos1==vpos1)) {
742 // pos0 = rpos0;
743 // }
744
745 // if (end0==-1 && (cpos1==vend1)) {
746 // end0 = rpos0;
747 // break;
748 // }
749
750 // ++cpos1;
751 // ++rpos0;
752 // }
753 // else if (opchr=='I') {
754 // rpos0 += oplen;
755 // }
756 // else if (opchr=='D') {
757 // cpos1 += oplen;
758 // }
759 // else {
760 // std::cerr << "unrecognized cigar state " << opchr << "\n";
761 // }
762 // }
763
764 // //compute repeat tract units
765 // float counts = 0;
766
767 // // std::cerr << "pos0,end0 = " << pos0 << "," <<end0 << " (" << g->motif.size() <<")\n";
768
769 // if (pos0!=-1 && end0!=-1) {
770 // counts = ((float)(end0-pos0+1))/g->motif.size();
771 // }
772
773 // if (counts) {
774 // //update genotyping record
775 // ++g->depth;
776 // g->counts.push_back(counts);
777 // g->strands.append(1, strand);
778 // g->no_mismatches.push_back(as.no_mismatches);
779 }
780 }
781