1 /* The MIT License
2 
3    Copyright (c) 2016 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "vntr_genotyping_record.h"
25 
26 /**
27  * Constructor.
28  * @v - VCF record.
29  */
VNTRGenotypingRecord(bcf_hdr_t * h,bcf1_t * v,int32_t vtype,int32_t nsamples,int32_t ploidy,Estimator * est)30 VNTRGenotypingRecord::VNTRGenotypingRecord(bcf_hdr_t *h, bcf1_t *v, int32_t vtype, int32_t nsamples, int32_t ploidy, Estimator* est)
31 {
32     clear();
33 
34     this->h = h;
35     //this->v = v;
36     this->rid = bcf_get_rid(v);
37     this->pos1 = bcf_get_pos1(v);
38     this->vtype = vtype;
39     this->nsamples = nsamples;
40 
41     this->alleles = {0,0,0};
42     char** tmp_alleles = bcf_get_allele(v);
43     for (size_t i=0; i< bcf_get_n_allele(v); ++i) {
44       if (i) kputc(',', &this->alleles);
45       kputs(tmp_alleles[i], &this->alleles);
46       v_alleles.push_back(tmp_alleles[i]);
47     }
48 
49     n_filter = 0;
50 
51     //rid = bcf_get_rid(v);
52     beg1 = bcf_get_pos1(v);
53     end1 = beg1;
54 
55     if (bcf_has_filter(h, v, const_cast<char*>("overlap_snp"))==1)
56         n_filter |= FILTER_MASK_OVERLAP_SNP;
57 
58     if (bcf_has_filter(h, v, const_cast<char*>("overlap_indel"))==1)
59         n_filter |= FILTER_MASK_OVERLAP_INDEL;
60 
61     if (bcf_has_filter(h, v, const_cast<char*>("overlap_vntr"))==1)
62         n_filter |= FILTER_MASK_OVERLAP_VNTR;
63 
64     pls = (uint8_t*)calloc( nsamples*3, sizeof(uint8_t) );
65     ads = (uint8_t*)calloc( nsamples*3, sizeof(uint8_t) );
66 }
67 
68 /**
69  * Clears the temporary variables in this record.
70  */
clearTemp()71 void VNTRGenotypingRecord::clearTemp()
72 {
73     tmp_dp_q20 = 0;
74     tmp_dp_ra = 0;
75     tmp_bq_s1 = tmp_bq_s2 = 0;
76     tmp_mq_s1 = tmp_mq_s2 = 0;
77     tmp_cy_s1 = tmp_cy_s2 = 0;
78     tmp_st_s1 = tmp_st_s2 = 0;
79     tmp_al_s1 = tmp_bq_al = tmp_mq_al = tmp_cy_al = tmp_st_al = tmp_nm_al = 0;
80     tmp_nm_s1 = tmp_nm_s2 = 0;
81     tmp_oth_exp_q20 = tmp_oth_obs_q20 = 0;
82     tmp_pls[0] = tmp_pls[1] = tmp_pls[2] = 1.;
83     tmp_ads[0] = tmp_ads[1] = tmp_ads[2] = 0;
84 }
85 
86 /**
87  * Clears this record.
88  */
clear()89 void VNTRGenotypingRecord::clear()
90 {
91     vtype = -1;
92 
93     pls = ads = NULL;
94 
95     bqr_num = bqr_den = 0;
96     mqr_num = mqr_den = 0;
97     cyr_num = cyr_den = 0;
98     str_num = str_den = 0;
99     nmr_num = nmr_den = 0;
100     ior_num = ior_den = 0;
101     nm0_num = nm0_den = 0;
102     nm1_num = nm1_den = 0;
103     abe_num = abe_den = 0;
104     abz_num = abz_den = 0;
105     ns_nref = dp_sum = max_gq = 0;
106     clearTemp();
107 }
108 
109 /**
110  * Destructor.
111  */
~VNTRGenotypingRecord()112 VNTRGenotypingRecord::~VNTRGenotypingRecord()
113 {
114   //if (v) bcf_destroy(v);
115     if ( pls ) free(pls);
116     if ( ads ) free(ads);
117     if ( alleles.l > 0 ) free(alleles.s);
118     if ( est ) delete est;
119 }
120 
121 /**
122  * Prints out variant with collected features and genotype information to a bcf object.
123  */
flush_variant(bcf_hdr_t * hdr)124 bcf1_t* VNTRGenotypingRecord::flush_variant(bcf_hdr_t* hdr)
125 {
126     bcf1_t *nv = bcf_init();
127     bcf_clear(nv);
128     bcf_set_n_sample(nv, nsamples);
129 
130     bcf_set_rid(nv, rid);
131     bcf_set_pos1(nv, pos1);
132     bcf_update_alleles_str(hdr, nv, alleles.s);
133 
134     int32_t* gt = (int32_t*) calloc ( nsamples * 2, sizeof(int32_t) );
135     int32_t* pl = (int32_t*) calloc ( nsamples * 3, sizeof(int32_t) );
136     int32_t* ad = (int32_t*) calloc ( nsamples * 2, sizeof(int32_t) );
137     int32_t* td = (int32_t*) calloc ( nsamples * 1, sizeof(int32_t) );
138     int32_t* gq = (int32_t*) calloc ( nsamples * 1, sizeof(int32_t) );
139     float MLE_HWE_AF[2];
140     float MLE_HWE_GF[3];
141     double gp, gp_sum, max_gp;
142     int32_t best_gt;
143     int32_t best_a1, best_a2;
144     int32_t* pls_i;
145     int32_t an = 0;
146     int32_t acs[2] = {0,0};
147     int32_t gcs[3] = {0,0,0};
148     float afs[3];
149     int32_t max_gq = 0;
150     int64_t dp_sum = 0;
151     int32_t ploidy = 2;
152 
153     std::copy( pls, pls + (nsamples*3), pl );
154     for(int32_t i=0; i < nsamples; ++i)
155     {
156         dp_sum += ( td[i] = ( (ad[2*i] = ads[3*i]) + (ad[2*i+1] = ads[3*i+1]) + ads[3*i+2] ) );
157     }
158 
159     int32_t n = 0;
160     int32_t adSumHet[2] = {0,0};
161     est->compute_gl_af_hwe(pl, nsamples, ploidy, 2, MLE_HWE_AF, MLE_HWE_GF, n, 1e-20);
162 
163     for(int32_t i=0; i < nsamples; ++i)
164     {
165         int32_t* pli = &pl[ i * 3 ];
166         max_gp = gp_sum = gp = ( LogTool::pl2prob(pli[0]) * MLE_HWE_AF[0] * MLE_HWE_AF[0] );
167         best_gt = 0; best_a1 = 0; best_a2 = 0;
168         for(size_t l=1; l < 2; ++l)
169         {
170             for(size_t m=0; m <= l; ++m)
171             {
172                 gp = ( LogTool::pl2prob(pli[ l*(l+1)/2 + m]) * MLE_HWE_AF[l] * MLE_HWE_AF[m] * (l == m ? 1 : 2) );
173                 gp_sum += gp;
174                 if ( max_gp < gp )
175                 {
176                     max_gp = gp;
177                     best_gt = l*(l+1)/2 + m;
178                     best_a1 = m;
179                     best_a2 = l;
180                 }
181             }
182         }
183 
184         if ( best_gt == 1 )
185         {
186             adSumHet[0] += ad[2*i];
187             adSumHet[1] += ad[2*i+1];
188         }
189 
190         double prob = 1.-max_gp/gp_sum;  // to calculate GQ
191         if ( prob <= 3.162278e-26 )
192         {
193             prob = 3.162278e-26;
194         }
195 
196         if ( prob > 1 )
197         {
198             prob = 1;
199         }
200 
201         gq[i] = (int32_t)LogTool::prob2pl(prob);
202 
203         if ( ( best_gt > 0 ) && ( max_gq < gq[i] ) )
204         {
205             max_gq = gq[i];
206         }
207 
208         gt[2*i]   = ((best_a1 + 1) << 1);
209         gt[2*i+1] = ((best_a2 + 1) << 1);
210         an += 2;             // still use diploid representation of chrX for now.
211         ++acs[best_a1];
212         ++acs[best_a2];
213         ++gcs[best_gt];
214     }
215 
216     for(size_t i=0; i < 2; ++i)
217     {
218         afs[i] = acs[i]/(float)an;
219     }
220 
221     bcf_update_format_int32(hdr, nv, "GT", gt, nsamples * 2);
222     bcf_update_format_int32(hdr, nv, "GQ", gq, nsamples );
223     bcf_update_format_int32(hdr, nv, "AD", ad, nsamples * 2);
224     bcf_update_format_int32(hdr, nv, "DP", td, nsamples );
225     bcf_update_format_int32(hdr, nv, "PL", pl, nsamples * 3);
226 
227     float avgdp = (float)dp_sum / (float)nsamples;
228 
229     nv->qual = (float) max_gq;
230     bcf_update_info_float(hdr, nv, "AVGDP", &avgdp, 1);
231     bcf_update_info_int32(hdr, nv, "AC", &acs[1], 1);
232     bcf_update_info_int32(hdr, nv, "AN", &an, 1);
233     bcf_update_info_float(hdr, nv, "AF", &afs[1], 1);
234     bcf_update_info_int32(hdr, nv, "GC", gcs, 3);
235     bcf_update_info_int32(hdr, nv, "GN", &nsamples, 1);
236 
237     if (n)
238     {
239         float* MLE_HWE_AF_PTR = &MLE_HWE_AF[1];
240         bcf_update_info_float(hdr, nv, "HWEAF", MLE_HWE_AF_PTR, 1);
241     }
242 
243     // calculate the allele frequencies under HWD
244     float MLE_AF[2];
245     float MLE_GF[3];
246     n = 0;
247     est->compute_gl_af(pl, nsamples, ploidy, 2, MLE_AF, MLE_GF,  n, 1e-20);
248     if (n)
249     {
250         //float* MLE_AF_PTR = &MLE_AF[1];
251         //bcf_update_info_float(odw->hdr, nv, "HWDAF", MLE_AF_PTR, n_alleles-1);
252         bcf_update_info_float(hdr, nv, "HWDGF", &MLE_GF, 3);
253     }
254 
255     float fic = 0;
256     n = 0;
257     est->compute_gl_fic(pl, nsamples, ploidy, MLE_HWE_AF, 2, MLE_GF, fic, n);
258     if ( std::isnan((double)fic) ) fic = 0;
259     if (n)
260     {
261         bcf_update_info_float(hdr, nv, "IBC", &fic, 1);
262     }
263 
264     // calculate the LRT statistics related to HWE
265     float lrts;
266     float logp;
267     int32_t df;
268     n = 0;
269     est->compute_hwe_lrt(pl, nsamples, ploidy, 2, MLE_HWE_GF, MLE_GF, n, lrts, logp, df);
270     if (n)
271     {
272         if ( fic < 0 ) logp = 0-logp;
273         {
274             bcf_update_info_float(hdr, nv, "HWE_SLP", &logp, 1);
275         }
276     }
277 
278     float abe = (adSumHet[0] + 0.5)/(adSumHet[0] + adSumHet[1] + 1.0);
279     bcf_update_info_float(hdr, nv, "ABE", &abe, 1);
280 
281     // update filter
282     if ( n_filter & FILTER_MASK_OVERLAP_SNP )
283         bcf_add_filter(hdr, nv, bcf_hdr_id2int(hdr, BCF_DT_ID, "overlap_snp"));
284     if ( n_filter & FILTER_MASK_OVERLAP_INDEL )
285         bcf_add_filter(hdr, nv, bcf_hdr_id2int(hdr, BCF_DT_ID, "overlap_indel"));
286     if ( n_filter & FILTER_MASK_OVERLAP_VNTR )
287         bcf_add_filter(hdr, nv, bcf_hdr_id2int(hdr, BCF_DT_ID, "overlap_vntr"));
288 
289     //bcf_update_info_int32(odw->hdr, nv, "NS_NREF", &v_ns_nrefs[k], 1);
290     abe_num /= (abe_den+1e-6); bcf_update_info_float(hdr, nv, "ABE",  &abe_num, 1);
291     abz_num /= sqrt(abz_den+1e-6); bcf_update_info_float(hdr, nv, "ABZ",  &abz_num, 1);
292     bqr_num /= sqrt(bqr_den+1e-6); bcf_update_info_float(hdr, nv, "BQZ", &bqr_num, 1);
293     mqr_num /= sqrt(mqr_den+1e-6); bcf_update_info_float(hdr, nv, "MQZ", &mqr_num, 1);
294     cyr_num /= sqrt(cyr_den+1e-6); bcf_update_info_float(hdr, nv, "CYZ", &cyr_num, 1);
295     str_num /= sqrt(str_den+1e-6); bcf_update_info_float(hdr, nv, "STZ", &str_num, 1);
296     nmr_num /= sqrt(nmr_den+1e-6); bcf_update_info_float(hdr, nv, "NMZ", &nmr_num, 1);
297     ior_num = log(ior_num/ior_den+1e-6)/log(10.); bcf_update_info_float(hdr, nv, "IOR", &ior_num, 1);
298     nm1_num /= (nm1_den+1e-6); bcf_update_info_float(hdr, nv, "NM1", &nm1_num, 1);
299     nm0_num /= (nm0_den+1e-6); bcf_update_info_float(hdr, nv, "NM0", &nm0_num, 1);
300 
301     //odw->write(nv);
302     //bcf_destroy(nv);
303     free(gt);
304     free(gq);
305     free(pl);
306     free(ad);
307     free(td);
308 
309     free(pls); pls = NULL;
310     free(ads); ads = NULL;
311 
312     delete est;
313     free(alleles.s);
314 
315     return nv;
316 }
317 
318 /**
319  * Adds information from a sample to aggregated data.
320  */
flush_sample(int32_t sampleIndex)321 void VNTRGenotypingRecord::flush_sample(int32_t sampleIndex)
322 {
323     uint8_t* p_pls = &pls[sampleIndex*3];
324     uint8_t* p_ads = &ads[sampleIndex*3];
325 
326     int32_t imax = ( tmp_pls[0] > tmp_pls[1] ) ? ( tmp_pls[0] > tmp_pls[2] ? 0 : 2 ) : ( tmp_pls[1] > tmp_pls[2] ? 1 : 2);
327     for(int32_t i=0; i < 3; ++i)
328     {
329         uint32_t l = LogTool::prob2pl(tmp_pls[i]/tmp_pls[imax]);
330         p_pls[i] = ((l > 255) ? 255 : l);
331         p_ads[i] = ((tmp_ads[i] > 255) ? 255 : (uint8_t)tmp_ads[i]);
332     }
333 
334     float sqrt_dp_ra = sqrt((float)tmp_dp_ra);
335     float ior = (float)(tmp_oth_obs_q20 / (tmp_oth_exp_q20 + 1e-6));
336     float nm1 = tmp_al_s1 == 0 ? 0 : tmp_nm_al / (float)tmp_al_s1;
337     float nm0 = (tmp_dp_ra - tmp_al_s1) == 0 ? 0 : (tmp_nm_s1-tmp_nm_al) / (float)(tmp_dp_ra - tmp_al_s1);
338     float w_dp_ra  = log(tmp_dp_ra+1.); //sqrt(dp_ra);
339     float w_dp_q20 = log(tmp_dp_q20+1.); //sqrt(dp_q20);
340     float w_al_s1  = log(tmp_al_s1+1.); //sqrt(al_s1);
341     float w_ref_s1 = log(tmp_dp_ra - tmp_al_s1+1.);
342 
343     if ( p_pls[1] == 0 )
344     { // het genotypes
345         abe_num += (w_dp_ra * (tmp_dp_ra - tmp_al_s1 + 0.05) / (double)(tmp_dp_ra + 0.1));
346         abe_den += w_dp_ra;
347 
348         // E(r) = 0.5(r+a) V(r) = 0.25(r+a)
349         abz_num += w_dp_ra * (tmp_dp_ra - tmp_al_s1 - tmp_dp_ra*0.5)/sqrt(0.25 * tmp_dp_ra + 1e-3);
350         abz_den += (w_dp_ra * w_dp_ra);
351 
352         float bqr = sqrt_dp_ra * Estimator::compute_correlation( tmp_dp_ra, tmp_bq_al, tmp_bq_s1, tmp_bq_s2, tmp_al_s1, tmp_al_s1, .1 );
353         float mqr = sqrt_dp_ra * Estimator::compute_correlation( tmp_dp_ra, tmp_mq_al, tmp_mq_s1, tmp_mq_s2, tmp_al_s1, tmp_al_s1, .1 );
354         float cyr = sqrt_dp_ra * Estimator::compute_correlation_f( tmp_dp_ra, tmp_cy_al, tmp_cy_s1, tmp_cy_s2, (float)tmp_al_s1, (float)tmp_al_s1, .1 );
355         float str = sqrt_dp_ra * Estimator::compute_correlation( tmp_dp_ra, tmp_st_al, tmp_st_s1, tmp_st_s1, tmp_al_s1, tmp_al_s1, .1 );
356         float nmr = sqrt_dp_ra * Estimator::compute_correlation( tmp_dp_ra, tmp_nm_al, tmp_nm_s1, tmp_nm_s2, tmp_al_s1, tmp_al_s1, .1 );
357 
358         // Use Stouffer's method to combine the z-scores, but weighted by log of sample size
359         bqr_num += (bqr * w_dp_ra); bqr_den += (w_dp_ra * w_dp_ra);
360         mqr_num += (mqr * w_dp_ra); mqr_den += (w_dp_ra * w_dp_ra);
361         cyr_num += (cyr * w_dp_ra); cyr_den += (w_dp_ra * w_dp_ra);
362         str_num += (str * w_dp_ra); str_den += (w_dp_ra * w_dp_ra);
363         nmr_num += (nmr * w_dp_ra); nmr_den += (w_dp_ra * w_dp_ra);
364     }
365 
366     ior_num += (ior * w_dp_q20); ior_den += w_dp_q20;
367     nm1_num += (nm1 * w_al_s1);  nm1_den += w_al_s1;
368     nm0_num += (nm0 * w_ref_s1); nm0_den += w_ref_s1;
369 
370     clearTemp();
371 }
372 
373 /**
374  * Adds an allele based with collected sufficient statistics.
375  */
add_allele(double contam,int32_t allele,uint8_t mapq,bool fwd,uint32_t q,int32_t cycle,uint32_t nm)376 void VNTRGenotypingRecord::add_allele(double contam, int32_t allele, uint8_t mapq, bool fwd, uint32_t q, int32_t cycle, uint32_t nm)
377 {
378     double pe = LogTool::pl2prob(q);
379     double pm = 1 - pe;
380 
381     if (q>40)
382     {
383         q = 40;
384     }
385 
386     if ( allele == 0 )
387     {
388         ++tmp_ads[0];
389         tmp_pls[0] *= ( pm * (1-contam) + pe * contam / 3 );
390         tmp_pls[1] *= ( pm / 2 + pe / 6 );
391         tmp_pls[2] *= ( pm * contam + pe * (1-contam) / 3 );
392     }
393     else if ( allele > 0 ) // currently, bi-allelic only
394     {
395         ++tmp_ads[1];
396         tmp_pls[0] *= ( pm * contam + pe * (1-contam) / 3 );
397         tmp_pls[1] *= ( pm / 2 + pe / 6 );
398         tmp_pls[2] *= ( pm * (1-contam) + pe * contam / 3 );
399     }
400     else
401     {
402         ++tmp_ads[2];
403     }
404     double sump = tmp_pls[0] + tmp_pls[1] + tmp_pls[2] + 1e-300;
405     tmp_pls[0] /= sump;
406     tmp_pls[1] /= sump;
407     tmp_pls[2] /= sump;
408 
409     if ( allele >= 0 )
410     {
411         if ( q > 20 )
412         {
413           tmp_oth_exp_q20 += (LogTool::pl2prob(q) * 2. / 3.);
414           ++tmp_dp_q20;
415         }
416 
417         float log_td = (cycle > 0) ? 0-logf((float)cycle) : 0;
418 
419         ++tmp_dp_ra;
420         tmp_bq_s1 += q;
421         tmp_bq_s2 += (q*q);
422         tmp_mq_s1 += mapq;
423         tmp_mq_s2 += (mapq * mapq);
424         tmp_cy_s1 += log_td;
425         tmp_cy_s2 += (log_td * log_td);
426         tmp_st_s1 += fwd;
427 
428         if ( allele > 0 )
429         {
430             ++tmp_al_s1;
431             tmp_bq_al += q;
432             tmp_mq_al += mapq;
433             tmp_cy_al += log_td;
434             tmp_st_al += fwd;
435             tmp_nm_al += (nm-1);
436             tmp_nm_s1 += (nm-1);
437             tmp_nm_s2 += (nm-1)*(nm-1);
438         }
439         else
440         {
441           tmp_nm_s1 += nm;
442           tmp_nm_s2 += (nm * nm);
443         }
444     }
445     else
446     {
447         if (q>20)
448         {
449           tmp_oth_exp_q20 += (LogTool::pl2prob(q) * 2. / 3.);
450           ++tmp_oth_obs_q20;
451           ++tmp_dp_q20;
452         }
453     }
454 }
455 
456 /**
457  * Collects sufficient statistics from read for variants to be genotyped.
458  */
process_read(AugmentedBAMRecord & as,int32_t sampleIndex,double contam)459 void VNTRGenotypingRecord::process_read(AugmentedBAMRecord& as, int32_t sampleIndex, double contam)
460 {
461     if (vtype==VT_SNP)
462     {
463         if (v_alleles.size()==2)
464         {
465             bam1_t *s = as.s;
466 
467             char strand = bam_is_rev(s) ? 'R' : 'F';
468             int32_t allele = 0;
469             //uint32_t bpos1 = bam_get_pos1(s);
470             uint8_t* seq = bam_get_seq(s);
471             uint8_t* qual = bam_get_qual(s);
472             int32_t rlen = bam_get_l_qseq(s);
473             uint8_t mapq = bam_get_mapq(s);
474             uint32_t q = 30;
475             int32_t cycle = 0;
476 
477             std::vector<uint32_t>& aug_cigar = as.aug_cigar;
478             std::vector<std::string>& aug_ref = as.aug_ref;
479             std::vector<std::string>& aug_alt = as.aug_alt;
480 
481             int32_t vpos1 = pos1;
482             int32_t cpos1 = bam_get_pos1(s);
483             int32_t rpos0 = 0;
484 
485             for (uint32_t i=0; i<aug_cigar.size(); ++i)
486             {
487                 uint32_t oplen = bam_cigar_oplen(aug_cigar[i]);
488                 char opchr = bam_cigar_opchr(aug_cigar[i]);
489 
490                 if (opchr=='S')
491                 {
492                     rpos0 += oplen;
493                 }
494                 else if (opchr=='=')
495                 {
496                     if (vpos1>=cpos1 && vpos1<=(cpos1+oplen-1))
497                     {
498                         rpos0 += vpos1-cpos1;
499                         allele = 0;
500                         q = qual[rpos0];
501                         cycle = rpos0<(rlen>>1) ? (rpos0+1) : -(rlen - rpos0 + 1);
502                         break;
503                     }
504                     else
505                     {
506                         cpos1 += oplen;
507                         rpos0 += oplen;
508                     }
509                 }
510                 else if (opchr=='X')
511                 {
512                     if (vpos1==cpos1)
513                     {
514                         allele = (aug_alt[i].at(0) == v_alleles[1].at(0)) ? 1 : -1;
515                         q = qual[rpos0];
516                         cycle = rpos0<(rlen>>1) ? (rpos0+1) : -(rlen - rpos0 + 1);
517                         break;
518                     }
519 
520                     ++cpos1;
521                     ++rpos0;
522                 }
523                 else if (opchr=='I')
524                 {
525                     rpos0 += oplen;
526                 }
527                 else if (opchr=='D')
528                 {
529                     cpos1 += oplen;
530                 }
531                 else
532                 {
533                     notice("unrecognized cigar state %d", opchr);
534                 }
535             }
536 
537             uint32_t no_mismatches = as.no_mismatches;
538             if (allele!=0 && q<20)
539             {
540                 ++no_mismatches;
541             }
542 
543             if (allele!=0 && no_mismatches==0)
544             {
545                 //no_mismatches = 1;
546                 std::cerr << "something wrong\n";
547             }
548 
549             add_allele( contam, allele, mapq, strand == 'F', q, cycle, no_mismatches );
550         }
551         else //multiallelic
552         {
553                   //abort();
554         }
555     }
556     else if (vtype==VT_INDEL)
557     {
558         if (v_alleles.size()==2)
559         {
560             if (as.beg1 <= beg1 && end1 <= as.end1)
561             {
562                 bam1_t *s = as.s;
563 
564                 char strand = bam_is_rev(s) ? 'R' : 'F';
565                 int32_t allele = 0;
566                 //uint32_t bpos1 = bam_get_pos1(s);
567                 uint8_t* seq = bam_get_seq(s);
568                 uint8_t* qual = bam_get_qual(s);
569                 uint32_t rlen = bam_get_l_qseq(s);
570                 uint8_t mapq = bam_get_mapq(s);
571 
572                 uint32_t q = len*30;
573                 uint32_t cycle = 10;
574 
575                 std::vector<uint32_t>& aug_cigar = as.aug_cigar;
576                 std::vector<std::string>& aug_ref = as.aug_ref;
577                 std::vector<std::string>& aug_alt = as.aug_alt;
578 
579                 int32_t vpos1 = pos1;
580 
581                 int32_t cpos1 = bam_get_pos1(s);
582                 int32_t rpos0 = 0;
583 
584                 for (uint32_t i=0; i<aug_cigar.size(); ++i)
585                 {
586                     uint32_t oplen = bam_cigar_oplen(aug_cigar[i]);
587                     char opchr = bam_cigar_opchr(aug_cigar[i]);
588 
589                     if (opchr=='S')
590                     {
591                         rpos0 += oplen;
592                     }
593                     else if (opchr=='=')
594                     {
595                         if (vpos1>=cpos1 && vpos1<=(cpos1+oplen-1))
596                         {
597                             cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
598                         }
599                         cpos1 += oplen;
600                         rpos0 += oplen;
601                     }
602                     else if (opchr=='X')
603                     {
604                         if (cpos1-1==vpos1)
605                         {
606                             cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
607                         }
608                         ++cpos1;
609                         ++rpos0;
610                     }
611                     else if (opchr=='I')
612                     {
613                         if (dlen>0 && cpos1-1==vpos1)
614                         {
615                             if (indel==aug_alt[i])
616                             {
617                                 q = len*30;
618                                 allele = 1;
619                             }
620                             else
621                             {
622                                 q = abs(len-(int32_t)aug_ref[i].size())*30;
623                                 allele = -1;
624                             }
625 
626                             cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
627                             break;
628                         }
629                         else if (dlen<0 && cpos1-1==vpos1)
630                         {
631                             q = 30;
632                             allele = -3;
633                             cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
634                             break;
635                         }
636 
637                         rpos0 += oplen;
638                     }
639                     else if (opchr=='D')
640                     {
641                         if (dlen<0 && cpos1-1==vpos1)
642                         {
643                             if (indel==aug_ref[i])
644                             {
645                                 q = len*30;
646                                 allele = 1;
647                             }
648                             else
649                             {
650                                 q = abs(len-(int32_t)aug_ref[i].size())*30;
651                                 allele = -1;
652                             }
653 
654                             cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
655                             break;
656                         }
657                         else if (dlen>0 && cpos1-1==vpos1)
658                         {
659                             q = 30;
660                             allele = -2;
661 
662                             cycle = strand == 'F' ? (rpos0+1) : (rlen - rpos0);
663                             break;
664                         }
665 
666                     }
667                     else
668                     {
669                         std::cerr << "unrecognized cigar state " << opchr << "\n";
670                     }
671                 }
672 
673                 add_allele( contam, allele, mapq, strand == 'F', q, rand() % 75, as.no_mismatches );
674             }
675             else
676             {
677                 bam1_t *s = as.s;
678                 uint8_t mapq = bam_get_mapq(s);
679 
680                 add_allele( contam, -1, mapq, bam_is_rev(as.s) ? false : true, 20, rand() % 75, as.no_mismatches );
681             }
682         }
683         else //multiallelic
684         {
685             //abort();
686         }
687     }
688     else if (vtype==VT_VNTR)
689     {
690     //abort();
691  //    bam1_t *s = as.s;
692 
693 //     char strand = bam_is_rev(s) ? 'R' : 'F';
694 //     int32_t allele = 0;
695 //     uint32_t pos1 = bam_get_pos1(s);
696 //     uint8_t* seq = bam_get_seq(s);
697 //     uint8_t* qual = bam_get_qual(s);
698 //     uint32_t rlen = bam_get_l_qseq(s);
699 //     uint8_t mapq = bam_get_mapq(s);
700 
701 //     std::vector<uint32_t>& aug_cigar = as.aug_cigar;
702 //     std::vector<std::string>& aug_ref = as.aug_ref;
703 //     std::vector<std::string>& aug_alt = as.aug_alt;
704 
705 //     //genomic bookend positions of VNTR
706 //     int32_t vpos1 = g->beg1-1;
707 //     int32_t vend1 = g->end1+1;
708 
709 //     //position with respect to read
710 //     int32_t cpos1 = bam_get_pos1(s);
711 //     int32_t rpos0 = 0;
712 
713 //     //genomic bookend positions of VNTR translated to read position
714 //     int32_t pos0 = -1;
715 //     int32_t end0 = -1;
716 
717 
718 //     //locate repeat region on read.
719 //     //translating genomic coordinates to read positions
720 //     for (uint32_t i=0; i<aug_cigar.size(); ++i) {
721 //       uint32_t oplen = bam_cigar_oplen(aug_cigar[i]);
722 //       char opchr = bam_cigar_opchr(aug_cigar[i]);
723 
724 //       if (opchr=='S') {
725 //  rpos0 += oplen;
726 //       }
727 //       else if (opchr=='=') {
728 //  if (pos0==-1 && (cpos1<=vpos1 && vpos1<=(cpos1+oplen-1))) {
729 //    pos0 = rpos0 + (vpos1-cpos1+1);
730 //  }
731 
732 //  if (end0==-1 && (cpos1<=vend1 && vend1<=(cpos1+oplen-1))) {
733 //    end0 = rpos0 + (vend1-cpos1+1);
734 //    break;
735 //  }
736 
737 //  cpos1 += oplen;
738 //  rpos0 += oplen;
739 //       }
740 //       else if (opchr=='X') {
741 //  if (pos0==-1 && (cpos1==vpos1)) {
742 //    pos0 = rpos0;
743 //  }
744 
745 //  if (end0==-1 && (cpos1==vend1)) {
746 //    end0 = rpos0;
747 //    break;
748 //  }
749 
750 //  ++cpos1;
751 //  ++rpos0;
752 //       }
753 //       else if (opchr=='I') {
754 //  rpos0 += oplen;
755 //       }
756 //       else if (opchr=='D') {
757 //  cpos1 += oplen;
758 //       }
759 //       else {
760 //  std::cerr << "unrecognized cigar state " << opchr << "\n";
761 //       }
762 //     }
763 
764 //     //compute repeat tract units
765 //     float counts = 0;
766 
767 // //        std::cerr << "pos0,end0 = " << pos0 << "," <<end0 <<  " (" << g->motif.size() <<")\n";
768 
769 //     if (pos0!=-1 && end0!=-1) {
770 //       counts = ((float)(end0-pos0+1))/g->motif.size();
771 //     }
772 
773 //     if (counts) {
774 //       //update genotyping record
775 //       ++g->depth;
776 //       g->counts.push_back(counts);
777 //       g->strands.append(1, strand);
778 //       g->no_mismatches.push_back(as.no_mismatches);
779     }
780 }
781