1 /* The MIT License
2 
3    Copyright (c)  2015 Hyun Min Kang and Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "paste_and_compute_features_sequential.h"
25 
26 namespace
27 {
28 
29 class Igor : Program
30 {
31     public:
32 
33     std::string version;
34 
35     ///////////
36     //options//
37     ///////////
38     std::vector<std::string> input_vcf_files;
39     std::string input_vcf_file_list;
40     std::string output_vcf_file;
41     bool print;
42     std::vector<GenomeInterval> intervals;
43     int32_t maxBQ;
44 
45     ///////
46     //i/o//
47     ///////
48     BCFOrderedReader *odr;
49     BCFOrderedWriter *odw;
50 
51     ///////////////
52     //general use//
53     ///////////////
54 
55     /////////
56     //stats//
57     /////////
58 
59     /////////
60     //tools//
61     /////////
62 
63     // The function assumes a particular list of features in the BCF files
64     //
65 
Igor(int argc,char ** argv)66     Igor(int argc, char ** argv)
67     {
68         //////////////////////////
69         //options initialization//
70         //////////////////////////
71         try
72         {
73             std::string desc = "Pastes VCF files like the unix paste functions.\n"
74                  "              This is used after the per sample genotyping step in vt.\n"
75                  "              Input requirements and assumptions:\n"
76                  "              1. Same variants are represented in the same order for each file (required)\n"
77                  "              2. Genotype fields are the same for corresponding records (required)\n"
78                  "              3. Sample names are different in all the files (warning will be given if not)\n"
79                  "              4. Headers (not including the samples) are the same for all the files (unchecked assumption, will fail if output is BCF)\n"
80                  "              Outputs:\n"
81                  "              1. INFO fields output will be that of the first file\n"
82                  "              2. Genotype fields are the same for corresponding records\n";
83 
84 
85             version = "0.1";
86             TCLAP::CmdLine cmd(desc, ' ', version);
87             VTOutput my; cmd.setOutput(&my);
88             TCLAP::SwitchArg arg_print("p", "p", "print options and summary []", cmd, false);
89             TCLAP::ValueArg<std::string> arg_output_vcf_file("o", "o", "output VCF file [-]", false, "-", "", cmd);
90             TCLAP::ValueArg<std::string> arg_input_vcf_file_list("L", "L", "file containing list of input VCF files", false, "", "str", cmd);
91             TCLAP::UnlabeledMultiArg<std::string> arg_input_vcf_files("<in1.vcf>...", "Multiple VCF files",false, "files", cmd);
92             TCLAP::ValueArg<std::string> arg_intervals("i","i","Intervals[]", false, "", "str", cmd);
93             TCLAP::ValueArg<std::string> arg_interval_list("I", "I", "file containing list of intervals []", false, "", "file", cmd);
94             TCLAP::ValueArg<int32_t> arg_max_bq("q", "q", "Maximum base quality to cap []", false, 30, "int", cmd);
95 
96             cmd.parse(argc, argv);
97 
98             input_vcf_file_list = arg_input_vcf_file_list.getValue();
99             output_vcf_file = arg_output_vcf_file.getValue();
100             parse_files(input_vcf_files, arg_input_vcf_files.getValue(), arg_input_vcf_file_list.getValue());
101             const std::vector<std::string>& v = arg_input_vcf_files.getValue();
102             print = arg_print.getValue();
103             maxBQ = arg_max_bq.getValue();
104             parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue());
105 
106             if (input_vcf_files.size()==0)
107             {
108                 fprintf(stderr, "[E:%s:%d %s] no input vcf files.\n", __FILE__, __LINE__, __FUNCTION__);
109                 exit(1);
110             }
111 
112         }
113         catch (TCLAP::ArgException &e)
114         {
115             std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
116             abort();
117         }
118     };
119 
initialize()120     void initialize()
121     {
122         //////////////////////
123         //i/o initialization//
124         //////////////////////
125         // read only the first file
126         odr = new BCFOrderedReader(input_vcf_files[0], intervals);
127         if ( intervals.size() > 1 )
128         {
129             fprintf(stderr, "[E:%s:%d %s] Multiple intervals are not allowed\n", __FILE__, __LINE__, __FUNCTION__);
130             exit(1);
131         }
132 
133         //for (size_t i=0; i<input_vcf_files.size(); ++i)
134         //{
135         //    odrs.push_back(new BCFOrderedReader(input_vcf_files[i], intervals));
136         //}
137         odw = new BCFOrderedWriter(output_vcf_file, 0);
138 
139         odw->set_hdr(odr->hdr);
140 
141         bcf_hdr_append(odw->hdr, "##INFO=<ID=AVGDP,Number=1,Type=Float,Description=\"Average Depth per Sample\">\n");
142         bcf_hdr_append(odw->hdr, "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Alternate Allele Counts\">\n");
143         bcf_hdr_append(odw->hdr, "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total Number Allele Counts\">\n");
144         //bcf_hdr_append(odw->hdr, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Reads\">\n");
145         bcf_hdr_append(odw->hdr, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Alternate Allele Frequency from Best-guess Genotypes\">\n");
146         bcf_hdr_append(odw->hdr, "##INFO=<ID=GC,Number=G,Type=Integer,Description=\"Genotype Counts\">\n");
147         bcf_hdr_append(odw->hdr, "##INFO=<ID=GN,Number=1,Type=Integer,Description=\"Total Number of Genotypes\">\n");
148         bcf_hdr_append(odw->hdr, "##INFO=<ID=HWEAF,Number=A,Type=Float,Description=\"Genotype likelihood based Allele Frequency assuming HWE\">\n");
149         bcf_hdr_append(odw->hdr, "##INFO=<ID=HWDGF,Number=G,Type=Float,Description=\"Genotype likelihood based Genotype Frequency ignoring HWE\">\n");
150         bcf_hdr_append(odw->hdr, "##INFO=<ID=IBC,Number=1,Type=Float,Description=\"Inbreeding Coefficients calculated from genotype likelihoods\">\n");
151         bcf_hdr_append(odw->hdr, "##INFO=<ID=HWE_SLP,Number=1,Type=Float,Description=\"Signed log p-values testing  statistics based Hardy Weinberg ln(Likelihood Ratio)\">\n");
152         bcf_hdr_append(odw->hdr, "##INFO=<ID=ABE,Number=1,Type=Float,Description=\"Expected allele Balance towards Reference Allele on Heterozygous Sites\">\n");
153         bcf_hdr_append(odw->hdr, "##INFO=<ID=ABZ,Number=1,Type=Float,Description=\"Average Z-scores of Allele Balance towards Reference Allele on Heterozygous Sites\">\n");
154         bcf_hdr_append(odw->hdr, "##INFO=<ID=NS_NREF,Number=1,Type=Integer,Description=\"Number of samples with non-reference reads\">\n");
155         bcf_hdr_append(odw->hdr, "##INFO=<ID=BQZ,Number=1,Type=Float,Description=\"Correlation between base quality and alleles\">\n");
156         //bcf_hdr_append(odw->hdr, "##INFO=<ID=MQZ,Number=1,Type=Float,Description=\"Correlation between mapping quality and alleles\">\n");
157         bcf_hdr_append(odw->hdr, "##INFO=<ID=CYZ,Number=1,Type=Float,Description=\"Correlation between cycle and alleles\">\n");
158         bcf_hdr_append(odw->hdr, "##INFO=<ID=STZ,Number=1,Type=Float,Description=\"Correlation between strand and alleles\">\n");
159         bcf_hdr_append(odw->hdr, "##INFO=<ID=NMZ,Number=1,Type=Float,Description=\"Correlation between mismatch counts per read and alleles\">\n");
160         bcf_hdr_append(odw->hdr, "##INFO=<ID=IOR,Number=1,Type=Float,Description=\"Inflated rate of observing of other alleles in log10 scale\">\n");
161         bcf_hdr_append(odw->hdr, "##INFO=<ID=NM0,Number=1,Type=Float,Description=\"Average number of mismatches in the reads with ref alleles\">\n");
162         bcf_hdr_append(odw->hdr, "##INFO=<ID=NM1,Number=1,Type=Float,Description=\"Average number of mismatches in the reads with non-ref alleles\">\n");
163         bcf_hdr_append(odw->hdr, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n");
164         bcf_hdr_append(odw->hdr, "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">\n");
165         bcf_hdr_append(odw->hdr, "##FORMAT=<ID=AD,Number=A,Type=Integer,Description=\"Allele Depth\">\n");
166         bcf_hdr_append(odw->hdr, "##FORMAT=<ID=OD,Number=1,Type=Integer,Description=\"Other Allele Depth\">\n");
167         bcf_hdr_append(odw->hdr, "##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"Phred-scale Genotype Likelihoods\">\n");
168 
169         ///////////////
170         //general use//
171         ///////////////
172 
173         ////////////////////////
174         //stats initialization//
175         ////////////////////////
176 
177         /////////
178         //tools//
179         /////////
180     }
181 
182 
compute_correlation(int32_t n,int32_t xy,int32_t x1,int32_t x2,int32_t y1,int32_t y2,float buffer)183     float compute_correlation(int32_t n, int32_t xy, int32_t x1, int32_t x2, int32_t y1, int32_t y2, float buffer) {
184       if ( n == 0 ) return 0;
185       float xsd = x2/(float)n - (x1/(float)n)*(x1/(float)n);
186       float ysd = y2/(float)n - (y1/(float)n)*(y1/(float)n);
187       return ( ( xy/(float)n - x1 * y1 / (float) n / (float) n ) / sqrt( xsd * ysd + buffer ) );
188     }
189 
compute_correlation_f(int32_t n,float xy,float x1,float x2,float y1,float y2,float buffer)190     float compute_correlation_f(int32_t n, float xy, float x1, float x2, float y1, float y2, float buffer) {
191       if ( n == 0 ) return 0;
192       float xsd = x2/(float)n - (x1/(float)n)*(x1/(float)n);
193       float ysd = y2/(float)n - (y1/(float)n)*(y1/(float)n);
194       return ( ( xy/(float)n - x1 * y1 / (float) n / (float) n ) / sqrt( xsd * ysd + buffer ) );
195     }
196 
paste_and_compute_features_sequential()197     void paste_and_compute_features_sequential()
198     {
199     // assume that the following features are available
200     // 1. BQSUM, DPF, DPR
201     // 2. GT, PL, ADF, ADR, DP, CY, ST, AL, NM
202     //
203     // calculate the following per-variant statistics
204     // DP - total depth -- \sum_i (DPF+DPR) + \sum_i DP
205     // MLEAF - allele frequency estimated by best guess genotypes
206     // HWEAF - allele frequency estimated by genotype likelihood under HWE
207     // HWDAF - allele frequency estimated by genotype likelihood under non-HWD
208     // SLRT - signed LRT test statistics for departure from HWE
209     // IBC - [Pr(Het|HWD)-Pr(Het|HWE)]/Pr(Het|HWE)
210     // AB - Allele balance -- \sum_i \sqrt{#A + #R} (#A)/(#A+#R) | GT = HET
211     // STR - Strand bias   -- \sum_i \sqrt{#A + #R} cor(FR,FA,RR,RA)
212     // TBR - Tail distance bias  -- \sum_i \sqrt{#A + #R} cor(TD,AL)
213     // MQR - Mapping quality bias -- \sum_i \sqrt{#A + #R} cor(MQ,AL)
214     // BQR - Base quality bias -- \sum_i \sqrt{#A + #R} cor(BQ,AL)
215     // NMR - Number of mismatch bias -- \sum \sqrt{#A + #R} cor(NM,AL - A?) | BQ > 20
216     // IOR - Inflated fraction of "other" biases compared to base quality -- \sum_i \sqrt{#A + #R} {Pr(O) - E[O]} | BQ > 20
217     //
218     // calculate the following per-sample statistics
219     // GT - best guess genotypes
220     // GQ - genotype quality
221     // AD - allele depth
222     // PL - likelihoods
223     //
224     // Assume that either
225     // BQSUM : DPF : DPR or
226     // GT, PL, DP, CY, ST, NM exists
227 
228     // store everything into memory
229     std::vector<int32_t> vn_alleles;
230     std::vector<int32_t> vn_genos;
231     std::vector<int32_t*> v_pls;
232     std::vector<int32_t*> v_gts;
233     std::vector<int32_t*> v_gqs;
234     std::vector<int32_t*> v_ads;
235     std::vector<int32_t*> v_ods;
236 
237     // check the type from FORMAT, first field must be either BQSUM or GT
238     int32_t* p_bqsum = NULL;
239     int32_t np_bqsum = 0;
240     int32_t* p_dp = NULL;
241     int32_t np_dp = 0;
242     int32_t* p_gt = NULL;
243     int32_t np_gt = 0;
244     int32_t* p_pl = NULL;
245     int32_t np_pl = 0;
246     int32_t* p_bq = NULL;
247     int32_t np_bq = 0;
248     int32_t* p_mq = NULL;
249     int32_t np_mq = 0;
250     int32_t* p_cy = NULL;
251     int32_t np_cy = 0;
252     char**   p_st = NULL;
253     int32_t np_st = 0;
254     int32_t*  p_al = NULL;
255     int32_t np_al = 0;
256     int32_t* p_nm = NULL;
257     int32_t np_nm = 0;
258 
259     // read the first genotype file to populate
260     int32_t nfiles = input_vcf_files.size();
261     std::vector<bool> v_skips;
262     std::vector<bool> v_is_snps;
263     std::vector<int32_t> v_rids;
264     std::vector<int32_t> v_poss;
265     std::vector<int32_t> v_rlens;
266     std::vector< std::vector<std::string> > v_d_alleles;
267     std::vector< std::vector<int32_t> > v_filts;
268 
269         bcf1_t* v = bcf_init();
270     while( odr->read(v) ) {
271       // skip multi-allelics
272       bool skip = false;
273       bcf_unpack(v, BCF_UN_ALL);
274 
275       if ( v->n_allele > 2 ) skip = true;
276       else if ( ( !intervals.empty() ) && ( ( v->pos < intervals[0].start1 ) || ( v->pos > intervals[0].end1 ) ) ) skip = true;
277       else {
278         bool is_vntr = false;
279         for(size_t i=0; i < v->n_allele; ++i) {
280           if ( strcmp(v->d.allele[i],"<VNTR>") == 0 )
281         is_vntr = true;
282         }
283         if ( is_vntr ) skip = true;
284       }
285 
286       // determine whether to skip the marker or not
287       v_skips.push_back(skip);
288       if ( skip ) continue;
289 
290       // populate marker information
291       v_is_snps.push_back(bcf_is_snp(v));
292       v_rids.push_back(v->rid);
293       v_poss.push_back(v->pos);
294       v_rlens.push_back(v->rlen);
295       v_d_alleles.push_back(std::vector<std::string>());
296       for(size_t i=0; i < v->n_allele; ++i) {
297         v_d_alleles.back().push_back(v->d.allele[i]);
298       }
299       vn_alleles.push_back(v->n_allele);
300       vn_genos.push_back(v->n_allele * (v->n_allele+1)/2);
301 
302       v_filts.push_back(std::vector<int32_t>());
303       for(size_t i=0; i < v->d.n_flt; ++i) {
304         v_filts.back().push_back(v->d.flt[i]);
305       }
306 
307       // initialize genotype fields
308       v_pls.push_back( (int32_t*)calloc( nfiles * vn_genos.back(), sizeof(int32_t) ) );
309       v_ads.push_back( (int32_t*)calloc( nfiles * vn_alleles.back(), sizeof(int32_t) ) );
310       v_gts.push_back( (int32_t*)calloc( nfiles * 2, sizeof(int32_t) ) );
311       v_gqs.push_back( (int32_t*)calloc( nfiles, sizeof(int32_t) ) );
312       v_ods.push_back( (int32_t*)calloc( nfiles, sizeof(int32_t) ) );
313     }
314     odr->close();
315     delete odr;
316 
317     std::vector<float> v_bqr_nums(v_rids.size(), 0);
318     std::vector<float> v_bqr_dens(v_rids.size(), 0);
319     std::vector<float> v_mqr_nums(v_rids.size(), 0);
320     std::vector<float> v_mqr_dens(v_rids.size(), 0);
321     std::vector<float> v_cyr_nums(v_rids.size(), 0);
322     std::vector<float> v_cyr_dens(v_rids.size(), 0);
323     std::vector<float> v_str_nums(v_rids.size(), 0);
324     std::vector<float> v_str_dens(v_rids.size(), 0);
325     std::vector<float> v_nmr_nums(v_rids.size(), 0);
326     std::vector<float> v_nmr_dens(v_rids.size(), 0);
327     std::vector<float> v_ior_nums(v_rids.size(), 0);
328     std::vector<float> v_ior_dens(v_rids.size(), 0);
329     std::vector<float> v_nm0_nums(v_rids.size(), 0);
330     std::vector<float> v_nm0_dens(v_rids.size(), 0);
331     std::vector<float> v_nm1_nums(v_rids.size(), 0);
332     std::vector<float> v_nm1_dens(v_rids.size(), 0);
333     std::vector<float> v_ab_nums(v_rids.size(), 0);
334     std::vector<float> v_ab_dens(v_rids.size(), 0);
335     std::vector<float> v_abz_nums(v_rids.size(), 0);
336     std::vector<float> v_abz_dens(v_rids.size(), 0);
337     std::vector<int32_t> v_ns_nrefs(v_rids.size(), 0);
338     std::vector<int32_t> v_dp_sums(v_rids.size(), 0);
339     std::vector<int32_t> v_max_gqs(v_rids.size(), 0);
340 
341     bcf_clear(v);
342 
343     for(size_t i=0; i < nfiles; ++i) {
344       fprintf(stderr,"Reading input file %s..\n", input_vcf_files[i].c_str());
345       // set odr and v
346       odr = new BCFOrderedReader(input_vcf_files[i], intervals);
347       v = bcf_init();
348 
349       if ( bcf_hdr_nsamples(odr->hdr) != 1 ) {
350           fprintf(stderr, "[E:%s:%d %s] The genotype file must contain exactly one sample", __FILE__, __LINE__, __FUNCTION__);
351           exit(1);
352       }
353       if ( i > 0 )
354         bcf_hdr_add_sample(odw->hdr, bcf_hdr_get_sample_name(odr->hdr, 0));
355 
356       for( size_t j=0, k=0; j < v_skips.size(); ++j) {
357         if ( ! odr->read(v) ) {
358           fprintf(stderr, "[E:%s:%d %s] Cannot read variant from genotype files. j=%zu, k=%zu, pos[k]=%d", __FILE__, __LINE__, __FUNCTION__, j, k, v_poss[k]);
359           exit(1);
360         }
361         if ( v_skips[j] ) continue;
362         bcf_unpack(v, BCF_UN_ALL);
363 
364         // check marker infor with anchor files
365         if ( ( v->rid != v_rids[k] ) || ( v->pos != v_poss[k] ) || ( v->rlen != v_rlens[k] ) || ( v->n_allele != vn_alleles[k] ) ) {
366           fprintf(stderr, "[E:%s:%d %s] Variant position or ref alleles does not match\n", __FILE__, __LINE__, __FUNCTION__);
367           exit(1);
368         }
369 
370         for(size_t l=0; l < vn_alleles[k]; ++l) {
371           if ( v_d_alleles[k][l].compare(v->d.allele[l]) ) {
372           fprintf(stderr, "[E:%s:%d %s] Variant alleles does not match\n", __FILE__, __LINE__, __FUNCTION__);
373           exit(1);
374           }
375         }
376 
377         // extract genotype fields and calculate summary statistics
378         if ( bcf_get_format_int32(odr->hdr, v, "BQSUM", &p_bqsum, &np_bqsum) >= 0 ) { // BQSUM observed - REF-ONLY
379           if ( bcf_get_format_int32(odr->hdr, v, "DP", &p_dp, &np_dp) < 0 ) {
380         fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- BQSUM, DP\n", __FILE__, __LINE__, __FUNCTION__);
381         exit(1);
382           }
383           if ( ( np_bqsum != np_dp ) || ( np_dp != 1 ) ) {
384         fprintf(stderr, "[E:%s:%d %s] FORMAT field does not have the samme number of fields -- BQSUM, DP\n", __FILE__, __LINE__, __FUNCTION__);
385         exit(1);
386           }
387 
388           for(size_t l=0; l < vn_alleles[k]; ++l){
389         v_ads[k][ i*vn_alleles[k] + l] = (l == 0 ? p_dp[0] : 0);
390         v_ods[k][ i ] = 0;
391         for(size_t m=0; m <= l; ++m)  {
392           if ( m == 0 ) {
393             if ( l == 0 )
394               v_pls[k][vn_genos[k] * i + l * (l+1) / 2 + m] = 0;
395             else
396               v_pls[k][vn_genos[k] * i + l * (l+1) / 2 + m] = (int32_t)floor(6.931472 * p_dp[0] + 0.5);
397           }
398           else
399             v_pls[k][vn_genos[k] * i + l * (l+1) / 2 + m] = (int32_t)floor(p_bqsum[0] + 10.98612 * p_dp[0] + 0.5);
400           }
401           }
402           v_dp_sums[k] += p_dp[0];
403         }
404         else if ( bcf_get_genotypes(odr->hdr, v, &p_gt, &np_gt) >= 0 ) {  // GT unobserved -- non-REF
405           // extract PL, DP, BQ, MQ, CY, ST, AL, NM
406           if ( bcf_get_format_int32(odr->hdr, v, "PL", &p_pl, &np_pl) < 0 ) {
407         fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- PL\n", __FILE__, __LINE__, __FUNCTION__);
408         exit(1);
409           }
410           if ( bcf_get_format_int32(odr->hdr, v, "DP", &p_dp, &np_dp) < 0 ) {
411         fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- DP\n", __FILE__, __LINE__, __FUNCTION__);
412         exit(1);
413           }
414           if ( bcf_get_format_int32(odr->hdr, v, "BQ", &p_bq, &np_bq) < 0 ) {
415         fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- BQ\n", __FILE__, __LINE__, __FUNCTION__);
416         exit(1);
417           }
418           if ( bcf_get_format_int32(odr->hdr, v, "MQ", &p_mq, &np_mq) < 0 ) {
419         fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- MQ\n", __FILE__, __LINE__, __FUNCTION__);
420         exit(1);
421           }
422           if ( bcf_get_format_int32(odr->hdr, v, "CY", &p_cy, &np_cy) < 0 ) {
423         fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- GT, CY\n", __FILE__, __LINE__, __FUNCTION__);
424         exit(1);
425           }
426           if ( bcf_get_format_string(odr->hdr, v, "ST", &p_st, &np_st) < 0 ) {
427         fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- GT, ST\n", __FILE__, __LINE__, __FUNCTION__);
428         exit(1);
429           }
430           if ( bcf_get_format_int32(odr->hdr, v, "AL", &p_al, &np_al) < 0 ) {
431         fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- GT, AL\n", __FILE__, __LINE__, __FUNCTION__);
432         exit(1);
433           }
434           if ( bcf_get_format_int32(odr->hdr, v, "NM", &p_nm, &np_nm) < 0 ) {
435         fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- GT, NM\n", __FILE__, __LINE__, __FUNCTION__);
436         exit(1);
437           }
438 
439           // sanity checking
440           if ( np_pl != vn_genos[k] ) {
441         fprintf(stderr, "[E:%s:%d %s] np_pl (%d) != n_genos (%d)\n", __FILE__, __LINE__, __FUNCTION__, np_pl, vn_genos[k]);
442         exit(1);
443           }
444 
445           if ( ( np_dp != 1 ) || ( np_gt != 2 ) ) {
446         fprintf(stderr, "[E:%s:%d %s] Assertion failed in np_dp (%d) == np_gt (%d),  np_st (%d) == np_al (%d)\n", __FILE__, __LINE__, __FUNCTION__, np_dp, np_gt, np_st, np_al);
447         exit(1);
448           }
449           if ( p_dp[0] != strlen(p_st[0]) ) {
450         fprintf(stderr, "[E:%s:%d %s] Assetion failed - p_dp[0] == strlen(p_st[0])\n", __FILE__, __LINE__, __FUNCTION__);
451         exit(1);
452           }
453 
454           // currently assume everything as diploid
455           int32_t a1 = bcf_gt_allele(p_gt[0]);
456           int32_t a2 = bcf_gt_allele(p_gt[1]);
457           int32_t gt = bcf_alleles2gt(a1, a2);
458           for(size_t l=0; l < vn_genos[k]; ++l) {
459         v_pls[k][vn_genos[k] * i + l] = p_pl[l];
460           }
461 
462           v_dp_sums[k] += p_dp[0];
463 
464           int32_t bq_s1 = 0;
465           int32_t bq_s2 = 0;
466           int32_t mq_s1 = 0;
467           int32_t mq_s2 = 0;
468           float   cy_s1 = 0;
469           float   cy_s2 = 0;
470           int32_t st_s1 = 0;
471           int32_t al_s1 = 0;
472           int32_t nm_s1 = 0;
473           int32_t nm_s2 = 0;
474           int32_t bq_al = 0;
475           int32_t mq_al = 0;
476           float   cy_al = 0;
477           int32_t st_al = 0;
478           int32_t nm_al = 0;
479           int32_t dp_ra = 0;
480           int32_t dp_q20 = 0;
481           double  oth_exp_q20 = 0;
482           double  oth_obs_q20 = 0;
483 
484           int32_t* ads_z = &v_ads[k][ i*vn_alleles[k] ];
485 
486           ++v_ns_nrefs[k];
487 
488           for(size_t l=0; l < p_dp[0]; ++l) {
489         if ( p_bq[l] > maxBQ ) p_bq[l] = maxBQ;
490 
491         if ( p_al[l] >= 0 ) {
492           if ( p_bq[l] > 20 ) {
493             oth_exp_q20 += ( LogTool::pl2prob(p_bq[l]) * 2.0 / 3.0 );
494             ++dp_q20;
495           }
496 
497           // calculate cycle-based tail distance
498           float log_td = 0-logf((float)abs(v_is_snps[k] ? p_cy[l] : (int)(rand() % 100))+1.); // temporarily ignore cycles
499 
500           ++dp_ra;
501           bq_s1 += p_bq[l];
502           bq_s2 += (p_bq[l] * p_bq[l]);
503           mq_s1 += p_mq[l];
504           mq_s2 += (p_mq[l] * p_mq[l]);
505           cy_s1 += log_td;
506           cy_s2 += (log_td * log_td);
507           st_s1 += ((p_st[0][l] == 'F') ? 0 : 1);
508           if ( p_al[l] > 0 ) {
509             ++al_s1;
510             bq_al += p_bq[l];
511             mq_al += p_mq[l];
512             cy_al += log_td;
513             st_al += ((p_st[0][l] == 'F') ? 0 : 1);
514             nm_al += (p_nm[l]-1);
515             nm_s1 += (p_nm[l]-1);
516             nm_s2 += ((p_nm[l]-1)*(p_nm[l]-1));
517             ++ads_z[1];
518           }
519           else {
520             nm_s1 += p_nm[l];
521             nm_s2 += ( p_nm[l] * p_nm[l] );
522             ++ads_z[0];
523           }
524         }
525         else {
526           if ( p_bq[l] > 20 ) {
527             oth_exp_q20 += ( LogTool::pl2prob(p_bq[l]) * 2.0 / 3.0 );
528             ++oth_obs_q20;
529             ++dp_q20;
530           }
531           ++v_ods[k][i];
532         }
533           }
534 
535           float sqrt_dp_ra = sqrt((float)dp_ra);
536           float ior = (float)(oth_obs_q20 / (oth_exp_q20 + 1e-6));
537           float nm1 = al_s1 == 0 ? 0 : nm_al / (float)al_s1;
538           float nm0 = (dp_ra-al_s1) == 0 ? 0 : (nm_s1-nm_al) / (float)(dp_ra-al_s1);
539           float w_dp_ra  = log(dp_ra+1.); //sqrt(dp_ra);
540           float w_dp_q20 = log(dp_q20+1.); //sqrt(dp_q20);
541           float w_al_s1  = log(al_s1+1.); //sqrt(al_s1);
542           float w_ref_s1 = log(dp_ra-al_s1+1.);
543 
544           if ( gt == 1 ) { // het genotypes
545         v_ab_nums[k] += (w_dp_ra * (dp_ra - al_s1 + 0.05) / (double)(dp_ra + 0.1));
546         v_ab_dens[k] += w_dp_ra;
547 
548         // E(r) = 0.5(r+a) V(r) = 0.25(r+a)
549         v_abz_nums[k] += w_dp_ra * (dp_ra - al_s1 - dp_ra*0.5)/sqrt(0.25 * dp_ra + 1e-3);
550         v_abz_dens[k] += (w_dp_ra * w_dp_ra);
551 
552         float bqr = sqrt_dp_ra * compute_correlation( dp_ra, bq_al, bq_s1, bq_s2, al_s1, al_s1, .1 );
553         float mqr = sqrt_dp_ra * compute_correlation( dp_ra, mq_al, mq_s1, mq_s2, al_s1, al_s1, .1 );
554         float cyr = sqrt_dp_ra * compute_correlation_f( dp_ra, cy_al, cy_s1, cy_s2, (float)al_s1, (float)al_s1, .1 );
555         float str = sqrt_dp_ra * compute_correlation( dp_ra, st_al, st_s1, st_s1, al_s1, al_s1, .1 );
556         float nmr = sqrt_dp_ra * compute_correlation( dp_ra, nm_al, nm_s1, nm_s2, al_s1, al_s1, .1 );
557 
558         // Use Stouffer's method to combine the z-scores, but weighted by log of sample size
559         v_bqr_nums[k] += (bqr * w_dp_ra); v_bqr_dens[k] += (w_dp_ra * w_dp_ra);
560         v_mqr_nums[k] += (mqr * w_dp_ra); v_mqr_dens[k] += (w_dp_ra * w_dp_ra);
561         v_cyr_nums[k] += (cyr * w_dp_ra); v_cyr_dens[k] += (w_dp_ra * w_dp_ra);
562         v_str_nums[k] += (str * w_dp_ra); v_str_dens[k] += (w_dp_ra * w_dp_ra);
563         v_nmr_nums[k] += (nmr * w_dp_ra); v_nmr_dens[k] += (w_dp_ra * w_dp_ra);
564           }
565 
566           v_ior_nums[k] += (ior * w_dp_q20); v_ior_dens[k] += w_dp_q20;
567           v_nm1_nums[k] += (nm1 * w_al_s1);  v_nm1_dens[k] += w_al_s1;
568           v_nm0_nums[k] += (nm0 * w_ref_s1); v_nm0_dens[k] += w_ref_s1;
569         }
570 
571         ++k;
572       }
573       odr->close();
574       delete odr;
575     }
576         bcf_hdr_add_sample(odw->hdr, NULL);
577 
578         odw->write_hdr();
579 
580     bcf1_t* nv = bcf_init();
581 
582     for(size_t k=0; k < v_rids.size(); ++k) {
583       bcf_clear(nv);
584       nv->rid = v_rids[k];
585       nv->pos = v_poss[k];
586       nv->rlen = v_rlens[k];
587       nv->n_sample = nfiles;
588 
589       const char* tmp_d_alleles[vn_alleles[k]];
590       for(int l=0; l < vn_alleles[k]; ++l)
591         tmp_d_alleles[l] = v_d_alleles[k][l].c_str();
592       bcf_update_alleles(odw->hdr, nv, tmp_d_alleles, vn_alleles[k]);
593 
594       if ( !v_filts[k].empty() ) {
595         int tmp_filts[v_filts[k].size()];
596         for(size_t l=0; l < v_filts[k].size(); ++l) {
597           tmp_filts[l] = v_filts[k][l];
598         }
599         bcf_update_filter(odw->hdr, nv, tmp_filts, (int32_t)v_filts[k].size());
600       }
601 
602       bcf_unpack(nv, BCF_UN_ALL);
603 
604       // calculate the allele frequencies under HWE
605       float MLE_HWE_AF[vn_alleles[k]];
606       float MLE_HWE_GF[vn_genos[k]];
607       int32_t ploidy = 2; // temporarily constant
608       int32_t n = 0;
609       Estimator::compute_gl_af_hwe(v_pls[k], nfiles, ploidy, vn_alleles[k], MLE_HWE_AF, MLE_HWE_GF,  n, 1e-20);
610 
611       // calculate the genotypes (diploid only)
612       double gp, gp_sum, max_gp;
613       int32_t best_gt;
614       int32_t best_a1, best_a2;
615       int32_t* pls_i;
616       int32_t an = 0;
617       int32_t acs[vn_alleles[k]];
618       int32_t gcs[vn_genos[k]];
619       float afs[vn_alleles[k]];
620 
621       memset(acs, 0, sizeof(int32_t)*vn_alleles[k]);
622       memset(gcs, 0, sizeof(int32_t)*vn_genos[k]);
623 
624       for(size_t i=0; i < nfiles; ++i) {
625         pls_i = &v_pls[k][ i * vn_genos[k] ];
626         max_gp = gp_sum = gp = ( LogTool::pl2prob(pls_i[0]) * MLE_HWE_AF[0] * MLE_HWE_AF[0] );
627         best_gt = 0; best_a1 = 0; best_a2 = 0;
628         for(size_t l=1; l < vn_alleles[k]; ++l) {
629           for(size_t m=0; m <= l; ++m) {
630         gp = ( LogTool::pl2prob(pls_i[ l*(l+1)/2 + m]) * MLE_HWE_AF[l] * MLE_HWE_AF[m] * (l == m ? 1 : 2) );
631         gp_sum += gp;
632         if ( max_gp < gp ) {
633           max_gp = gp;
634           best_gt = l*(l+1)/2 + m; best_a1 = m; best_a2 = l;
635         }
636           }
637         }
638 
639         double prob = 1.-max_gp/gp_sum;
640         if ( prob <= 3.162278e-26 )
641           prob = 3.162278e-26;
642         if ( prob > 1 )
643           prob = 1;
644 
645         v_gqs[k][i] = (int32_t)LogTool::prob2pl(prob);
646 
647         if ( ( best_gt > 0 ) && ( v_max_gqs[k] < v_gqs[k][i] ) )
648           v_max_gqs[k] = v_gqs[k][i];
649 
650         v_gts[k][2*i]   = ((best_a1 + 1) << 1);
651         v_gts[k][2*i+1] = ((best_a2 + 1) << 1);
652         an += 2;
653         ++acs[best_a1];
654         ++acs[best_a2];
655         ++gcs[best_gt];
656       }
657 
658       for(size_t i=0; i < vn_alleles[k]; ++i) {
659         afs[i] = acs[i]/(float)an;
660       }
661 
662       bcf_update_format_int32(odw->hdr, nv, "GT", v_gts[k], nfiles * 2);
663       bcf_update_format_int32(odw->hdr, nv, "GQ", v_gqs[k], nfiles );
664       bcf_update_format_int32(odw->hdr, nv, "AD", v_ads[k], nfiles * vn_alleles[k]);
665       bcf_update_format_int32(odw->hdr, nv, "OD", v_ods[k], nfiles );
666       bcf_update_format_int32(odw->hdr, nv, "PL", v_pls[k], nfiles * vn_genos[k]);
667 
668       float avgdp = (float)v_dp_sums[k]/(float)nfiles;
669 
670       nv->qual = (float) v_max_gqs[k];
671       bcf_update_info_float(odw->hdr, nv, "AVGDP", &avgdp, 1);
672       bcf_update_info_int32(odw->hdr, nv, "AC", &acs[1], vn_alleles[k]-1);
673       bcf_update_info_int32(odw->hdr, nv, "AN", &an, 1);
674       bcf_update_info_float(odw->hdr, nv, "AF", &afs[1], vn_alleles[k]-1);
675       bcf_update_info_int32(odw->hdr, nv, "GC", gcs, vn_genos[k]);
676       bcf_update_info_int32(odw->hdr, nv, "GN", &nfiles, 1);
677 
678       if (n) {
679         float* MLE_HWE_AF_PTR = &MLE_HWE_AF[1];
680         bcf_update_info_float(odw->hdr, nv, "HWEAF", MLE_HWE_AF_PTR, vn_alleles[k]-1);
681         //bcf_update_info_float(odw->hdr, nv, "HWEGF", &MLE_HWE_GF, n_genos);
682       }
683 
684       // calculate the allele frequencies under HWD
685       float MLE_AF[vn_alleles[k]];
686       float MLE_GF[vn_genos[k]];
687       n = 0;
688       Estimator::compute_gl_af(v_pls[k], nfiles, ploidy, vn_alleles[k], MLE_AF, MLE_GF,  n, 1e-20);
689       if (n) {
690         float* MLE_AF_PTR = &MLE_AF[1];
691         //bcf_update_info_float(odw->hdr, nv, "HWDAF", MLE_AF_PTR, n_alleles-1);
692         bcf_update_info_float(odw->hdr, nv, "HWDGF", &MLE_GF, vn_genos[k]);
693       }
694 
695       float fic = 0;
696       n = 0;
697       Estimator::compute_gl_fic(v_pls[k], nfiles, ploidy, MLE_HWE_AF, vn_alleles[k], MLE_GF, fic, n);
698       if ( std::isnan((double)fic) ) fic = 0;
699       if (n) {
700         bcf_update_info_float(odw->hdr, nv, "IBC", &fic, 1);
701       }
702 
703       // calculate the LRT statistics related to HWE
704       float lrts;
705       float logp;
706       int32_t df;
707       n = 0;
708       Estimator::compute_hwe_lrt(v_pls[k], nfiles, ploidy, vn_alleles[k], MLE_HWE_GF, MLE_GF, n, lrts, logp, df);
709       if (n) {
710         if ( fic > 0 ) logp = 0-logp;
711         bcf_update_info_float(odw->hdr, nv, "HWE_SLP", &logp, 1);
712       }
713 
714       // add additional annotations
715       v_ns_nrefs[k] -= (nfiles - gcs[0]);
716       bcf_update_info_int32(odw->hdr, nv, "NS_NREF", &v_ns_nrefs[k], 1);
717       v_ab_nums[k] /= (v_ab_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "ABE",  &v_ab_nums[k], 1);
718       v_abz_nums[k] /= sqrt(v_abz_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "ABZ",  &v_abz_nums[k], 1);
719       v_bqr_nums[k] /= sqrt(v_bqr_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "BQZ", &v_bqr_nums[k], 1);
720       v_mqr_nums[k] /= sqrt(v_mqr_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "MQZ", &v_mqr_nums[k], 1);
721       v_cyr_nums[k] /= sqrt(v_cyr_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "CYZ", &v_cyr_nums[k], 1);
722       v_str_nums[k] /= sqrt(v_str_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "STZ", &v_str_nums[k], 1);
723       v_nmr_nums[k] /= sqrt(v_nmr_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "NMZ", &v_nmr_nums[k], 1);
724       v_ior_nums[k] = log(v_ior_nums[k]/v_ior_dens[k]+1e-6)/log(10.); bcf_update_info_float(odw->hdr, nv, "IOR", &v_ior_nums[k], 1);
725       v_nm1_nums[k] /= (v_nm1_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "NM1", &v_nm1_nums[k], 1);
726       v_nm0_nums[k] /= (v_nm0_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "NM0", &v_nm0_nums[k], 1);
727 
728       //fprintf(stderr,"AC = %f, AN = %f, NS_NREF = %f, AB = %f, BQR = %f, MQR = %f, CYR = %f, STR = %f, NMR = %f, IOR = %f, NMA = %f\n", acs[1], an, ns_nref, ab_num, bqr_num, mqr_num, cyr_num, str_num, nmr_num, ior_num, nma_num);
729 
730       odw->write(nv);
731     }
732 
733         odw->close();
734     };
735 
print_options()736     void print_options()
737     {
738         if (!print) return;
739 
740         std::clog << "paste_and_comput_features v" << version << "\n\n";
741         print_ifiles("options:     input VCF file        ", input_vcf_files);
742         std::clog << "         [o] output VCF file       " << output_vcf_file << "\n";
743         std::clog << "\n";
744     }
745 
print_stats()746     void print_stats()
747     {
748         if (!print) return;
749 
750         std::clog << "\n";
751         std::cerr << "stats: Total number of files pasted  " << input_vcf_files.size() << "\n";
752         std::clog << "\n";
753     };
754 
~Igor()755     ~Igor() {};
756 
757     private:
758 };
759 
760 }
761 
paste_and_compute_features_sequential(int argc,char ** argv)762 bool paste_and_compute_features_sequential(int argc, char ** argv)
763 {
764     Igor igor(argc, argv);
765     igor.print_options();
766     igor.initialize();
767     igor.paste_and_compute_features_sequential();
768     igor.print_stats();
769     return igor.print;
770 }
771 
772