1 /* The MIT License
2
3 Copyright (c) 2015 Hyun Min Kang and Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "paste_and_compute_features_sequential.h"
25
26 namespace
27 {
28
29 class Igor : Program
30 {
31 public:
32
33 std::string version;
34
35 ///////////
36 //options//
37 ///////////
38 std::vector<std::string> input_vcf_files;
39 std::string input_vcf_file_list;
40 std::string output_vcf_file;
41 bool print;
42 std::vector<GenomeInterval> intervals;
43 int32_t maxBQ;
44
45 ///////
46 //i/o//
47 ///////
48 BCFOrderedReader *odr;
49 BCFOrderedWriter *odw;
50
51 ///////////////
52 //general use//
53 ///////////////
54
55 /////////
56 //stats//
57 /////////
58
59 /////////
60 //tools//
61 /////////
62
63 // The function assumes a particular list of features in the BCF files
64 //
65
Igor(int argc,char ** argv)66 Igor(int argc, char ** argv)
67 {
68 //////////////////////////
69 //options initialization//
70 //////////////////////////
71 try
72 {
73 std::string desc = "Pastes VCF files like the unix paste functions.\n"
74 " This is used after the per sample genotyping step in vt.\n"
75 " Input requirements and assumptions:\n"
76 " 1. Same variants are represented in the same order for each file (required)\n"
77 " 2. Genotype fields are the same for corresponding records (required)\n"
78 " 3. Sample names are different in all the files (warning will be given if not)\n"
79 " 4. Headers (not including the samples) are the same for all the files (unchecked assumption, will fail if output is BCF)\n"
80 " Outputs:\n"
81 " 1. INFO fields output will be that of the first file\n"
82 " 2. Genotype fields are the same for corresponding records\n";
83
84
85 version = "0.1";
86 TCLAP::CmdLine cmd(desc, ' ', version);
87 VTOutput my; cmd.setOutput(&my);
88 TCLAP::SwitchArg arg_print("p", "p", "print options and summary []", cmd, false);
89 TCLAP::ValueArg<std::string> arg_output_vcf_file("o", "o", "output VCF file [-]", false, "-", "", cmd);
90 TCLAP::ValueArg<std::string> arg_input_vcf_file_list("L", "L", "file containing list of input VCF files", false, "", "str", cmd);
91 TCLAP::UnlabeledMultiArg<std::string> arg_input_vcf_files("<in1.vcf>...", "Multiple VCF files",false, "files", cmd);
92 TCLAP::ValueArg<std::string> arg_intervals("i","i","Intervals[]", false, "", "str", cmd);
93 TCLAP::ValueArg<std::string> arg_interval_list("I", "I", "file containing list of intervals []", false, "", "file", cmd);
94 TCLAP::ValueArg<int32_t> arg_max_bq("q", "q", "Maximum base quality to cap []", false, 30, "int", cmd);
95
96 cmd.parse(argc, argv);
97
98 input_vcf_file_list = arg_input_vcf_file_list.getValue();
99 output_vcf_file = arg_output_vcf_file.getValue();
100 parse_files(input_vcf_files, arg_input_vcf_files.getValue(), arg_input_vcf_file_list.getValue());
101 const std::vector<std::string>& v = arg_input_vcf_files.getValue();
102 print = arg_print.getValue();
103 maxBQ = arg_max_bq.getValue();
104 parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue());
105
106 if (input_vcf_files.size()==0)
107 {
108 fprintf(stderr, "[E:%s:%d %s] no input vcf files.\n", __FILE__, __LINE__, __FUNCTION__);
109 exit(1);
110 }
111
112 }
113 catch (TCLAP::ArgException &e)
114 {
115 std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
116 abort();
117 }
118 };
119
initialize()120 void initialize()
121 {
122 //////////////////////
123 //i/o initialization//
124 //////////////////////
125 // read only the first file
126 odr = new BCFOrderedReader(input_vcf_files[0], intervals);
127 if ( intervals.size() > 1 )
128 {
129 fprintf(stderr, "[E:%s:%d %s] Multiple intervals are not allowed\n", __FILE__, __LINE__, __FUNCTION__);
130 exit(1);
131 }
132
133 //for (size_t i=0; i<input_vcf_files.size(); ++i)
134 //{
135 // odrs.push_back(new BCFOrderedReader(input_vcf_files[i], intervals));
136 //}
137 odw = new BCFOrderedWriter(output_vcf_file, 0);
138
139 odw->set_hdr(odr->hdr);
140
141 bcf_hdr_append(odw->hdr, "##INFO=<ID=AVGDP,Number=1,Type=Float,Description=\"Average Depth per Sample\">\n");
142 bcf_hdr_append(odw->hdr, "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Alternate Allele Counts\">\n");
143 bcf_hdr_append(odw->hdr, "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total Number Allele Counts\">\n");
144 //bcf_hdr_append(odw->hdr, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Reads\">\n");
145 bcf_hdr_append(odw->hdr, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Alternate Allele Frequency from Best-guess Genotypes\">\n");
146 bcf_hdr_append(odw->hdr, "##INFO=<ID=GC,Number=G,Type=Integer,Description=\"Genotype Counts\">\n");
147 bcf_hdr_append(odw->hdr, "##INFO=<ID=GN,Number=1,Type=Integer,Description=\"Total Number of Genotypes\">\n");
148 bcf_hdr_append(odw->hdr, "##INFO=<ID=HWEAF,Number=A,Type=Float,Description=\"Genotype likelihood based Allele Frequency assuming HWE\">\n");
149 bcf_hdr_append(odw->hdr, "##INFO=<ID=HWDGF,Number=G,Type=Float,Description=\"Genotype likelihood based Genotype Frequency ignoring HWE\">\n");
150 bcf_hdr_append(odw->hdr, "##INFO=<ID=IBC,Number=1,Type=Float,Description=\"Inbreeding Coefficients calculated from genotype likelihoods\">\n");
151 bcf_hdr_append(odw->hdr, "##INFO=<ID=HWE_SLP,Number=1,Type=Float,Description=\"Signed log p-values testing statistics based Hardy Weinberg ln(Likelihood Ratio)\">\n");
152 bcf_hdr_append(odw->hdr, "##INFO=<ID=ABE,Number=1,Type=Float,Description=\"Expected allele Balance towards Reference Allele on Heterozygous Sites\">\n");
153 bcf_hdr_append(odw->hdr, "##INFO=<ID=ABZ,Number=1,Type=Float,Description=\"Average Z-scores of Allele Balance towards Reference Allele on Heterozygous Sites\">\n");
154 bcf_hdr_append(odw->hdr, "##INFO=<ID=NS_NREF,Number=1,Type=Integer,Description=\"Number of samples with non-reference reads\">\n");
155 bcf_hdr_append(odw->hdr, "##INFO=<ID=BQZ,Number=1,Type=Float,Description=\"Correlation between base quality and alleles\">\n");
156 //bcf_hdr_append(odw->hdr, "##INFO=<ID=MQZ,Number=1,Type=Float,Description=\"Correlation between mapping quality and alleles\">\n");
157 bcf_hdr_append(odw->hdr, "##INFO=<ID=CYZ,Number=1,Type=Float,Description=\"Correlation between cycle and alleles\">\n");
158 bcf_hdr_append(odw->hdr, "##INFO=<ID=STZ,Number=1,Type=Float,Description=\"Correlation between strand and alleles\">\n");
159 bcf_hdr_append(odw->hdr, "##INFO=<ID=NMZ,Number=1,Type=Float,Description=\"Correlation between mismatch counts per read and alleles\">\n");
160 bcf_hdr_append(odw->hdr, "##INFO=<ID=IOR,Number=1,Type=Float,Description=\"Inflated rate of observing of other alleles in log10 scale\">\n");
161 bcf_hdr_append(odw->hdr, "##INFO=<ID=NM0,Number=1,Type=Float,Description=\"Average number of mismatches in the reads with ref alleles\">\n");
162 bcf_hdr_append(odw->hdr, "##INFO=<ID=NM1,Number=1,Type=Float,Description=\"Average number of mismatches in the reads with non-ref alleles\">\n");
163 bcf_hdr_append(odw->hdr, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n");
164 bcf_hdr_append(odw->hdr, "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">\n");
165 bcf_hdr_append(odw->hdr, "##FORMAT=<ID=AD,Number=A,Type=Integer,Description=\"Allele Depth\">\n");
166 bcf_hdr_append(odw->hdr, "##FORMAT=<ID=OD,Number=1,Type=Integer,Description=\"Other Allele Depth\">\n");
167 bcf_hdr_append(odw->hdr, "##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"Phred-scale Genotype Likelihoods\">\n");
168
169 ///////////////
170 //general use//
171 ///////////////
172
173 ////////////////////////
174 //stats initialization//
175 ////////////////////////
176
177 /////////
178 //tools//
179 /////////
180 }
181
182
compute_correlation(int32_t n,int32_t xy,int32_t x1,int32_t x2,int32_t y1,int32_t y2,float buffer)183 float compute_correlation(int32_t n, int32_t xy, int32_t x1, int32_t x2, int32_t y1, int32_t y2, float buffer) {
184 if ( n == 0 ) return 0;
185 float xsd = x2/(float)n - (x1/(float)n)*(x1/(float)n);
186 float ysd = y2/(float)n - (y1/(float)n)*(y1/(float)n);
187 return ( ( xy/(float)n - x1 * y1 / (float) n / (float) n ) / sqrt( xsd * ysd + buffer ) );
188 }
189
compute_correlation_f(int32_t n,float xy,float x1,float x2,float y1,float y2,float buffer)190 float compute_correlation_f(int32_t n, float xy, float x1, float x2, float y1, float y2, float buffer) {
191 if ( n == 0 ) return 0;
192 float xsd = x2/(float)n - (x1/(float)n)*(x1/(float)n);
193 float ysd = y2/(float)n - (y1/(float)n)*(y1/(float)n);
194 return ( ( xy/(float)n - x1 * y1 / (float) n / (float) n ) / sqrt( xsd * ysd + buffer ) );
195 }
196
paste_and_compute_features_sequential()197 void paste_and_compute_features_sequential()
198 {
199 // assume that the following features are available
200 // 1. BQSUM, DPF, DPR
201 // 2. GT, PL, ADF, ADR, DP, CY, ST, AL, NM
202 //
203 // calculate the following per-variant statistics
204 // DP - total depth -- \sum_i (DPF+DPR) + \sum_i DP
205 // MLEAF - allele frequency estimated by best guess genotypes
206 // HWEAF - allele frequency estimated by genotype likelihood under HWE
207 // HWDAF - allele frequency estimated by genotype likelihood under non-HWD
208 // SLRT - signed LRT test statistics for departure from HWE
209 // IBC - [Pr(Het|HWD)-Pr(Het|HWE)]/Pr(Het|HWE)
210 // AB - Allele balance -- \sum_i \sqrt{#A + #R} (#A)/(#A+#R) | GT = HET
211 // STR - Strand bias -- \sum_i \sqrt{#A + #R} cor(FR,FA,RR,RA)
212 // TBR - Tail distance bias -- \sum_i \sqrt{#A + #R} cor(TD,AL)
213 // MQR - Mapping quality bias -- \sum_i \sqrt{#A + #R} cor(MQ,AL)
214 // BQR - Base quality bias -- \sum_i \sqrt{#A + #R} cor(BQ,AL)
215 // NMR - Number of mismatch bias -- \sum \sqrt{#A + #R} cor(NM,AL - A?) | BQ > 20
216 // IOR - Inflated fraction of "other" biases compared to base quality -- \sum_i \sqrt{#A + #R} {Pr(O) - E[O]} | BQ > 20
217 //
218 // calculate the following per-sample statistics
219 // GT - best guess genotypes
220 // GQ - genotype quality
221 // AD - allele depth
222 // PL - likelihoods
223 //
224 // Assume that either
225 // BQSUM : DPF : DPR or
226 // GT, PL, DP, CY, ST, NM exists
227
228 // store everything into memory
229 std::vector<int32_t> vn_alleles;
230 std::vector<int32_t> vn_genos;
231 std::vector<int32_t*> v_pls;
232 std::vector<int32_t*> v_gts;
233 std::vector<int32_t*> v_gqs;
234 std::vector<int32_t*> v_ads;
235 std::vector<int32_t*> v_ods;
236
237 // check the type from FORMAT, first field must be either BQSUM or GT
238 int32_t* p_bqsum = NULL;
239 int32_t np_bqsum = 0;
240 int32_t* p_dp = NULL;
241 int32_t np_dp = 0;
242 int32_t* p_gt = NULL;
243 int32_t np_gt = 0;
244 int32_t* p_pl = NULL;
245 int32_t np_pl = 0;
246 int32_t* p_bq = NULL;
247 int32_t np_bq = 0;
248 int32_t* p_mq = NULL;
249 int32_t np_mq = 0;
250 int32_t* p_cy = NULL;
251 int32_t np_cy = 0;
252 char** p_st = NULL;
253 int32_t np_st = 0;
254 int32_t* p_al = NULL;
255 int32_t np_al = 0;
256 int32_t* p_nm = NULL;
257 int32_t np_nm = 0;
258
259 // read the first genotype file to populate
260 int32_t nfiles = input_vcf_files.size();
261 std::vector<bool> v_skips;
262 std::vector<bool> v_is_snps;
263 std::vector<int32_t> v_rids;
264 std::vector<int32_t> v_poss;
265 std::vector<int32_t> v_rlens;
266 std::vector< std::vector<std::string> > v_d_alleles;
267 std::vector< std::vector<int32_t> > v_filts;
268
269 bcf1_t* v = bcf_init();
270 while( odr->read(v) ) {
271 // skip multi-allelics
272 bool skip = false;
273 bcf_unpack(v, BCF_UN_ALL);
274
275 if ( v->n_allele > 2 ) skip = true;
276 else if ( ( !intervals.empty() ) && ( ( v->pos < intervals[0].start1 ) || ( v->pos > intervals[0].end1 ) ) ) skip = true;
277 else {
278 bool is_vntr = false;
279 for(size_t i=0; i < v->n_allele; ++i) {
280 if ( strcmp(v->d.allele[i],"<VNTR>") == 0 )
281 is_vntr = true;
282 }
283 if ( is_vntr ) skip = true;
284 }
285
286 // determine whether to skip the marker or not
287 v_skips.push_back(skip);
288 if ( skip ) continue;
289
290 // populate marker information
291 v_is_snps.push_back(bcf_is_snp(v));
292 v_rids.push_back(v->rid);
293 v_poss.push_back(v->pos);
294 v_rlens.push_back(v->rlen);
295 v_d_alleles.push_back(std::vector<std::string>());
296 for(size_t i=0; i < v->n_allele; ++i) {
297 v_d_alleles.back().push_back(v->d.allele[i]);
298 }
299 vn_alleles.push_back(v->n_allele);
300 vn_genos.push_back(v->n_allele * (v->n_allele+1)/2);
301
302 v_filts.push_back(std::vector<int32_t>());
303 for(size_t i=0; i < v->d.n_flt; ++i) {
304 v_filts.back().push_back(v->d.flt[i]);
305 }
306
307 // initialize genotype fields
308 v_pls.push_back( (int32_t*)calloc( nfiles * vn_genos.back(), sizeof(int32_t) ) );
309 v_ads.push_back( (int32_t*)calloc( nfiles * vn_alleles.back(), sizeof(int32_t) ) );
310 v_gts.push_back( (int32_t*)calloc( nfiles * 2, sizeof(int32_t) ) );
311 v_gqs.push_back( (int32_t*)calloc( nfiles, sizeof(int32_t) ) );
312 v_ods.push_back( (int32_t*)calloc( nfiles, sizeof(int32_t) ) );
313 }
314 odr->close();
315 delete odr;
316
317 std::vector<float> v_bqr_nums(v_rids.size(), 0);
318 std::vector<float> v_bqr_dens(v_rids.size(), 0);
319 std::vector<float> v_mqr_nums(v_rids.size(), 0);
320 std::vector<float> v_mqr_dens(v_rids.size(), 0);
321 std::vector<float> v_cyr_nums(v_rids.size(), 0);
322 std::vector<float> v_cyr_dens(v_rids.size(), 0);
323 std::vector<float> v_str_nums(v_rids.size(), 0);
324 std::vector<float> v_str_dens(v_rids.size(), 0);
325 std::vector<float> v_nmr_nums(v_rids.size(), 0);
326 std::vector<float> v_nmr_dens(v_rids.size(), 0);
327 std::vector<float> v_ior_nums(v_rids.size(), 0);
328 std::vector<float> v_ior_dens(v_rids.size(), 0);
329 std::vector<float> v_nm0_nums(v_rids.size(), 0);
330 std::vector<float> v_nm0_dens(v_rids.size(), 0);
331 std::vector<float> v_nm1_nums(v_rids.size(), 0);
332 std::vector<float> v_nm1_dens(v_rids.size(), 0);
333 std::vector<float> v_ab_nums(v_rids.size(), 0);
334 std::vector<float> v_ab_dens(v_rids.size(), 0);
335 std::vector<float> v_abz_nums(v_rids.size(), 0);
336 std::vector<float> v_abz_dens(v_rids.size(), 0);
337 std::vector<int32_t> v_ns_nrefs(v_rids.size(), 0);
338 std::vector<int32_t> v_dp_sums(v_rids.size(), 0);
339 std::vector<int32_t> v_max_gqs(v_rids.size(), 0);
340
341 bcf_clear(v);
342
343 for(size_t i=0; i < nfiles; ++i) {
344 fprintf(stderr,"Reading input file %s..\n", input_vcf_files[i].c_str());
345 // set odr and v
346 odr = new BCFOrderedReader(input_vcf_files[i], intervals);
347 v = bcf_init();
348
349 if ( bcf_hdr_nsamples(odr->hdr) != 1 ) {
350 fprintf(stderr, "[E:%s:%d %s] The genotype file must contain exactly one sample", __FILE__, __LINE__, __FUNCTION__);
351 exit(1);
352 }
353 if ( i > 0 )
354 bcf_hdr_add_sample(odw->hdr, bcf_hdr_get_sample_name(odr->hdr, 0));
355
356 for( size_t j=0, k=0; j < v_skips.size(); ++j) {
357 if ( ! odr->read(v) ) {
358 fprintf(stderr, "[E:%s:%d %s] Cannot read variant from genotype files. j=%zu, k=%zu, pos[k]=%d", __FILE__, __LINE__, __FUNCTION__, j, k, v_poss[k]);
359 exit(1);
360 }
361 if ( v_skips[j] ) continue;
362 bcf_unpack(v, BCF_UN_ALL);
363
364 // check marker infor with anchor files
365 if ( ( v->rid != v_rids[k] ) || ( v->pos != v_poss[k] ) || ( v->rlen != v_rlens[k] ) || ( v->n_allele != vn_alleles[k] ) ) {
366 fprintf(stderr, "[E:%s:%d %s] Variant position or ref alleles does not match\n", __FILE__, __LINE__, __FUNCTION__);
367 exit(1);
368 }
369
370 for(size_t l=0; l < vn_alleles[k]; ++l) {
371 if ( v_d_alleles[k][l].compare(v->d.allele[l]) ) {
372 fprintf(stderr, "[E:%s:%d %s] Variant alleles does not match\n", __FILE__, __LINE__, __FUNCTION__);
373 exit(1);
374 }
375 }
376
377 // extract genotype fields and calculate summary statistics
378 if ( bcf_get_format_int32(odr->hdr, v, "BQSUM", &p_bqsum, &np_bqsum) >= 0 ) { // BQSUM observed - REF-ONLY
379 if ( bcf_get_format_int32(odr->hdr, v, "DP", &p_dp, &np_dp) < 0 ) {
380 fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- BQSUM, DP\n", __FILE__, __LINE__, __FUNCTION__);
381 exit(1);
382 }
383 if ( ( np_bqsum != np_dp ) || ( np_dp != 1 ) ) {
384 fprintf(stderr, "[E:%s:%d %s] FORMAT field does not have the samme number of fields -- BQSUM, DP\n", __FILE__, __LINE__, __FUNCTION__);
385 exit(1);
386 }
387
388 for(size_t l=0; l < vn_alleles[k]; ++l){
389 v_ads[k][ i*vn_alleles[k] + l] = (l == 0 ? p_dp[0] : 0);
390 v_ods[k][ i ] = 0;
391 for(size_t m=0; m <= l; ++m) {
392 if ( m == 0 ) {
393 if ( l == 0 )
394 v_pls[k][vn_genos[k] * i + l * (l+1) / 2 + m] = 0;
395 else
396 v_pls[k][vn_genos[k] * i + l * (l+1) / 2 + m] = (int32_t)floor(6.931472 * p_dp[0] + 0.5);
397 }
398 else
399 v_pls[k][vn_genos[k] * i + l * (l+1) / 2 + m] = (int32_t)floor(p_bqsum[0] + 10.98612 * p_dp[0] + 0.5);
400 }
401 }
402 v_dp_sums[k] += p_dp[0];
403 }
404 else if ( bcf_get_genotypes(odr->hdr, v, &p_gt, &np_gt) >= 0 ) { // GT unobserved -- non-REF
405 // extract PL, DP, BQ, MQ, CY, ST, AL, NM
406 if ( bcf_get_format_int32(odr->hdr, v, "PL", &p_pl, &np_pl) < 0 ) {
407 fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- PL\n", __FILE__, __LINE__, __FUNCTION__);
408 exit(1);
409 }
410 if ( bcf_get_format_int32(odr->hdr, v, "DP", &p_dp, &np_dp) < 0 ) {
411 fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- DP\n", __FILE__, __LINE__, __FUNCTION__);
412 exit(1);
413 }
414 if ( bcf_get_format_int32(odr->hdr, v, "BQ", &p_bq, &np_bq) < 0 ) {
415 fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- BQ\n", __FILE__, __LINE__, __FUNCTION__);
416 exit(1);
417 }
418 if ( bcf_get_format_int32(odr->hdr, v, "MQ", &p_mq, &np_mq) < 0 ) {
419 fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- MQ\n", __FILE__, __LINE__, __FUNCTION__);
420 exit(1);
421 }
422 if ( bcf_get_format_int32(odr->hdr, v, "CY", &p_cy, &np_cy) < 0 ) {
423 fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- GT, CY\n", __FILE__, __LINE__, __FUNCTION__);
424 exit(1);
425 }
426 if ( bcf_get_format_string(odr->hdr, v, "ST", &p_st, &np_st) < 0 ) {
427 fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- GT, ST\n", __FILE__, __LINE__, __FUNCTION__);
428 exit(1);
429 }
430 if ( bcf_get_format_int32(odr->hdr, v, "AL", &p_al, &np_al) < 0 ) {
431 fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- GT, AL\n", __FILE__, __LINE__, __FUNCTION__);
432 exit(1);
433 }
434 if ( bcf_get_format_int32(odr->hdr, v, "NM", &p_nm, &np_nm) < 0 ) {
435 fprintf(stderr, "[E:%s:%d %s] FORMAT field does not contain expected fields -- GT, NM\n", __FILE__, __LINE__, __FUNCTION__);
436 exit(1);
437 }
438
439 // sanity checking
440 if ( np_pl != vn_genos[k] ) {
441 fprintf(stderr, "[E:%s:%d %s] np_pl (%d) != n_genos (%d)\n", __FILE__, __LINE__, __FUNCTION__, np_pl, vn_genos[k]);
442 exit(1);
443 }
444
445 if ( ( np_dp != 1 ) || ( np_gt != 2 ) ) {
446 fprintf(stderr, "[E:%s:%d %s] Assertion failed in np_dp (%d) == np_gt (%d), np_st (%d) == np_al (%d)\n", __FILE__, __LINE__, __FUNCTION__, np_dp, np_gt, np_st, np_al);
447 exit(1);
448 }
449 if ( p_dp[0] != strlen(p_st[0]) ) {
450 fprintf(stderr, "[E:%s:%d %s] Assetion failed - p_dp[0] == strlen(p_st[0])\n", __FILE__, __LINE__, __FUNCTION__);
451 exit(1);
452 }
453
454 // currently assume everything as diploid
455 int32_t a1 = bcf_gt_allele(p_gt[0]);
456 int32_t a2 = bcf_gt_allele(p_gt[1]);
457 int32_t gt = bcf_alleles2gt(a1, a2);
458 for(size_t l=0; l < vn_genos[k]; ++l) {
459 v_pls[k][vn_genos[k] * i + l] = p_pl[l];
460 }
461
462 v_dp_sums[k] += p_dp[0];
463
464 int32_t bq_s1 = 0;
465 int32_t bq_s2 = 0;
466 int32_t mq_s1 = 0;
467 int32_t mq_s2 = 0;
468 float cy_s1 = 0;
469 float cy_s2 = 0;
470 int32_t st_s1 = 0;
471 int32_t al_s1 = 0;
472 int32_t nm_s1 = 0;
473 int32_t nm_s2 = 0;
474 int32_t bq_al = 0;
475 int32_t mq_al = 0;
476 float cy_al = 0;
477 int32_t st_al = 0;
478 int32_t nm_al = 0;
479 int32_t dp_ra = 0;
480 int32_t dp_q20 = 0;
481 double oth_exp_q20 = 0;
482 double oth_obs_q20 = 0;
483
484 int32_t* ads_z = &v_ads[k][ i*vn_alleles[k] ];
485
486 ++v_ns_nrefs[k];
487
488 for(size_t l=0; l < p_dp[0]; ++l) {
489 if ( p_bq[l] > maxBQ ) p_bq[l] = maxBQ;
490
491 if ( p_al[l] >= 0 ) {
492 if ( p_bq[l] > 20 ) {
493 oth_exp_q20 += ( LogTool::pl2prob(p_bq[l]) * 2.0 / 3.0 );
494 ++dp_q20;
495 }
496
497 // calculate cycle-based tail distance
498 float log_td = 0-logf((float)abs(v_is_snps[k] ? p_cy[l] : (int)(rand() % 100))+1.); // temporarily ignore cycles
499
500 ++dp_ra;
501 bq_s1 += p_bq[l];
502 bq_s2 += (p_bq[l] * p_bq[l]);
503 mq_s1 += p_mq[l];
504 mq_s2 += (p_mq[l] * p_mq[l]);
505 cy_s1 += log_td;
506 cy_s2 += (log_td * log_td);
507 st_s1 += ((p_st[0][l] == 'F') ? 0 : 1);
508 if ( p_al[l] > 0 ) {
509 ++al_s1;
510 bq_al += p_bq[l];
511 mq_al += p_mq[l];
512 cy_al += log_td;
513 st_al += ((p_st[0][l] == 'F') ? 0 : 1);
514 nm_al += (p_nm[l]-1);
515 nm_s1 += (p_nm[l]-1);
516 nm_s2 += ((p_nm[l]-1)*(p_nm[l]-1));
517 ++ads_z[1];
518 }
519 else {
520 nm_s1 += p_nm[l];
521 nm_s2 += ( p_nm[l] * p_nm[l] );
522 ++ads_z[0];
523 }
524 }
525 else {
526 if ( p_bq[l] > 20 ) {
527 oth_exp_q20 += ( LogTool::pl2prob(p_bq[l]) * 2.0 / 3.0 );
528 ++oth_obs_q20;
529 ++dp_q20;
530 }
531 ++v_ods[k][i];
532 }
533 }
534
535 float sqrt_dp_ra = sqrt((float)dp_ra);
536 float ior = (float)(oth_obs_q20 / (oth_exp_q20 + 1e-6));
537 float nm1 = al_s1 == 0 ? 0 : nm_al / (float)al_s1;
538 float nm0 = (dp_ra-al_s1) == 0 ? 0 : (nm_s1-nm_al) / (float)(dp_ra-al_s1);
539 float w_dp_ra = log(dp_ra+1.); //sqrt(dp_ra);
540 float w_dp_q20 = log(dp_q20+1.); //sqrt(dp_q20);
541 float w_al_s1 = log(al_s1+1.); //sqrt(al_s1);
542 float w_ref_s1 = log(dp_ra-al_s1+1.);
543
544 if ( gt == 1 ) { // het genotypes
545 v_ab_nums[k] += (w_dp_ra * (dp_ra - al_s1 + 0.05) / (double)(dp_ra + 0.1));
546 v_ab_dens[k] += w_dp_ra;
547
548 // E(r) = 0.5(r+a) V(r) = 0.25(r+a)
549 v_abz_nums[k] += w_dp_ra * (dp_ra - al_s1 - dp_ra*0.5)/sqrt(0.25 * dp_ra + 1e-3);
550 v_abz_dens[k] += (w_dp_ra * w_dp_ra);
551
552 float bqr = sqrt_dp_ra * compute_correlation( dp_ra, bq_al, bq_s1, bq_s2, al_s1, al_s1, .1 );
553 float mqr = sqrt_dp_ra * compute_correlation( dp_ra, mq_al, mq_s1, mq_s2, al_s1, al_s1, .1 );
554 float cyr = sqrt_dp_ra * compute_correlation_f( dp_ra, cy_al, cy_s1, cy_s2, (float)al_s1, (float)al_s1, .1 );
555 float str = sqrt_dp_ra * compute_correlation( dp_ra, st_al, st_s1, st_s1, al_s1, al_s1, .1 );
556 float nmr = sqrt_dp_ra * compute_correlation( dp_ra, nm_al, nm_s1, nm_s2, al_s1, al_s1, .1 );
557
558 // Use Stouffer's method to combine the z-scores, but weighted by log of sample size
559 v_bqr_nums[k] += (bqr * w_dp_ra); v_bqr_dens[k] += (w_dp_ra * w_dp_ra);
560 v_mqr_nums[k] += (mqr * w_dp_ra); v_mqr_dens[k] += (w_dp_ra * w_dp_ra);
561 v_cyr_nums[k] += (cyr * w_dp_ra); v_cyr_dens[k] += (w_dp_ra * w_dp_ra);
562 v_str_nums[k] += (str * w_dp_ra); v_str_dens[k] += (w_dp_ra * w_dp_ra);
563 v_nmr_nums[k] += (nmr * w_dp_ra); v_nmr_dens[k] += (w_dp_ra * w_dp_ra);
564 }
565
566 v_ior_nums[k] += (ior * w_dp_q20); v_ior_dens[k] += w_dp_q20;
567 v_nm1_nums[k] += (nm1 * w_al_s1); v_nm1_dens[k] += w_al_s1;
568 v_nm0_nums[k] += (nm0 * w_ref_s1); v_nm0_dens[k] += w_ref_s1;
569 }
570
571 ++k;
572 }
573 odr->close();
574 delete odr;
575 }
576 bcf_hdr_add_sample(odw->hdr, NULL);
577
578 odw->write_hdr();
579
580 bcf1_t* nv = bcf_init();
581
582 for(size_t k=0; k < v_rids.size(); ++k) {
583 bcf_clear(nv);
584 nv->rid = v_rids[k];
585 nv->pos = v_poss[k];
586 nv->rlen = v_rlens[k];
587 nv->n_sample = nfiles;
588
589 const char* tmp_d_alleles[vn_alleles[k]];
590 for(int l=0; l < vn_alleles[k]; ++l)
591 tmp_d_alleles[l] = v_d_alleles[k][l].c_str();
592 bcf_update_alleles(odw->hdr, nv, tmp_d_alleles, vn_alleles[k]);
593
594 if ( !v_filts[k].empty() ) {
595 int tmp_filts[v_filts[k].size()];
596 for(size_t l=0; l < v_filts[k].size(); ++l) {
597 tmp_filts[l] = v_filts[k][l];
598 }
599 bcf_update_filter(odw->hdr, nv, tmp_filts, (int32_t)v_filts[k].size());
600 }
601
602 bcf_unpack(nv, BCF_UN_ALL);
603
604 // calculate the allele frequencies under HWE
605 float MLE_HWE_AF[vn_alleles[k]];
606 float MLE_HWE_GF[vn_genos[k]];
607 int32_t ploidy = 2; // temporarily constant
608 int32_t n = 0;
609 Estimator::compute_gl_af_hwe(v_pls[k], nfiles, ploidy, vn_alleles[k], MLE_HWE_AF, MLE_HWE_GF, n, 1e-20);
610
611 // calculate the genotypes (diploid only)
612 double gp, gp_sum, max_gp;
613 int32_t best_gt;
614 int32_t best_a1, best_a2;
615 int32_t* pls_i;
616 int32_t an = 0;
617 int32_t acs[vn_alleles[k]];
618 int32_t gcs[vn_genos[k]];
619 float afs[vn_alleles[k]];
620
621 memset(acs, 0, sizeof(int32_t)*vn_alleles[k]);
622 memset(gcs, 0, sizeof(int32_t)*vn_genos[k]);
623
624 for(size_t i=0; i < nfiles; ++i) {
625 pls_i = &v_pls[k][ i * vn_genos[k] ];
626 max_gp = gp_sum = gp = ( LogTool::pl2prob(pls_i[0]) * MLE_HWE_AF[0] * MLE_HWE_AF[0] );
627 best_gt = 0; best_a1 = 0; best_a2 = 0;
628 for(size_t l=1; l < vn_alleles[k]; ++l) {
629 for(size_t m=0; m <= l; ++m) {
630 gp = ( LogTool::pl2prob(pls_i[ l*(l+1)/2 + m]) * MLE_HWE_AF[l] * MLE_HWE_AF[m] * (l == m ? 1 : 2) );
631 gp_sum += gp;
632 if ( max_gp < gp ) {
633 max_gp = gp;
634 best_gt = l*(l+1)/2 + m; best_a1 = m; best_a2 = l;
635 }
636 }
637 }
638
639 double prob = 1.-max_gp/gp_sum;
640 if ( prob <= 3.162278e-26 )
641 prob = 3.162278e-26;
642 if ( prob > 1 )
643 prob = 1;
644
645 v_gqs[k][i] = (int32_t)LogTool::prob2pl(prob);
646
647 if ( ( best_gt > 0 ) && ( v_max_gqs[k] < v_gqs[k][i] ) )
648 v_max_gqs[k] = v_gqs[k][i];
649
650 v_gts[k][2*i] = ((best_a1 + 1) << 1);
651 v_gts[k][2*i+1] = ((best_a2 + 1) << 1);
652 an += 2;
653 ++acs[best_a1];
654 ++acs[best_a2];
655 ++gcs[best_gt];
656 }
657
658 for(size_t i=0; i < vn_alleles[k]; ++i) {
659 afs[i] = acs[i]/(float)an;
660 }
661
662 bcf_update_format_int32(odw->hdr, nv, "GT", v_gts[k], nfiles * 2);
663 bcf_update_format_int32(odw->hdr, nv, "GQ", v_gqs[k], nfiles );
664 bcf_update_format_int32(odw->hdr, nv, "AD", v_ads[k], nfiles * vn_alleles[k]);
665 bcf_update_format_int32(odw->hdr, nv, "OD", v_ods[k], nfiles );
666 bcf_update_format_int32(odw->hdr, nv, "PL", v_pls[k], nfiles * vn_genos[k]);
667
668 float avgdp = (float)v_dp_sums[k]/(float)nfiles;
669
670 nv->qual = (float) v_max_gqs[k];
671 bcf_update_info_float(odw->hdr, nv, "AVGDP", &avgdp, 1);
672 bcf_update_info_int32(odw->hdr, nv, "AC", &acs[1], vn_alleles[k]-1);
673 bcf_update_info_int32(odw->hdr, nv, "AN", &an, 1);
674 bcf_update_info_float(odw->hdr, nv, "AF", &afs[1], vn_alleles[k]-1);
675 bcf_update_info_int32(odw->hdr, nv, "GC", gcs, vn_genos[k]);
676 bcf_update_info_int32(odw->hdr, nv, "GN", &nfiles, 1);
677
678 if (n) {
679 float* MLE_HWE_AF_PTR = &MLE_HWE_AF[1];
680 bcf_update_info_float(odw->hdr, nv, "HWEAF", MLE_HWE_AF_PTR, vn_alleles[k]-1);
681 //bcf_update_info_float(odw->hdr, nv, "HWEGF", &MLE_HWE_GF, n_genos);
682 }
683
684 // calculate the allele frequencies under HWD
685 float MLE_AF[vn_alleles[k]];
686 float MLE_GF[vn_genos[k]];
687 n = 0;
688 Estimator::compute_gl_af(v_pls[k], nfiles, ploidy, vn_alleles[k], MLE_AF, MLE_GF, n, 1e-20);
689 if (n) {
690 float* MLE_AF_PTR = &MLE_AF[1];
691 //bcf_update_info_float(odw->hdr, nv, "HWDAF", MLE_AF_PTR, n_alleles-1);
692 bcf_update_info_float(odw->hdr, nv, "HWDGF", &MLE_GF, vn_genos[k]);
693 }
694
695 float fic = 0;
696 n = 0;
697 Estimator::compute_gl_fic(v_pls[k], nfiles, ploidy, MLE_HWE_AF, vn_alleles[k], MLE_GF, fic, n);
698 if ( std::isnan((double)fic) ) fic = 0;
699 if (n) {
700 bcf_update_info_float(odw->hdr, nv, "IBC", &fic, 1);
701 }
702
703 // calculate the LRT statistics related to HWE
704 float lrts;
705 float logp;
706 int32_t df;
707 n = 0;
708 Estimator::compute_hwe_lrt(v_pls[k], nfiles, ploidy, vn_alleles[k], MLE_HWE_GF, MLE_GF, n, lrts, logp, df);
709 if (n) {
710 if ( fic > 0 ) logp = 0-logp;
711 bcf_update_info_float(odw->hdr, nv, "HWE_SLP", &logp, 1);
712 }
713
714 // add additional annotations
715 v_ns_nrefs[k] -= (nfiles - gcs[0]);
716 bcf_update_info_int32(odw->hdr, nv, "NS_NREF", &v_ns_nrefs[k], 1);
717 v_ab_nums[k] /= (v_ab_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "ABE", &v_ab_nums[k], 1);
718 v_abz_nums[k] /= sqrt(v_abz_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "ABZ", &v_abz_nums[k], 1);
719 v_bqr_nums[k] /= sqrt(v_bqr_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "BQZ", &v_bqr_nums[k], 1);
720 v_mqr_nums[k] /= sqrt(v_mqr_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "MQZ", &v_mqr_nums[k], 1);
721 v_cyr_nums[k] /= sqrt(v_cyr_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "CYZ", &v_cyr_nums[k], 1);
722 v_str_nums[k] /= sqrt(v_str_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "STZ", &v_str_nums[k], 1);
723 v_nmr_nums[k] /= sqrt(v_nmr_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "NMZ", &v_nmr_nums[k], 1);
724 v_ior_nums[k] = log(v_ior_nums[k]/v_ior_dens[k]+1e-6)/log(10.); bcf_update_info_float(odw->hdr, nv, "IOR", &v_ior_nums[k], 1);
725 v_nm1_nums[k] /= (v_nm1_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "NM1", &v_nm1_nums[k], 1);
726 v_nm0_nums[k] /= (v_nm0_dens[k]+1e-6); bcf_update_info_float(odw->hdr, nv, "NM0", &v_nm0_nums[k], 1);
727
728 //fprintf(stderr,"AC = %f, AN = %f, NS_NREF = %f, AB = %f, BQR = %f, MQR = %f, CYR = %f, STR = %f, NMR = %f, IOR = %f, NMA = %f\n", acs[1], an, ns_nref, ab_num, bqr_num, mqr_num, cyr_num, str_num, nmr_num, ior_num, nma_num);
729
730 odw->write(nv);
731 }
732
733 odw->close();
734 };
735
print_options()736 void print_options()
737 {
738 if (!print) return;
739
740 std::clog << "paste_and_comput_features v" << version << "\n\n";
741 print_ifiles("options: input VCF file ", input_vcf_files);
742 std::clog << " [o] output VCF file " << output_vcf_file << "\n";
743 std::clog << "\n";
744 }
745
print_stats()746 void print_stats()
747 {
748 if (!print) return;
749
750 std::clog << "\n";
751 std::cerr << "stats: Total number of files pasted " << input_vcf_files.size() << "\n";
752 std::clog << "\n";
753 };
754
~Igor()755 ~Igor() {};
756
757 private:
758 };
759
760 }
761
paste_and_compute_features_sequential(int argc,char ** argv)762 bool paste_and_compute_features_sequential(int argc, char ** argv)
763 {
764 Igor igor(argc, argv);
765 igor.print_options();
766 igor.initialize();
767 igor.paste_and_compute_features_sequential();
768 igor.print_stats();
769 return igor.print;
770 }
771
772