1 /* The MIT License
2
3 Copyright (c) 2014 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "estimate.h"
25
26 #define EST_QUAL 0
27 #define EST_AF 1
28 #define EST_GF 2
29 #define EST_HWEAF 3
30 #define EST_MLEAF 4
31 #define EST_HWE 5
32 #define EST_FIC 6
33 #define EST_AB 7
34 #define NO_EST 8
35
36 namespace
37 {
38
39 class Igor : Program
40 {
41 public:
42
43 std::string version;
44
45 ///////////
46 //options//
47 ///////////
48 std::string input_vcf_file;
49 std::string output_vcf_file;
50 std::vector<GenomeInterval> intervals;
51 std::string interval_list;
52 std::string arg_sample_list;
53 bool compute_estimate[NO_EST];
54 std::string estimates;
55
56 char** samples;
57 int32_t *imap;
58 int32_t nsamples;
59 bool print_sites_only;
60
61 ///////
62 //i/o//
63 ///////
64 BCFOrderedReader *odr;
65 BCFOrderedWriter *odw;
66
67 //////////
68 //filter//
69 //////////
70 std::string fexp;
71 Filter filter;
72 bool filter_exists;
73
74 /////////
75 //stats//
76 /////////
77 int32_t no_samples;
78 int32_t no_variants;
79 int32_t no_variants_not_computed;
80 int32_t no_variants_computed;
81 int32_t no_variants_missing_dependencies;
82 int32_t no_reference;
83
84 /////////
85 //tools//
86 /////////
87 VariantManip *vm;
88
89 /**
90 * Parse filters. Processes the filter list first followed by the interval string. Duplicates are dropped.
91 *
92 * @filters - filters stored in this vector
93 * @filter_string - comma delimited filters in a string
94 */
parse_estimators(bool compute_estimate[],std::string estimates_string)95 void parse_estimators(bool compute_estimate[], std::string estimates_string)
96 {
97 this->estimates = estimates_string;
98
99 for (size_t i=0; i<NO_EST; ++i)
100 {
101 compute_estimate[i] = false;
102 }
103
104 std::vector<std::string> v;
105 if (estimates_string!="")
106 split(v, ",", estimates_string);
107
108 for (size_t i=0; i<v.size(); ++i)
109 {
110 if (v[i]=="AF")
111 {
112 compute_estimate[EST_AF] = true;
113 }
114 else if (v[i]=="HWEAF")
115 {
116 compute_estimate[EST_HWEAF] = true;
117 }
118 else if (v[i]=="MLEAF")
119 {
120 compute_estimate[EST_MLEAF] = true;
121 }
122 else if (v[i]=="HWE")
123 {
124 compute_estimate[EST_HWEAF] = true;
125 compute_estimate[EST_MLEAF] = true;
126 compute_estimate[EST_HWE] = true;
127 }
128 else if (v[i]=="AB")
129 {
130 compute_estimate[EST_HWEAF] = true;
131 compute_estimate[EST_AB] = true;
132 }
133 else if (v[i]=="FIC")
134 {
135 compute_estimate[EST_HWEAF] = true;
136 compute_estimate[EST_FIC] = true;
137 }
138 else
139 {
140 fprintf(stderr, "[%s:%d %s] Estimate type not recognized: %s\n", __FILE__, __LINE__, __FUNCTION__, v[i].c_str());
141 }
142 }
143
144 size_t no_est = 0;
145 for (size_t i=0; i<NO_EST; ++i)
146 {
147 if (compute_estimate[i])
148 {
149 ++no_est;
150 }
151 }
152
153 if (no_est==0)
154 {
155 fprintf(stderr, "[%s:%d %s] No valid estimate types recognized.\n", __FILE__, __LINE__, __FUNCTION__);
156 exit(1);
157 }
158 }
159
Igor(int argc,char ** argv)160 Igor(int argc, char ** argv)
161 {
162 version = "0.5";
163
164 //////////////////////////
165 //options initialization//
166 //////////////////////////
167 try
168 {
169 std::string desc = "Compute variant based estimates.\n\n"
170 " AF Genotype (GT) based allele frequencies\n"
171 " If genotypes are unavailable, best guess\n"
172 " genotypes are inferred based on genotype\n"
173 " likelihoods (GL or PL)\n"
174 " AC : Alternate Allele counts\n"
175 " AN : Total allele counts\n"
176 " NS : No. of samples.\n"
177 " AF : Alternate allele frequencies.\n"
178 " MLEAF GL based allele frequencies estimates\n"
179 " MLEAF : Alternate allele frequency derived from MLEGF\n"
180 " MLEGF : Genotype frequencies.\n"
181 " HWEAF GL based allele frequencies estimates assuming HWE\n"
182 " HWEAF : Alternate allele frequencies\n"
183 " HWEGF : Genotype frequencies derived from HWEAF.\n"
184 " HWE GL based Hardy-Weinberg statistics.\n"
185 " HWE_LLR : log likelihood ratio\n"
186 " HWE_LPVAL : log p-value\n"
187 " HWE_DF : degrees of freedom\n"
188 " AB GL based Allele Balance.\n"
189 " FIC GL based Inbreeding Coefficient\n"
190 " \n";
191
192 TCLAP::CmdLine cmd(desc, ' ', version);
193 VTOutput my;
194 cmd.setOutput(&my);
195 TCLAP::ValueArg<std::string> arg_intervals("i", "intervals", "Intervals", false, "", "str", cmd);
196 TCLAP::ValueArg<std::string> arg_interval_list("I", "interval-list", "File containing list of intervals", false, "", "file", cmd);
197 TCLAP::ValueArg<std::string> arg_fexp("f", "f", "filter expression []", false, "", "str", cmd);
198 TCLAP::ValueArg<std::string> arg_estimates("e", "e", "comma separated estimates to be computed []", false, "", "str", cmd);
199 TCLAP::ValueArg<std::string> arg_output_vcf_file("o", "o", "output VCF/VCF.GZ/BCF file [-]", false, "-", "str", cmd);
200 TCLAP::SwitchArg arg_print_sites_only("s", "s", "print site information only without genotypes [false]", cmd, false);
201 TCLAP::UnlabeledValueArg<std::string> arg_input_vcf_file("<in.vcf>", "input VCF file", true, "", "file", cmd);
202
203 cmd.parse(argc, argv);
204
205 input_vcf_file = arg_input_vcf_file.getValue();
206 output_vcf_file = arg_output_vcf_file.getValue();
207 parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue());
208 print_sites_only = arg_print_sites_only.getValue();
209 fexp = arg_fexp.getValue();
210 parse_estimators(compute_estimate, arg_estimates.getValue());
211 }
212 catch (TCLAP::ArgException &e)
213 {
214 std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
215 abort();
216 }
217 };
218
initialize()219 void initialize()
220 {
221 //////////////////////
222 //i/o initialization//
223 //////////////////////
224 odr = new BCFOrderedReader(input_vcf_file, intervals);
225 odw = new BCFOrderedWriter(output_vcf_file);
226 if (print_sites_only)
227 {
228 odw->link_hdr(bcf_hdr_subset(odr->hdr, 0, 0, 0));
229 }
230 else
231 {
232 odw->link_hdr(odr->hdr);
233 }
234
235 if (compute_estimate[EST_AF])
236 {
237 bcf_hdr_append(odw->hdr, "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Alternate Allele Counts\">\n");
238 bcf_hdr_append(odw->hdr, "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total Number Allele Counts\">\n");
239 bcf_hdr_append(odw->hdr, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">\n");
240 bcf_hdr_append(odw->hdr, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Alternate Allele Frequency\">\n");
241 }
242
243 if (compute_estimate[EST_GF])
244 {
245 bcf_hdr_append(odw->hdr, "##INFO=<ID=GC,Number=G,Type=Integer,Description=\"Genotype Counts\">\n");
246 bcf_hdr_append(odw->hdr, "##INFO=<ID=GN,Number=1,Type=Integer,Description=\"Total Number of Genotypes Counts\">\n");
247 bcf_hdr_append(odw->hdr, "##INFO=<ID=GF,Number=G,Type=Float,Description=\"Genotype Frequency\">\n");
248 }
249
250 if (compute_estimate[EST_HWEAF])
251 {
252 bcf_hdr_append(odw->hdr, "##INFO=<ID=HWEAF,Number=A,Type=Float,Description=\"Genotype likelihood based MLE Allele Frequency assuming HWE\">\n");
253 bcf_hdr_append(odw->hdr, "##INFO=<ID=HWEGF,Number=G,Type=Float,Description=\"Genotype likelihood based MLE Genotype Frequency assuming HWE\">\n");
254 }
255
256 if (compute_estimate[EST_MLEAF])
257 {
258 bcf_hdr_append(odw->hdr, "##INFO=<ID=MLEAF,Number=A,Type=Float,Description=\"Genotype likelihood based MLE Allele Frequency\">\n");
259 bcf_hdr_append(odw->hdr, "##INFO=<ID=MLEGF,Number=G,Type=Float,Description=\"Genotype likelihood based MLE Genotype Frequency\">\n");
260 }
261
262 if (compute_estimate[EST_HWE])
263 {
264 bcf_hdr_append(odw->hdr, "##INFO=<ID=HWE_LLR,Number=1,Type=Float,Description=\"Genotype likelihood based Hardy Weinberg ln(Likelihood Ratio)\">\n");
265 bcf_hdr_append(odw->hdr, "##INFO=<ID=HWE_LPVAL,Number=1,Type=Float,Description=\"Genotype likelihood based Hardy Weinberg Likelihood Ratio Test Statistic ln(p-value)\">\n");
266 bcf_hdr_append(odw->hdr, "##INFO=<ID=HWE_DF,Number=1,Type=Integer,Description=\"Degrees of freedom for Genotype likelihood based Hardy Weinberg Likelihood Ratio Test Statistic\">\n");
267 }
268
269 if (compute_estimate[EST_FIC])
270 {
271 bcf_hdr_append(odw->hdr, "##INFO=<ID=FIC,Number=1,Type=Float,Description=\"Genotype likelihood based Inbreeding Coefficient\">\n");
272 }
273
274 if (compute_estimate[EST_AB])
275 {
276 bcf_hdr_append(odw->hdr, "##INFO=<ID=AB,Number=1,Type=Float,Description=\"Genotype likelihood based Allele Balance\">\n");
277 }
278
279 /////////////////////////
280 //filter initialization//
281 /////////////////////////
282 filter.parse(fexp.c_str());
283 filter_exists = fexp=="" ? false : true;
284
285 ////////////////////////
286 //stats initialization//
287 ////////////////////////
288 no_samples = bcf_hdr_nsamples(odr->hdr);
289 no_variants = 0;
290 no_variants_not_computed = 0;
291 no_variants_computed = 0;
292 no_variants_missing_dependencies = 0;
293 no_reference = 0;
294
295 if (!no_samples)
296 {
297 fprintf(stderr, "[%s:%d %s] No samples in VCF file: %s\n", __FILE__, __LINE__, __FUNCTION__, input_vcf_file.c_str());
298 exit(1);
299 }
300
301 ///////////////////////
302 //tool initialization//
303 ///////////////////////
304 vm = new VariantManip("");
305 }
306
estimate()307 void estimate()
308 {
309 bcf1_t *v = odw->get_bcf1_from_pool();
310 bcf_hdr_t *h = odr->hdr;
311 Variant variant;
312
313 int32_t *gts = NULL;
314 int32_t *pls = NULL;
315 int32_t *dps = NULL;
316 int32_t n_gts = 0;
317 int32_t n_pls = 0;
318 int32_t n_dps = 0;
319
320 odw->write_hdr();
321
322 while(odr->read(v))
323 {
324 variant.clear();
325 bool printed = false;
326 int32_t vtype = vm->classify_variant(h, v, variant);
327 if (filter_exists && !filter.apply(h,v,&variant))
328 {
329 continue;
330 }
331
332 ++no_variants;
333
334 if (vtype==VT_REF)
335 {
336 odw->write(v);
337 ++no_variants_not_computed;
338 ++no_reference;
339 continue;
340 }
341
342
343 bcf_unpack(v, BCF_UN_ALL);
344 int32_t ploidy = bcf_get_genotypes(odr->hdr, v, >s, &n_gts);
345 ploidy /= no_samples;
346
347 // if (!n_gts)
348 // {
349 // for (int32_t i=0; i<NO_EST; ++i)
350 // {
351 // compute_estimate[i] = 0;
352 // }
353 //
354 // ++no_variants_missing_dependencies;
355 // }
356
357 bcf_get_format_int32(odr->hdr, v, "PL", &pls, &n_pls);
358 int32_t no_alleles = bcf_get_n_allele(v);
359
360 // if (!n_pls)
361 // {
362 // if (compute_estimate[EST_MLEAF])
363 // {
364 // fprintf(stderr, "[%s:%d %s] No PL informationamples in VCF file: %s\n", __FILE__, __LINE__, __FUNCTION__, input_vcf_file.c_str());
365 // exit(1);
366 // }
367 // compute_estimate[i] = 0;
368 // }
369
370 float qual = 0;
371 int32_t n = 0;
372 if (compute_estimate[EST_QUAL])
373 {
374 Estimator::compute_qual(pls, no_samples, ploidy, no_alleles, qual, n);
375 if (n)
376 {
377 bcf_set_qual(v, qual);
378 }
379 }
380
381 int32_t no_genotypes = bcf_an2gn(no_alleles);
382
383 if (compute_estimate[EST_AF])
384 {
385 if (!n_gts)
386 {
387 ++no_variants_missing_dependencies;
388 }
389 else
390 {
391 int32_t g[ploidy];
392 for (int32_t i=0; i<ploidy; ++i) g[i]=0;
393 int32_t AC[no_alleles];
394 float AF[no_alleles];
395 for (int32_t i=0; i<no_alleles; ++i) {AF[i]=0;AC[i]=0;}
396 int32_t AN=0;
397 int32_t NS=0;
398
399 int32_t GC[no_genotypes];
400 int32_t GN=0;
401 float GF[no_genotypes];
402 Estimator::compute_af(gts, no_samples, ploidy, no_alleles, AC, AN, AF, GC, GN, GF, NS);
403
404 int32_t* AC_PTR = &AC[1];
405 bcf_update_info_int32(odw->hdr, v, "AC", AC_PTR, no_alleles-1);
406 bcf_update_info_int32(odw->hdr, v, "AN", &AN, 1);
407 float* AF_PTR = &AF[1];
408 bcf_update_info_float(odw->hdr, v, "AF", AF_PTR, no_alleles-1);
409 if (GN)
410 {
411 bcf_update_info_int32(odw->hdr, v, "GC", GC, no_genotypes);
412 bcf_update_info_int32(odw->hdr, v, "GN", &GN, 1);
413 bcf_update_info_float(odw->hdr, v, "GF", GF, no_genotypes);
414 }
415 bcf_update_info_int32(odw->hdr, v, "NS", &NS, 1);
416 }
417 }
418
419 if (compute_estimate[EST_GF])
420 {
421 }
422
423 float MLE_HWE_AF[no_alleles];
424 float MLE_HWE_GF[no_genotypes];
425 if (compute_estimate[EST_HWEAF])
426 {
427 if (!n_pls)
428 {
429 ++no_variants_missing_dependencies;
430 }
431 else
432 {
433 n = 0;
434 Estimator::compute_gl_af_hwe(pls, no_samples, ploidy,no_alleles, MLE_HWE_AF, MLE_HWE_GF, n, 1e-20);
435 if (n)
436 {
437 float* MLE_HWE_AF_PTR = &MLE_HWE_AF[1];
438 bcf_update_info_float(odw->hdr, v, "HWEAF", MLE_HWE_AF_PTR, no_alleles-1);
439 bcf_update_info_float(odw->hdr, v, "HWEGF", &MLE_HWE_GF, no_genotypes);
440 }
441 }
442 }
443
444 float MLE_AF[no_alleles];
445 float MLE_GF[no_genotypes];
446 if (compute_estimate[EST_MLEAF])
447 {
448 if (!n_pls)
449 {
450 ++no_variants_missing_dependencies;
451 continue;
452 }
453 else
454 {
455 n = 0;
456 Estimator::compute_gl_af(pls, no_samples, ploidy,no_alleles, MLE_AF, MLE_GF, n, 1e-20);
457 if (n)
458 {
459 float* MLE_AF_PTR = &MLE_AF[1];
460 bcf_update_info_float(odw->hdr, v, "MLEAF", MLE_AF_PTR, no_alleles-1);
461 bcf_update_info_float(odw->hdr, v, "MLEGF", &MLE_GF, no_genotypes);
462 }
463 }
464 }
465
466 if (compute_estimate[EST_HWE])
467 {
468 if (!n_pls)
469 {
470 ++no_variants_missing_dependencies;
471 continue;
472 }
473 else
474 {
475 float lrts;
476 float logp;
477 int32_t df;
478 n = 0;
479 Estimator::compute_hwe_lrt(pls, no_samples, ploidy,
480 no_alleles, MLE_HWE_GF, MLE_GF, n,
481 lrts, logp, df);
482 if (n)
483 {
484 bcf_update_info_float(odw->hdr, v, "HWE_LLR", &lrts, 1);
485 bcf_update_info_float(odw->hdr, v, "HWE_LPVAL", &logp, 1);
486 bcf_update_info_int32(odw->hdr, v, "HWE_DF", &df, 1);
487 }
488 }
489 }
490
491 if (compute_estimate[EST_FIC])
492 {
493 if (!n_pls)
494 {
495 ++no_variants_missing_dependencies;
496 continue;
497 }
498 else
499 {
500 float f;
501 n = 0;
502 Estimator::compute_gl_fic(pls, no_samples, ploidy,
503 MLE_HWE_AF, no_alleles, MLE_GF,
504 f, n);
505 if (n)
506 {
507 bcf_update_info_float(odw->hdr, v, "FIC", &f, 1);
508 }
509 }
510 }
511
512 if (compute_estimate[EST_AB])
513 {
514 bcf_get_format_int32(odr->hdr, v, "DP", &dps, &n_dps);
515
516 if (!n_pls || !n_dps)
517 {
518 ++no_variants_missing_dependencies;
519 }
520 else
521 {
522 float ab;
523 n = 0;
524 Estimator::compute_gl_ab(pls, no_samples, ploidy,
525 dps, MLE_GF, no_alleles,
526 ab, n);
527
528 if (n)
529 {
530 bcf_update_info_float(odw->hdr, v, "AB", &ab, 1);
531 }
532 }
533 }
534
535 if (print_sites_only)
536 {
537 bcf_subset(odw->hdr, v, 0, 0);
538 }
539
540 odw->write(v);
541 ++no_variants_computed;
542 }
543
544 if(n_gts) free(gts);
545 if(n_pls) free(pls);
546 if(n_dps) free(dps);
547
548 odw->close();
549 };
550
print_options()551 void print_options()
552 {
553 std::clog << "estimate v" << version << "\n";
554 std::clog << "\n";
555 std::clog << "Options: input VCF File " << input_vcf_file << "\n";
556 print_str_op(" [e] estimates ", estimates);
557 print_str_op(" [f] filter ", fexp);
558 print_int_op(" [i] Intervals ", intervals);
559 std::clog << "\n";
560 }
561
print_stats()562 void print_stats()
563 {
564 std::clog << "\n";
565 std::clog << "stats: total variant records " << no_variants << "\n";
566 std::clog << "\n";
567 std::clog << " variants computed " << no_variants_computed << "\n";
568 std::clog << " variants not computed " << no_variants_not_computed << "\n";
569 std::clog << " missing dependencies " << no_variants_missing_dependencies << "\n";
570 std::clog << " no_reference " << no_reference << "\n";
571 std::clog << "\n";
572 };
573
~Igor()574 ~Igor()
575 {
576 };
577
578 private:
579 };
580
581 }
582
estimate(int argc,char ** argv)583 void estimate(int argc, char ** argv)
584 {
585 Igor igor(argc, argv);
586 igor.print_options();
587 igor.initialize();
588 igor.estimate();
589 igor.print_stats();
590 }