1 /* The MIT License
2 
3    Copyright (c) 2014 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "estimate.h"
25 
26 #define EST_QUAL    0
27 #define EST_AF      1
28 #define EST_GF      2
29 #define EST_HWEAF   3
30 #define EST_MLEAF   4
31 #define EST_HWE     5
32 #define EST_FIC     6
33 #define EST_AB      7
34 #define NO_EST      8
35 
36 namespace
37 {
38 
39 class Igor : Program
40 {
41     public:
42 
43     std::string version;
44 
45     ///////////
46     //options//
47     ///////////
48     std::string input_vcf_file;
49     std::string output_vcf_file;
50     std::vector<GenomeInterval> intervals;
51     std::string interval_list;
52     std::string arg_sample_list;
53     bool compute_estimate[NO_EST];
54     std::string estimates;
55 
56     char** samples;
57     int32_t *imap;
58     int32_t nsamples;
59     bool print_sites_only;
60 
61     ///////
62     //i/o//
63     ///////
64     BCFOrderedReader *odr;
65     BCFOrderedWriter *odw;
66 
67     //////////
68     //filter//
69     //////////
70     std::string fexp;
71     Filter filter;
72     bool filter_exists;
73 
74     /////////
75     //stats//
76     /////////
77     int32_t no_samples;
78     int32_t no_variants;
79     int32_t no_variants_not_computed;
80     int32_t no_variants_computed;
81     int32_t no_variants_missing_dependencies;
82     int32_t no_reference;
83 
84     /////////
85     //tools//
86     /////////
87     VariantManip *vm;
88 
89     /**
90      * Parse filters. Processes the filter list first followed by the interval string. Duplicates are dropped.
91      *
92      * @filters       - filters stored in this vector
93      * @filter_string - comma delimited filters in a string
94      */
parse_estimators(bool compute_estimate[],std::string estimates_string)95     void parse_estimators(bool compute_estimate[], std::string estimates_string)
96     {
97         this->estimates = estimates_string;
98 
99         for (size_t i=0; i<NO_EST; ++i)
100         {
101             compute_estimate[i] = false;
102         }
103 
104         std::vector<std::string> v;
105         if (estimates_string!="")
106             split(v, ",", estimates_string);
107 
108         for (size_t i=0; i<v.size(); ++i)
109         {
110             if (v[i]=="AF")
111             {
112                 compute_estimate[EST_AF] = true;
113             }
114             else if (v[i]=="HWEAF")
115             {
116                 compute_estimate[EST_HWEAF] = true;
117             }
118             else if (v[i]=="MLEAF")
119             {
120                 compute_estimate[EST_MLEAF] = true;
121             }
122             else if (v[i]=="HWE")
123             {
124                 compute_estimate[EST_HWEAF] = true;
125                 compute_estimate[EST_MLEAF] = true;
126                 compute_estimate[EST_HWE] = true;
127             }
128             else if (v[i]=="AB")
129             {
130                 compute_estimate[EST_HWEAF] = true;
131                 compute_estimate[EST_AB] = true;
132             }
133             else if (v[i]=="FIC")
134             {
135                 compute_estimate[EST_HWEAF] = true;
136                 compute_estimate[EST_FIC] = true;
137             }
138             else
139             {
140                 fprintf(stderr, "[%s:%d %s] Estimate type not recognized: %s\n", __FILE__, __LINE__, __FUNCTION__, v[i].c_str());
141             }
142         }
143 
144         size_t no_est = 0;
145         for (size_t i=0; i<NO_EST; ++i)
146         {
147             if (compute_estimate[i])
148             {
149                 ++no_est;
150             }
151         }
152 
153         if (no_est==0)
154         {
155             fprintf(stderr, "[%s:%d %s] No valid estimate types recognized.\n", __FILE__, __LINE__, __FUNCTION__);
156             exit(1);
157         }
158     }
159 
Igor(int argc,char ** argv)160     Igor(int argc, char ** argv)
161     {
162         version = "0.5";
163 
164         //////////////////////////
165         //options initialization//
166         //////////////////////////
167         try
168         {
169             std::string desc = "Compute variant based estimates.\n\n"
170                   "   AF         Genotype (GT) based allele frequencies\n"
171                   "              If genotypes are unavailable, best guess\n"
172                   "              genotypes are inferred based on genotype\n"
173                   "              likelihoods (GL or PL)\n"
174                   "              AC        : Alternate Allele counts\n"
175                   "              AN        : Total allele counts\n"
176                   "              NS        : No. of samples.\n"
177                   "              AF        : Alternate allele frequencies.\n"
178                   "   MLEAF      GL based allele frequencies estimates\n"
179                   "              MLEAF     : Alternate allele frequency derived from MLEGF\n"
180                   "              MLEGF     : Genotype frequencies.\n"
181                   "   HWEAF      GL based allele frequencies estimates assuming HWE\n"
182                   "              HWEAF     : Alternate allele frequencies\n"
183                   "              HWEGF     : Genotype frequencies derived from HWEAF.\n"
184                   "   HWE        GL based Hardy-Weinberg statistics.\n"
185                   "              HWE_LLR   : log likelihood ratio\n"
186                   "              HWE_LPVAL : log p-value\n"
187                   "              HWE_DF    : degrees of freedom\n"
188                   "   AB         GL based Allele Balance.\n"
189                   "   FIC        GL based Inbreeding Coefficient\n"
190                   "              \n";
191 
192             TCLAP::CmdLine cmd(desc, ' ', version);
193             VTOutput my;
194             cmd.setOutput(&my);
195             TCLAP::ValueArg<std::string> arg_intervals("i", "intervals", "Intervals", false, "", "str", cmd);
196             TCLAP::ValueArg<std::string> arg_interval_list("I", "interval-list", "File containing list of intervals", false, "", "file", cmd);
197             TCLAP::ValueArg<std::string> arg_fexp("f", "f", "filter expression []", false, "", "str", cmd);
198             TCLAP::ValueArg<std::string> arg_estimates("e", "e", "comma separated estimates to be computed []", false, "", "str", cmd);
199             TCLAP::ValueArg<std::string> arg_output_vcf_file("o", "o", "output VCF/VCF.GZ/BCF file [-]", false, "-", "str", cmd);
200             TCLAP::SwitchArg arg_print_sites_only("s", "s", "print site information only without genotypes [false]", cmd, false);
201             TCLAP::UnlabeledValueArg<std::string> arg_input_vcf_file("<in.vcf>", "input VCF file", true, "", "file", cmd);
202 
203             cmd.parse(argc, argv);
204 
205             input_vcf_file = arg_input_vcf_file.getValue();
206             output_vcf_file = arg_output_vcf_file.getValue();
207             parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue());
208             print_sites_only = arg_print_sites_only.getValue();
209             fexp = arg_fexp.getValue();
210             parse_estimators(compute_estimate, arg_estimates.getValue());
211         }
212         catch (TCLAP::ArgException &e)
213         {
214             std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
215             abort();
216         }
217     };
218 
initialize()219     void initialize()
220     {
221         //////////////////////
222         //i/o initialization//
223         //////////////////////
224         odr = new BCFOrderedReader(input_vcf_file, intervals);
225         odw = new BCFOrderedWriter(output_vcf_file);
226         if (print_sites_only)
227         {
228             odw->link_hdr(bcf_hdr_subset(odr->hdr, 0, 0, 0));
229         }
230         else
231         {
232             odw->link_hdr(odr->hdr);
233         }
234 
235         if (compute_estimate[EST_AF])
236         {
237             bcf_hdr_append(odw->hdr, "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Alternate Allele Counts\">\n");
238             bcf_hdr_append(odw->hdr, "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total Number Allele Counts\">\n");
239             bcf_hdr_append(odw->hdr, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">\n");
240             bcf_hdr_append(odw->hdr, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Alternate Allele Frequency\">\n");
241         }
242 
243         if (compute_estimate[EST_GF])
244         {
245             bcf_hdr_append(odw->hdr, "##INFO=<ID=GC,Number=G,Type=Integer,Description=\"Genotype Counts\">\n");
246             bcf_hdr_append(odw->hdr, "##INFO=<ID=GN,Number=1,Type=Integer,Description=\"Total Number of Genotypes Counts\">\n");
247             bcf_hdr_append(odw->hdr, "##INFO=<ID=GF,Number=G,Type=Float,Description=\"Genotype Frequency\">\n");
248         }
249 
250         if (compute_estimate[EST_HWEAF])
251         {
252             bcf_hdr_append(odw->hdr, "##INFO=<ID=HWEAF,Number=A,Type=Float,Description=\"Genotype likelihood based MLE Allele Frequency assuming HWE\">\n");
253             bcf_hdr_append(odw->hdr, "##INFO=<ID=HWEGF,Number=G,Type=Float,Description=\"Genotype likelihood based MLE Genotype Frequency assuming HWE\">\n");
254         }
255 
256         if (compute_estimate[EST_MLEAF])
257         {
258             bcf_hdr_append(odw->hdr, "##INFO=<ID=MLEAF,Number=A,Type=Float,Description=\"Genotype likelihood based MLE Allele Frequency\">\n");
259             bcf_hdr_append(odw->hdr, "##INFO=<ID=MLEGF,Number=G,Type=Float,Description=\"Genotype likelihood based MLE Genotype Frequency\">\n");
260         }
261 
262         if (compute_estimate[EST_HWE])
263         {
264             bcf_hdr_append(odw->hdr, "##INFO=<ID=HWE_LLR,Number=1,Type=Float,Description=\"Genotype likelihood based Hardy Weinberg ln(Likelihood Ratio)\">\n");
265             bcf_hdr_append(odw->hdr, "##INFO=<ID=HWE_LPVAL,Number=1,Type=Float,Description=\"Genotype likelihood based Hardy Weinberg Likelihood Ratio Test Statistic ln(p-value)\">\n");
266             bcf_hdr_append(odw->hdr, "##INFO=<ID=HWE_DF,Number=1,Type=Integer,Description=\"Degrees of freedom for Genotype likelihood based Hardy Weinberg Likelihood Ratio Test Statistic\">\n");
267         }
268 
269         if (compute_estimate[EST_FIC])
270         {
271             bcf_hdr_append(odw->hdr, "##INFO=<ID=FIC,Number=1,Type=Float,Description=\"Genotype likelihood based Inbreeding Coefficient\">\n");
272         }
273 
274         if (compute_estimate[EST_AB])
275         {
276             bcf_hdr_append(odw->hdr, "##INFO=<ID=AB,Number=1,Type=Float,Description=\"Genotype likelihood based Allele Balance\">\n");
277         }
278 
279         /////////////////////////
280         //filter initialization//
281         /////////////////////////
282         filter.parse(fexp.c_str());
283         filter_exists = fexp=="" ? false : true;
284 
285         ////////////////////////
286         //stats initialization//
287         ////////////////////////
288         no_samples = bcf_hdr_nsamples(odr->hdr);
289         no_variants = 0;
290         no_variants_not_computed = 0;
291         no_variants_computed = 0;
292         no_variants_missing_dependencies = 0;
293         no_reference = 0;
294 
295         if (!no_samples)
296         {
297             fprintf(stderr, "[%s:%d %s] No samples in VCF file: %s\n", __FILE__, __LINE__, __FUNCTION__, input_vcf_file.c_str());
298             exit(1);
299         }
300 
301         ///////////////////////
302         //tool initialization//
303         ///////////////////////
304         vm = new VariantManip("");
305     }
306 
estimate()307     void estimate()
308     {
309         bcf1_t *v = odw->get_bcf1_from_pool();
310         bcf_hdr_t *h = odr->hdr;
311         Variant variant;
312 
313         int32_t *gts = NULL;
314         int32_t *pls = NULL;
315         int32_t *dps = NULL;
316         int32_t n_gts = 0;
317         int32_t n_pls = 0;
318         int32_t n_dps = 0;
319 
320         odw->write_hdr();
321 
322         while(odr->read(v))
323         {
324             variant.clear();
325             bool printed = false;
326             int32_t vtype = vm->classify_variant(h, v, variant);
327             if (filter_exists && !filter.apply(h,v,&variant))
328             {
329                 continue;
330             }
331 
332             ++no_variants;
333 
334             if (vtype==VT_REF)
335             {
336                 odw->write(v);
337                 ++no_variants_not_computed;
338                 ++no_reference;
339                 continue;
340             }
341 
342 
343             bcf_unpack(v, BCF_UN_ALL);
344             int32_t ploidy = bcf_get_genotypes(odr->hdr, v, &gts, &n_gts);
345             ploidy /= no_samples;
346 
347 //            if (!n_gts)
348 //            {
349 //                for (int32_t i=0; i<NO_EST; ++i)
350 //                {
351 //                    compute_estimate[i] = 0;
352 //                }
353 //
354 //                ++no_variants_missing_dependencies;
355 //            }
356 
357             bcf_get_format_int32(odr->hdr, v, "PL", &pls, &n_pls);
358             int32_t no_alleles = bcf_get_n_allele(v);
359 
360 //            if (!n_pls)
361 //            {
362 //                if (compute_estimate[EST_MLEAF])
363 //                {
364 //                    fprintf(stderr, "[%s:%d %s] No PL informationamples in VCF file: %s\n", __FILE__, __LINE__, __FUNCTION__, input_vcf_file.c_str());
365 //                    exit(1);
366 //                }
367 //                compute_estimate[i] = 0;
368 //            }
369 
370             float qual = 0;
371             int32_t n = 0;
372             if (compute_estimate[EST_QUAL])
373             {
374                 Estimator::compute_qual(pls, no_samples, ploidy, no_alleles, qual, n);
375                 if (n)
376                 {
377                     bcf_set_qual(v, qual);
378                 }
379             }
380 
381             int32_t no_genotypes = bcf_an2gn(no_alleles);
382 
383             if (compute_estimate[EST_AF])
384             {
385                 if (!n_gts)
386                 {
387                     ++no_variants_missing_dependencies;
388                 }
389                 else
390                 {
391                     int32_t g[ploidy];
392                     for (int32_t i=0; i<ploidy; ++i) g[i]=0;
393                     int32_t AC[no_alleles];
394                     float AF[no_alleles];
395                     for (int32_t i=0; i<no_alleles; ++i) {AF[i]=0;AC[i]=0;}
396                     int32_t AN=0;
397                     int32_t NS=0;
398 
399                     int32_t GC[no_genotypes];
400                     int32_t GN=0;
401                     float GF[no_genotypes];
402                     Estimator::compute_af(gts, no_samples, ploidy, no_alleles, AC, AN, AF, GC, GN, GF, NS);
403 
404                     int32_t* AC_PTR = &AC[1];
405                     bcf_update_info_int32(odw->hdr, v, "AC", AC_PTR, no_alleles-1);
406                     bcf_update_info_int32(odw->hdr, v, "AN", &AN, 1);
407                     float* AF_PTR = &AF[1];
408                     bcf_update_info_float(odw->hdr, v, "AF", AF_PTR, no_alleles-1);
409                     if (GN)
410                     {
411                         bcf_update_info_int32(odw->hdr, v, "GC", GC, no_genotypes);
412                         bcf_update_info_int32(odw->hdr, v, "GN", &GN, 1);
413                         bcf_update_info_float(odw->hdr, v, "GF", GF, no_genotypes);
414                     }
415                     bcf_update_info_int32(odw->hdr, v, "NS", &NS, 1);
416                 }
417             }
418 
419             if (compute_estimate[EST_GF])
420             {
421             }
422 
423             float MLE_HWE_AF[no_alleles];
424             float MLE_HWE_GF[no_genotypes];
425             if (compute_estimate[EST_HWEAF])
426             {
427                 if (!n_pls)
428                 {
429                     ++no_variants_missing_dependencies;
430                 }
431                 else
432                 {
433                     n = 0;
434                     Estimator::compute_gl_af_hwe(pls, no_samples, ploidy,no_alleles, MLE_HWE_AF, MLE_HWE_GF,  n, 1e-20);
435                     if (n)
436                     {
437                         float* MLE_HWE_AF_PTR = &MLE_HWE_AF[1];
438                         bcf_update_info_float(odw->hdr, v, "HWEAF", MLE_HWE_AF_PTR, no_alleles-1);
439                         bcf_update_info_float(odw->hdr, v, "HWEGF", &MLE_HWE_GF, no_genotypes);
440                     }
441                 }
442             }
443 
444             float MLE_AF[no_alleles];
445             float MLE_GF[no_genotypes];
446             if (compute_estimate[EST_MLEAF])
447             {
448                 if (!n_pls)
449                 {
450                     ++no_variants_missing_dependencies;
451                     continue;
452                 }
453                 else
454                 {
455                     n = 0;
456                     Estimator::compute_gl_af(pls, no_samples, ploidy,no_alleles, MLE_AF, MLE_GF,  n, 1e-20);
457                     if (n)
458                     {
459                         float* MLE_AF_PTR = &MLE_AF[1];
460                         bcf_update_info_float(odw->hdr, v, "MLEAF", MLE_AF_PTR, no_alleles-1);
461                         bcf_update_info_float(odw->hdr, v, "MLEGF", &MLE_GF, no_genotypes);
462                     }
463                 }
464             }
465 
466             if (compute_estimate[EST_HWE])
467             {
468                 if (!n_pls)
469                 {
470                     ++no_variants_missing_dependencies;
471                     continue;
472                 }
473                 else
474                 {
475                     float lrts;
476                     float logp;
477                     int32_t df;
478                     n = 0;
479                     Estimator::compute_hwe_lrt(pls, no_samples, ploidy,
480                                          no_alleles, MLE_HWE_GF, MLE_GF, n,
481                                          lrts, logp, df);
482                     if (n)
483                     {
484                         bcf_update_info_float(odw->hdr, v, "HWE_LLR", &lrts, 1);
485                         bcf_update_info_float(odw->hdr, v, "HWE_LPVAL", &logp, 1);
486                         bcf_update_info_int32(odw->hdr, v, "HWE_DF", &df, 1);
487                     }
488                 }
489             }
490 
491             if (compute_estimate[EST_FIC])
492             {
493                 if (!n_pls)
494                 {
495                     ++no_variants_missing_dependencies;
496                     continue;
497                 }
498                 else
499                 {
500                     float f;
501                     n = 0;
502                     Estimator::compute_gl_fic(pls, no_samples, ploidy,
503                                        MLE_HWE_AF, no_alleles, MLE_GF,
504                                        f, n);
505                     if (n)
506                     {
507                         bcf_update_info_float(odw->hdr, v, "FIC", &f, 1);
508                     }
509                 }
510             }
511 
512             if (compute_estimate[EST_AB])
513             {
514                 bcf_get_format_int32(odr->hdr, v, "DP", &dps, &n_dps);
515 
516                 if (!n_pls || !n_dps)
517                 {
518                     ++no_variants_missing_dependencies;
519                 }
520                 else
521                 {
522                     float ab;
523                     n = 0;
524                     Estimator::compute_gl_ab(pls, no_samples, ploidy,
525                                        dps, MLE_GF, no_alleles,
526                                        ab, n);
527 
528                     if (n)
529                     {
530                         bcf_update_info_float(odw->hdr, v, "AB", &ab, 1);
531                     }
532                 }
533             }
534 
535             if (print_sites_only)
536             {
537                 bcf_subset(odw->hdr, v, 0, 0);
538             }
539 
540             odw->write(v);
541             ++no_variants_computed;
542         }
543 
544         if(n_gts) free(gts);
545         if(n_pls) free(pls);
546         if(n_dps) free(dps);
547 
548         odw->close();
549     };
550 
print_options()551     void print_options()
552     {
553         std::clog << "estimate v" << version << "\n";
554         std::clog << "\n";
555         std::clog << "Options:     input VCF File    " << input_vcf_file << "\n";
556         print_str_op("         [e] estimates         ", estimates);
557         print_str_op("         [f] filter            ", fexp);
558         print_int_op("         [i] Intervals         ", intervals);
559         std::clog << "\n";
560     }
561 
print_stats()562     void print_stats()
563     {
564         std::clog << "\n";
565         std::clog << "stats: total variant records      " << no_variants << "\n";
566         std::clog << "\n";
567         std::clog << "       variants computed          " << no_variants_computed << "\n";
568         std::clog << "       variants not computed      " << no_variants_not_computed << "\n";
569         std::clog << "           missing dependencies       " << no_variants_missing_dependencies << "\n";
570         std::clog << "           no_reference               " << no_reference << "\n";
571         std::clog << "\n";
572     };
573 
~Igor()574     ~Igor()
575     {
576     };
577 
578     private:
579 };
580 
581 }
582 
estimate(int argc,char ** argv)583 void estimate(int argc, char ** argv)
584 {
585     Igor igor(argc, argv);
586     igor.print_options();
587     igor.initialize();
588     igor.estimate();
589     igor.print_stats();
590 }