1 /* The MIT License
2 
3    Copyright (c) 2015 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "compute_features2.h"
25 
26 namespace
27 {
28 
29 class Igor : Program
30 {
31     public:
32 
33     std::string version;
34 
35     ///////////
36     //options//
37     ///////////
38     std::string input_vcf_file;
39     std::string output_vcf_file;
40     std::vector<GenomeInterval> intervals;
41     std::string interval_list;
42     std::string arg_sample_list;
43     char** samples;
44     int32_t *imap;
45     int32_t nsamples;
46     bool print_sites_only;
47 
48     ///////
49     //i/o//
50     ///////
51     BCFOrderedReader *odr;
52     BCFOrderedWriter *odw;
53 
54     //////////
55     //filter//
56     //////////
57     std::string fexp;
58     Filter filter;
59     bool filter_exists;
60 
61     /////////
62     //stats//
63     /////////
64     int32_t no_samples;
65     int32_t no_variants;
66 
67     /////////
68     //tools//
69     /////////
70     VariantManip *vm;
71 
Igor(int argc,char ** argv)72     Igor(int argc, char ** argv)
73     {
74         version = "0.5";
75 
76         //////////////////////////
77         //options initialization//
78         //////////////////////////
79         try
80         {
81             std::string desc = "Compute features for variants in vt pipeline from separate genotype files";
82 
83             TCLAP::CmdLine cmd(desc, ' ', version);
84             VTOutput my;
85             cmd.setOutput(&my);
86             TCLAP::ValueArg<std::string> arg_intervals("i", "intervals", "Intervals", false, "", "str", cmd);
87             TCLAP::ValueArg<std::string> arg_interval_list("I", "interval-list", "File containing list of intervals", false, "", "file", cmd);
88             TCLAP::ValueArg<std::string> arg_fexp("f", "f", "filter expression []", false, "", "str", cmd);
89             TCLAP::ValueArg<std::string> arg_output_vcf_file("o", "o", "output VCF/VCF.GZ/BCF file [-]", false, "-", "str", cmd);
90             TCLAP::SwitchArg arg_print_sites_only("s", "s", "print site information only without genotypes [false]", cmd, false);
91             TCLAP::UnlabeledValueArg<std::string> arg_input_vcf_file("<in.vcf>", "input VCF file", true, "", "file", cmd);
92 
93             cmd.parse(argc, argv);
94 
95             input_vcf_file = arg_input_vcf_file.getValue();
96             output_vcf_file = arg_output_vcf_file.getValue();
97             parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue());
98             print_sites_only = arg_print_sites_only.getValue();
99             fexp = arg_fexp.getValue();
100         }
101         catch (TCLAP::ArgException &e)
102         {
103             std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
104             abort();
105         }
106     };
107 
initialize()108     void initialize()
109     {
110         //convert to paste like set up
111 
112         //////////////////////
113         //i/o initialization//
114         //////////////////////
115         odr = new BCFOrderedReader(input_vcf_file, intervals);
116         odw = new BCFOrderedWriter(output_vcf_file);
117         if (print_sites_only)
118         {
119             odw->link_hdr(bcf_hdr_subset(odr->hdr, 0, 0, 0));
120         }
121         else
122         {
123             odw->link_hdr(odr->hdr);
124         }
125 
126         bcf_hdr_append(odw->hdr, "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Alternate Allele Counts\">\n");
127         bcf_hdr_append(odw->hdr, "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total Number Allele Counts\">\n");
128         bcf_hdr_append(odw->hdr, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">\n");
129         bcf_hdr_append(odw->hdr, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Alternate Allele Frequency\">\n");
130         bcf_hdr_append(odw->hdr, "##INFO=<ID=GC,Number=G,Type=Integer,Description=\"Genotype Counts\">\n");
131         bcf_hdr_append(odw->hdr, "##INFO=<ID=GN,Number=1,Type=Integer,Description=\"Total Number of Genotypes Counts\">\n");
132         bcf_hdr_append(odw->hdr, "##INFO=<ID=GF,Number=G,Type=Float,Description=\"Genotype Frequency\">\n");
133 
134         bcf_hdr_append(odw->hdr, "##INFO=<ID=HWEAF,Number=A,Type=Float,Description=\"Genotype likelihood based MLE Allele Frequency assuming HWE\">\n");
135         bcf_hdr_append(odw->hdr, "##INFO=<ID=HWEGF,Number=G,Type=Float,Description=\"Genotype likelihood based MLE Genotype Frequency assuming HWE\">\n");
136         bcf_hdr_append(odw->hdr, "##INFO=<ID=MLEAF,Number=A,Type=Float,Description=\"Genotype likelihood based MLE Allele Frequency\">\n");
137         bcf_hdr_append(odw->hdr, "##INFO=<ID=MLEGF,Number=G,Type=Float,Description=\"Genotype likelihood based MLE Genotype Frequency\">\n");
138         bcf_hdr_append(odw->hdr, "##INFO=<ID=HWE_LLR,Number=1,Type=Float,Description=\"Genotype likelihood based Hardy Weinberg ln(Likelihood Ratio)\">\n");
139         bcf_hdr_append(odw->hdr, "##INFO=<ID=HWE_LPVAL,Number=1,Type=Float,Description=\"Genotype likelihood based Hardy Weinberg Likelihood Ratio Test Statistic ln(p-value)\">\n");
140         bcf_hdr_append(odw->hdr, "##INFO=<ID=HWE_DF,Number=1,Type=Integer,Description=\"Degrees of freedom for Genotype likelihood based Hardy Weinberg Likelihood Ratio Test Statistic\">\n");
141         bcf_hdr_append(odw->hdr, "##INFO=<ID=FIC,Number=1,Type=Float,Description=\"Genotype likelihood based Inbreeding Coefficient\">\n");
142         bcf_hdr_append(odw->hdr, "##INFO=<ID=AB,Number=1,Type=Float,Description=\"Genotype likelihood based Allele Balance\">\n");
143 
144         /////////////////////////
145         //filter initialization//
146         /////////////////////////
147         filter.parse(fexp.c_str());
148         filter_exists = fexp=="" ? false : true;
149 
150         ////////////////////////
151         //stats initialization//
152         ////////////////////////
153         no_samples = bcf_hdr_nsamples(odr->hdr);
154         no_variants = 0;
155 
156         if (!no_samples)
157         {
158             fprintf(stderr, "[%s:%d %s] No samples in VCF file: %s\n", __FILE__, __LINE__, __FUNCTION__, input_vcf_file.c_str());
159             exit(1);
160         }
161 
162         ///////////////////////
163         //tool initialization//
164         ///////////////////////
165         vm = new VariantManip("");
166     }
167 
compute_features()168     void compute_features()
169     {
170         bcf1_t *v = odw->get_bcf1_from_pool();
171         bcf_hdr_t *h = odr->hdr;
172         Variant variant;
173 
174         int32_t *gts = NULL;
175         int32_t *pls = NULL;
176         int32_t *dps = NULL;
177         int32_t n_gts = 0;
178         int32_t n_pls = 0;
179         int32_t n_dps = 0;
180 
181         odw->write_hdr();
182 
183         while(odr->read(v))
184         {
185             variant.clear();
186             bool printed = false;
187 
188             vm->classify_variant(h, v, variant);
189             if (filter_exists && !filter.apply(h,v,&variant))
190             {
191                 continue;
192             }
193 
194             bcf_unpack(v, BCF_UN_ALL);
195             int32_t ploidy = bcf_get_genotypes(odr->hdr, v, &gts, &n_gts);
196             ploidy /= no_samples;
197 
198             if (!n_gts)
199             {
200                 continue;
201             }
202 
203             bcf_get_format_int32(odr->hdr, v, "PL", &pls, &n_pls);
204             int32_t no_alleles = bcf_get_n_allele(v);
205 
206             if (!n_pls)
207             {
208                 continue;
209             }
210 
211             float qual = 0;
212             int32_t n = 0;
213             Estimator::compute_qual(pls, no_samples, ploidy, no_alleles, qual, n);
214             if (n)
215             {
216                 bcf_set_qual(v, qual);
217             }
218 
219             int32_t g[ploidy];
220             for (int32_t i=0; i<ploidy; ++i) g[i]=0;
221             int32_t AC[no_alleles];
222             float AF[no_alleles];
223             for (int32_t i=0; i<no_alleles; ++i) AC[i]=0;
224             int32_t AN=0;
225             int32_t NS=0;
226             int32_t no_genotypes = bcf_an2gn(no_alleles);
227             int32_t GC[no_genotypes];
228             int32_t GN=0;
229             float GF[no_genotypes];
230 
231             Estimator::compute_af(gts, no_samples, ploidy, no_alleles, AC, AN, AF, GC, GN, GF, NS);
232 
233             if (NS)
234             {
235                 int32_t* AC_PTR = &AC[1];
236                 bcf_update_info_int32(odw->hdr, v, "AC", AC_PTR, no_alleles-1);
237                 bcf_update_info_int32(odw->hdr, v, "AN", &AN, 1);
238                 float* AF_PTR = &AF[1];
239                 bcf_update_info_float(odw->hdr, v, "AF", AF_PTR, no_alleles-1);
240                 if (GN)
241                 {
242                     bcf_update_info_int32(odw->hdr, v, "GC", GC, no_genotypes);
243                     bcf_update_info_int32(odw->hdr, v, "GN", &GN, 1);
244                     bcf_update_info_float(odw->hdr, v, "GF", GF, no_genotypes);
245                 }
246                 bcf_update_info_int32(odw->hdr, v, "NS", &NS, 1);
247             }
248 
249             float MLE_HWE_AF[no_alleles];
250             float MLE_HWE_GF[no_genotypes];
251             n = 0;
252             Estimator::compute_gl_af_hwe(pls, no_samples, ploidy,no_alleles, MLE_HWE_AF, MLE_HWE_GF,  n, 1e-20);
253             if (n)
254             {
255                 float* MLE_HWE_AF_PTR = &MLE_HWE_AF[1];
256                 bcf_update_info_float(odw->hdr, v, "HWEAF", MLE_HWE_AF_PTR, no_alleles-1);
257                 bcf_update_info_float(odw->hdr, v, "HWEGF", &MLE_HWE_GF, no_genotypes);
258             }
259 
260             float MLE_AF[no_alleles];
261             float MLE_GF[no_genotypes];
262             n = 0;
263             Estimator::compute_gl_af(pls, no_samples, ploidy,no_alleles, MLE_AF, MLE_GF,  n, 1e-20);
264             if (n)
265             {
266                 float* MLE_AF_PTR = &MLE_AF[1];
267                 bcf_update_info_float(odw->hdr, v, "MLEAF", MLE_AF_PTR, no_alleles-1);
268                 bcf_update_info_float(odw->hdr, v, "MLEGF", &MLE_GF, no_genotypes);
269             }
270 
271             float lrts;
272             float logp;
273             int32_t df;
274             n = 0;
275             Estimator::compute_hwe_lrt(pls, no_samples, ploidy,
276                                  no_alleles, MLE_HWE_GF, MLE_GF, n,
277                                  lrts, logp, df);
278             if (n)
279             {
280                 bcf_update_info_float(odw->hdr, v, "HWE_LLR", &lrts, 1);
281                 bcf_update_info_float(odw->hdr, v, "HWE_LPVAL", &logp, 1);
282                 bcf_update_info_int32(odw->hdr, v, "HWE_DF", &df, 1);
283             }
284 
285             float f;
286             n = 0;
287             Estimator::compute_gl_fic(pls, no_samples, ploidy,
288                                MLE_HWE_AF, no_alleles, MLE_GF,
289                                f, n);
290             if (n)
291             {
292                 bcf_update_info_float(odw->hdr, v, "FIC", &f, 1);
293             }
294 
295             bcf_get_format_int32(odr->hdr, v, "DP", &dps, &n_dps);
296             if (n_dps)
297             {
298                 float ab;
299                 n = 0;
300                 Estimator::compute_gl_ab(pls, no_samples, ploidy,
301                                    dps, MLE_GF, no_alleles,
302                                    ab, n);
303 
304                 if (n)
305                 {
306                     bcf_update_info_float(odw->hdr, v, "AB", &ab, 1);
307                 }
308             }
309 
310             if (print_sites_only)
311             {
312                 bcf_subset(odw->hdr, v, 0, 0);
313             }
314 
315             odw->write(v);
316             ++no_variants;
317         }
318 
319         if(n_gts) free(gts);
320         if(n_pls) free(pls);
321         if(n_dps) free(dps);
322 
323         odw->close();
324     };
325 
print_options()326     void print_options()
327     {
328         std::clog << "compute_features v" << version << "\n";
329         std::clog << "\n";
330         std::clog << "Options:     input VCF File    " << input_vcf_file << "\n";
331         print_str_op("         [f] filter            ", fexp);
332         print_int_op("         [i] Intervals         ", intervals);
333         std::clog << "\n";
334     }
335 
print_stats()336     void print_stats()
337     {
338         std::clog << "\n";
339         std::clog << "stats: variants   : " << no_variants << "\n";
340         std::clog << "\n";
341     };
342 
~Igor()343     ~Igor()
344     {
345     };
346 
347     private:
348 };
349 
350 }
351 
compute_features2(int argc,char ** argv)352 void compute_features2(int argc, char ** argv)
353 {
354     Igor igor(argc, argv);
355     igor.print_options();
356     igor.initialize();
357     igor.compute_features();
358     igor.print_stats();
359 }
360 
361