1 /* The MIT License
2 
3    Copyright (c) 2013 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "view.h"
25 
26 namespace
27 {
28 
29 class Igor : Program
30 {
31     public:
32 
33     ///////////
34     //options//
35     ///////////
36     std::string input_vcf_file;
37     std::string output_vcf_file;
38     int32_t compression_level;
39     std::string streaming_selection_bed_file;
40     uint32_t left_window;
41     uint32_t right_window;
42     std::vector<GenomeInterval> intervals;
43     std::vector<std::string> samples;
44     std::string variant;
45     uint32_t sort_window_size;
46     bool stream_selection;
47     bool print_header;
48     bool print_header_only;
49     bool print_sites_only;
50     bool print;
51     bool debug;
52     int32_t no_subset_samples;
53 
54     ///////
55     //i/o//
56     ///////
57     BCFOrderedReader *odr;
58     BCFOrderedWriter *odw;
59 
60     //////////
61     //filter//
62     //////////
63     std::string fexp;
64     Filter filter;
65     bool filter_exists;
66 
67     /////////
68     //stats//
69     /////////
70     uint32_t no_variants;
71     uint32_t no_samples;
72 
73     /////////
74     //tools//
75     /////////
76     VariantManip *vm;
77     OrderedRegionOverlapMatcher *orom_regions;
78 
Igor(int argc,char ** argv)79     Igor(int argc, char **argv)
80     {
81         version = "0.5";
82 
83         //////////////////////////
84         //options initialization//
85         //////////////////////////
86         try
87         {
88             std::string desc = "Views a VCF or BCF or VCF.GZ file.";
89 
90             TCLAP::CmdLine cmd(desc, ' ', version);
91             VTOutput my;
92             cmd.setOutput(&my);
93             TCLAP::ValueArg<std::string> arg_intervals("i", "i", "intervals []", false, "", "str", cmd);
94             TCLAP::ValueArg<std::string> arg_interval_list("I", "I", "file containing list of intervals []", false, "", "file", cmd);
95             TCLAP::ValueArg<std::string> arg_streaming_selection_bed_file("t", "t", "bed file for variant selection via streaming []", false, "", "file", cmd);
96             TCLAP::ValueArg<int32_t> arg_compression_level("c", "c", "compression level 0-9, 0 and -1 denotes uncompressed with the former being wrapped in bgzf.[6]", false, 6, "int", cmd);
97             TCLAP::ValueArg<uint32_t> arg_left_window("l", "l", "left window size for overlap []", false, 0, "int", cmd);
98             TCLAP::ValueArg<uint32_t> arg_right_window("r", "r", "right window size for overlap []", false, 0, "int", cmd);
99             TCLAP::SwitchArg arg_print("p", "p", "print options and summary [false]", cmd, false);
100             TCLAP::SwitchArg arg_print_header("h", "h", "omit header, this option is honored only for STDOUT [false]", cmd, false);
101             TCLAP::SwitchArg arg_print_header_only("H", "H", "print header only, this option is honored only for STDOUT [false]", cmd, false);
102             TCLAP::SwitchArg arg_print_sites_only("s", "s", "print site information only without genotypes [false]", cmd, false);
103             TCLAP::ValueArg<uint32_t> arg_sort_window_size("w", "w", "local sorting window size [0]", false, 0, "int", cmd);
104             TCLAP::SwitchArg arg_debug("d", "d", "debug [false]", cmd, false);
105             //TCLAP::ValueArg<std::string> arg_sample_list("s", "s", "file containing list of sample []", false, "", "file", cmd);
106             TCLAP::ValueArg<std::string> arg_fexp("f", "f", "filter expression []", false, "", "str", cmd);
107             TCLAP::ValueArg<std::string> arg_output_vcf_file("o", "o", "output VCF/VCF.GZ/BCF file [-]", false, "-", "str", cmd);
108             TCLAP::UnlabeledValueArg<std::string> arg_input_vcf_file("<in.vcf>", "input VCF file", true, "","file", cmd);
109 
110             cmd.parse(argc, argv);
111 
112             input_vcf_file = arg_input_vcf_file.getValue();
113             output_vcf_file = arg_output_vcf_file.getValue();
114             compression_level = arg_compression_level.getValue();
115             parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue());
116             fexp = arg_fexp.getValue();
117             streaming_selection_bed_file = arg_streaming_selection_bed_file.getValue();
118             left_window = arg_left_window.getValue();
119             right_window = arg_right_window.getValue();
120             print_header = arg_print_header.getValue();
121             print_header_only = arg_print_header_only.getValue();
122             no_subset_samples = arg_print_sites_only.getValue() ? 0 : -1;
123             print = arg_print.getValue();
124             sort_window_size = arg_sort_window_size.getValue();
125             debug = arg_debug.getValue();
126         }
127         catch (TCLAP::ArgException &e)
128         {
129             std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
130             abort();
131         }
132     };
133 
initialize()134     void initialize()
135     {
136         //////////////////////
137         //i/o initialization//
138         //////////////////////
139         stream_selection = false;
140         if (streaming_selection_bed_file != "")
141         {
142             orom_regions = new OrderedRegionOverlapMatcher(streaming_selection_bed_file);
143             stream_selection = true;
144         }
145 
146         odr = new BCFOrderedReader(input_vcf_file, intervals);
147         odw = new BCFOrderedWriter(output_vcf_file, sort_window_size, compression_level);
148         if (no_subset_samples==-1)
149         {
150             odw->link_hdr(odr->hdr);
151         }
152         else if (no_subset_samples==0)
153         {
154             odw->link_hdr(bcf_hdr_subset(odr->hdr, 0, 0, 0));
155         }
156 
157         /////////////////////////
158         //filter initialization//
159         /////////////////////////
160         filter.parse(fexp.c_str(), debug);
161         filter_exists = fexp=="" ? false : true;
162 
163         ////////////////////////
164         //stats initialization//
165         ////////////////////////
166         no_variants = 0;
167         no_samples = 0;
168 
169         ///////////////////////
170         //tool initialization//
171         ///////////////////////
172         vm = new VariantManip("");
173     }
174 
view()175     void view()
176     {
177         if (print_header_only && output_vcf_file == "-")
178         {
179             odw->write_hdr();
180             odr->close();
181             odw->close();
182             return;
183         }
184 
185         if (print_header || output_vcf_file != "-") odw->write_hdr();
186 
187         bcf1_t *v = odw->get_bcf1_from_pool();
188         bcf_hdr_t *h = odr->hdr;
189         Variant variant;
190 
191         while (odr->read(v))
192         {
193 //            bcf_print(h,v);
194 
195             if (stream_selection)
196             {
197                 bcf_unpack(v, BCF_UN_STR);
198                 std::string chrom = bcf_get_chrom(odr->hdr,v);
199                 int32_t start1 = bcf_get_pos1(v);
200                 int32_t end1 = bcf_get_end1(v);
201 
202                 if (!orom_regions->overlaps_with(chrom, start1-left_window, end1+right_window))
203                 {
204                     continue;
205                 }
206             }
207 
208             if (filter_exists)
209             {
210                 vm->classify_variant(h, v, variant);
211                 if (!filter.apply(h, v, &variant, debug))
212                 {
213                     continue;
214                 }
215             }
216 
217             if (no_subset_samples==0)
218             {
219                 bcf_subset(odw->hdr, v, 0, 0);
220                 //maybe add some additional adhoc fixing for BCF files that do not have a complete header.
221             }
222             odw->write(v);
223             if (sort_window_size)
224             {
225                 v = odw->get_bcf1_from_pool();
226             }
227             ++no_variants;
228         }
229 
230         odw->close();
231         odr->close();
232 
233         delete(odw);
234         delete(odr);
235     };
236 
print_options()237     void print_options()
238     {
239         if (!print) return;
240 
241         std::clog << "view v" << version << "\n\n";
242 
243         std::clog << "options:     input VCF file                " << input_vcf_file << "\n";
244         std::clog << "         [o] output VCF file               " << output_vcf_file << "\n";
245         std::clog << "         [w] sort window size              " << sort_window_size << "\n";
246         print_str_op("         [t] streaming selection bed file  ", streaming_selection_bed_file);
247         print_num_op("         [l] left window                   ", left_window);
248         print_num_op("         [r] right window                  ", right_window);
249         std::clog << "         [h] print header                  " << (print_header ? "yes" : "no") << "\n";
250         std::clog << "         [H] print header only             " << (print_header_only ? "yes" : "no") << "\n";
251         std::clog << "         [s] print site information only   " << (print_sites_only ? "yes" : "no") << "\n";
252         std::clog << "         [p] print options and stats       " << (print ? "yes" : "no") << "\n";
253         std::clog << "         [c] compression level             " << compression_level << "\n";
254         print_str_op("         [f] filter                        ", fexp);
255         print_int_op("         [i] intervals                     ", intervals);
256         std::clog << "\n";
257     }
258 
print_stats()259     void print_stats()
260     {
261         if (!print) return;
262 
263         std::clog << "\n";
264         std::clog << "stats: no. variants  : " << no_variants << "\n";
265         std::clog << "       no. samples   : " << no_samples << "\n";
266         std::clog << "\n";
267     };
268 
~Igor()269     ~Igor() {};
270 
271     private:
272 };
273 
274 }
275 
view(int argc,char ** argv)276 bool view(int argc, char ** argv)
277 {
278     Igor igor(argc, argv);
279     igor.print_options();
280     igor.initialize();
281     igor.view();
282     igor.print_stats();
283     return igor.print;
284 };
285