1 /* The MIT License
2
3 Copyright (c) 2013 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "view.h"
25
26 namespace
27 {
28
29 class Igor : Program
30 {
31 public:
32
33 ///////////
34 //options//
35 ///////////
36 std::string input_vcf_file;
37 std::string output_vcf_file;
38 int32_t compression_level;
39 std::string streaming_selection_bed_file;
40 uint32_t left_window;
41 uint32_t right_window;
42 std::vector<GenomeInterval> intervals;
43 std::vector<std::string> samples;
44 std::string variant;
45 uint32_t sort_window_size;
46 bool stream_selection;
47 bool print_header;
48 bool print_header_only;
49 bool print_sites_only;
50 bool print;
51 bool debug;
52 int32_t no_subset_samples;
53
54 ///////
55 //i/o//
56 ///////
57 BCFOrderedReader *odr;
58 BCFOrderedWriter *odw;
59
60 //////////
61 //filter//
62 //////////
63 std::string fexp;
64 Filter filter;
65 bool filter_exists;
66
67 /////////
68 //stats//
69 /////////
70 uint32_t no_variants;
71 uint32_t no_samples;
72
73 /////////
74 //tools//
75 /////////
76 VariantManip *vm;
77 OrderedRegionOverlapMatcher *orom_regions;
78
Igor(int argc,char ** argv)79 Igor(int argc, char **argv)
80 {
81 version = "0.5";
82
83 //////////////////////////
84 //options initialization//
85 //////////////////////////
86 try
87 {
88 std::string desc = "Views a VCF or BCF or VCF.GZ file.";
89
90 TCLAP::CmdLine cmd(desc, ' ', version);
91 VTOutput my;
92 cmd.setOutput(&my);
93 TCLAP::ValueArg<std::string> arg_intervals("i", "i", "intervals []", false, "", "str", cmd);
94 TCLAP::ValueArg<std::string> arg_interval_list("I", "I", "file containing list of intervals []", false, "", "file", cmd);
95 TCLAP::ValueArg<std::string> arg_streaming_selection_bed_file("t", "t", "bed file for variant selection via streaming []", false, "", "file", cmd);
96 TCLAP::ValueArg<int32_t> arg_compression_level("c", "c", "compression level 0-9, 0 and -1 denotes uncompressed with the former being wrapped in bgzf.[6]", false, 6, "int", cmd);
97 TCLAP::ValueArg<uint32_t> arg_left_window("l", "l", "left window size for overlap []", false, 0, "int", cmd);
98 TCLAP::ValueArg<uint32_t> arg_right_window("r", "r", "right window size for overlap []", false, 0, "int", cmd);
99 TCLAP::SwitchArg arg_print("p", "p", "print options and summary [false]", cmd, false);
100 TCLAP::SwitchArg arg_print_header("h", "h", "omit header, this option is honored only for STDOUT [false]", cmd, false);
101 TCLAP::SwitchArg arg_print_header_only("H", "H", "print header only, this option is honored only for STDOUT [false]", cmd, false);
102 TCLAP::SwitchArg arg_print_sites_only("s", "s", "print site information only without genotypes [false]", cmd, false);
103 TCLAP::ValueArg<uint32_t> arg_sort_window_size("w", "w", "local sorting window size [0]", false, 0, "int", cmd);
104 TCLAP::SwitchArg arg_debug("d", "d", "debug [false]", cmd, false);
105 //TCLAP::ValueArg<std::string> arg_sample_list("s", "s", "file containing list of sample []", false, "", "file", cmd);
106 TCLAP::ValueArg<std::string> arg_fexp("f", "f", "filter expression []", false, "", "str", cmd);
107 TCLAP::ValueArg<std::string> arg_output_vcf_file("o", "o", "output VCF/VCF.GZ/BCF file [-]", false, "-", "str", cmd);
108 TCLAP::UnlabeledValueArg<std::string> arg_input_vcf_file("<in.vcf>", "input VCF file", true, "","file", cmd);
109
110 cmd.parse(argc, argv);
111
112 input_vcf_file = arg_input_vcf_file.getValue();
113 output_vcf_file = arg_output_vcf_file.getValue();
114 compression_level = arg_compression_level.getValue();
115 parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue());
116 fexp = arg_fexp.getValue();
117 streaming_selection_bed_file = arg_streaming_selection_bed_file.getValue();
118 left_window = arg_left_window.getValue();
119 right_window = arg_right_window.getValue();
120 print_header = arg_print_header.getValue();
121 print_header_only = arg_print_header_only.getValue();
122 no_subset_samples = arg_print_sites_only.getValue() ? 0 : -1;
123 print = arg_print.getValue();
124 sort_window_size = arg_sort_window_size.getValue();
125 debug = arg_debug.getValue();
126 }
127 catch (TCLAP::ArgException &e)
128 {
129 std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
130 abort();
131 }
132 };
133
initialize()134 void initialize()
135 {
136 //////////////////////
137 //i/o initialization//
138 //////////////////////
139 stream_selection = false;
140 if (streaming_selection_bed_file != "")
141 {
142 orom_regions = new OrderedRegionOverlapMatcher(streaming_selection_bed_file);
143 stream_selection = true;
144 }
145
146 odr = new BCFOrderedReader(input_vcf_file, intervals);
147 odw = new BCFOrderedWriter(output_vcf_file, sort_window_size, compression_level);
148 if (no_subset_samples==-1)
149 {
150 odw->link_hdr(odr->hdr);
151 }
152 else if (no_subset_samples==0)
153 {
154 odw->link_hdr(bcf_hdr_subset(odr->hdr, 0, 0, 0));
155 }
156
157 /////////////////////////
158 //filter initialization//
159 /////////////////////////
160 filter.parse(fexp.c_str(), debug);
161 filter_exists = fexp=="" ? false : true;
162
163 ////////////////////////
164 //stats initialization//
165 ////////////////////////
166 no_variants = 0;
167 no_samples = 0;
168
169 ///////////////////////
170 //tool initialization//
171 ///////////////////////
172 vm = new VariantManip("");
173 }
174
view()175 void view()
176 {
177 if (print_header_only && output_vcf_file == "-")
178 {
179 odw->write_hdr();
180 odr->close();
181 odw->close();
182 return;
183 }
184
185 if (print_header || output_vcf_file != "-") odw->write_hdr();
186
187 bcf1_t *v = odw->get_bcf1_from_pool();
188 bcf_hdr_t *h = odr->hdr;
189 Variant variant;
190
191 while (odr->read(v))
192 {
193 // bcf_print(h,v);
194
195 if (stream_selection)
196 {
197 bcf_unpack(v, BCF_UN_STR);
198 std::string chrom = bcf_get_chrom(odr->hdr,v);
199 int32_t start1 = bcf_get_pos1(v);
200 int32_t end1 = bcf_get_end1(v);
201
202 if (!orom_regions->overlaps_with(chrom, start1-left_window, end1+right_window))
203 {
204 continue;
205 }
206 }
207
208 if (filter_exists)
209 {
210 vm->classify_variant(h, v, variant);
211 if (!filter.apply(h, v, &variant, debug))
212 {
213 continue;
214 }
215 }
216
217 if (no_subset_samples==0)
218 {
219 bcf_subset(odw->hdr, v, 0, 0);
220 //maybe add some additional adhoc fixing for BCF files that do not have a complete header.
221 }
222 odw->write(v);
223 if (sort_window_size)
224 {
225 v = odw->get_bcf1_from_pool();
226 }
227 ++no_variants;
228 }
229
230 odw->close();
231 odr->close();
232
233 delete(odw);
234 delete(odr);
235 };
236
print_options()237 void print_options()
238 {
239 if (!print) return;
240
241 std::clog << "view v" << version << "\n\n";
242
243 std::clog << "options: input VCF file " << input_vcf_file << "\n";
244 std::clog << " [o] output VCF file " << output_vcf_file << "\n";
245 std::clog << " [w] sort window size " << sort_window_size << "\n";
246 print_str_op(" [t] streaming selection bed file ", streaming_selection_bed_file);
247 print_num_op(" [l] left window ", left_window);
248 print_num_op(" [r] right window ", right_window);
249 std::clog << " [h] print header " << (print_header ? "yes" : "no") << "\n";
250 std::clog << " [H] print header only " << (print_header_only ? "yes" : "no") << "\n";
251 std::clog << " [s] print site information only " << (print_sites_only ? "yes" : "no") << "\n";
252 std::clog << " [p] print options and stats " << (print ? "yes" : "no") << "\n";
253 std::clog << " [c] compression level " << compression_level << "\n";
254 print_str_op(" [f] filter ", fexp);
255 print_int_op(" [i] intervals ", intervals);
256 std::clog << "\n";
257 }
258
print_stats()259 void print_stats()
260 {
261 if (!print) return;
262
263 std::clog << "\n";
264 std::clog << "stats: no. variants : " << no_variants << "\n";
265 std::clog << " no. samples : " << no_samples << "\n";
266 std::clog << "\n";
267 };
268
~Igor()269 ~Igor() {};
270
271 private:
272 };
273
274 }
275
view(int argc,char ** argv)276 bool view(int argc, char ** argv)
277 {
278 Igor igor(argc, argv);
279 igor.print_options();
280 igor.initialize();
281 igor.view();
282 igor.print_stats();
283 return igor.print;
284 };
285