1 /* The MIT License
2
3 Copyright (c) 2014 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "annotate_regions.h"
25
26 namespace
27 {
28
29 class Igor : Program
30 {
31 public:
32
33 std::string version;
34
35 ///////////
36 //options//
37 ///////////
38 std::string input_vcf_file;
39 std::string ref_fasta_file;
40 std::string output_vcf_file;
41 std::vector<GenomeInterval> intervals;
42 std::string interval_list;
43 std::string regions_file;
44 std::string REGIONS_TAG;
45 std::string REGIONS_TAG_DESC;
46 std::string REGIONS_LEFT_TAG;
47 std::string REGIONS_LEFT_TAG_DESC;
48 std::string REGIONS_RIGHT_TAG;
49 std::string REGIONS_RIGHT_TAG_DESC;
50 uint32_t left_window;
51 uint32_t right_window;
52 bool without_regions;
53 bool use_bed;
54
55 ///////
56 //i/o//
57 ///////
58 BCFOrderedReader *odr;
59 BCFOrderedWriter *odw;
60
61 //////////
62 //filter//
63 //////////
64 std::vector<std::string> fexps;
65 Filter filter;
66 bool filter_exists;
67
68 /////////
69 //stats//
70 /////////
71 int32_t no_variants_annotated;
72 int32_t no_variants;
73
74 ////////////////
75 //common tools//
76 ////////////////
77 VariantManip *vm;
78 OrderedRegionOverlapMatcher *orom_regions;
79 OrderedBCFOverlapMatcher *obom_regions;
80
Igor(int argc,char ** argv)81 Igor(int argc, char **argv)
82 {
83 version = "0.5";
84
85 //////////////////////////
86 //options initialization//
87 //////////////////////////
88 try
89 {
90 std::string desc = "annotates regions in a VCF file";
91
92 TCLAP::CmdLine cmd(desc, ' ', version);
93 VTOutput my; cmd.setOutput(&my);
94 TCLAP::ValueArg<std::string> arg_intervals("i", "i", "intervals", false, "", "str", cmd);
95 TCLAP::ValueArg<std::string> arg_interval_list("I", "I", "file containing list of intervals []", false, "", "str", cmd);
96 TCLAP::ValueArg<std::string> arg_output_vcf_file("o", "o", "output VCF file [-]", false, "-", "str", cmd);
97 TCLAP::ValueArg<std::string> arg_regions_file("b", "b", "regions BED/BCF file []", true, "", "str", cmd);
98 TCLAP::ValueArg<std::string> arg_REGIONS_TAG("t", "t", "regions tag []", true, "", "str", cmd);
99 TCLAP::ValueArg<std::string> arg_REGIONS_TAG_DESC("d", "d", "regions tag description []", true, "", "str", cmd);
100 TCLAP::ValueArg<std::string> arg_fexp("f", "f", "filter expression []", false, "", "str", cmd);
101 TCLAP::ValueArg<uint32_t> arg_left_window("l", "l", "left window size for overlap []", false, 0, "int", cmd);
102 TCLAP::ValueArg<uint32_t> arg_right_window("r", "r", "right window size for overlap []", false, 0, "int", cmd);
103 TCLAP::SwitchArg arg_without_regions("w", "w", "outside of (without) regions", cmd, false);
104 TCLAP::UnlabeledValueArg<std::string> arg_input_vcf_file("<in.vcf>", "input VCF file", true, "","file", cmd);
105
106 cmd.parse(argc, argv);
107
108 input_vcf_file = arg_input_vcf_file.getValue();
109 output_vcf_file = arg_output_vcf_file.getValue();
110 parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue());
111 parse_filters(fexps, arg_fexp.getValue(), 2, false);
112 regions_file = arg_regions_file.getValue();
113 left_window = arg_left_window.getValue();
114 right_window = arg_right_window.getValue();
115 without_regions = arg_without_regions.getValue();
116 REGIONS_TAG = arg_REGIONS_TAG.getValue();
117 REGIONS_TAG_DESC = arg_REGIONS_TAG_DESC.getValue();
118 REGIONS_LEFT_TAG = REGIONS_TAG + "_LEFT";
119 REGIONS_RIGHT_TAG = REGIONS_TAG + "_RIGHT";
120 }
121 catch (TCLAP::ArgException &e)
122 {
123 std::cerr << "error: " << e.error() << " for arg " << e.argId() << std::endl;
124 abort();
125 }
126 };
127
~Igor()128 ~Igor() {};
129
initialize()130 void initialize()
131 {
132 ////////////////////
133 //i/o initialization
134 ////////////////////
135 odr = new BCFOrderedReader(input_vcf_file, intervals);
136 odw = new BCFOrderedWriter(output_vcf_file);
137 odw->link_hdr(odr->hdr);
138
139 std::string hrec = "##INFO=<ID=" + REGIONS_TAG + ",Number=0,Type=Flag,Description=\"" + REGIONS_TAG_DESC + "\">";
140 bcf_hdr_append(odw->hdr, hrec.c_str());
141 if (left_window)
142 {
143 std::string hrec = "##INFO=<ID=" + REGIONS_LEFT_TAG + ",Number=0,Type=Flag,Description=\"" + REGIONS_TAG_DESC + " (Left window)\">";
144 bcf_hdr_append(odw->hdr, hrec.c_str());
145 }
146 if (right_window)
147 {
148 std::string hrec = "##INFO=<ID=" + REGIONS_RIGHT_TAG + ",Number=0,Type=Flag,Description=\"" + REGIONS_TAG_DESC + " (Right window)\">";
149 bcf_hdr_append(odw->hdr, hrec.c_str());
150 }
151
152 /////////////////////////
153 //filter initialization//
154 /////////////////////////
155 filter.parse(fexps[0].c_str(), false);
156 filter_exists = fexps[0]=="" ? false : true;
157
158 ///////////////////////
159 //tool initialization//
160 ///////////////////////
161 if (str_ends_with(regions_file, ".bed") || str_ends_with(regions_file, ".bed.gz"))
162 {
163 use_bed = true;
164 orom_regions = new OrderedRegionOverlapMatcher(regions_file);
165 }
166 else if (str_ends_with(regions_file, ".vcf") || str_ends_with(regions_file, ".vcf.gz") || str_ends_with(regions_file, ".bcf"))
167 {
168 use_bed = false;
169 obom_regions = new OrderedBCFOverlapMatcher(regions_file, intervals, fexps[1]);
170 }
171 else
172 {
173 fprintf(stderr, "[%s:%d %s] Need to at least specify either a bed or bcf file\n", __FILE__, __LINE__, __FUNCTION__);
174 exit(1);
175 }
176
177 ////////////////////////
178 //stats initialization//
179 ////////////////////////
180 no_variants_annotated = 0;
181 no_variants = 0;
182 }
183
print_options()184 void print_options()
185 {
186 std::clog << "annotate_regions v" << version << "\n";
187 std::clog << "\n";
188 std::clog << "options: input VCF file(s) " << input_vcf_file << "\n";
189 std::clog << " [o] output VCF file " << output_vcf_file << "\n";
190 print_str_op(" [f] filter 1 ", fexps[0]);
191 print_str_op(" filter 2 ", fexps[1]);
192 print_str_op(" [t] region INFO tag ", REGIONS_TAG);
193 print_str_op(" [d] region INFO description ", REGIONS_TAG_DESC);
194 print_str_op(" [b] regions file ", regions_file);
195 print_boo_op(" [w] without regions ", without_regions);
196 print_num_op(" [l] left window ", left_window);
197 print_num_op(" [r] right window ", right_window);
198 print_int_op(" [i] intervals ", intervals);
199 std::clog << "\n";
200 }
201
print_stats()202 void print_stats()
203 {
204 std::clog << "\n";
205 std::cerr << "stats: no. of variants annotated " << no_variants_annotated << "\n";
206 std::cerr << " total no. of variants " << no_variants << "\n";std::clog << "\n";
207 }
208
annotate_regions()209 void annotate_regions()
210 {
211 odw->write_hdr();
212
213 bcf_hdr_t *h = odr->hdr;
214 bcf1_t *v = bcf_init1();
215 std::vector<Interval*> overlaps;
216 Variant variant;
217 kstring_t s = {0,0,0};
218 while (odr->read(v))
219 {
220 bcf_unpack(v, BCF_UN_STR);
221 if (filter_exists)
222 {
223 vm->classify_variant(h, v, variant);
224 if (!filter.apply(h, v, &variant, false))
225 {
226 continue;
227 }
228 }
229
230 std::string chrom = bcf_get_chrom(odr->hdr,v);
231 int32_t beg1 = bcf_get_pos1(v);
232 int32_t end1 = bcf_get_end1(v);
233
234 if (use_bed)
235 {
236 bool overlaps = orom_regions->overlaps_with(chrom, beg1-left_window, end1+right_window);
237
238 if (!without_regions && overlaps)
239 {
240 if (left_window+right_window)
241 {
242 std::vector<Interval>& regs = orom_regions->overlapping_regions;
243 for (int32_t i=0; i<regs.size(); ++i)
244 {
245 if (beg1>=regs[i].beg1-left_window && beg1<=regs[i].beg1+right_window)
246 {
247 bcf_update_info_flag(odr->hdr, v, REGIONS_LEFT_TAG.c_str(), "", 1);
248 }
249 else if (end1>=regs[i].end1-right_window && end1<=regs[i].end1+right_window)
250 {
251 bcf_update_info_flag(odr->hdr, v, REGIONS_RIGHT_TAG.c_str(), "", 1);
252 }
253 }
254 }
255
256 bcf_update_info_flag(odr->hdr, v, REGIONS_TAG.c_str(), "", 1);
257 ++no_variants_annotated;
258 }
259 else if (without_regions && !overlaps)
260 {
261 bcf_update_info_flag(odr->hdr, v, REGIONS_TAG.c_str(), "", 1);
262 ++no_variants_annotated;
263 }
264 }
265 else
266 {
267 bool overlaps = orom_regions->overlaps_with(chrom, beg1-left_window, end1+right_window);
268
269 if (!without_regions && overlaps)
270 {
271 bcf_update_info_flag(odr->hdr, v, REGIONS_TAG.c_str(), "", 1);
272 ++no_variants_annotated;
273 }
274 else if (without_regions && !overlaps)
275 {
276 bcf_update_info_flag(odr->hdr, v, REGIONS_TAG.c_str(), "", 1);
277 ++no_variants_annotated;
278 }
279 }
280
281 ++no_variants;
282 odw->write(v);
283 }
284
285 odw->close();
286 };
287
288 private:
289 };
290 }
291
annotate_regions(int argc,char ** argv)292 void annotate_regions(int argc, char ** argv)
293 {
294 Igor igor(argc, argv);
295 igor.print_options();
296 igor.initialize();
297 igor.annotate_regions();
298 igor.print_stats();
299 };