1 /* The MIT License
2 
3    Copyright (c) 2014 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "annotate_regions.h"
25 
26 namespace
27 {
28 
29 class Igor : Program
30 {
31     public:
32 
33     std::string version;
34 
35     ///////////
36     //options//
37     ///////////
38     std::string input_vcf_file;
39     std::string ref_fasta_file;
40     std::string output_vcf_file;
41     std::vector<GenomeInterval> intervals;
42     std::string interval_list;
43     std::string regions_file;
44     std::string REGIONS_TAG;
45     std::string REGIONS_TAG_DESC;
46     std::string REGIONS_LEFT_TAG;
47     std::string REGIONS_LEFT_TAG_DESC;
48     std::string REGIONS_RIGHT_TAG;
49     std::string REGIONS_RIGHT_TAG_DESC;
50     uint32_t left_window;
51     uint32_t right_window;
52     bool without_regions;
53     bool use_bed;
54 
55     ///////
56     //i/o//
57     ///////
58     BCFOrderedReader *odr;
59     BCFOrderedWriter *odw;
60 
61     //////////
62     //filter//
63     //////////
64     std::vector<std::string> fexps;
65     Filter filter;
66     bool filter_exists;
67 
68     /////////
69     //stats//
70     /////////
71     int32_t no_variants_annotated;
72     int32_t no_variants;
73 
74     ////////////////
75     //common tools//
76     ////////////////
77     VariantManip *vm;
78     OrderedRegionOverlapMatcher *orom_regions;
79     OrderedBCFOverlapMatcher *obom_regions;
80 
Igor(int argc,char ** argv)81     Igor(int argc, char **argv)
82     {
83         version = "0.5";
84 
85         //////////////////////////
86         //options initialization//
87         //////////////////////////
88         try
89         {
90             std::string desc = "annotates regions in a VCF file";
91 
92             TCLAP::CmdLine cmd(desc, ' ', version);
93             VTOutput my; cmd.setOutput(&my);
94             TCLAP::ValueArg<std::string> arg_intervals("i", "i", "intervals", false, "", "str", cmd);
95             TCLAP::ValueArg<std::string> arg_interval_list("I", "I", "file containing list of intervals []", false, "", "str", cmd);
96             TCLAP::ValueArg<std::string> arg_output_vcf_file("o", "o", "output VCF file [-]", false, "-", "str", cmd);
97             TCLAP::ValueArg<std::string> arg_regions_file("b", "b", "regions BED/BCF file []", true, "", "str", cmd);
98             TCLAP::ValueArg<std::string> arg_REGIONS_TAG("t", "t", "regions tag []", true, "", "str", cmd);
99             TCLAP::ValueArg<std::string> arg_REGIONS_TAG_DESC("d", "d", "regions tag description []", true, "", "str", cmd);
100             TCLAP::ValueArg<std::string> arg_fexp("f", "f", "filter expression []", false, "", "str", cmd);
101             TCLAP::ValueArg<uint32_t> arg_left_window("l", "l", "left window size for overlap []", false, 0, "int", cmd);
102             TCLAP::ValueArg<uint32_t> arg_right_window("r", "r", "right window size for overlap []", false, 0, "int", cmd);
103             TCLAP::SwitchArg arg_without_regions("w", "w", "outside of (without) regions", cmd, false);
104             TCLAP::UnlabeledValueArg<std::string> arg_input_vcf_file("<in.vcf>", "input VCF file", true, "","file", cmd);
105 
106             cmd.parse(argc, argv);
107 
108             input_vcf_file = arg_input_vcf_file.getValue();
109             output_vcf_file = arg_output_vcf_file.getValue();
110             parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue());
111             parse_filters(fexps, arg_fexp.getValue(), 2, false);
112             regions_file = arg_regions_file.getValue();
113             left_window = arg_left_window.getValue();
114             right_window = arg_right_window.getValue();
115             without_regions = arg_without_regions.getValue();
116             REGIONS_TAG = arg_REGIONS_TAG.getValue();
117             REGIONS_TAG_DESC = arg_REGIONS_TAG_DESC.getValue();
118             REGIONS_LEFT_TAG = REGIONS_TAG + "_LEFT";
119             REGIONS_RIGHT_TAG = REGIONS_TAG + "_RIGHT";
120         }
121         catch (TCLAP::ArgException &e)
122         {
123             std::cerr << "error: " << e.error() << " for arg " << e.argId() << std::endl;
124             abort();
125         }
126     };
127 
~Igor()128     ~Igor() {};
129 
initialize()130     void initialize()
131     {
132         ////////////////////
133         //i/o initialization
134         ////////////////////
135         odr = new BCFOrderedReader(input_vcf_file, intervals);
136         odw = new BCFOrderedWriter(output_vcf_file);
137         odw->link_hdr(odr->hdr);
138 
139         std::string hrec = "##INFO=<ID=" + REGIONS_TAG + ",Number=0,Type=Flag,Description=\"" + REGIONS_TAG_DESC + "\">";
140         bcf_hdr_append(odw->hdr, hrec.c_str());
141         if (left_window)
142         {
143             std::string hrec = "##INFO=<ID=" + REGIONS_LEFT_TAG + ",Number=0,Type=Flag,Description=\"" + REGIONS_TAG_DESC + " (Left window)\">";
144             bcf_hdr_append(odw->hdr, hrec.c_str());
145         }
146         if (right_window)
147         {
148             std::string hrec = "##INFO=<ID=" + REGIONS_RIGHT_TAG + ",Number=0,Type=Flag,Description=\"" + REGIONS_TAG_DESC + " (Right window)\">";
149             bcf_hdr_append(odw->hdr, hrec.c_str());
150         }
151 
152         /////////////////////////
153         //filter initialization//
154         /////////////////////////
155         filter.parse(fexps[0].c_str(), false);
156         filter_exists = fexps[0]=="" ? false : true;
157 
158         ///////////////////////
159         //tool initialization//
160         ///////////////////////
161         if (str_ends_with(regions_file, ".bed") || str_ends_with(regions_file, ".bed.gz"))
162         {
163             use_bed = true;
164             orom_regions = new OrderedRegionOverlapMatcher(regions_file);
165         }
166         else if (str_ends_with(regions_file, ".vcf") || str_ends_with(regions_file, ".vcf.gz") ||  str_ends_with(regions_file, ".bcf"))
167         {
168             use_bed = false;
169             obom_regions = new OrderedBCFOverlapMatcher(regions_file, intervals, fexps[1]);
170         }
171         else
172         {
173             fprintf(stderr, "[%s:%d %s] Need to at least specify either a bed or bcf file\n", __FILE__, __LINE__, __FUNCTION__);
174             exit(1);
175         }
176 
177         ////////////////////////
178         //stats initialization//
179         ////////////////////////
180         no_variants_annotated = 0;
181         no_variants = 0;
182     }
183 
print_options()184     void print_options()
185     {
186         std::clog << "annotate_regions v" << version << "\n";
187         std::clog << "\n";
188         std::clog << "options:     input VCF file(s)       " << input_vcf_file << "\n";
189         std::clog << "         [o] output VCF file         " << output_vcf_file << "\n";
190         print_str_op("         [f] filter 1                ", fexps[0]);
191         print_str_op("             filter 2                ", fexps[1]);
192         print_str_op("         [t] region INFO tag         ", REGIONS_TAG);
193         print_str_op("         [d] region INFO description ", REGIONS_TAG_DESC);
194         print_str_op("         [b] regions file            ", regions_file);
195         print_boo_op("         [w] without regions         ", without_regions);
196         print_num_op("         [l] left window             ", left_window);
197         print_num_op("         [r] right window            ", right_window);
198         print_int_op("         [i] intervals               ", intervals);
199         std::clog << "\n";
200     }
201 
print_stats()202     void print_stats()
203     {
204         std::clog << "\n";
205         std::cerr << "stats: no. of variants annotated     " << no_variants_annotated << "\n";
206         std::cerr << "       total no. of variants         " << no_variants << "\n";std::clog << "\n";
207     }
208 
annotate_regions()209     void annotate_regions()
210     {
211         odw->write_hdr();
212 
213         bcf_hdr_t *h = odr->hdr;
214         bcf1_t *v = bcf_init1();
215         std::vector<Interval*> overlaps;
216         Variant variant;
217         kstring_t s = {0,0,0};
218         while (odr->read(v))
219         {
220             bcf_unpack(v, BCF_UN_STR);
221             if (filter_exists)
222             {
223                 vm->classify_variant(h, v, variant);
224                 if (!filter.apply(h, v, &variant, false))
225                 {
226                     continue;
227                 }
228             }
229 
230             std::string chrom = bcf_get_chrom(odr->hdr,v);
231             int32_t beg1 = bcf_get_pos1(v);
232             int32_t end1 = bcf_get_end1(v);
233 
234             if (use_bed)
235             {
236                 bool overlaps = orom_regions->overlaps_with(chrom, beg1-left_window, end1+right_window);
237 
238                 if (!without_regions && overlaps)
239                 {
240                     if (left_window+right_window)
241                     {
242                         std::vector<Interval>& regs = orom_regions->overlapping_regions;
243                         for (int32_t i=0; i<regs.size(); ++i)
244                         {
245                             if (beg1>=regs[i].beg1-left_window && beg1<=regs[i].beg1+right_window)
246                             {
247                                 bcf_update_info_flag(odr->hdr, v, REGIONS_LEFT_TAG.c_str(), "", 1);
248                             }
249                             else if (end1>=regs[i].end1-right_window && end1<=regs[i].end1+right_window)
250                             {
251                                 bcf_update_info_flag(odr->hdr, v, REGIONS_RIGHT_TAG.c_str(), "", 1);
252                             }
253                         }
254                     }
255 
256                     bcf_update_info_flag(odr->hdr, v, REGIONS_TAG.c_str(), "", 1);
257                     ++no_variants_annotated;
258                 }
259                 else if (without_regions && !overlaps)
260                 {
261                     bcf_update_info_flag(odr->hdr, v, REGIONS_TAG.c_str(), "", 1);
262                     ++no_variants_annotated;
263                 }
264             }
265             else
266             {
267                 bool overlaps = orom_regions->overlaps_with(chrom, beg1-left_window, end1+right_window);
268 
269                 if (!without_regions && overlaps)
270                 {
271                     bcf_update_info_flag(odr->hdr, v, REGIONS_TAG.c_str(), "", 1);
272                     ++no_variants_annotated;
273                 }
274                 else if (without_regions && !overlaps)
275                 {
276                     bcf_update_info_flag(odr->hdr, v, REGIONS_TAG.c_str(), "", 1);
277                     ++no_variants_annotated;
278                 }
279             }
280 
281             ++no_variants;
282             odw->write(v);
283         }
284 
285         odw->close();
286     };
287 
288     private:
289 };
290 }
291 
annotate_regions(int argc,char ** argv)292 void annotate_regions(int argc, char ** argv)
293 {
294     Igor igor(argc, argv);
295     igor.print_options();
296     igor.initialize();
297     igor.annotate_regions();
298     igor.print_stats();
299 };