1 /* The MIT License
2
3 Copyright (c) 2015 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "merge_genotypes.h"
25
26 #define SINGLE 0
27 #define AGGREGATED 1
28
29 namespace
30 {
31
32 class Igor : Program
33 {
34 public:
35
36 std::string version;
37
38 ///////////
39 //options//
40 ///////////
41 std::vector<std::string> input_vcf_files;
42 std::string input_vcf_file_list;
43 std::string output_vcf_file;
44 std::string candidate_sites_vcf_file;
45 std::vector<GenomeInterval> intervals;
46 std::string interval_list;
47
48 ///////
49 //i/o//
50 ///////
51 BCFSyncedReader *sr;
52 BCFOrderedWriter *odw;
53 bcf1_t *v;
54
55 ///////////////
56 //general use//
57 ///////////////
58 std::vector<int32_t> file_types;
59 kstring_t variant;
60
61 //////////
62 //filter//
63 //////////
64 std::string fexp;
65 Filter filter;
66 bool filter_exists;
67
68 /////////
69 //stats//
70 /////////
71 uint32_t no_samples;
72 uint32_t no_snps;
73 uint32_t no_indels;
74 uint32_t no_vntrs;
75
76 /////////
77 //tools//
78 /////////
79 VariantManip * vm;
80
Igor(int argc,char ** argv)81 Igor(int argc, char ** argv)
82 {
83 //////////////////////////
84 //options initialization//
85 //////////////////////////
86 try
87 {
88 std::string desc =
89 "Merge GT, PL, DP, ADF and ADR across samples.\n\
90 Extracts only the naive genotypes based on best guess genotypes.";
91
92 version = "0.5";
93 TCLAP::CmdLine cmd(desc, ' ', version);
94 VTOutput my; cmd.setOutput(&my);
95 TCLAP::ValueArg<std::string> arg_intervals("i", "i", "intervals", false, "", "str", cmd);
96 TCLAP::ValueArg<std::string> arg_interval_list("I", "I", "file containing list of intervals []", false, "", "str", cmd);
97 TCLAP::ValueArg<std::string> arg_output_vcf_file("o", "o", "output VCF file [-]", false, "-", "", cmd);
98 TCLAP::ValueArg<std::string> arg_candidate_sites_vcf_file("c", "c", "candidate sites VCF file with annotation []", true, "-", "", cmd);
99 TCLAP::ValueArg<std::string> arg_input_vcf_file_list("L", "L", "file containing list of input VCF files", true, "", "str", cmd);
100 TCLAP::ValueArg<std::string> arg_fexp("f", "f", "filter expression (applied to candidate sites file)[]", false, "", "str", cmd);
101 TCLAP::UnlabeledMultiArg<std::string> arg_input_vcf_files("<in1.vcf>...", "Multiple VCF files",false, "files", cmd);
102
103 cmd.parse(argc, argv);
104
105 input_vcf_file_list = arg_input_vcf_file_list.getValue();
106 parse_files(input_vcf_files, arg_input_vcf_files.getValue(), input_vcf_file_list);
107 candidate_sites_vcf_file = arg_candidate_sites_vcf_file.getValue();
108 output_vcf_file = arg_output_vcf_file.getValue();
109 fexp = arg_fexp.getValue();
110 parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue());
111 }
112 catch (TCLAP::ArgException &e)
113 {
114 std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
115 abort();
116 }
117 };
118
initialize()119 void initialize()
120 {
121 /////////////////////////
122 //filter initialization//
123 /////////////////////////
124 filter.parse(fexp.c_str(), false);
125 filter_exists = fexp=="" ? false : true;
126
127 //////////////////////
128 //i/o initialization//
129 //////////////////////
130 fprintf(stderr, "[I:%s:%d %s] Initializing %zd VCF files ...", __FILE__, __LINE__, __FUNCTION__, input_vcf_files.size());
131 input_vcf_files.insert(input_vcf_files.begin(), candidate_sites_vcf_file);
132 sr = new BCFSyncedReader(input_vcf_files, intervals, false);
133 fprintf(stderr, " done.\n");
134
135 odw = new BCFOrderedWriter(output_vcf_file, 0);
136 bcf_hdr_append(odw->hdr, "##fileformat=VCFv4.2");
137 bcf_hdr_transfer_contigs(sr->hdrs[0], odw->hdr);
138 bool rename = true;
139
140 // odw->link_hdr(sr->hdrs[0]);
141 // //exact alignment related statisitcs
142 std::string EX_MOTIF = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_MOTIF", "1", "String", "Canonical motif in a VNTR.", rename);
143 std::string EX_RU = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_RU", "1", "String", "Repeat unit in the reference sequence.", rename);
144 std::string EX_BASIS = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_BASIS", "1", "String", "Basis nucleotides in the motif.", rename);
145 std::string EX_MLEN = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_MLEN", "1", "Integer", "Motif length.", rename);
146 std::string EX_BLEN = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_BLEN", "1", "Integer", "Basis length.", rename);
147 std::string EX_REPEAT_TRACT = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_REPEAT_TRACT", "2", "Integer", "Boundary of the repeat tract detected by exact alignment.", rename);
148 std::string EX_COMP = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_COMP", "4", "Integer", "Composition(%) of bases in an exact repeat tract.", rename);
149 std::string EX_ENTROPY = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_ENTROPY", "1", "Float", "Entropy measure of an exact repeat tract [0,2].", rename);
150 std::string EX_ENTROPY2 = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_ENTROPY2", "1", "Float", "Dinucleotide entropy measure of an exact repeat tract [0,4].", rename);
151 std::string EX_KL_DIVERGENCE = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_KL_DIVERGENCE", "1", "Float", "Kullback-Leibler Divergence of an exact repeat tract.", rename);
152 std::string EX_KL_DIVERGENCE2 = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_KL_DIVERGENCE2", "1", "Float", "Dinucleotide Kullback-Leibler Divergence of an exact repeat tract.", rename);
153 std::string EX_REF = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_REF", ".", "Float", "Allele lengths in repeat units from exact alignment.", rename);
154 std::string EX_RL = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_RL", "1", "Integer", "Reference exact repeat tract length in bases.", rename);
155 std::string EX_LL = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_LL", "1", "Integer", "Longest exact repeat tract length in bases.", rename);
156 std::string EX_RU_COUNTS = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_RU_COUNTS", "2", "Integer", "Number of exact repeat units and total number of repeat units in exact repeat tract.", rename);
157 std::string EX_SCORE = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_SCORE", "1", "Float", "Score of repeat unit in exact repeat tract.", rename);
158 std::string EX_TRF_SCORE = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_TRF_SCORE", "1", "Integer", "TRF Score for M/I/D as 2/-7/-7 in exact repeat tract.", rename);
159
160 //fuzzy alignment related statisitcs
161 std::string FZ_MOTIF = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_MOTIF", "1", "String", "Canonical motif in a VNTR.", rename);
162 std::string FZ_RU = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_RU", "1", "String", "Repeat unit in the reference sequence.", rename);
163 std::string FZ_BASIS = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_BASIS", "1", "String", "Basis nucleotides in the motif.", rename);
164 std::string FZ_MLEN = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_MLEN", "1", "Integer", "Motif length.", rename);
165 std::string FZ_BLEN = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_BLEN", "1", "Integer", "Basis length.", rename);
166 std::string FZ_REPEAT_TRACT = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_REPEAT_TRACT", "2", "Integer", "Boundary of the repeat tract detected by fuzzy alignment.", rename);
167 std::string FZ_COMP = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_COMP", "4", "Integer", "Composition(%) of bases in a fuzzy repeat tract.", rename);
168 std::string FZ_ENTROPY = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_ENTROPY", "1", "Float", "Entropy measure of a fuzzy repeat tract (0-2).", rename);
169 std::string FZ_ENTROPY2 = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_ENTROPY2", "1", "Float", "Dinucleotide entropy measure of a fuzzy repeat tract (0-2).", rename);
170 std::string FZ_KL_DIVERGENCE = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_KL_DIVERGENCE", "1", "Float", "Kullback-Leibler Divergence of a fuzzyt repeat tract.", rename);
171 std::string FZ_KL_DIVERGENCE2 = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_KL_DIVERGENCE2", "1", "Float", "Dinucleotide Kullback-Leibler Divergence of a fuzzy repeat tract.", rename);
172 std::string FZ_REF = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_REF", ".", "Float", "Allele lengths in repeat units from fuzzy alignment.", rename);
173 std::string FZ_RL = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_RL", "1", "Integer", "Reference fuzzy repeat tract length in bases.", rename);
174 std::string FZ_LL = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_LL", "1", "Integer", "Longest fuzzy repeat tract length in bases.", rename);
175 std::string FZ_RU_COUNTS = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_RU_COUNTS", "2", "Integer", "Number of exact repeat units and total number of repeat units in fuzzy repeat tract.", rename);
176 std::string FZ_SCORE = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_SCORE", "1", "Float", "Score of repeat unit in fuzzy repeat tract.", rename);
177 std::string FZ_TRF_SCORE = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_TRF_SCORE", "1", "Integer", "TRF Score for M/I/D as 2/-7/-7 in fuzzy repeat tract.", rename);
178
179 std::string FLANKSEQ = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FLANKSEQ", "1", "String", "Flanking sequence 10bp on either side of REF.", rename);
180 std::string EXACT_RU_AMBIGUOUS = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EXACT_RU_AMBIGUOUS", "0", "Flag", "Exact motif is ambiguous.", rename);
181
182 std::string MOTIF = bcf_hdr_append_info_with_backup_naming(odw->hdr, "MOTIF", "1", "String", "Canonical motif in a VNTR.", rename);
183 std::string RU = bcf_hdr_append_info_with_backup_naming(odw->hdr, "RU", "1", "String", "Repeat unit in the reference sequence.", rename);
184 std::string FLANKS = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FLANKS", "2", "Integer", "Exact left and right flank positions of the Indel.", rename);
185 std::string FZ_FLANKS = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_FLANKS", "2", "Integer", "Fuzzy left and right flank positions of the Indel.", rename);
186
187
188 bcf_hdr_append(odw->hdr, "##INFO=<ID=LARGE_REPEAT_REGION,Number=0,Type=Flag,Description=\"Very large repeat region, vt only detects up to 1000bp long regions.\">");
189 bcf_hdr_append(odw->hdr, "##INFO=<ID=FLANKSEQ,Number=1,Type=String,Description=\"Flanking sequence 10bp on either side of detected repeat region.\">");
190 bcf_hdr_append(odw->hdr, "##FILTER=<ID=overlap_vntr,Description=\"Overlaps with VNTR\">");
191 bcf_hdr_append(odw->hdr, "##FILTER=<ID=overlap_indel,Description=\"Overlaps with indel\">");
192
193 //COMMON
194 bcf_hdr_append(odw->hdr, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
195 bcf_hdr_append(odw->hdr, "##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"PHRED scaled genotype likelihoods\">");
196 bcf_hdr_append(odw->hdr, "##FORMAT=<ID=AD,Number=A,Type=Integer,Description=\"Allele Depth\">");
197 bcf_hdr_append(odw->hdr, "##FORMAT=<ID=ADF,Number=A,Type=Integer,Description=\"Allele Depth (Forward strand)\">");
198 bcf_hdr_append(odw->hdr, "##FORMAT=<ID=ADR,Number=A,Type=Integer,Description=\"Allele Depth (Reverse strand)\">");
199 bcf_hdr_append(odw->hdr, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Depth\">");
200
201 //compute following stats for SNP filtering
202 //update from topmed freeze 3 calling pipeline
203
204
205
206 //VNTR
207 bcf_hdr_append(odw->hdr, "##FORMAT=<ID=CG,Number=.,Type=Float,Description=\"Repeat count genotype\">");
208
209 //add samples to output merged file
210 for (int32_t i=1; i<sr->hdrs.size(); ++i)
211 {
212 // printf("adding sample = %s\n", bcf_hdr_get_sample_name(sr->hdrs[i], 0));
213 bcf_hdr_add_sample(odw->hdr, bcf_hdr_get_sample_name(sr->hdrs[i], 0) );
214 }
215 bcf_hdr_sync(odw->hdr);
216
217 odw->write_hdr();
218
219 ///////////////
220 //general use//
221 ///////////////
222 variant = {0,0,0};
223 no_samples = sr->nfiles - 1;
224
225 ////////////////////////
226 //stats initialization//
227 ////////////////////////
228 no_snps = 0;
229 no_indels = 0;
230 no_vntrs = 0;
231
232 /////////
233 //tools//
234 /////////
235 vm = new VariantManip();
236 }
237
merge_genotypes()238 void merge_genotypes()
239 {
240 int32_t *NSAMPLES = NULL;
241 int32_t no_NSAMPLES = 0;
242 int32_t *GT = NULL;
243 int32_t *PL = NULL;
244 int32_t *AD = NULL;
245 int32_t *ADF = NULL;
246 int32_t *ADR = NULL;
247 int32_t *BQSUM = NULL;
248 int32_t *DP = NULL;
249 float *CG = NULL;
250
251 int32_t no_GT = 0;
252 int32_t no_PL = 0;
253 int32_t no_AD = 0;
254 int32_t no_ADF = 0;
255 int32_t no_ADR = 0;
256 int32_t no_BQSUM = 0;
257 int32_t no_DP = 0;
258 int32_t no_CG = 0;
259
260 std::vector<int32_t> gt;
261 std::vector<int32_t> pl;
262 std::vector<int32_t> ad;
263 std::vector<int32_t> dp;
264 std::vector<float> cg;
265
266 bcf1_t* nv = bcf_init();
267 Variant var;
268 std::vector<bcfptr*> current_recs;
269
270 while(sr->read_next_position(current_recs))
271 {
272 if (current_recs.size()!=no_samples+1)
273 {
274 std::string variant = bcf_variant2string(current_recs[0]->h, current_recs[0]->v);
275
276 if (current_recs.size()<no_samples+1)
277 {
278 fprintf(stderr, "[W:%s:%d %s] %d variants expected but %zd is observed for %s. Variant skipped.\n", __FILE__, __LINE__, __FUNCTION__, no_samples+1, current_recs.size(), variant.c_str());
279 continue;
280 }
281 else if (current_recs.size()>no_samples+1)
282 {
283 int32_t m = current_recs.size()/(no_samples+1);
284
285 if (m*(no_samples+1)==current_recs.size())
286 {
287 fprintf(stderr, "[W:%s:%d %s] %d variants expected but %zd is observed for %s. Seems like a duplicate variant. Will attempt to process anyway.\n", __FILE__, __LINE__, __FUNCTION__, no_samples+1, current_recs.size(), variant.c_str());
288 }
289 else
290 {
291 fprintf(stderr, "[W:%s:%d %s] %d variants expected but %zd is observed for %s. Variant skipped.\n", __FILE__, __LINE__, __FUNCTION__, no_samples+1, current_recs.size(), variant.c_str());
292 continue;
293 }
294 }
295 }
296
297 gt.resize(0);
298 pl.resize(0);
299 ad.resize(0);
300 dp.resize(0);
301 cg.resize(0);
302
303 bcf_clear(nv);
304 int32_t vtype;
305
306 if (filter_exists)
307 {
308 vm->classify_variant(current_recs[0]->h, current_recs[0]->v, var);
309 if (!filter.apply(current_recs[0]->h, current_recs[0]->v, &var, false))
310 {
311 continue;
312 }
313 }
314
315 std::vector<bool> file_processed(no_samples+1, false);
316 int32_t files_processed = 0;
317
318 // file_processed.resize(0);
319 // file_processed.resize(no_samples+1, false);
320
321 //for each file
322 for (uint32_t i=0; i<current_recs.size(); ++i)
323 {
324 int32_t file_index = current_recs[i]->file_index;
325 bcf1_t *v = current_recs[i]->v;
326 bcf_hdr_t *h = current_recs[i]->h;
327
328 if (file_processed[file_index])
329 {
330 //duplicate variant from the same file, skip
331 continue;
332 }
333 else
334 {
335 ++files_processed;
336 file_processed[file_index] = true;
337 }
338
339 // printf("\tfile index: %d\n", file_index);
340 // bcf_print(h, v);
341
342
343 //candidate sites file, populate info fields
344 if (!file_index)
345 {
346 vtype = vm->classify_variant(h, v, var);
347
348 // bcf_copy(v, nv);
349
350 bcf_set_chrom(odw->hdr, nv, bcf_get_chrom(h, v));
351 bcf_set_pos1(nv, bcf_get_pos1(v));
352 bcf_update_alleles(odw->hdr, nv, const_cast<const char**>(bcf_get_allele(v)), bcf_get_n_allele(v));
353 bcf_set_n_sample(nv, no_samples);
354
355 if (vtype==VT_SNP)
356 {
357 bcf_copy_info_field(h, v, odw->hdr, nv, "FLANKSEQ", BCF_HT_STR);
358 }
359 else if (vtype==VT_INDEL)
360 {
361 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_MOTIF", BCF_HT_STR);
362 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_MLEN", BCF_HT_INT);
363 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_RU", BCF_HT_STR);
364 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_BASIS", BCF_HT_STR);
365 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_BLEN", BCF_HT_INT);
366 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_REPEAT_TRACT", BCF_HT_INT);
367 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_COMP", BCF_HT_INT);
368 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_ENTROPY", BCF_HT_REAL);
369 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_ENTROPY2", BCF_HT_REAL);
370 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_KL_DIVERGENCE", BCF_HT_REAL);
371 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_KL_DIVERGENCE2", BCF_HT_REAL);
372 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_REF", BCF_HT_REAL);
373 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_RL", BCF_HT_INT);
374 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_LL", BCF_HT_INT);
375 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_RU_COUNTS", BCF_HT_INT);
376 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_SCORE", BCF_HT_REAL);
377 bcf_copy_info_field(h, v, odw->hdr, nv, "EX_TRF_SCORE", BCF_HT_INT);
378
379 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_MOTIF", BCF_HT_STR);
380 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_MLEN", BCF_HT_INT);
381 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_RU", BCF_HT_STR);
382 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_BASIS", BCF_HT_STR);
383 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_BLEN", BCF_HT_INT);
384 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_REPEAT_TRACT", BCF_HT_INT);
385 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_COMP", BCF_HT_INT);
386 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_ENTROPY", BCF_HT_REAL);
387 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_ENTROPY2", BCF_HT_REAL);
388 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_KL_DIVERGENCE", BCF_HT_REAL);
389 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_KL_DIVERGENCE2", BCF_HT_REAL);
390 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_REF", BCF_HT_REAL);
391 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_RL", BCF_HT_INT);
392 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_LL", BCF_HT_INT);
393 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_RU_COUNTS", BCF_HT_INT);
394 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_SCORE", BCF_HT_REAL);
395 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_TRF_SCORE", BCF_HT_INT);
396
397 bcf_copy_info_field(h, v, odw->hdr, nv, "EXACT_RU_AMBIGUOUS", BCF_HT_FLAG);
398 bcf_copy_info_field(h, v, odw->hdr, nv, "LARGE_REPEAT_REGION", BCF_HT_FLAG);
399 }
400 else if (vtype==VT_VNTR)
401 {
402 // printf("\t\tis a VNTR\n");
403 // printf("\t\t\tcopying info fields\n");
404 // bcf_print(h, v);
405
406 bcf_copy_info_field(h, v, odw->hdr, nv, "MOTIF", BCF_HT_STR);
407 bcf_copy_info_field(h, v, odw->hdr, nv, "RU", BCF_HT_STR);
408 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_RL", BCF_HT_INT);
409 bcf_copy_info_field(h, v, odw->hdr, nv, "FLANKS", BCF_HT_INT);
410 bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_FLANKS", BCF_HT_INT);
411 }
412
413 std::vector<int32_t> filter_ids;
414
415 if (bcf_has_filter(h, v, const_cast<char*>("overlap_indel"))==1)
416 {
417 filter_ids.push_back(bcf_hdr_id2int(odw->hdr, BCF_DT_ID, const_cast<char*>("overlap_indel")));
418 // int32_t overlap_indel_filter_id = bcf_hdr_id2int(odw->hdr, BCF_DT_ID, const_cast<char*>("overlap_indel"));
419 // bcf_update_filter(odw->hdr, nv, &overlap_indel_filter_id, 1);
420 }
421
422 if (bcf_has_filter(h, v, const_cast<char*>("overlap_vntr"))==1)
423 {
424 filter_ids.push_back(bcf_hdr_id2int(odw->hdr, BCF_DT_ID, const_cast<char*>("overlap_vntr")));
425 // int32_t overlap_indel_filter_id = bcf_hdr_id2int(odw->hdr, BCF_DT_ID, const_cast<char*>("overlap_vntr"));
426 // bcf_update_filter(odw->hdr, nv, &overlap_indel_filter_id, 1);
427 }
428
429 bcf_update_filter(odw->hdr, nv, &filter_ids[0], filter_ids.size());
430
431 continue;
432 }
433
434 if (vtype==VT_SNP)
435 {
436 // printf("\t\tis a SNP\n");
437
438 int32_t no_gt = bcf_get_genotypes(h, v, >, &no_GT);
439 int32_t no_pl = bcf_get_format_int32(h, v, "PL", &PL, &no_PL);
440 int32_t no_dp = bcf_get_format_int32(h, v, "DP", &DP, &no_DP);
441 int32_t no_adf = bcf_get_format_int32(h, v, "ADF", &ADF, &no_ADF);
442 int32_t no_adr = bcf_get_format_int32(h, v, "ADR", &ADR, &no_ADR);
443 int32_t no_bqsum = bcf_get_format_int32(h, v, "BQSUM", &BQSUM, &no_BQSUM);
444
445 // printf("GT: %d\n", no_gt);
446 // printf("PL: %d\n", no_pl);
447 // printf("DP: %d\n", no_dp);
448 // printf("ADF: %d\n", no_adf);
449 // printf("ADR: %d\n", no_adr);
450 // printf("BQSUM: %d\n", no_bqsum);
451
452 //GT:PL:DP:AD:ADF:ADR:BQ:MQ:CY:ST:AL:NM
453 if (no_gt > 0 &&
454 no_pl > 0 &&
455 no_dp > 0 &&
456 no_adf > 0 &&
457 no_adr > 0)
458 {
459 // printf("\t\tupdating genotypes for GT,PL\n");
460 // printf("\t\t\tno_GT %d\n", no_GT);
461 // printf("\t\t\t%d %d\n", GT[0], GT[1]);
462 gt.push_back(GT[0]);
463 gt.push_back(GT[1]);
464 pl.push_back(PL[0]);
465 pl.push_back(PL[1]);
466 pl.push_back(PL[2]);
467 dp.push_back(DP[0]);
468 ad.push_back(ADF[0]+ADR[0]);
469 ad.push_back(ADF[1]+ADR[1]);
470
471 // printf("GT: %d\n", no_GT);
472 // printf("PL: %d\n", no_PL);
473 // printf("DP: %d\n", no_DP);
474 // printf("ADF: %d\n", no_ADF);
475 // printf("ADR: %d\n", no_ADR);
476
477 }
478 //GT:BQSUM:DP
479 else if (no_gt > 0 &&
480 no_bqsum > 0 &&
481 no_dp > 0)
482 {
483 // printf("\t\tupdating genotypes for GT,BQSUM\n");
484 // printf("\t\t\tno_GT %d\n", no_GT);
485 // printf("\t\t\t%d %d\n", GT[0], GT[1]);
486 gt.push_back(GT[0]);
487 gt.push_back(GT[1]);
488 pl.push_back(0);
489 pl.push_back(BQSUM[0]/3);
490 pl.push_back(BQSUM[0]);
491 dp.push_back(DP[0]);
492 ad.push_back(DP[0]);
493 ad.push_back(0);
494 }
495 else
496 {
497 fprintf(stderr, "[E:%s:%d %s] cannot get format values GT:PL:DP:ADF:ADR or GT:BQSUM:DP from %s\n", __FILE__, __LINE__, __FUNCTION__, sr->file_names[file_index].c_str());
498 exit(1);
499 }
500 }
501 else if (vtype == VT_INDEL)
502 {
503 // printf("\t\tis an INDEL\n");
504
505 int32_t no_gt = bcf_get_genotypes(h, v, >, &no_GT);
506 int32_t no_pl = bcf_get_format_int32(h, v, "PL", &PL, &no_PL);
507 int32_t no_dp = bcf_get_format_int32(h, v, "DP", &DP, &no_DP);
508 int32_t no_adf = bcf_get_format_int32(h, v, "ADF", &ADF, &no_ADF);
509 int32_t no_adr = bcf_get_format_int32(h, v, "ADR", &ADR, &no_ADR);
510 int32_t no_bqsum = bcf_get_format_int32(h, v, "BQSUM", &BQSUM, &no_BQSUM);
511
512 if (no_gt > 0 &&
513 no_pl > 0 &&
514 no_dp > 0 &&
515 no_adf > 0 &&
516 no_adr > 0)
517 {
518 // printf("\t\tupdating genotypes for GT,PL\n");
519 // printf("\t\t\tno_GT %d\n", no_GT);
520 // printf("\t\t\t%d %d\n", GT[0], GT[1]);
521 gt.push_back(GT[0]);
522 gt.push_back(GT[1]);
523 pl.push_back(PL[0]);
524 pl.push_back(PL[1]);
525 pl.push_back(PL[2]);
526 dp.push_back(DP[0]);
527 ad.push_back(ADF[0]+ADR[0]);
528 ad.push_back(ADF[1]+ADR[1]);
529
530 // printf("GT: %d\n", no_GT);
531 // printf("PL: %d\n", no_PL);
532 // printf("DP: %d\n", no_DP);
533 // printf("ADF: %d\n", no_ADF);
534 // printf("ADR: %d\n", no_ADR);
535
536 }
537 //GT:BQSUM:DP
538 else if (no_gt > 0 &&
539 no_bqsum > 0 &&
540 no_dp > 0)
541 {
542 // printf("\t\tupdating genotypes for GT,BQSUM\n");
543 // printf("\t\t\tno_GT %d\n", no_GT);
544 // printf("\t\t\t%d %d\n", GT[0], GT[1]);
545 gt.push_back(GT[0]);
546 gt.push_back(GT[1]);
547 pl.push_back(0);
548 pl.push_back(BQSUM[0]/3);
549 pl.push_back(BQSUM[0]);
550 dp.push_back(DP[0]);
551 ad.push_back(DP[0]);
552 ad.push_back(0);
553 }
554 else
555 {
556 fprintf(stderr, "[E:%s:%d %s] cannot get format values GT:PL:DP:ADF:ADR or GT:BQSUM:DP from %s\n", __FILE__, __LINE__, __FUNCTION__, sr->file_names[file_index].c_str());
557 exit(1);
558 }
559 }
560 else if (vtype == VT_VNTR)
561 {
562 // printf("\t\tis a VNTR\n");
563 // bcf_print(h, v);
564 //
565 int32_t no_cg = bcf_get_format_float(h, v, "CG", &CG, &no_CG);
566
567 //CG
568 if (no_cg > 0)
569 {
570 // printf("\t\tupdating genotypes for CG\n");
571 // printf("\t\t\tno_CG %d\n", no_CG);
572 // printf("\t\t\t%f %f\n", CG[0], CG[1]);
573 cg.push_back(CG[0]);
574 cg.push_back(CG[1]);
575 }
576 else
577 {
578 fprintf(stderr, "[E:%s:%d %s] cannot get format values CG from %s\n", __FILE__, __LINE__, __FUNCTION__, sr->file_names[file_index].c_str());
579 }
580 }
581
582 }//end processing each file
583
584 //write to merged record
585 if (vtype==VT_SNP)
586 {
587 // printf("\tupdating genotypes %zd\n", gt.size());
588 // printf("\tn_samples %zd\n", nv->n_sample);
589 bcf_update_genotypes(odw->hdr, nv, >[0], gt.size());
590 bcf_update_format_int32(odw->hdr, nv, "PL", &pl[0], pl.size());
591 bcf_update_format_int32(odw->hdr, nv, "DP", &dp[0], dp.size());
592 bcf_update_format_int32(odw->hdr, nv, "AD", &ad[0], ad.size());
593
594 ++no_snps;
595 }
596 else if (vtype==VT_INDEL)
597 {
598 // printf("\tupdating genotypes %zd\n", gt.size());
599 // printf("\tn_samples %zd\n", nv->n_sample);
600 bcf_update_genotypes(odw->hdr, nv, >[0], gt.size());
601 bcf_update_format_int32(odw->hdr, nv, "PL", &pl[0], pl.size());
602 bcf_update_format_int32(odw->hdr, nv, "DP", &dp[0], dp.size());
603 bcf_update_format_int32(odw->hdr, nv, "AD", &ad[0], ad.size());
604
605 ++no_indels;
606 }
607 else if (vtype==VT_VNTR)
608 {
609 // printf("\tupdating genotypes %zd\n", gt.size());
610 // printf("\tn_samples %zd\n", nv->n_sample);
611 bcf_update_format_float(odw->hdr, nv, "CG", &cg[0], cg.size());
612
613 ++no_vntrs;
614
615 // bcf_print(odw->hdr, nv);
616 }
617
618 //check to make sure correct number of records are processed.
619 if (files_processed!=file_processed.size())
620 {
621 fprintf(stderr, "[I:%s:%d %s] Lesser than expected number of files processed : %d\n", __FILE__, __LINE__, __FUNCTION__, files_processed);
622 exit(1);
623 }
624
625 odw->write(nv);
626 // bcf_print(odw->hdr, nv);
627
628 //this acts as a flag to initialize a newly merged record
629 vtype = VT_UNDEFINED;
630
631 int32_t no_variants = no_snps+no_indels+no_vntrs;
632 // if ((no_variants&0x0FFF)==0x0600)
633 if ((no_variants%100)==0)
634 {
635 fprintf(stderr, "[I:%s:%d %s] Merged %d rows\n", __FILE__, __LINE__, __FUNCTION__, no_variants);
636 }
637
638 }
639
640 odw->close();
641 fprintf(stderr, "[I:%s:%d %s] Synced reader closing ...", __FILE__, __LINE__, __FUNCTION__);
642 sr->close();
643 fprintf(stderr, " closed\n");
644
645 };
646
print_options()647 void print_options()
648 {
649 std::clog << "merge_genotypes v" << version << "\n\n";
650 std::clog << "options: [L] input VCF file list " << input_vcf_file_list << " (" << input_vcf_files.size() << " files)\n";
651 std::clog << " [c] candidate sites VCF file " << candidate_sites_vcf_file << "\n";
652 std::clog << " [o] output VCF file " << output_vcf_file << "\n";
653 print_str_op(" [f] filter (applied to candidate sites file) ", fexp);
654 print_int_op(" [i] intervals ", intervals);
655 std::clog << "\n";
656 }
657
print_stats()658 void print_stats()
659 {
660 std::clog << "\n";
661 std::clog << "stats: Total no. of SNPs " << no_snps << "\n";
662 std::clog << " Total no. of Indels " << no_indels << "\n";
663 std::clog << " Total no. of VNTRs " << no_vntrs << "\n";
664 std::clog << "\n";
665 };
666
~Igor()667 ~Igor()
668 {
669 };
670
671 private:
672 };
673
674 }
675
merge_genotypes(int argc,char ** argv)676 void merge_genotypes(int argc, char ** argv)
677 {
678 Igor igor(argc, argv);
679 igor.print_options();
680 igor.initialize();
681 igor.merge_genotypes();
682 igor.print_stats();
683 }
684
685