1 /* The MIT License
2 
3    Copyright (c) 2015 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "merge_genotypes.h"
25 
26 #define SINGLE     0
27 #define AGGREGATED 1
28 
29 namespace
30 {
31 
32 class Igor : Program
33 {
34     public:
35 
36     std::string version;
37 
38     ///////////
39     //options//
40     ///////////
41     std::vector<std::string> input_vcf_files;
42     std::string input_vcf_file_list;
43     std::string output_vcf_file;
44     std::string candidate_sites_vcf_file;
45     std::vector<GenomeInterval> intervals;
46     std::string interval_list;
47 
48     ///////
49     //i/o//
50     ///////
51     BCFSyncedReader *sr;
52     BCFOrderedWriter *odw;
53     bcf1_t *v;
54 
55     ///////////////
56     //general use//
57     ///////////////
58     std::vector<int32_t> file_types;
59     kstring_t variant;
60 
61     //////////
62     //filter//
63     //////////
64     std::string fexp;
65     Filter filter;
66     bool filter_exists;
67 
68     /////////
69     //stats//
70     /////////
71     uint32_t no_samples;
72     uint32_t no_snps;
73     uint32_t no_indels;
74     uint32_t no_vntrs;
75 
76     /////////
77     //tools//
78     /////////
79     VariantManip * vm;
80 
Igor(int argc,char ** argv)81     Igor(int argc, char ** argv)
82     {
83         //////////////////////////
84         //options initialization//
85         //////////////////////////
86         try
87         {
88             std::string desc =
89 "Merge GT, PL, DP, ADF and ADR across samples.\n\
90 Extracts only the naive genotypes based on best guess genotypes.";
91 
92             version = "0.5";
93             TCLAP::CmdLine cmd(desc, ' ', version);
94             VTOutput my; cmd.setOutput(&my);
95             TCLAP::ValueArg<std::string> arg_intervals("i", "i", "intervals", false, "", "str", cmd);
96             TCLAP::ValueArg<std::string> arg_interval_list("I", "I", "file containing list of intervals []", false, "", "str", cmd);
97             TCLAP::ValueArg<std::string> arg_output_vcf_file("o", "o", "output VCF file [-]", false, "-", "", cmd);
98             TCLAP::ValueArg<std::string> arg_candidate_sites_vcf_file("c", "c", "candidate sites VCF file with annotation []", true, "-", "", cmd);
99             TCLAP::ValueArg<std::string> arg_input_vcf_file_list("L", "L", "file containing list of input VCF files", true, "", "str", cmd);
100             TCLAP::ValueArg<std::string> arg_fexp("f", "f", "filter expression (applied to candidate sites file)[]", false, "", "str", cmd);
101             TCLAP::UnlabeledMultiArg<std::string> arg_input_vcf_files("<in1.vcf>...", "Multiple VCF files",false, "files", cmd);
102 
103             cmd.parse(argc, argv);
104 
105             input_vcf_file_list = arg_input_vcf_file_list.getValue();
106             parse_files(input_vcf_files, arg_input_vcf_files.getValue(), input_vcf_file_list);
107             candidate_sites_vcf_file = arg_candidate_sites_vcf_file.getValue();
108             output_vcf_file = arg_output_vcf_file.getValue();
109             fexp = arg_fexp.getValue();
110             parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue());
111         }
112         catch (TCLAP::ArgException &e)
113         {
114             std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
115             abort();
116         }
117     };
118 
initialize()119     void initialize()
120     {
121         /////////////////////////
122         //filter initialization//
123         /////////////////////////
124         filter.parse(fexp.c_str(), false);
125         filter_exists = fexp=="" ? false : true;
126 
127         //////////////////////
128         //i/o initialization//
129         //////////////////////
130         fprintf(stderr, "[I:%s:%d %s] Initializing %zd VCF files ...", __FILE__, __LINE__, __FUNCTION__, input_vcf_files.size());
131         input_vcf_files.insert(input_vcf_files.begin(), candidate_sites_vcf_file);
132         sr = new BCFSyncedReader(input_vcf_files, intervals, false);
133         fprintf(stderr, " done.\n");
134 
135         odw = new BCFOrderedWriter(output_vcf_file, 0);
136         bcf_hdr_append(odw->hdr, "##fileformat=VCFv4.2");
137         bcf_hdr_transfer_contigs(sr->hdrs[0], odw->hdr);
138         bool rename = true;
139 
140 //        odw->link_hdr(sr->hdrs[0]);
141 //        //exact alignment related statisitcs
142         std::string EX_MOTIF = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_MOTIF", "1", "String", "Canonical motif in a VNTR.", rename);
143         std::string EX_RU = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_RU", "1", "String", "Repeat unit in the reference sequence.", rename);
144         std::string EX_BASIS = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_BASIS", "1", "String", "Basis nucleotides in the motif.", rename);
145         std::string EX_MLEN = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_MLEN", "1", "Integer", "Motif length.", rename);
146         std::string EX_BLEN = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_BLEN", "1", "Integer", "Basis length.", rename);
147         std::string EX_REPEAT_TRACT = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_REPEAT_TRACT", "2", "Integer", "Boundary of the repeat tract detected by exact alignment.", rename);
148         std::string EX_COMP = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_COMP", "4", "Integer", "Composition(%) of bases in an exact repeat tract.", rename);
149         std::string EX_ENTROPY = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_ENTROPY", "1", "Float", "Entropy measure of an exact repeat tract [0,2].", rename);
150         std::string EX_ENTROPY2 = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_ENTROPY2", "1", "Float", "Dinucleotide entropy measure of an exact repeat tract [0,4].", rename);
151         std::string EX_KL_DIVERGENCE = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_KL_DIVERGENCE", "1", "Float", "Kullback-Leibler Divergence of an exact repeat tract.", rename);
152         std::string EX_KL_DIVERGENCE2 = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_KL_DIVERGENCE2", "1", "Float", "Dinucleotide Kullback-Leibler Divergence of an exact repeat tract.", rename);
153         std::string EX_REF = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_REF", ".", "Float", "Allele lengths in repeat units from exact alignment.", rename);
154         std::string EX_RL = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_RL", "1", "Integer", "Reference exact repeat tract length in bases.", rename);
155         std::string EX_LL = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_LL", "1", "Integer", "Longest exact repeat tract length in bases.", rename);
156         std::string EX_RU_COUNTS = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_RU_COUNTS", "2", "Integer", "Number of exact repeat units and total number of repeat units in exact repeat tract.", rename);
157         std::string EX_SCORE = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_SCORE", "1", "Float", "Score of repeat unit in exact repeat tract.", rename);
158         std::string EX_TRF_SCORE = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EX_TRF_SCORE", "1", "Integer", "TRF Score for M/I/D as 2/-7/-7 in exact repeat tract.", rename);
159 
160         //fuzzy alignment related statisitcs
161         std::string FZ_MOTIF = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_MOTIF", "1", "String", "Canonical motif in a VNTR.", rename);
162         std::string FZ_RU = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_RU", "1", "String", "Repeat unit in the reference sequence.", rename);
163         std::string FZ_BASIS = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_BASIS", "1", "String", "Basis nucleotides in the motif.", rename);
164         std::string FZ_MLEN = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_MLEN", "1", "Integer", "Motif length.", rename);
165         std::string FZ_BLEN = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_BLEN", "1", "Integer", "Basis length.", rename);
166         std::string FZ_REPEAT_TRACT = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_REPEAT_TRACT", "2", "Integer", "Boundary of the repeat tract detected by fuzzy alignment.", rename);
167         std::string FZ_COMP = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_COMP", "4", "Integer", "Composition(%) of bases in a fuzzy repeat tract.", rename);
168         std::string FZ_ENTROPY = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_ENTROPY", "1", "Float", "Entropy measure of a fuzzy repeat tract (0-2).", rename);
169         std::string FZ_ENTROPY2 = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_ENTROPY2", "1", "Float", "Dinucleotide entropy measure of a fuzzy repeat tract (0-2).", rename);
170         std::string FZ_KL_DIVERGENCE = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_KL_DIVERGENCE", "1", "Float", "Kullback-Leibler Divergence of a fuzzyt repeat tract.", rename);
171         std::string FZ_KL_DIVERGENCE2 = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_KL_DIVERGENCE2", "1", "Float", "Dinucleotide Kullback-Leibler Divergence of a fuzzy repeat tract.", rename);
172         std::string FZ_REF = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_REF", ".", "Float", "Allele lengths in repeat units from fuzzy alignment.", rename);
173         std::string FZ_RL = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_RL", "1", "Integer", "Reference fuzzy repeat tract length in bases.", rename);
174         std::string FZ_LL = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_LL", "1", "Integer", "Longest fuzzy repeat tract length in bases.", rename);
175         std::string FZ_RU_COUNTS = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_RU_COUNTS", "2", "Integer", "Number of exact repeat units and total number of repeat units in fuzzy repeat tract.", rename);
176         std::string FZ_SCORE = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_SCORE", "1", "Float", "Score of repeat unit in fuzzy repeat tract.", rename);
177         std::string FZ_TRF_SCORE = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_TRF_SCORE", "1", "Integer", "TRF Score for M/I/D as 2/-7/-7 in fuzzy repeat tract.", rename);
178 
179         std::string FLANKSEQ = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FLANKSEQ", "1", "String", "Flanking sequence 10bp on either side of REF.", rename);
180         std::string EXACT_RU_AMBIGUOUS = bcf_hdr_append_info_with_backup_naming(odw->hdr, "EXACT_RU_AMBIGUOUS", "0", "Flag", "Exact motif is ambiguous.", rename);
181 
182         std::string MOTIF = bcf_hdr_append_info_with_backup_naming(odw->hdr, "MOTIF", "1", "String", "Canonical motif in a VNTR.", rename);
183         std::string RU = bcf_hdr_append_info_with_backup_naming(odw->hdr, "RU", "1", "String", "Repeat unit in the reference sequence.", rename);
184         std::string FLANKS = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FLANKS", "2", "Integer", "Exact left and right flank positions of the Indel.", rename);
185         std::string FZ_FLANKS = bcf_hdr_append_info_with_backup_naming(odw->hdr, "FZ_FLANKS", "2", "Integer", "Fuzzy left and right flank positions of the Indel.", rename);
186 
187 
188         bcf_hdr_append(odw->hdr, "##INFO=<ID=LARGE_REPEAT_REGION,Number=0,Type=Flag,Description=\"Very large repeat region, vt only detects up to 1000bp long regions.\">");
189         bcf_hdr_append(odw->hdr, "##INFO=<ID=FLANKSEQ,Number=1,Type=String,Description=\"Flanking sequence 10bp on either side of detected repeat region.\">");
190         bcf_hdr_append(odw->hdr, "##FILTER=<ID=overlap_vntr,Description=\"Overlaps with VNTR\">");
191         bcf_hdr_append(odw->hdr, "##FILTER=<ID=overlap_indel,Description=\"Overlaps with indel\">");
192 
193         //COMMON
194         bcf_hdr_append(odw->hdr, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
195         bcf_hdr_append(odw->hdr, "##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"PHRED scaled genotype likelihoods\">");
196         bcf_hdr_append(odw->hdr, "##FORMAT=<ID=AD,Number=A,Type=Integer,Description=\"Allele Depth\">");
197         bcf_hdr_append(odw->hdr, "##FORMAT=<ID=ADF,Number=A,Type=Integer,Description=\"Allele Depth (Forward strand)\">");
198         bcf_hdr_append(odw->hdr, "##FORMAT=<ID=ADR,Number=A,Type=Integer,Description=\"Allele Depth (Reverse strand)\">");
199         bcf_hdr_append(odw->hdr, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Depth\">");
200 
201         //compute following stats for SNP filtering
202         //update from topmed freeze 3 calling pipeline
203 
204 
205 
206         //VNTR
207         bcf_hdr_append(odw->hdr, "##FORMAT=<ID=CG,Number=.,Type=Float,Description=\"Repeat count genotype\">");
208 
209         //add samples to output merged file
210         for (int32_t i=1; i<sr->hdrs.size(); ++i)
211         {
212 //            printf("adding sample = %s\n",  bcf_hdr_get_sample_name(sr->hdrs[i], 0));
213             bcf_hdr_add_sample(odw->hdr, bcf_hdr_get_sample_name(sr->hdrs[i], 0) );
214         }
215         bcf_hdr_sync(odw->hdr);
216 
217         odw->write_hdr();
218 
219         ///////////////
220         //general use//
221         ///////////////
222         variant = {0,0,0};
223         no_samples = sr->nfiles - 1;
224 
225         ////////////////////////
226         //stats initialization//
227         ////////////////////////
228         no_snps = 0;
229         no_indels = 0;
230         no_vntrs = 0;
231 
232         /////////
233         //tools//
234         /////////
235         vm = new VariantManip();
236     }
237 
merge_genotypes()238     void merge_genotypes()
239     {
240         int32_t *NSAMPLES = NULL;
241         int32_t no_NSAMPLES = 0;
242         int32_t *GT = NULL;
243         int32_t *PL = NULL;
244         int32_t *AD = NULL;
245         int32_t *ADF = NULL;
246         int32_t *ADR = NULL;
247         int32_t *BQSUM = NULL;
248         int32_t *DP = NULL;
249         float *CG = NULL;
250 
251         int32_t no_GT = 0;
252         int32_t no_PL = 0;
253         int32_t no_AD = 0;
254         int32_t no_ADF = 0;
255         int32_t no_ADR = 0;
256         int32_t no_BQSUM = 0;
257         int32_t no_DP = 0;
258         int32_t no_CG = 0;
259 
260         std::vector<int32_t> gt;
261         std::vector<int32_t> pl;
262         std::vector<int32_t> ad;
263         std::vector<int32_t> dp;
264         std::vector<float> cg;
265 
266         bcf1_t* nv = bcf_init();
267         Variant var;
268         std::vector<bcfptr*> current_recs;
269 
270         while(sr->read_next_position(current_recs))
271         {
272             if (current_recs.size()!=no_samples+1)
273             {
274                 std::string variant = bcf_variant2string(current_recs[0]->h, current_recs[0]->v);
275 
276                 if (current_recs.size()<no_samples+1)
277                 {
278                     fprintf(stderr, "[W:%s:%d %s] %d variants expected but %zd is observed for %s.  Variant skipped.\n", __FILE__, __LINE__, __FUNCTION__, no_samples+1, current_recs.size(), variant.c_str());
279                     continue;
280                 }
281                 else if (current_recs.size()>no_samples+1)
282                 {
283                     int32_t m = current_recs.size()/(no_samples+1);
284 
285                     if (m*(no_samples+1)==current_recs.size())
286                     {
287                         fprintf(stderr, "[W:%s:%d %s] %d variants expected but %zd is observed for %s.  Seems like a duplicate variant.  Will attempt to process anyway.\n", __FILE__, __LINE__, __FUNCTION__, no_samples+1, current_recs.size(), variant.c_str());
288                     }
289                     else
290                     {
291                         fprintf(stderr, "[W:%s:%d %s] %d variants expected but %zd is observed for %s.  Variant skipped.\n", __FILE__, __LINE__, __FUNCTION__, no_samples+1, current_recs.size(), variant.c_str());
292                         continue;
293                     }
294                 }
295             }
296 
297             gt.resize(0);
298             pl.resize(0);
299             ad.resize(0);
300             dp.resize(0);
301             cg.resize(0);
302 
303             bcf_clear(nv);
304             int32_t vtype;
305 
306             if (filter_exists)
307             {
308                 vm->classify_variant(current_recs[0]->h, current_recs[0]->v, var);
309                 if (!filter.apply(current_recs[0]->h, current_recs[0]->v, &var, false))
310                 {
311                     continue;
312                 }
313             }
314 
315             std::vector<bool> file_processed(no_samples+1, false);
316             int32_t files_processed = 0;
317 
318 //            file_processed.resize(0);
319 //            file_processed.resize(no_samples+1, false);
320 
321             //for each file
322             for (uint32_t i=0; i<current_recs.size(); ++i)
323             {
324                 int32_t file_index = current_recs[i]->file_index;
325                 bcf1_t *v = current_recs[i]->v;
326                 bcf_hdr_t *h = current_recs[i]->h;
327 
328                 if (file_processed[file_index])
329                 {
330                     //duplicate variant from the same file, skip
331                     continue;
332                 }
333                 else
334                 {
335                     ++files_processed;
336                     file_processed[file_index] = true;
337                 }
338 
339 //                printf("\tfile index: %d\n", file_index);
340 //                bcf_print(h, v);
341 
342 
343                 //candidate sites file, populate info fields
344                 if (!file_index)
345                 {
346                     vtype = vm->classify_variant(h, v, var);
347 
348 //                    bcf_copy(v, nv);
349 
350                     bcf_set_chrom(odw->hdr, nv, bcf_get_chrom(h, v));
351                     bcf_set_pos1(nv, bcf_get_pos1(v));
352                     bcf_update_alleles(odw->hdr, nv, const_cast<const char**>(bcf_get_allele(v)), bcf_get_n_allele(v));
353                     bcf_set_n_sample(nv, no_samples);
354 
355                     if (vtype==VT_SNP)
356                     {
357                         bcf_copy_info_field(h, v, odw->hdr, nv, "FLANKSEQ", BCF_HT_STR);
358                     }
359                     else if (vtype==VT_INDEL)
360                     {
361                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_MOTIF", BCF_HT_STR);
362                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_MLEN", BCF_HT_INT);
363                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_RU", BCF_HT_STR);
364                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_BASIS", BCF_HT_STR);
365                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_BLEN", BCF_HT_INT);
366                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_REPEAT_TRACT", BCF_HT_INT);
367                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_COMP", BCF_HT_INT);
368                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_ENTROPY", BCF_HT_REAL);
369                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_ENTROPY2", BCF_HT_REAL);
370                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_KL_DIVERGENCE", BCF_HT_REAL);
371                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_KL_DIVERGENCE2", BCF_HT_REAL);
372                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_REF", BCF_HT_REAL);
373                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_RL", BCF_HT_INT);
374                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_LL", BCF_HT_INT);
375                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_RU_COUNTS", BCF_HT_INT);
376                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_SCORE", BCF_HT_REAL);
377                         bcf_copy_info_field(h, v, odw->hdr, nv, "EX_TRF_SCORE", BCF_HT_INT);
378 
379                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_MOTIF", BCF_HT_STR);
380                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_MLEN", BCF_HT_INT);
381                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_RU", BCF_HT_STR);
382                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_BASIS", BCF_HT_STR);
383                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_BLEN", BCF_HT_INT);
384                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_REPEAT_TRACT", BCF_HT_INT);
385                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_COMP", BCF_HT_INT);
386                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_ENTROPY", BCF_HT_REAL);
387                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_ENTROPY2", BCF_HT_REAL);
388                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_KL_DIVERGENCE", BCF_HT_REAL);
389                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_KL_DIVERGENCE2", BCF_HT_REAL);
390                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_REF", BCF_HT_REAL);
391                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_RL", BCF_HT_INT);
392                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_LL", BCF_HT_INT);
393                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_RU_COUNTS", BCF_HT_INT);
394                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_SCORE", BCF_HT_REAL);
395                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_TRF_SCORE", BCF_HT_INT);
396 
397                         bcf_copy_info_field(h, v, odw->hdr, nv, "EXACT_RU_AMBIGUOUS", BCF_HT_FLAG);
398                         bcf_copy_info_field(h, v, odw->hdr, nv, "LARGE_REPEAT_REGION", BCF_HT_FLAG);
399                     }
400                     else if (vtype==VT_VNTR)
401                     {
402 //                        printf("\t\tis a VNTR\n");
403 //                        printf("\t\t\tcopying info fields\n");
404 //                        bcf_print(h, v);
405 
406                         bcf_copy_info_field(h, v, odw->hdr, nv, "MOTIF", BCF_HT_STR);
407                         bcf_copy_info_field(h, v, odw->hdr, nv, "RU", BCF_HT_STR);
408                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_RL", BCF_HT_INT);
409                         bcf_copy_info_field(h, v, odw->hdr, nv, "FLANKS", BCF_HT_INT);
410                         bcf_copy_info_field(h, v, odw->hdr, nv, "FZ_FLANKS", BCF_HT_INT);
411                     }
412 
413                     std::vector<int32_t> filter_ids;
414 
415                     if (bcf_has_filter(h, v, const_cast<char*>("overlap_indel"))==1)
416                     {
417                         filter_ids.push_back(bcf_hdr_id2int(odw->hdr, BCF_DT_ID, const_cast<char*>("overlap_indel")));
418 //                        int32_t overlap_indel_filter_id = bcf_hdr_id2int(odw->hdr, BCF_DT_ID, const_cast<char*>("overlap_indel"));
419 //                        bcf_update_filter(odw->hdr, nv, &overlap_indel_filter_id, 1);
420                     }
421 
422                     if (bcf_has_filter(h, v, const_cast<char*>("overlap_vntr"))==1)
423                     {
424                         filter_ids.push_back(bcf_hdr_id2int(odw->hdr, BCF_DT_ID, const_cast<char*>("overlap_vntr")));
425 //                        int32_t overlap_indel_filter_id = bcf_hdr_id2int(odw->hdr, BCF_DT_ID, const_cast<char*>("overlap_vntr"));
426 //                        bcf_update_filter(odw->hdr, nv, &overlap_indel_filter_id, 1);
427                     }
428 
429                     bcf_update_filter(odw->hdr, nv, &filter_ids[0], filter_ids.size());
430 
431                     continue;
432                 }
433 
434                 if (vtype==VT_SNP)
435                 {
436 //                    printf("\t\tis a SNP\n");
437 
438                     int32_t no_gt = bcf_get_genotypes(h, v, &GT, &no_GT);
439                     int32_t no_pl = bcf_get_format_int32(h, v, "PL", &PL, &no_PL);
440                     int32_t no_dp = bcf_get_format_int32(h, v, "DP", &DP, &no_DP);
441                     int32_t no_adf = bcf_get_format_int32(h, v, "ADF", &ADF, &no_ADF);
442                     int32_t no_adr = bcf_get_format_int32(h, v, "ADR", &ADR, &no_ADR);
443                     int32_t no_bqsum = bcf_get_format_int32(h, v, "BQSUM", &BQSUM, &no_BQSUM);
444 
445 //                    printf("GT: %d\n", no_gt);
446 //                    printf("PL: %d\n", no_pl);
447 //                    printf("DP: %d\n", no_dp);
448 //                    printf("ADF: %d\n", no_adf);
449 //                    printf("ADR: %d\n", no_adr);
450 //                    printf("BQSUM: %d\n", no_bqsum);
451 
452                     //GT:PL:DP:AD:ADF:ADR:BQ:MQ:CY:ST:AL:NM
453                     if (no_gt > 0 &&
454                         no_pl > 0 &&
455                         no_dp > 0 &&
456                         no_adf > 0 &&
457                         no_adr > 0)
458                     {
459 //                        printf("\t\tupdating genotypes for GT,PL\n");
460 //                        printf("\t\t\tno_GT %d\n", no_GT);
461 //                        printf("\t\t\t%d %d\n", GT[0], GT[1]);
462                           gt.push_back(GT[0]);
463                           gt.push_back(GT[1]);
464                           pl.push_back(PL[0]);
465                           pl.push_back(PL[1]);
466                           pl.push_back(PL[2]);
467                           dp.push_back(DP[0]);
468                           ad.push_back(ADF[0]+ADR[0]);
469                           ad.push_back(ADF[1]+ADR[1]);
470 
471 //                          printf("GT: %d\n", no_GT);
472 //                          printf("PL: %d\n", no_PL);
473 //                          printf("DP: %d\n", no_DP);
474 //                          printf("ADF: %d\n", no_ADF);
475 //                          printf("ADR: %d\n", no_ADR);
476 
477                     }
478                     //GT:BQSUM:DP
479                     else if (no_gt > 0 &&
480                              no_bqsum > 0 &&
481                              no_dp > 0)
482                     {
483 //                          printf("\t\tupdating genotypes for GT,BQSUM\n");
484 //                          printf("\t\t\tno_GT %d\n", no_GT);
485 //                          printf("\t\t\t%d %d\n", GT[0], GT[1]);
486                           gt.push_back(GT[0]);
487                           gt.push_back(GT[1]);
488                           pl.push_back(0);
489                           pl.push_back(BQSUM[0]/3);
490                           pl.push_back(BQSUM[0]);
491                           dp.push_back(DP[0]);
492                           ad.push_back(DP[0]);
493                           ad.push_back(0);
494                     }
495                     else
496                     {
497                         fprintf(stderr, "[E:%s:%d %s] cannot get format values GT:PL:DP:ADF:ADR or GT:BQSUM:DP from %s\n", __FILE__, __LINE__, __FUNCTION__, sr->file_names[file_index].c_str());
498                         exit(1);
499                     }
500                 }
501                 else if (vtype == VT_INDEL)
502                 {
503 //                    printf("\t\tis an INDEL\n");
504 
505                     int32_t no_gt = bcf_get_genotypes(h, v, &GT, &no_GT);
506                     int32_t no_pl = bcf_get_format_int32(h, v, "PL", &PL, &no_PL);
507                     int32_t no_dp = bcf_get_format_int32(h, v, "DP", &DP, &no_DP);
508                     int32_t no_adf = bcf_get_format_int32(h, v, "ADF", &ADF, &no_ADF);
509                     int32_t no_adr = bcf_get_format_int32(h, v, "ADR", &ADR, &no_ADR);
510                     int32_t no_bqsum = bcf_get_format_int32(h, v, "BQSUM", &BQSUM, &no_BQSUM);
511 
512                     if (no_gt > 0 &&
513                         no_pl > 0 &&
514                         no_dp > 0 &&
515                         no_adf > 0 &&
516                         no_adr > 0)
517                     {
518 //                        printf("\t\tupdating genotypes for GT,PL\n");
519 //                        printf("\t\t\tno_GT %d\n", no_GT);
520 //                        printf("\t\t\t%d %d\n", GT[0], GT[1]);
521                           gt.push_back(GT[0]);
522                           gt.push_back(GT[1]);
523                           pl.push_back(PL[0]);
524                           pl.push_back(PL[1]);
525                           pl.push_back(PL[2]);
526                           dp.push_back(DP[0]);
527                           ad.push_back(ADF[0]+ADR[0]);
528                           ad.push_back(ADF[1]+ADR[1]);
529 
530 //                          printf("GT: %d\n", no_GT);
531 //                          printf("PL: %d\n", no_PL);
532 //                          printf("DP: %d\n", no_DP);
533 //                          printf("ADF: %d\n", no_ADF);
534 //                          printf("ADR: %d\n", no_ADR);
535 
536                     }
537                     //GT:BQSUM:DP
538                     else if (no_gt > 0 &&
539                              no_bqsum > 0 &&
540                              no_dp > 0)
541                     {
542 //                          printf("\t\tupdating genotypes for GT,BQSUM\n");
543 //                          printf("\t\t\tno_GT %d\n", no_GT);
544 //                          printf("\t\t\t%d %d\n", GT[0], GT[1]);
545                           gt.push_back(GT[0]);
546                           gt.push_back(GT[1]);
547                           pl.push_back(0);
548                           pl.push_back(BQSUM[0]/3);
549                           pl.push_back(BQSUM[0]);
550                           dp.push_back(DP[0]);
551                           ad.push_back(DP[0]);
552                           ad.push_back(0);
553                     }
554                     else
555                     {
556                         fprintf(stderr, "[E:%s:%d %s] cannot get format values GT:PL:DP:ADF:ADR or GT:BQSUM:DP from %s\n", __FILE__, __LINE__, __FUNCTION__, sr->file_names[file_index].c_str());
557                         exit(1);
558                     }
559                 }
560                 else if (vtype == VT_VNTR)
561                 {
562 //                    printf("\t\tis a VNTR\n");
563 //                    bcf_print(h, v);
564 //
565                     int32_t no_cg = bcf_get_format_float(h, v, "CG", &CG, &no_CG);
566 
567                     //CG
568                     if (no_cg > 0)
569                     {
570 //                          printf("\t\tupdating genotypes for CG\n");
571 //                          printf("\t\t\tno_CG %d\n", no_CG);
572 //                          printf("\t\t\t%f %f\n", CG[0], CG[1]);
573                           cg.push_back(CG[0]);
574                           cg.push_back(CG[1]);
575                     }
576                     else
577                     {
578                         fprintf(stderr, "[E:%s:%d %s] cannot get format values CG from %s\n", __FILE__, __LINE__, __FUNCTION__, sr->file_names[file_index].c_str());
579                     }
580                 }
581 
582             }//end processing each file
583 
584             //write to merged record
585             if (vtype==VT_SNP)
586             {
587 //                printf("\tupdating genotypes %zd\n", gt.size());
588 //                printf("\tn_samples %zd\n", nv->n_sample);
589                 bcf_update_genotypes(odw->hdr, nv, &gt[0], gt.size());
590                 bcf_update_format_int32(odw->hdr, nv, "PL", &pl[0], pl.size());
591                 bcf_update_format_int32(odw->hdr, nv, "DP", &dp[0], dp.size());
592                 bcf_update_format_int32(odw->hdr, nv, "AD", &ad[0], ad.size());
593 
594                 ++no_snps;
595             }
596             else if (vtype==VT_INDEL)
597             {
598 //                printf("\tupdating genotypes %zd\n", gt.size());
599 //                printf("\tn_samples %zd\n", nv->n_sample);
600                 bcf_update_genotypes(odw->hdr, nv, &gt[0], gt.size());
601                 bcf_update_format_int32(odw->hdr, nv, "PL", &pl[0], pl.size());
602                 bcf_update_format_int32(odw->hdr, nv, "DP", &dp[0], dp.size());
603                 bcf_update_format_int32(odw->hdr, nv, "AD", &ad[0], ad.size());
604 
605                 ++no_indels;
606             }
607             else if (vtype==VT_VNTR)
608             {
609 //                printf("\tupdating genotypes %zd\n", gt.size());
610 //                printf("\tn_samples %zd\n", nv->n_sample);
611                 bcf_update_format_float(odw->hdr, nv, "CG", &cg[0], cg.size());
612 
613                 ++no_vntrs;
614 
615 //                bcf_print(odw->hdr, nv);
616             }
617 
618             //check to make sure correct number of records are processed.
619             if (files_processed!=file_processed.size())
620             {
621                 fprintf(stderr, "[I:%s:%d %s] Lesser than expected number of files processed : %d\n", __FILE__, __LINE__, __FUNCTION__, files_processed);
622                 exit(1);
623             }
624 
625             odw->write(nv);
626 //            bcf_print(odw->hdr, nv);
627 
628             //this acts as a flag to initialize a newly merged record
629             vtype = VT_UNDEFINED;
630 
631             int32_t no_variants = no_snps+no_indels+no_vntrs;
632 //            if ((no_variants&0x0FFF)==0x0600)
633             if ((no_variants%100)==0)
634             {
635                 fprintf(stderr, "[I:%s:%d %s] Merged %d rows\n", __FILE__, __LINE__, __FUNCTION__, no_variants);
636             }
637 
638         }
639 
640         odw->close();
641         fprintf(stderr, "[I:%s:%d %s] Synced reader closing ...", __FILE__, __LINE__, __FUNCTION__);
642         sr->close();
643         fprintf(stderr, " closed\n");
644 
645     };
646 
print_options()647     void print_options()
648     {
649         std::clog << "merge_genotypes v" << version << "\n\n";
650         std::clog << "options: [L] input VCF file list                      " << input_vcf_file_list << " (" << input_vcf_files.size() << " files)\n";
651         std::clog << "         [c] candidate sites VCF file                 " << candidate_sites_vcf_file << "\n";
652         std::clog << "         [o] output VCF file                          " << output_vcf_file << "\n";
653         print_str_op("         [f] filter (applied to candidate sites file) ", fexp);
654         print_int_op("         [i] intervals                                ", intervals);
655         std::clog << "\n";
656     }
657 
print_stats()658     void print_stats()
659     {
660         std::clog << "\n";
661         std::clog << "stats: Total no. of SNPs                 " << no_snps << "\n";
662         std::clog << "       Total no. of Indels               " << no_indels << "\n";
663         std::clog << "       Total no. of VNTRs                " << no_vntrs << "\n";
664         std::clog << "\n";
665     };
666 
~Igor()667     ~Igor()
668     {
669     };
670 
671     private:
672 };
673 
674 }
675 
merge_genotypes(int argc,char ** argv)676 void merge_genotypes(int argc, char ** argv)
677 {
678     Igor igor(argc, argv);
679     igor.print_options();
680     igor.initialize();
681     igor.merge_genotypes();
682     igor.print_stats();
683 }
684 
685