1 /* The MIT License
2 
3    Copyright (c) 2015 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "decompose2.h"
25 
26 namespace
27 {
28 
29 class Igor : Program
30 {
31     public:
32 
33     ///////////
34     //options//
35     ///////////
36     std::string input_vcf_file;
37     std::string output_vcf_file;
38     std::vector<GenomeInterval> intervals;
39     std::string ref_fasta_file;
40     bool smart;
41 
42     ///////
43     //i/o//
44     ///////
45     BCFOrderedReader *odr;
46     BCFOrderedWriter *odw;
47 
48     kstring_t s;
49     kstring_t new_alleles;
50     kstring_t old_alleles;
51 
52     /////////
53     //stats//
54     /////////
55     uint32_t no_variants;
56     uint32_t new_no_variants;
57     uint32_t no_biallelic;
58     uint32_t no_multiallelic;
59     uint32_t no_additional_biallelic;
60 
61     /////////
62     //tools//
63     /////////
64     VariantManip *vm;
65 
Igor(int argc,char ** argv)66     Igor(int argc, char **argv)
67     {
68         version = "0.5";
69 
70         //////////////////////////
71         //options initialization//
72         //////////////////////////
73         try
74         {
75             std::string desc = "decomposes multiallelic variants into biallelic in a VCF file.";
76 
77             TCLAP::CmdLine cmd(desc, ' ', version);
78             VTOutput my; cmd.setOutput(&my);
79             TCLAP::ValueArg<std::string> arg_intervals("i", "i", "intervals []", false, "", "str", cmd);
80             TCLAP::ValueArg<std::string> arg_interval_list("I", "I", "file containing list of intervals []", false, "", "file", cmd);
81             TCLAP::ValueArg<std::string> arg_output_vcf_file("o", "o", "output VCF file [-]", false, "-", "str", cmd);
82             TCLAP::SwitchArg arg_smart("s", "s", "smart decomposition [false]", cmd, false);
83             TCLAP::UnlabeledValueArg<std::string> arg_input_vcf_file("<in.vcf>", "input VCF file", true, "","file", cmd);
84 
85             cmd.parse(argc, argv);
86 
87             input_vcf_file = arg_input_vcf_file.getValue();
88             output_vcf_file = arg_output_vcf_file.getValue();
89             smart = arg_smart.getValue();
90             parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue());
91         }
92         catch (TCLAP::ArgException &e)
93         {
94             std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
95             abort();
96         }
97     };
98 
initialize()99     void initialize()
100     {
101         //////////////////////
102         //i/o initialization//
103         //////////////////////
104         odr = new BCFOrderedReader(input_vcf_file, intervals);
105 
106         odw = new BCFOrderedWriter(output_vcf_file);
107         odw->link_hdr(odr->hdr);
108         //bcf_hdr_append_info_with_backup_naming(odw->hdr, "OLD_MULTIALLELIC", "1", "String", "Original chr:pos:ref:alt encoding", false);
109         bcf_hdr_append(odw->hdr, "##INFO=<ID=,Number=1,Type=String,Description=\"Original chr:pos:ref:alt encoding\">\n");
110         odw->write_hdr();
111 
112         s = {0,0,0};
113         old_alleles = {0,0,0};
114         new_alleles = {0,0,0};
115 
116         ////////////////////////
117         //stats initialization//
118         ////////////////////////
119         no_variants = 0;
120         no_biallelic = 0;
121         no_multiallelic = 0;
122 
123         no_additional_biallelic = 0;
124 
125         ////////////////////////
126         //tools initialization//
127         ////////////////////////
128     }
129 
130     /**
131      * n choose r.
132      */
choose(uint32_t n,uint32_t r)133     uint32_t choose(uint32_t n, uint32_t r)
134     {
135         if (r>n)
136         {
137             return 0;
138         }
139         else if (r==n)
140         {
141             return 1;
142         }
143         else if (r==0)
144         {
145             return 1;
146         }
147         else
148         {
149             if (r>(n>>1))
150             {
151                 r = n-r;
152             }
153 
154             uint32_t num = n;
155             uint32_t denum = 1;
156 
157             for (uint32_t i=1; i<r; ++i)
158             {
159                 num *= n-i;
160                 denum *= i+1;
161             }
162 
163             return num/denum;
164         }
165     }
166 
decompose2()167     void decompose2()
168     {
169         bcf1_t* v = bcf_init();
170         Variant variant;
171 
172         while (odr->read(v))
173         {
174 //            std::cerr << "=============================\n";
175 //            bcf_print(odr->hdr, v);
176 //            std::cerr << "=============================\n";
177 
178             int32_t n_allele = bcf_get_n_allele(v);
179 
180             if (n_allele > 2)
181             {
182                 if (!smart)
183                 {
184                     ++no_multiallelic;
185                     no_additional_biallelic += n_allele-2;
186 
187                     old_alleles.l = 0;
188                     bcf_variant2string(odw->hdr, v, &old_alleles);
189 
190                     int32_t rid = bcf_get_rid(v);
191                     int32_t pos1 = bcf_get_pos1(v);
192                     char** allele = bcf_get_allele(v);
193 
194                     int32_t *gt = NULL;
195                     int32_t *pl = NULL;
196                     float *gl = NULL;
197                     int32_t *dp = NULL;
198                     size_t no_samples = bcf_hdr_nsamples(odr->hdr);
199                     bool has_GT = false;
200                     bool has_PL = false;
201                     bool has_GL = false;
202                     bool has_DP = false;
203                     int32_t ploidy = 0;
204                     int32_t n_genotype;
205                     int32_t n_genotype2;
206 
207                     int32_t *gts = NULL;
208                     int32_t *pls = NULL;
209                     float *gls = NULL;
210                     int32_t *dps = NULL;
211 
212                     if (no_samples)
213                     {
214                         bcf_unpack(v, BCF_UN_FMT);
215 
216                         int32_t n = 0;
217                         int32_t ret = bcf_get_genotypes(odr->hdr, v, &gt, &n);
218                         if (ret>0) has_GT = true;
219                         ploidy = n/bcf_hdr_nsamples(odr->hdr);
220                         n_genotype = bcf_ap2g(n_allele, ploidy);
221                         n_genotype2 = bcf_ap2g(2, ploidy);
222                         if (ret>0) gts = (int32_t*) malloc(no_samples*n_genotype2*sizeof(int32_t));
223 
224                         n=0;
225                         ret = bcf_get_format_int32(odr->hdr, v, "PL", &pl, &n);
226                         if (ret>0)
227                         {
228                             has_PL = true;
229                             pls = (int32_t*) malloc(no_samples*n_genotype2*sizeof(int32_t));
230                         }
231 
232                         n=0;
233                         ret = bcf_get_format_float(odr->hdr, v, "GL", &gl, &n);
234                         if (ret>0)
235                         {
236                             has_GL = true;
237                             gls = (float*) malloc(no_samples*n_genotype2*sizeof(float));
238                         }
239 
240                         n=0;
241                         ret = bcf_get_format_int32(odr->hdr, v, "DP", &dp, &n);
242                         if (ret>0)
243                         {
244                             has_DP = true;
245                             dps = (int32_t*) malloc(no_samples*sizeof(int32_t));
246                         }
247                     }
248 
249 //                    std::cerr << "*****************************\n";
250 //                    bcf_print(odw->hdr, v);
251 //                    std::cerr << "*****************************\n";
252 
253                     for (size_t i=1; i<n_allele; ++i)
254                     {
255                         //bcf_clear(v);
256                         bcf1_t* nv = bcf_dup(v);
257                         new_alleles.l=0;
258                         kputs(allele[0], &new_alleles);
259                         kputc(',', &new_alleles);
260                         kputs(allele[i], &new_alleles);
261 
262                         bcf_update_info_string(odw->hdr, nv, "OLD_MULTIALLELIC", old_alleles.s);
263                         bcf_update_alleles_str(odw->hdr, nv, new_alleles.s);
264 
265                         if (no_samples)
266                         {
267                             //get array genotypes
268                             for (size_t j=0; j<no_samples; ++j)
269                             {
270                                 if (has_GT)
271                                 {
272                                     for (size_t k=0; k<ploidy; ++k)
273                                     {
274                                         int32_t _a = gt[j*ploidy+k];
275                                         if (_a<0)
276                                         {
277                                             gts[j*ploidy+k] = _a;
278                                         }
279                                         else
280                                         {
281                                             int32_t a = bcf_gt_allele(_a);
282 
283                                             if (a)
284                                             {
285                                                 a = a==i ? 1 : -1;
286                                             }
287 
288                                             gts[j*ploidy+k] =  ((a+1)<<1) | bcf_gt_is_phased(_a);
289                                         }
290                                     }
291                                 }
292 
293                                 if (has_PL)
294                                 {
295                                     pls[j*n_genotype2] = pl[j*n_genotype];
296                                     uint32_t index = 0;
297                                     for (uint32_t k = 1; k<n_genotype2; ++k)
298                                     {
299                                         index += choose(ploidy-(k-1)+i-1,i-1);
300                                         pls[j*n_genotype2+k] = pl[j*n_genotype+index];
301                                     }
302                                 }
303 
304                                 if (has_GL)
305                                 {
306                                     gls[j*n_genotype2] = gl[j*n_genotype];
307                                     uint32_t index = 0;
308                                     for (uint32_t k = 1; k<n_genotype2; ++k)
309                                     {
310                                         index += choose(ploidy-(k-1)+i-1,i-1);
311                                         gls[j*n_genotype2+k] = gl[j*n_genotype+index];
312                                     }
313                                 }
314 
315                                 if (has_DP)
316                                 {
317                                     dps[j] = dp[j];
318                                 }
319                             }
320 
321                             //remove other format values except for GT, PL, GL and DP
322                             if (i==1)
323                             {
324                                 bcf_fmt_t *fmt = v->d.fmt;
325                                 for (size_t j = 0; j < v->n_fmt; ++j)
326                                 {
327                                     const char* tag = odw->hdr->id[BCF_DT_ID][fmt[j].id].key;
328 
329                                     if (strcmp(tag,"GT")&&strcmp(tag,"PL")&&strcmp(tag,"GL")&&strcmp(tag,"DP"))
330                                     {
331                                         bcf_update_format_int32(odw->hdr, nv, tag, 0, 0);
332                                     }
333                                 }
334                             }
335 
336                             if (has_GT) bcf_update_genotypes(odw->hdr, nv, gts, no_samples*ploidy);
337                             if (has_PL) bcf_update_format_int32(odw->hdr, nv, "PL", pls, no_samples*n_genotype2);
338                             if (has_GL) bcf_update_format_float(odw->hdr, nv, "GL", gls, no_samples*n_genotype2);
339                             if (has_DP) bcf_update_format_int32(odw->hdr, nv, "DP", dps, no_samples);
340                         }
341 
342                         //remove other format values except for GT, PL, GL and DP
343                         bcf_fmt_t *fmt = v->d.fmt;
344                         for (size_t j = 0; j < v->n_fmt; ++j)
345                         {
346                             const char* tag = odw->hdr->id[BCF_DT_ID][fmt[j].id].key;
347 
348                             if (strcmp(tag,"GT")&&strcmp(tag,"PL")&&strcmp(tag,"GL")&&strcmp(tag,"DP"))
349                             {
350                                 bcf_update_format_int32(odw->hdr, nv, tag, 0, 0);
351                             }
352                         }
353 
354                         odw->write(nv);
355                         bcf_destroy(nv);
356                     }
357 
358                     if (has_GT) {free(gt); free(gts);}
359                     if (has_PL) {free(pl); free(pls);}
360                     if (has_GL) {free(gl); free(gls);}
361                     if (has_DP) {free(dp); free(dps);}
362                 }
363                 else //smart decomposition
364                 {
365                     ++no_multiallelic;
366                     no_additional_biallelic += n_allele-2;
367 
368                     old_alleles.l = 0;
369                     bcf_variant2string(odw->hdr, v, &old_alleles);
370 
371                     int32_t rid = bcf_get_rid(v);
372                     int32_t pos1 = bcf_get_pos1(v);
373                     char** allele = bcf_get_allele(v);
374                     uint32_t no_samples = bcf_hdr_nsamples(odr->hdr);
375 
376                     char** alleles = (char**) malloc(n_allele*sizeof(char*));
377                     for (size_t i=0; i<n_allele; ++i)
378                     {
379                         alleles[i] = strdup(allele[i]);
380                     }
381 
382 //                    std::cerr << "=============================\n";
383 //                    bcf_print(odr->hdr, v);
384 //                    std::cerr << "=============================\n";
385 //
386                     for (size_t i=1; i<n_allele; ++i)
387                     {
388                         bcf1_t *nv = bcf_dup(v);
389                         bcf_set_rid(nv, rid);
390                         bcf_set_pos1(nv, pos1);
391                         new_alleles.l=0;
392                         kputs(allele[0], &new_alleles);
393                         kputc(',', &new_alleles);
394                         kputs(allele[i], &new_alleles);
395 
396                         //////////////////////
397                         //split up INFO fields
398                         //////////////////////
399                         if (nv->n_info)
400                         {
401                             bcf_unpack(v, BCF_UN_INFO);
402 
403                             bcf_hdr_t* hdr = odr->hdr;
404                             bcf_info_t *info = v->d.info;
405 
406                             for (uint32_t j = 0; j < v->n_info; ++j)
407                             {
408                                 int32_t key = info[j].key;
409 
410                                 if (key<0)
411                                 {
412                                     fprintf(stderr, "[E::%s] invalid BCF, the INFO key key=%d not present in the header.\n", __func__, key);
413                                     abort();
414                                 }
415 
416                                 const char* tag = hdr->id[BCF_DT_ID][key].key;
417                                 int32_t var_len = bcf_hdr_id2length(hdr,BCF_HL_INFO,key);
418                                 int32_t type = info[j].type;
419 
420                                 if (var_len==BCF_VL_G)
421                                 {
422                                     if (type==BCF_BT_INT8||type==BCF_BT_INT16||type==BCF_BT_INT32)
423                                     {
424                                         int32_t n = 0;
425                                         int32_t* g = 0;
426                                         int32_t ret = bcf_get_info_int32(odr->hdr, v, tag, &g, &n);
427 
428                                         if (ret>0)
429                                         {
430                                             int32_t n_genotype = n;
431                                             int32_t ploidy = bcf_ag2p(n_allele, n_genotype);
432                                             int32_t n_genotype2 = bcf_ap2g(2, ploidy);
433                                             int32_t* gs = (int32_t*) malloc(n_genotype2*sizeof(int32_t));
434 
435                                             gs[0] = g[0];
436                                             uint32_t index = 0;
437                                             for (uint32_t k = 1; k<n_genotype2; ++k)
438                                             {
439                                                 index += choose(ploidy-(k-1)+i-1,i-1);
440                                                 gs[k] = g[index];
441                                             }
442 
443                                             bcf_update_info_int32(odw->hdr, nv, tag, gs, no_samples*n_genotype2);
444                                             free(gs);
445                                             free(g);
446                                         }
447                                     }
448                                     else if (type==BCF_BT_FLOAT)
449                                     {
450                                         int32_t n = 0;
451                                         float* g = 0;
452                                         int32_t ret = bcf_get_info_float(odr->hdr, v, tag, &g, &n);
453 
454                                         if (ret>0)
455                                         {
456                                             int32_t n_genotype = n;
457                                             int32_t ploidy = bcf_ag2p(n_allele, n_genotype);
458                                             int32_t n_genotype2 = bcf_ap2g(2, ploidy);
459                                             float* gs = (float*) malloc(n_genotype2*sizeof(float));
460 
461                                             gs[0] = g[0];
462                                             uint32_t index = 0;
463                                             for (uint32_t k = 1; k<n_genotype2; ++k)
464                                             {
465                                                 index += choose(ploidy-(k-1)+i-1,i-1);
466                                                 gs[k] = g[index];
467                                             }
468 
469                                             bcf_update_info_float(odw->hdr, nv, tag, gs, n_genotype2);
470                                             free(gs);
471                                             free(g);
472                                         }
473                                     }
474                                     else if (type==BCF_BT_CHAR)
475                                     {
476                                         //not necessary
477                                     }
478                                 }
479                                 else if (var_len == BCF_VL_A)
480                                 {
481                                     if (type==BCF_BT_INT8||type==BCF_BT_INT16||type==BCF_BT_INT32)
482                                     {
483                                         int32_t n = 0;
484                                         int32_t* a = 0;
485                                         int32_t ret = bcf_get_info_int32(odr->hdr, v, tag, &a, &n);
486                                         if (ret>0)
487                                         {
488                                             int32_t as = a[i-1];
489                                             bcf_update_info_int32(odw->hdr, nv, tag, &as, 1);
490                                         }
491                                         free(a);
492                                     }
493                                     else if (type==BCF_BT_FLOAT)
494                                     {
495                                         int32_t n = 0;
496                                         float* a = 0;
497                                         int32_t ret = bcf_get_info_float(odr->hdr, v, tag, &a, &n);
498                                         if (ret>0)
499                                         {
500                                             float as = a[i-1];
501                                             bcf_update_info_float(odw->hdr, nv, tag, &as, 1);
502                                         }
503                                         free(a);
504                                     }
505                                     else if (type==BCF_BT_CHAR)
506                                     {
507                                         int32_t n = 0;
508                                         char* a = 0;
509                                         int32_t ret = bcf_get_info_string(odr->hdr, v, tag, &a, &n);
510 
511                                         if (ret>0)
512                                         {
513                                             char* b = a;
514                                             char* c = a;
515                                             int32_t e = 0;
516                                             while (*b)
517                                             {
518                                                 if (*b==',')
519                                                 {
520                                                     *b = 0;
521                                                     ++e;
522 
523                                                     if (e==i)
524                                                     {
525                                                         bcf_update_info_string(odw->hdr, nv, tag, c);
526                                                         break;
527                                                     }
528 
529                                                     c = b+1;
530                                                 }
531                                                 else if (*(b+1)==0)
532                                                 {
533                                                     bcf_update_info_string(odw->hdr, nv, tag, c);
534                                                     break;
535                                                 }
536 
537                                                 ++b;
538                                             }
539 
540                                             if (e>i)
541                                             {
542                                                 bcf_update_info_string(odw->hdr, nv, tag, NULL);
543                                             }
544 
545                                             free(a);
546                                         }
547                                     }
548                                 }
549                                 else if (var_len == BCF_VL_R)
550                                 {
551                                     if (type==BCF_BT_INT8||type==BCF_BT_INT16||type==BCF_BT_INT32)
552                                     {
553                                         int32_t n = 0;
554                                         int32_t* a = 0;
555                                         int32_t ret = bcf_get_info_int32(odr->hdr, v, tag, &a, &n);
556                                         if (ret)
557                                         {
558                                             int32_t* as = (int32_t*) malloc(sizeof(int32_t)*2);
559                                             as[0] = a[0];
560                                             as[1] = a[i];
561                                             bcf_update_info_int32(odw->hdr, nv, tag, as, 2);
562                                             free(as);
563                                             free(a);
564                                         }
565                                     }
566                                     else if (type==BCF_BT_FLOAT)
567                                     {
568                                         int32_t n = 0;
569                                         float* a = 0;
570                                         int32_t ret = bcf_get_info_float(odr->hdr, v, tag, &a, &n);
571                                         if (ret>0)
572                                         {
573                                             float* as = (float*) malloc(sizeof(float)*2);
574                                             as[0] = a[0];
575                                             as[1] = a[i];
576                                             bcf_update_info_float(odw->hdr, nv, tag, as, 2);
577                                             free(as);
578                                             free(a);
579                                         }
580                                     }
581                                     else if (type==BCF_BT_CHAR)
582                                     {
583                                         int32_t n = 0;
584                                         char* a = 0;
585                                         int32_t ret = bcf_get_info_string(odr->hdr, v, tag, &a, &n);
586 
587                                         if (ret>0)
588                                         {
589                                             char* b = a;
590                                             char* c = a;
591                                             int32_t e = 0;
592                                             while (*b)
593                                             {
594                                                 if (*b==',')
595                                                 {
596                                                     *b = 0;
597                                                     ++e;
598 
599                                                     if (e==i)
600                                                     {
601                                                         bcf_update_info_string(odw->hdr, nv, tag, c);
602                                                         break;
603                                                     }
604 
605                                                     c = b+1;
606                                                 }
607                                                 //last occurrence
608                                                 else if (*(b+1)==0)
609                                                 {
610                                                     bcf_update_info_string(odw->hdr, nv, tag, c);
611                                                     break;
612                                                 }
613 
614                                                 ++b;
615                                             }
616 
617                                             if (e>i)
618                                             {
619                                                 bcf_update_info_string(odw->hdr, nv, tag, NULL);
620                                             }
621 
622                                             free(a);
623                                         }
624                                     }
625                                 }
626                                 else if (var_len == BCF_VL_FIXED)
627                                 {
628                                     //leave it there
629                                 }
630                                 else if (var_len == BCF_VL_VAR)
631                                 {
632                                     //leave it there
633                                 }
634                             }
635                         }
636 
637                         ////////////////////////
638                         //split up FORMAT fields
639                         ////////////////////////
640                         if (no_samples)
641                         {
642                             bcf_unpack(v, BCF_UN_FMT);
643 
644                             if (nv->n_fmt)
645                             {
646                                 bcf_hdr_t* hdr = odr->hdr;
647                                 bcf_fmt_t *fmt = v->d.fmt;
648 
649                                 for (uint32_t j = 0; j < (int32_t)v->n_fmt; ++j)
650                                 {
651                                     int32_t id = fmt[j].id;
652 
653                                     if (id<0)
654                                     {
655                                         fprintf(stderr, "[E::%s] invalid BCF, the FORMAT tag id=%d not present in the header.\n", __func__, id);
656                                         abort();
657                                     }
658 
659                                     const char* tag = hdr->id[BCF_DT_ID][id].key;
660                                     int32_t var_len = bcf_hdr_id2length(hdr,BCF_HL_FMT,id);
661                                     int32_t type = fmt[j].type;
662 
663                                     if (var_len==BCF_VL_G)
664                                     {
665                                         if (type==BCF_BT_INT8||type==BCF_BT_INT16||type==BCF_BT_INT32)
666                                         {
667                                             int32_t n = 0;
668                                             int32_t* g = 0;
669                                             int32_t ret = bcf_get_format_int32(odr->hdr, v, tag, &g, &n);
670                                             int32_t n_genotype = n/no_samples;
671 
672                                             int32_t ploidy = bcf_ag2p(n_allele, n_genotype);
673                                             int32_t n_genotype2 = bcf_ap2g(2, ploidy);
674                                             int32_t* gs = (int32_t*) malloc(no_samples*n_genotype2*sizeof(int32_t));
675 
676                                             //get array genotypes
677                                             for (uint32_t j=0; j<no_samples; ++j)
678                                             {
679                                                 gs[j*n_genotype2] = g[j*n_genotype];
680                                                 uint32_t index = 0;
681                                                 for (uint32_t k = 1; k<n_genotype2; ++k)
682                                                 {
683                                                     index += choose(ploidy-(k-1)+i-1,i-1);
684                                                     gs[j*n_genotype2+k] = g[j*n_genotype+index];
685                                                 }
686                                             }
687 
688                                             bcf_update_format_int32(odw->hdr, nv, tag, gs, no_samples*n_genotype2);
689                                             free(gs);
690                                             free(g);
691                                         }
692                                         else if (type==BCF_BT_FLOAT)
693                                         {
694                                             int32_t n = 0;
695                                             float* g = 0;
696                                             int32_t ret = bcf_get_format_float(odr->hdr, v, tag, &g, &n);
697                                             int32_t n_genotype = n/no_samples;
698                                             int32_t ploidy = bcf_ag2p(n_allele, n_genotype);
699                                             int32_t n_genotype2 = bcf_ap2g(2, ploidy);
700                                             float* gs = (float*) malloc(no_samples*n_genotype2*sizeof(float));
701 
702                                             for (uint32_t j=0; j<no_samples; ++j)
703                                             {
704                                                 gs[j*n_genotype2] = g[j*n_genotype];
705                                                 uint32_t index = 0;
706                                                 for (uint32_t k = 1; k<n_genotype2; ++k)
707                                                 {
708                                                     index += choose(ploidy-(k-1)+i-1,i-1);
709                                                     gs[j*n_genotype2+k] = g[j*n_genotype+index];
710                                                 }
711                                             }
712 
713                                             bcf_update_format_float(odw->hdr, nv, tag, gs, no_samples*n_genotype2);
714                                             free(gs);
715                                             free(g);
716                                         }
717                                         else if (type==BCF_BT_CHAR)
718                                         {
719                                             //to be implemented
720                                         }
721                                     }
722                                     else if (var_len == BCF_VL_A)
723                                     {
724                                         if (type==BCF_BT_INT8||type==BCF_BT_INT16||type==BCF_BT_INT32)
725                                         {
726                                             int32_t n = 0;
727                                             int32_t* a = 0;
728                                             int32_t ret = bcf_get_format_int32(odr->hdr, v, tag, &a, &n);
729                                             int32_t* as = (int32_t*) malloc(no_samples*sizeof(int32_t));
730 
731                                             if (ret!=(n_allele-1)*no_samples)
732                                             {
733                                                 for (uint32_t j=0; j<no_samples; ++j)
734                                                 {
735                                                     as[j] = bcf_int32_missing;
736                                                 }
737                                             }
738                                             else
739                                             {
740                                                 for (uint32_t j=0; j<no_samples; ++j)
741                                                 {
742                                                     if (a[j*(n_allele-1)+(i-1)]==bcf_int32_missing ||
743                                                         a[j*(n_allele-1)+(i-1)]==bcf_int32_vector_end)
744                                                     {
745                                                         as[j] = bcf_int32_missing;
746                                                     }
747                                                     else
748                                                     {
749                                                         as[j] = a[j*(n_allele-1)+(i-1)];
750                                                     }
751                                                 }
752                                             }
753                                             bcf_update_format_int32(odw->hdr, nv, tag, as, no_samples);
754                                             free(as);
755                                             free(a);
756                                         }
757                                         else if (type==BCF_BT_FLOAT)
758                                         {
759                                             int32_t n = 0;
760                                             float* a = 0;
761                                             int32_t ret = bcf_get_format_float(odr->hdr, v, tag, &a, &n);
762                                             float* as = (float*) malloc(no_samples*sizeof(float));
763 
764                                             if (ret!=(n_allele-1)*no_samples)
765                                             {
766                                                 for (uint32_t j=0; j<no_samples; ++j)
767                                                 {
768                                                     bcf_float_set_missing(as[j]);;
769                                                 }
770                                             }
771                                             else
772                                             {
773                                                 for (uint32_t j=0; j<no_samples; ++j)
774                                                 {
775                                                     if (bcf_float_is_missing(a[j*(n_allele-1)+(i-1)]) ||
776                                                         bcf_float_is_vector_end(a[j*(n_allele-1)+(i-1)]))
777                                                     {
778                                                         bcf_float_set_missing(as[j]);
779                                                     }
780                                                     else
781                                                     {
782                                                         as[j] = a[j*(n_allele-1)+(i-1)];
783                                                     }
784                                                 }
785                                             }
786 
787                                             bcf_update_format_float(odw->hdr, nv, tag, as, no_samples);
788                                             free(as);
789                                             free(a);
790                                         }
791                                         else if (type==BCF_BT_CHAR)
792                                         {
793                                             //to be implemented
794                                         }
795                                     }
796                                     else if (var_len == BCF_VL_R)
797                                     {
798                                         if (type==BCF_BT_INT8||type==BCF_BT_INT16||type==BCF_BT_INT32)
799                                         {
800                                             int32_t n = 0;
801                                             int32_t* a = 0;
802                                             int32_t ret = bcf_get_format_int32(odr->hdr, v, tag, &a, &n);
803                                             int32_t* as = (int32_t*) malloc(no_samples*sizeof(int32_t)*2);
804 
805                                             if (ret!=n_allele*no_samples)
806                                             {
807                                                 for (uint32_t j=0; j<no_samples; ++j)
808                                                 {
809                                                     as[j*2] = bcf_int32_missing;
810                                                     as[j*2+1] = bcf_int32_vector_end;
811                                                 }
812                                             }
813                                             else
814                                             {
815                                                 for (uint32_t j=0; j<no_samples; ++j)
816                                                 {
817                                                     as[j*2] = a[j*n_allele];
818                                                     as[j*2+1] = a[j*n_allele+i];
819                                                 }
820                                             }
821 
822                                             bcf_update_format_int32(odw->hdr, nv, tag, as, no_samples*2);
823                                             free(as);
824                                             free(a);
825                                         }
826                                         else if (type==BCF_BT_FLOAT)
827                                         {
828                                             int32_t n = 0;
829                                             float* a = 0;
830                                             int32_t ret = bcf_get_format_float(odr->hdr, v, tag, &a, &n);
831                                             float* as = (float*) malloc(no_samples*sizeof(float));
832 
833                                             if (ret!=n_allele*no_samples)
834                                             {
835                                                 for (uint32_t j=0; j<no_samples; ++j)
836                                                 {
837                                                     bcf_float_set_missing(as[j*2]);
838                                                     bcf_float_set_vector_end(as[j*2+1]);
839                                                 }
840                                             }
841                                             else
842                                             {
843                                                 for (uint32_t j=0; j<no_samples; ++j)
844                                                 {
845                                                     as[j*2] = a[j*n_allele];
846                                                     as[j*2+1] = a[j*n_allele+i];
847                                                 }
848                                             }
849 
850                                             bcf_update_format_float(odw->hdr, nv, tag, as, no_samples*2);
851                                             free(as);
852                                             free(a);
853                                         }
854                                         else if (type==BCF_BT_CHAR)
855                                         {
856                                             //to be implemented
857                                         }
858                                     }
859                                     else if (var_len == BCF_VL_FIXED)
860                                     {
861                                         if (strcmp(tag,"GT")==0)
862                                         {
863                                             int32_t n = 0;
864                                             int32_t* gt = 0;
865                                             int32_t ret = bcf_get_genotypes(odr->hdr, v, &gt, &n);
866 
867                                             int32_t ploidy = n/bcf_hdr_nsamples(odr->hdr);
868                                             int32_t n_genotype = bcf_ap2g(n_allele, ploidy);
869                                             int32_t n_genotype2 = bcf_ap2g(2, ploidy);
870 
871                                             int32_t* gts = (int32_t*) malloc(no_samples*n_genotype2*sizeof(int32_t));
872 
873                                             for (uint32_t j=0; j<no_samples; ++j)
874                                             {
875                                                 for (size_t k=0; k<ploidy; ++k)
876                                                 {
877                                                     int32_t a = gt[j*ploidy+k];
878                                                     if (a<0)
879                                                     {
880                                                         gts[j*ploidy+k] = a;
881                                                     }
882                                                     else
883                                                     {
884                                                         int32_t na = bcf_gt_allele(a);
885 
886                                                         if (na)
887                                                         {
888                                                             na = na==i ? 1 : -1;
889                                                         }
890 
891                                                         gts[j*ploidy+k] =  ((na+1)<<1) | bcf_gt_is_phased(a);
892                                                     }
893                                                 }
894                                             }
895 
896                                             bcf_update_genotypes(odw->hdr, nv, gts, no_samples*ploidy);
897                                             free(gts);
898                                             free(gt);
899                                         }
900                                         else
901                                         {
902                                             //leave it there...
903                                         }
904                                     }
905                                     else if (var_len == BCF_VL_VAR)
906                                     {
907                                         //leave it there
908                                     }
909                                 }
910                             }
911                         }
912 
913                         bcf_update_info_string(odw->hdr, nv, "OLD_MULTIALLELIC", old_alleles.s);
914                         bcf_update_alleles_str(odw->hdr, nv, new_alleles.s);
915 
916                         odw->write(nv);
917                         bcf_destroy(nv);
918                     }
919                 }
920             }
921             else
922             {
923                 odw->write(v);
924                 ++no_biallelic;
925             }
926 
927             ++no_variants;
928         }
929 
930         odr->close();
931         odw->close();
932     };
933 
print_options()934     void print_options()
935     {
936         std::clog << "decompose v" << version << "\n";
937         std::clog << "\n";
938         std::clog << "options:     input VCF file        " << input_vcf_file << "\n";
939         std::clog << "         [s] smart decomposition   " << (smart ? "true" : "false") << " (experimental)\n";
940         std::clog << "         [o] output VCF file       " << output_vcf_file << "\n";
941         print_int_op("         [i] intervals             ", intervals);
942         std::clog << "\n";
943     }
944 
print_stats()945     void print_stats()
946     {
947         std::clog << "\n";
948         std::clog << "stats: no. variants                 : " << no_variants << "\n";
949         std::clog << "       no. biallelic variants       : " << no_biallelic << "\n";
950         std::clog << "       no. multiallelic variants    : " << no_multiallelic << "\n";
951         std::clog << "\n";
952         std::clog << "       no. additional biallelics    : " << no_additional_biallelic << "\n";
953         std::clog << "       total no. of biallelics      : " << no_additional_biallelic + no_variants << "\n";
954         std::clog << "\n";
955     };
956 
~Igor()957     ~Igor() {};
958 
959     private:
960 };
961 
962 }
963 
decompose2(int argc,char ** argv)964 void decompose2(int argc, char ** argv)
965 {
966     Igor igor(argc, argv);
967     igor.print_options();
968     igor.initialize();
969     igor.decompose2();
970     igor.print_stats();
971 };
972