1 /* The MIT License
2
3 Copyright (c) 2015 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "decompose2.h"
25
26 namespace
27 {
28
29 class Igor : Program
30 {
31 public:
32
33 ///////////
34 //options//
35 ///////////
36 std::string input_vcf_file;
37 std::string output_vcf_file;
38 std::vector<GenomeInterval> intervals;
39 std::string ref_fasta_file;
40 bool smart;
41
42 ///////
43 //i/o//
44 ///////
45 BCFOrderedReader *odr;
46 BCFOrderedWriter *odw;
47
48 kstring_t s;
49 kstring_t new_alleles;
50 kstring_t old_alleles;
51
52 /////////
53 //stats//
54 /////////
55 uint32_t no_variants;
56 uint32_t new_no_variants;
57 uint32_t no_biallelic;
58 uint32_t no_multiallelic;
59 uint32_t no_additional_biallelic;
60
61 /////////
62 //tools//
63 /////////
64 VariantManip *vm;
65
Igor(int argc,char ** argv)66 Igor(int argc, char **argv)
67 {
68 version = "0.5";
69
70 //////////////////////////
71 //options initialization//
72 //////////////////////////
73 try
74 {
75 std::string desc = "decomposes multiallelic variants into biallelic in a VCF file.";
76
77 TCLAP::CmdLine cmd(desc, ' ', version);
78 VTOutput my; cmd.setOutput(&my);
79 TCLAP::ValueArg<std::string> arg_intervals("i", "i", "intervals []", false, "", "str", cmd);
80 TCLAP::ValueArg<std::string> arg_interval_list("I", "I", "file containing list of intervals []", false, "", "file", cmd);
81 TCLAP::ValueArg<std::string> arg_output_vcf_file("o", "o", "output VCF file [-]", false, "-", "str", cmd);
82 TCLAP::SwitchArg arg_smart("s", "s", "smart decomposition [false]", cmd, false);
83 TCLAP::UnlabeledValueArg<std::string> arg_input_vcf_file("<in.vcf>", "input VCF file", true, "","file", cmd);
84
85 cmd.parse(argc, argv);
86
87 input_vcf_file = arg_input_vcf_file.getValue();
88 output_vcf_file = arg_output_vcf_file.getValue();
89 smart = arg_smart.getValue();
90 parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue());
91 }
92 catch (TCLAP::ArgException &e)
93 {
94 std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
95 abort();
96 }
97 };
98
initialize()99 void initialize()
100 {
101 //////////////////////
102 //i/o initialization//
103 //////////////////////
104 odr = new BCFOrderedReader(input_vcf_file, intervals);
105
106 odw = new BCFOrderedWriter(output_vcf_file);
107 odw->link_hdr(odr->hdr);
108 //bcf_hdr_append_info_with_backup_naming(odw->hdr, "OLD_MULTIALLELIC", "1", "String", "Original chr:pos:ref:alt encoding", false);
109 bcf_hdr_append(odw->hdr, "##INFO=<ID=,Number=1,Type=String,Description=\"Original chr:pos:ref:alt encoding\">\n");
110 odw->write_hdr();
111
112 s = {0,0,0};
113 old_alleles = {0,0,0};
114 new_alleles = {0,0,0};
115
116 ////////////////////////
117 //stats initialization//
118 ////////////////////////
119 no_variants = 0;
120 no_biallelic = 0;
121 no_multiallelic = 0;
122
123 no_additional_biallelic = 0;
124
125 ////////////////////////
126 //tools initialization//
127 ////////////////////////
128 }
129
130 /**
131 * n choose r.
132 */
choose(uint32_t n,uint32_t r)133 uint32_t choose(uint32_t n, uint32_t r)
134 {
135 if (r>n)
136 {
137 return 0;
138 }
139 else if (r==n)
140 {
141 return 1;
142 }
143 else if (r==0)
144 {
145 return 1;
146 }
147 else
148 {
149 if (r>(n>>1))
150 {
151 r = n-r;
152 }
153
154 uint32_t num = n;
155 uint32_t denum = 1;
156
157 for (uint32_t i=1; i<r; ++i)
158 {
159 num *= n-i;
160 denum *= i+1;
161 }
162
163 return num/denum;
164 }
165 }
166
decompose2()167 void decompose2()
168 {
169 bcf1_t* v = bcf_init();
170 Variant variant;
171
172 while (odr->read(v))
173 {
174 // std::cerr << "=============================\n";
175 // bcf_print(odr->hdr, v);
176 // std::cerr << "=============================\n";
177
178 int32_t n_allele = bcf_get_n_allele(v);
179
180 if (n_allele > 2)
181 {
182 if (!smart)
183 {
184 ++no_multiallelic;
185 no_additional_biallelic += n_allele-2;
186
187 old_alleles.l = 0;
188 bcf_variant2string(odw->hdr, v, &old_alleles);
189
190 int32_t rid = bcf_get_rid(v);
191 int32_t pos1 = bcf_get_pos1(v);
192 char** allele = bcf_get_allele(v);
193
194 int32_t *gt = NULL;
195 int32_t *pl = NULL;
196 float *gl = NULL;
197 int32_t *dp = NULL;
198 size_t no_samples = bcf_hdr_nsamples(odr->hdr);
199 bool has_GT = false;
200 bool has_PL = false;
201 bool has_GL = false;
202 bool has_DP = false;
203 int32_t ploidy = 0;
204 int32_t n_genotype;
205 int32_t n_genotype2;
206
207 int32_t *gts = NULL;
208 int32_t *pls = NULL;
209 float *gls = NULL;
210 int32_t *dps = NULL;
211
212 if (no_samples)
213 {
214 bcf_unpack(v, BCF_UN_FMT);
215
216 int32_t n = 0;
217 int32_t ret = bcf_get_genotypes(odr->hdr, v, >, &n);
218 if (ret>0) has_GT = true;
219 ploidy = n/bcf_hdr_nsamples(odr->hdr);
220 n_genotype = bcf_ap2g(n_allele, ploidy);
221 n_genotype2 = bcf_ap2g(2, ploidy);
222 if (ret>0) gts = (int32_t*) malloc(no_samples*n_genotype2*sizeof(int32_t));
223
224 n=0;
225 ret = bcf_get_format_int32(odr->hdr, v, "PL", &pl, &n);
226 if (ret>0)
227 {
228 has_PL = true;
229 pls = (int32_t*) malloc(no_samples*n_genotype2*sizeof(int32_t));
230 }
231
232 n=0;
233 ret = bcf_get_format_float(odr->hdr, v, "GL", &gl, &n);
234 if (ret>0)
235 {
236 has_GL = true;
237 gls = (float*) malloc(no_samples*n_genotype2*sizeof(float));
238 }
239
240 n=0;
241 ret = bcf_get_format_int32(odr->hdr, v, "DP", &dp, &n);
242 if (ret>0)
243 {
244 has_DP = true;
245 dps = (int32_t*) malloc(no_samples*sizeof(int32_t));
246 }
247 }
248
249 // std::cerr << "*****************************\n";
250 // bcf_print(odw->hdr, v);
251 // std::cerr << "*****************************\n";
252
253 for (size_t i=1; i<n_allele; ++i)
254 {
255 //bcf_clear(v);
256 bcf1_t* nv = bcf_dup(v);
257 new_alleles.l=0;
258 kputs(allele[0], &new_alleles);
259 kputc(',', &new_alleles);
260 kputs(allele[i], &new_alleles);
261
262 bcf_update_info_string(odw->hdr, nv, "OLD_MULTIALLELIC", old_alleles.s);
263 bcf_update_alleles_str(odw->hdr, nv, new_alleles.s);
264
265 if (no_samples)
266 {
267 //get array genotypes
268 for (size_t j=0; j<no_samples; ++j)
269 {
270 if (has_GT)
271 {
272 for (size_t k=0; k<ploidy; ++k)
273 {
274 int32_t _a = gt[j*ploidy+k];
275 if (_a<0)
276 {
277 gts[j*ploidy+k] = _a;
278 }
279 else
280 {
281 int32_t a = bcf_gt_allele(_a);
282
283 if (a)
284 {
285 a = a==i ? 1 : -1;
286 }
287
288 gts[j*ploidy+k] = ((a+1)<<1) | bcf_gt_is_phased(_a);
289 }
290 }
291 }
292
293 if (has_PL)
294 {
295 pls[j*n_genotype2] = pl[j*n_genotype];
296 uint32_t index = 0;
297 for (uint32_t k = 1; k<n_genotype2; ++k)
298 {
299 index += choose(ploidy-(k-1)+i-1,i-1);
300 pls[j*n_genotype2+k] = pl[j*n_genotype+index];
301 }
302 }
303
304 if (has_GL)
305 {
306 gls[j*n_genotype2] = gl[j*n_genotype];
307 uint32_t index = 0;
308 for (uint32_t k = 1; k<n_genotype2; ++k)
309 {
310 index += choose(ploidy-(k-1)+i-1,i-1);
311 gls[j*n_genotype2+k] = gl[j*n_genotype+index];
312 }
313 }
314
315 if (has_DP)
316 {
317 dps[j] = dp[j];
318 }
319 }
320
321 //remove other format values except for GT, PL, GL and DP
322 if (i==1)
323 {
324 bcf_fmt_t *fmt = v->d.fmt;
325 for (size_t j = 0; j < v->n_fmt; ++j)
326 {
327 const char* tag = odw->hdr->id[BCF_DT_ID][fmt[j].id].key;
328
329 if (strcmp(tag,"GT")&&strcmp(tag,"PL")&&strcmp(tag,"GL")&&strcmp(tag,"DP"))
330 {
331 bcf_update_format_int32(odw->hdr, nv, tag, 0, 0);
332 }
333 }
334 }
335
336 if (has_GT) bcf_update_genotypes(odw->hdr, nv, gts, no_samples*ploidy);
337 if (has_PL) bcf_update_format_int32(odw->hdr, nv, "PL", pls, no_samples*n_genotype2);
338 if (has_GL) bcf_update_format_float(odw->hdr, nv, "GL", gls, no_samples*n_genotype2);
339 if (has_DP) bcf_update_format_int32(odw->hdr, nv, "DP", dps, no_samples);
340 }
341
342 //remove other format values except for GT, PL, GL and DP
343 bcf_fmt_t *fmt = v->d.fmt;
344 for (size_t j = 0; j < v->n_fmt; ++j)
345 {
346 const char* tag = odw->hdr->id[BCF_DT_ID][fmt[j].id].key;
347
348 if (strcmp(tag,"GT")&&strcmp(tag,"PL")&&strcmp(tag,"GL")&&strcmp(tag,"DP"))
349 {
350 bcf_update_format_int32(odw->hdr, nv, tag, 0, 0);
351 }
352 }
353
354 odw->write(nv);
355 bcf_destroy(nv);
356 }
357
358 if (has_GT) {free(gt); free(gts);}
359 if (has_PL) {free(pl); free(pls);}
360 if (has_GL) {free(gl); free(gls);}
361 if (has_DP) {free(dp); free(dps);}
362 }
363 else //smart decomposition
364 {
365 ++no_multiallelic;
366 no_additional_biallelic += n_allele-2;
367
368 old_alleles.l = 0;
369 bcf_variant2string(odw->hdr, v, &old_alleles);
370
371 int32_t rid = bcf_get_rid(v);
372 int32_t pos1 = bcf_get_pos1(v);
373 char** allele = bcf_get_allele(v);
374 uint32_t no_samples = bcf_hdr_nsamples(odr->hdr);
375
376 char** alleles = (char**) malloc(n_allele*sizeof(char*));
377 for (size_t i=0; i<n_allele; ++i)
378 {
379 alleles[i] = strdup(allele[i]);
380 }
381
382 // std::cerr << "=============================\n";
383 // bcf_print(odr->hdr, v);
384 // std::cerr << "=============================\n";
385 //
386 for (size_t i=1; i<n_allele; ++i)
387 {
388 bcf1_t *nv = bcf_dup(v);
389 bcf_set_rid(nv, rid);
390 bcf_set_pos1(nv, pos1);
391 new_alleles.l=0;
392 kputs(allele[0], &new_alleles);
393 kputc(',', &new_alleles);
394 kputs(allele[i], &new_alleles);
395
396 //////////////////////
397 //split up INFO fields
398 //////////////////////
399 if (nv->n_info)
400 {
401 bcf_unpack(v, BCF_UN_INFO);
402
403 bcf_hdr_t* hdr = odr->hdr;
404 bcf_info_t *info = v->d.info;
405
406 for (uint32_t j = 0; j < v->n_info; ++j)
407 {
408 int32_t key = info[j].key;
409
410 if (key<0)
411 {
412 fprintf(stderr, "[E::%s] invalid BCF, the INFO key key=%d not present in the header.\n", __func__, key);
413 abort();
414 }
415
416 const char* tag = hdr->id[BCF_DT_ID][key].key;
417 int32_t var_len = bcf_hdr_id2length(hdr,BCF_HL_INFO,key);
418 int32_t type = info[j].type;
419
420 if (var_len==BCF_VL_G)
421 {
422 if (type==BCF_BT_INT8||type==BCF_BT_INT16||type==BCF_BT_INT32)
423 {
424 int32_t n = 0;
425 int32_t* g = 0;
426 int32_t ret = bcf_get_info_int32(odr->hdr, v, tag, &g, &n);
427
428 if (ret>0)
429 {
430 int32_t n_genotype = n;
431 int32_t ploidy = bcf_ag2p(n_allele, n_genotype);
432 int32_t n_genotype2 = bcf_ap2g(2, ploidy);
433 int32_t* gs = (int32_t*) malloc(n_genotype2*sizeof(int32_t));
434
435 gs[0] = g[0];
436 uint32_t index = 0;
437 for (uint32_t k = 1; k<n_genotype2; ++k)
438 {
439 index += choose(ploidy-(k-1)+i-1,i-1);
440 gs[k] = g[index];
441 }
442
443 bcf_update_info_int32(odw->hdr, nv, tag, gs, no_samples*n_genotype2);
444 free(gs);
445 free(g);
446 }
447 }
448 else if (type==BCF_BT_FLOAT)
449 {
450 int32_t n = 0;
451 float* g = 0;
452 int32_t ret = bcf_get_info_float(odr->hdr, v, tag, &g, &n);
453
454 if (ret>0)
455 {
456 int32_t n_genotype = n;
457 int32_t ploidy = bcf_ag2p(n_allele, n_genotype);
458 int32_t n_genotype2 = bcf_ap2g(2, ploidy);
459 float* gs = (float*) malloc(n_genotype2*sizeof(float));
460
461 gs[0] = g[0];
462 uint32_t index = 0;
463 for (uint32_t k = 1; k<n_genotype2; ++k)
464 {
465 index += choose(ploidy-(k-1)+i-1,i-1);
466 gs[k] = g[index];
467 }
468
469 bcf_update_info_float(odw->hdr, nv, tag, gs, n_genotype2);
470 free(gs);
471 free(g);
472 }
473 }
474 else if (type==BCF_BT_CHAR)
475 {
476 //not necessary
477 }
478 }
479 else if (var_len == BCF_VL_A)
480 {
481 if (type==BCF_BT_INT8||type==BCF_BT_INT16||type==BCF_BT_INT32)
482 {
483 int32_t n = 0;
484 int32_t* a = 0;
485 int32_t ret = bcf_get_info_int32(odr->hdr, v, tag, &a, &n);
486 if (ret>0)
487 {
488 int32_t as = a[i-1];
489 bcf_update_info_int32(odw->hdr, nv, tag, &as, 1);
490 }
491 free(a);
492 }
493 else if (type==BCF_BT_FLOAT)
494 {
495 int32_t n = 0;
496 float* a = 0;
497 int32_t ret = bcf_get_info_float(odr->hdr, v, tag, &a, &n);
498 if (ret>0)
499 {
500 float as = a[i-1];
501 bcf_update_info_float(odw->hdr, nv, tag, &as, 1);
502 }
503 free(a);
504 }
505 else if (type==BCF_BT_CHAR)
506 {
507 int32_t n = 0;
508 char* a = 0;
509 int32_t ret = bcf_get_info_string(odr->hdr, v, tag, &a, &n);
510
511 if (ret>0)
512 {
513 char* b = a;
514 char* c = a;
515 int32_t e = 0;
516 while (*b)
517 {
518 if (*b==',')
519 {
520 *b = 0;
521 ++e;
522
523 if (e==i)
524 {
525 bcf_update_info_string(odw->hdr, nv, tag, c);
526 break;
527 }
528
529 c = b+1;
530 }
531 else if (*(b+1)==0)
532 {
533 bcf_update_info_string(odw->hdr, nv, tag, c);
534 break;
535 }
536
537 ++b;
538 }
539
540 if (e>i)
541 {
542 bcf_update_info_string(odw->hdr, nv, tag, NULL);
543 }
544
545 free(a);
546 }
547 }
548 }
549 else if (var_len == BCF_VL_R)
550 {
551 if (type==BCF_BT_INT8||type==BCF_BT_INT16||type==BCF_BT_INT32)
552 {
553 int32_t n = 0;
554 int32_t* a = 0;
555 int32_t ret = bcf_get_info_int32(odr->hdr, v, tag, &a, &n);
556 if (ret)
557 {
558 int32_t* as = (int32_t*) malloc(sizeof(int32_t)*2);
559 as[0] = a[0];
560 as[1] = a[i];
561 bcf_update_info_int32(odw->hdr, nv, tag, as, 2);
562 free(as);
563 free(a);
564 }
565 }
566 else if (type==BCF_BT_FLOAT)
567 {
568 int32_t n = 0;
569 float* a = 0;
570 int32_t ret = bcf_get_info_float(odr->hdr, v, tag, &a, &n);
571 if (ret>0)
572 {
573 float* as = (float*) malloc(sizeof(float)*2);
574 as[0] = a[0];
575 as[1] = a[i];
576 bcf_update_info_float(odw->hdr, nv, tag, as, 2);
577 free(as);
578 free(a);
579 }
580 }
581 else if (type==BCF_BT_CHAR)
582 {
583 int32_t n = 0;
584 char* a = 0;
585 int32_t ret = bcf_get_info_string(odr->hdr, v, tag, &a, &n);
586
587 if (ret>0)
588 {
589 char* b = a;
590 char* c = a;
591 int32_t e = 0;
592 while (*b)
593 {
594 if (*b==',')
595 {
596 *b = 0;
597 ++e;
598
599 if (e==i)
600 {
601 bcf_update_info_string(odw->hdr, nv, tag, c);
602 break;
603 }
604
605 c = b+1;
606 }
607 //last occurrence
608 else if (*(b+1)==0)
609 {
610 bcf_update_info_string(odw->hdr, nv, tag, c);
611 break;
612 }
613
614 ++b;
615 }
616
617 if (e>i)
618 {
619 bcf_update_info_string(odw->hdr, nv, tag, NULL);
620 }
621
622 free(a);
623 }
624 }
625 }
626 else if (var_len == BCF_VL_FIXED)
627 {
628 //leave it there
629 }
630 else if (var_len == BCF_VL_VAR)
631 {
632 //leave it there
633 }
634 }
635 }
636
637 ////////////////////////
638 //split up FORMAT fields
639 ////////////////////////
640 if (no_samples)
641 {
642 bcf_unpack(v, BCF_UN_FMT);
643
644 if (nv->n_fmt)
645 {
646 bcf_hdr_t* hdr = odr->hdr;
647 bcf_fmt_t *fmt = v->d.fmt;
648
649 for (uint32_t j = 0; j < (int32_t)v->n_fmt; ++j)
650 {
651 int32_t id = fmt[j].id;
652
653 if (id<0)
654 {
655 fprintf(stderr, "[E::%s] invalid BCF, the FORMAT tag id=%d not present in the header.\n", __func__, id);
656 abort();
657 }
658
659 const char* tag = hdr->id[BCF_DT_ID][id].key;
660 int32_t var_len = bcf_hdr_id2length(hdr,BCF_HL_FMT,id);
661 int32_t type = fmt[j].type;
662
663 if (var_len==BCF_VL_G)
664 {
665 if (type==BCF_BT_INT8||type==BCF_BT_INT16||type==BCF_BT_INT32)
666 {
667 int32_t n = 0;
668 int32_t* g = 0;
669 int32_t ret = bcf_get_format_int32(odr->hdr, v, tag, &g, &n);
670 int32_t n_genotype = n/no_samples;
671
672 int32_t ploidy = bcf_ag2p(n_allele, n_genotype);
673 int32_t n_genotype2 = bcf_ap2g(2, ploidy);
674 int32_t* gs = (int32_t*) malloc(no_samples*n_genotype2*sizeof(int32_t));
675
676 //get array genotypes
677 for (uint32_t j=0; j<no_samples; ++j)
678 {
679 gs[j*n_genotype2] = g[j*n_genotype];
680 uint32_t index = 0;
681 for (uint32_t k = 1; k<n_genotype2; ++k)
682 {
683 index += choose(ploidy-(k-1)+i-1,i-1);
684 gs[j*n_genotype2+k] = g[j*n_genotype+index];
685 }
686 }
687
688 bcf_update_format_int32(odw->hdr, nv, tag, gs, no_samples*n_genotype2);
689 free(gs);
690 free(g);
691 }
692 else if (type==BCF_BT_FLOAT)
693 {
694 int32_t n = 0;
695 float* g = 0;
696 int32_t ret = bcf_get_format_float(odr->hdr, v, tag, &g, &n);
697 int32_t n_genotype = n/no_samples;
698 int32_t ploidy = bcf_ag2p(n_allele, n_genotype);
699 int32_t n_genotype2 = bcf_ap2g(2, ploidy);
700 float* gs = (float*) malloc(no_samples*n_genotype2*sizeof(float));
701
702 for (uint32_t j=0; j<no_samples; ++j)
703 {
704 gs[j*n_genotype2] = g[j*n_genotype];
705 uint32_t index = 0;
706 for (uint32_t k = 1; k<n_genotype2; ++k)
707 {
708 index += choose(ploidy-(k-1)+i-1,i-1);
709 gs[j*n_genotype2+k] = g[j*n_genotype+index];
710 }
711 }
712
713 bcf_update_format_float(odw->hdr, nv, tag, gs, no_samples*n_genotype2);
714 free(gs);
715 free(g);
716 }
717 else if (type==BCF_BT_CHAR)
718 {
719 //to be implemented
720 }
721 }
722 else if (var_len == BCF_VL_A)
723 {
724 if (type==BCF_BT_INT8||type==BCF_BT_INT16||type==BCF_BT_INT32)
725 {
726 int32_t n = 0;
727 int32_t* a = 0;
728 int32_t ret = bcf_get_format_int32(odr->hdr, v, tag, &a, &n);
729 int32_t* as = (int32_t*) malloc(no_samples*sizeof(int32_t));
730
731 if (ret!=(n_allele-1)*no_samples)
732 {
733 for (uint32_t j=0; j<no_samples; ++j)
734 {
735 as[j] = bcf_int32_missing;
736 }
737 }
738 else
739 {
740 for (uint32_t j=0; j<no_samples; ++j)
741 {
742 if (a[j*(n_allele-1)+(i-1)]==bcf_int32_missing ||
743 a[j*(n_allele-1)+(i-1)]==bcf_int32_vector_end)
744 {
745 as[j] = bcf_int32_missing;
746 }
747 else
748 {
749 as[j] = a[j*(n_allele-1)+(i-1)];
750 }
751 }
752 }
753 bcf_update_format_int32(odw->hdr, nv, tag, as, no_samples);
754 free(as);
755 free(a);
756 }
757 else if (type==BCF_BT_FLOAT)
758 {
759 int32_t n = 0;
760 float* a = 0;
761 int32_t ret = bcf_get_format_float(odr->hdr, v, tag, &a, &n);
762 float* as = (float*) malloc(no_samples*sizeof(float));
763
764 if (ret!=(n_allele-1)*no_samples)
765 {
766 for (uint32_t j=0; j<no_samples; ++j)
767 {
768 bcf_float_set_missing(as[j]);;
769 }
770 }
771 else
772 {
773 for (uint32_t j=0; j<no_samples; ++j)
774 {
775 if (bcf_float_is_missing(a[j*(n_allele-1)+(i-1)]) ||
776 bcf_float_is_vector_end(a[j*(n_allele-1)+(i-1)]))
777 {
778 bcf_float_set_missing(as[j]);
779 }
780 else
781 {
782 as[j] = a[j*(n_allele-1)+(i-1)];
783 }
784 }
785 }
786
787 bcf_update_format_float(odw->hdr, nv, tag, as, no_samples);
788 free(as);
789 free(a);
790 }
791 else if (type==BCF_BT_CHAR)
792 {
793 //to be implemented
794 }
795 }
796 else if (var_len == BCF_VL_R)
797 {
798 if (type==BCF_BT_INT8||type==BCF_BT_INT16||type==BCF_BT_INT32)
799 {
800 int32_t n = 0;
801 int32_t* a = 0;
802 int32_t ret = bcf_get_format_int32(odr->hdr, v, tag, &a, &n);
803 int32_t* as = (int32_t*) malloc(no_samples*sizeof(int32_t)*2);
804
805 if (ret!=n_allele*no_samples)
806 {
807 for (uint32_t j=0; j<no_samples; ++j)
808 {
809 as[j*2] = bcf_int32_missing;
810 as[j*2+1] = bcf_int32_vector_end;
811 }
812 }
813 else
814 {
815 for (uint32_t j=0; j<no_samples; ++j)
816 {
817 as[j*2] = a[j*n_allele];
818 as[j*2+1] = a[j*n_allele+i];
819 }
820 }
821
822 bcf_update_format_int32(odw->hdr, nv, tag, as, no_samples*2);
823 free(as);
824 free(a);
825 }
826 else if (type==BCF_BT_FLOAT)
827 {
828 int32_t n = 0;
829 float* a = 0;
830 int32_t ret = bcf_get_format_float(odr->hdr, v, tag, &a, &n);
831 float* as = (float*) malloc(no_samples*sizeof(float));
832
833 if (ret!=n_allele*no_samples)
834 {
835 for (uint32_t j=0; j<no_samples; ++j)
836 {
837 bcf_float_set_missing(as[j*2]);
838 bcf_float_set_vector_end(as[j*2+1]);
839 }
840 }
841 else
842 {
843 for (uint32_t j=0; j<no_samples; ++j)
844 {
845 as[j*2] = a[j*n_allele];
846 as[j*2+1] = a[j*n_allele+i];
847 }
848 }
849
850 bcf_update_format_float(odw->hdr, nv, tag, as, no_samples*2);
851 free(as);
852 free(a);
853 }
854 else if (type==BCF_BT_CHAR)
855 {
856 //to be implemented
857 }
858 }
859 else if (var_len == BCF_VL_FIXED)
860 {
861 if (strcmp(tag,"GT")==0)
862 {
863 int32_t n = 0;
864 int32_t* gt = 0;
865 int32_t ret = bcf_get_genotypes(odr->hdr, v, >, &n);
866
867 int32_t ploidy = n/bcf_hdr_nsamples(odr->hdr);
868 int32_t n_genotype = bcf_ap2g(n_allele, ploidy);
869 int32_t n_genotype2 = bcf_ap2g(2, ploidy);
870
871 int32_t* gts = (int32_t*) malloc(no_samples*n_genotype2*sizeof(int32_t));
872
873 for (uint32_t j=0; j<no_samples; ++j)
874 {
875 for (size_t k=0; k<ploidy; ++k)
876 {
877 int32_t a = gt[j*ploidy+k];
878 if (a<0)
879 {
880 gts[j*ploidy+k] = a;
881 }
882 else
883 {
884 int32_t na = bcf_gt_allele(a);
885
886 if (na)
887 {
888 na = na==i ? 1 : -1;
889 }
890
891 gts[j*ploidy+k] = ((na+1)<<1) | bcf_gt_is_phased(a);
892 }
893 }
894 }
895
896 bcf_update_genotypes(odw->hdr, nv, gts, no_samples*ploidy);
897 free(gts);
898 free(gt);
899 }
900 else
901 {
902 //leave it there...
903 }
904 }
905 else if (var_len == BCF_VL_VAR)
906 {
907 //leave it there
908 }
909 }
910 }
911 }
912
913 bcf_update_info_string(odw->hdr, nv, "OLD_MULTIALLELIC", old_alleles.s);
914 bcf_update_alleles_str(odw->hdr, nv, new_alleles.s);
915
916 odw->write(nv);
917 bcf_destroy(nv);
918 }
919 }
920 }
921 else
922 {
923 odw->write(v);
924 ++no_biallelic;
925 }
926
927 ++no_variants;
928 }
929
930 odr->close();
931 odw->close();
932 };
933
print_options()934 void print_options()
935 {
936 std::clog << "decompose v" << version << "\n";
937 std::clog << "\n";
938 std::clog << "options: input VCF file " << input_vcf_file << "\n";
939 std::clog << " [s] smart decomposition " << (smart ? "true" : "false") << " (experimental)\n";
940 std::clog << " [o] output VCF file " << output_vcf_file << "\n";
941 print_int_op(" [i] intervals ", intervals);
942 std::clog << "\n";
943 }
944
print_stats()945 void print_stats()
946 {
947 std::clog << "\n";
948 std::clog << "stats: no. variants : " << no_variants << "\n";
949 std::clog << " no. biallelic variants : " << no_biallelic << "\n";
950 std::clog << " no. multiallelic variants : " << no_multiallelic << "\n";
951 std::clog << "\n";
952 std::clog << " no. additional biallelics : " << no_additional_biallelic << "\n";
953 std::clog << " total no. of biallelics : " << no_additional_biallelic + no_variants << "\n";
954 std::clog << "\n";
955 };
956
~Igor()957 ~Igor() {};
958
959 private:
960 };
961
962 }
963
decompose2(int argc,char ** argv)964 void decompose2(int argc, char ** argv)
965 {
966 Igor igor(argc, argv);
967 igor.print_options();
968 igor.initialize();
969 igor.decompose2();
970 igor.print_stats();
971 };
972