1 /* The MIT License
2
3 Copyright (c) 2013 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "hts_utils.h"
25 #include "Rmath/Rmath.h"
26
27 /********
28 *General
29 ********/
30
31 KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
32 typedef khash_t(vdict) vdict_t;
33
34 /**********
35 *FAI UTILS
36 **********/
37
38 /**
39 * An alternate sequence fetcher for upper case sequence.
40 */
faidx_fetch_uc_seq(const faidx_t * fai,const char * c_name,int p_beg_i,int p_end_i,int * len)41 char *faidx_fetch_uc_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
42 {
43 int l;
44 char c;
45 khiter_t iter;
46 faidx1_t val;
47 char *seq=NULL;
48
49 // Adjust position
50 iter = kh_get(s, fai->hash, c_name);
51 if(iter == kh_end(fai->hash)) return 0;
52 val = kh_value(fai->hash, iter);
53 if(p_end_i < p_beg_i) p_beg_i = p_end_i;
54 if(p_beg_i < 0) p_beg_i = 0;
55 else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
56 if(p_end_i < 0) p_end_i = 0;
57 else if(val.len <= p_end_i) p_end_i = val.len - 1;
58
59 // Now retrieve the sequence
60 int ret = bgzf_useek(fai->bgzf, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
61 if ( ret<0 )
62 {
63 *len = -1;
64 fprintf(stderr, "[fai_fetch_seq] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
65 return NULL;
66 }
67 l = 0;
68 seq = (char*)malloc(p_end_i - p_beg_i + 2);
69 while ( (c=bgzf_getc(fai->bgzf))>=0 && l < p_end_i - p_beg_i + 1)
70 if (isgraph(c)) seq[l++] = toupper(c);
71 seq[l] = '\0';
72 *len = l;
73 return seq;
74 }
75
76 /**********
77 *HTS UTILS
78 **********/
79
80 /**
81 * Checks file extension for use in writing files.
82 */
str_ends_with(std::string const & value,std::string const & ending)83 bool str_ends_with(std::string const & value, std::string const & ending)
84 {
85 if (ending.size() > value.size()) return false;
86 return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
87 }
88
89 /**
90 * Checks file extension based on filename.
91 */
hts_filename_type(std::string const & value)92 int32_t hts_filename_type(std::string const& value)
93 {
94 if (str_ends_with(value,".vcf") || str_ends_with(value,".vcf.gz"))
95 {
96 return vcf;
97 }
98 else if (str_ends_with(value,".bcf"))
99 {
100 return bcf;
101 }
102 else if (str_ends_with(value,".sam"))
103 {
104 return sam;
105 }
106 else if (str_ends_with(value,".bam"))
107 {
108 return bam;
109 }
110 else if (str_ends_with(value,".cram"))
111 {
112 return cram;
113 }
114 else
115 {
116 return unknown_format;
117 }
118 }
119
120 /**************
121 *BAM HDR UTILS
122 **************/
123
124 /**
125 * Copies contigs found in bam header to bcf header.
126 */
bam_hdr_transfer_contigs_to_bcf_hdr(const bam_hdr_t * sh,bcf_hdr_t * vh)127 void bam_hdr_transfer_contigs_to_bcf_hdr(const bam_hdr_t *sh, bcf_hdr_t *vh)
128 {
129 kstring_t s = {0,0,0};
130 for (size_t i=0; i<bam_hdr_get_n_targets(sh); ++i)
131 {
132 s.l = 0;
133 ksprintf(&s, "##contig=<ID=%s,length=%d>", bam_hdr_get_target_name(sh)[i], bam_hdr_get_target_len(sh)[i]);
134 bcf_hdr_append(vh, s.s);
135 }
136 if (s.m) free(s.s);
137 }
138
139 /**
140 * Gets sample names from bam header.
141 */
bam_hdr_get_sample_name(bam_hdr_t * hdr)142 std::string bam_hdr_get_sample_name(bam_hdr_t* hdr) {
143 if ( !hdr )
144 error("Failed to read the BAM header");
145
146 const char *p = hdr->text;
147 const char *q, *r;
148 int n = 0;
149 std::string sm;
150 while( ( q = strstr(p, "@RG" ) ) != 0 ) {
151 p = q + 3;
152 r = q = 0;
153 if ( ( q = strstr(p, "\tID:" ) ) != 0 ) q += 4;
154 if ( ( r = strstr(p, "\tSM:" ) ) != 0 ) r += 4;
155 if ( r && q ) {
156 char *u, *v;
157 for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
158 for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
159 *u = *v = '\0';
160 if ( sm.empty() )
161 sm = r;
162 else if ( sm.compare(r) != 0 )
163 error("Multiple sample IDs are included in one BAM file - %s, %s", sm.c_str(), r);
164 }
165 else break;
166 p = q > r ? q : r;
167 ++n;
168 }
169 if ( sm.empty() )
170 error("Sample ID information cannot be found");
171 return sm;
172 }
173
174
175 /**********
176 *BAM UTILS
177 **********/
178
179 /**
180 * Gets the end position of the last mapped base in the read.
181 */
bam_get_end_pos1(bam1_t * s)182 int32_t bam_get_end_pos1(bam1_t *s)
183 {
184 int32_t end_pos1 = bam_get_pos1(s);
185 int32_t n_cigar_op = bam_get_n_cigar_op(s);
186 if (n_cigar_op)
187 {
188 uint32_t *cigar = bam_get_cigar(s);
189 for (int32_t i = 0; i < n_cigar_op; ++i)
190 {
191 int32_t opchr = bam_cigar_opchr(cigar[i]);
192
193 if (opchr=='M' || opchr=='D' || opchr=='N' || opchr=='=' || opchr=='X')
194 {
195 end_pos1 += bam_cigar_oplen(cigar[i]);
196 }
197 }
198 }
199
200 return end_pos1-1;
201 }
202
203 /**
204 * Gets the read sequence from a bam record
205 */
bam_get_seq_string(bam1_t * s,kstring_t * seq)206 void bam_get_seq_string(bam1_t *s, kstring_t *seq)
207 {
208 seq->l=0;
209 uint8_t* sq = bam_get_seq(s);
210 for (uint16_t i = 0; i < bam_get_l_qseq(s); ++i)
211 {
212 kputc("=ACMGRSVTWYHKDBN"[bam_seqi(sq, i)], seq);
213 }
214 };
215
216 /**
217 * Gets the base qualities from a bam record, when N is observed, a placeholder value of 0(!, 33 adjusted) is entered
218 */
bam_get_qual_string(bam1_t * s,kstring_t * qual)219 void bam_get_qual_string(bam1_t *s, kstring_t *qual)
220 {
221 qual->l=0;
222 uint32_t offset = 0;
223 uint8_t* q = bam_get_qual(s);
224 for (int32_t i = 0; i < bam_get_l_qseq(s); ++i)
225 {
226 kputc(q[i-offset] + 33, qual);
227 }
228 };
229
230 /**
231 * Gets the cigar from a BAM record
232 */
bam_get_cigar_string(bam1_t * s,kstring_t * cigar_string)233 void bam_get_cigar_string(bam1_t *s, kstring_t *cigar_string)
234 {
235 cigar_string->l=0;
236 int32_t n_cigar_op = bam_get_n_cigar_op(s);
237 if (n_cigar_op)
238 {
239 uint32_t *cigar = bam_get_cigar(s);
240 for (int32_t i = 0; i < n_cigar_op; ++i)
241 {
242 kputw(bam_cigar_oplen(cigar[i]), cigar_string);
243 kputc(bam_cigar_opchr(cigar[i]), cigar_string);
244 }
245 }
246 }
247
248 /**
249 * Gets the cigar string from a bam record
250 */
bam_get_cigar_expanded_string(bam1_t * s,kstring_t * cigar_expanded_string)251 void bam_get_cigar_expanded_string(bam1_t *s, kstring_t *cigar_expanded_string)
252 {
253 kstring_t cigar_string = {0,0,0};
254 bam_get_cigar_string(s, &cigar_string);
255
256 cigar_expanded_string->l = 0;
257 int32_t lastIndex = cigar_string.l;
258 int32_t i = 0;
259 kstring_t token = {0,0,0};
260
261 if (lastIndex<0)
262 {
263 return;
264 }
265 char c;
266 bool seenM = false;
267
268 while (i<=lastIndex)
269 {
270 c = cigar_string.s[i];
271
272 //captures the numeric count
273 if (c<'A')
274 {
275 kputc(c, &token);
276 }
277
278 if (c>'A' || i==lastIndex)
279 {
280 //it is possible for I's to be observed before the first M's in the cigar string
281 //in this case, we treat them as 'S'
282 if (!seenM)
283 {
284 if (c=='I')
285 {
286 c = 'S';
287 }
288 else if (c=='M')
289 {
290 seenM = true;
291 }
292 }
293
294 int32_t count = atoi(token.s);
295 for (uint32_t j=0; j<count; ++j)
296 kputc(c, cigar_expanded_string);
297 token.l = 0;;
298 }
299
300 ++i;
301 }
302
303 if (cigar_string.m) free(cigar_string.s);
304 if (token.m) free(token.s);
305 }
306
307 /**
308 * Gets the base in the read that is mapped to a genomic position.
309 * Extracts the read sequence and qualities too.
310 */
bam_get_base_and_qual_and_read_and_qual(bam1_t * srec,uint32_t pos,char & base,char & qual,int32_t & rpos,kstring_t * readseq,kstring_t * readqual)311 void bam_get_base_and_qual_and_read_and_qual(bam1_t *srec, uint32_t pos, char& base, char& qual, int32_t& rpos, kstring_t* readseq, kstring_t* readqual)
312 {
313 bam1_core_t *c = &srec->core;
314 int32_t rlen = c->l_qseq;
315 uint32_t cpos = c->pos; //reference coordinates of the first mapped base
316 rpos = 0; //read coordinates
317
318 kstring_t str;
319 str.l = str.m = 0, str.s = 0;
320 base = 'N';
321 qual = 0;
322
323 if (c->n_cigar)
324 {
325 uint32_t *cigar = bam_get_cigar(srec);
326 for (uint32_t i = 0; i < c->n_cigar; ++i)
327 {
328 char op = bam_cigar_opchr(cigar[i]);
329 str.l = 0;
330 kputw(bam_cigar_oplen(cigar[i]), &str);
331 char* stop;
332 uint32_t len = strtol(str.s, &stop, 10);
333 assert(stop);
334
335 if (op=='M')
336 {
337 if (pos>=cpos && pos<=cpos+len-1)
338 {
339 rpos += pos-cpos;
340 break;
341 }
342
343 cpos += len;
344 rpos += len;
345 }
346 else if (op=='D')
347 {
348 if (pos>=cpos && pos<=cpos+len-1)
349 {
350 rpos = -1;
351 break;
352 }
353
354 cpos += len;
355 }
356 else if (op=='S' || op=='I')
357 {
358 rpos += len;
359 }
360 }
361
362 //std::cout << "bpos " << bpos << "\n";
363 if (rpos>=0 && rpos<=rlen)
364 {
365 //sequence
366 bam_get_seq_string(srec, readseq);
367 base = readseq->s[rpos];
368
369 //qual
370 bam_get_qual_string(srec, readqual);
371 qual = readqual->s[rpos];
372 }
373 else
374 {
375 rpos = BAM_READ_INDEX_NA;
376 }
377 }
378 // std::cout << "b: " << base << "\n";
379 // std::cout << "q: " << s[bpos-1] << " " << q << "\n";
380 // for (uint32_t i = 0; i < c->l_qseq; ++i) std::cerr << ((char)(s[i] + 33));
381 };
382
383 /**
384 * Prints a bam.
385 */
bam_print(bam_hdr_t * h,bam1_t * s)386 void bam_print(bam_hdr_t *h, bam1_t *s)
387 {
388 const char* chrom = bam_get_chrom(h, s);
389 uint32_t pos1 = bam_get_pos1(s);
390 kstring_t seq = {0,0,0};
391 bam_get_seq_string(s, &seq);
392 uint32_t len = bam_get_l_qseq(s);
393 kstring_t qual = {0,0,0};
394 bam_get_qual_string(s, &qual);
395 kstring_t cigar_string = {0,0,0};
396 bam_get_cigar_string(s, &cigar_string);
397 kstring_t cigar_expanded_string = {0,0,0};
398 bam_get_cigar_expanded_string(s, &cigar_expanded_string);
399 uint16_t flag = bam_get_flag(s);
400 uint32_t mapq = bam_get_mapq(s);
401
402 std::cerr << "##################" << "\n";
403 std::cerr << "chrom-pos: " << chrom << "-" << pos1 << "\n";
404 std::cerr << "read : " << seq.s << "\n";
405 std::cerr << "qual : " << qual.s << "\n";
406 std::cerr << "cigar_str: " << cigar_string.s << "\n";
407 std::cerr << "cigar : " << cigar_expanded_string.s << "\n";
408 std::cerr << "len : " << len << "\n";
409 std::cerr << "mapq : " << mapq << "\n";
410 std::cerr << "mpos1 : " << bam_get_mpos1(s) << "\n";
411 std::cerr << "mtid : " << bam_get_mtid(s) << "\n";
412
413 if (seq.m) free(seq.s);
414 if (qual.m) free(qual.s);
415 if (cigar_string.m) free(cigar_string.s);
416 if (cigar_expanded_string.m) free(cigar_expanded_string.s);
417 }
418
419 /**************
420 *BCF HDR UTILS
421 **************/
422
423 /**
424 * Prints header.
425 */
bcf_hdr_print(bcf_hdr_t * h)426 void bcf_hdr_print(bcf_hdr_t *h)
427 {
428 if ( h->dirty ) bcf_hdr_sync(h);
429 kstring_t htxt = {0,0,0};
430 bcf_hdr_format(h, 0, &htxt);
431 std::cerr << htxt.s;
432 if (htxt.m) free(htxt.s);
433 }
434
435 /**
436 * Copies contigs found in bcf header to another bcf header.
437 */
bcf_hdr_transfer_contigs(const bcf_hdr_t * hsrc,bcf_hdr_t * hdest)438 void bcf_hdr_transfer_contigs(const bcf_hdr_t *hsrc, bcf_hdr_t *hdest)
439 {
440 vdict_t *d = (vdict_t*)hsrc->dict[BCF_DT_CTG];
441 int tid, m = kh_size(d);
442 const char **names = (const char**) calloc(m,sizeof(const char*));
443 int len[m];
444 khint_t k;
445
446 for (k=kh_begin(d); k<kh_end(d); k++)
447 {
448 if ( !kh_exist(d,k) ) continue;
449 tid = kh_val(d,k).id;
450 len[tid] = bcf_hrec_find_key(kh_val(d, k).hrec[0],"length");
451 int j;
452 if ( sscanf(kh_val(d, k).hrec[0]->vals[len[tid]],"%d",&j) )
453 len[tid] = j;
454 names[tid] = kh_key(d,k);
455 }
456
457 kstring_t s = {0,0,0};
458 for (tid=0; tid<m; tid++)
459 {
460 s.l = 0;
461 ksprintf(&s, "##contig=<ID=%s,length=%d>", names[tid], len[tid]);
462 bcf_hdr_append(hdest, s.s);
463 }
464 if (s.m) free(s.s);
465 }
466
467 /**
468 * Checks if a particular header type exists
469 * @hdr - header
470 * @type - BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT, BCF_HL_CTG
471 * @key - the key name
472 */
bcf_hdr_exists(bcf_hdr_t * hdr,int32_t type,const char * key)473 bool bcf_hdr_exists(bcf_hdr_t *hdr, int32_t type, const char *key)
474 {
475 if (type>=BCF_HL_FLT && type<= BCF_HL_CTG) //FLT, INFO, FMT, CTG
476 {
477 for (uint32_t i=0; i<hdr->nhrec; ++i)
478 {
479 if (hdr->hrec[i]->type==type)
480 {
481 int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
482 if (j>=0)
483 {
484 char* val = hdr->hrec[i]->vals[j];
485
486 if (strcmp(key, val)==0)
487 {
488 return true;
489 }
490 }
491 }
492 }
493 }
494
495 return false;
496 }
497
498 /**
499 * Extracts sequence length by rid.
500 */
bcf_hdr_seqlen(const bcf_hdr_t * hdr,int32_t * nseq)501 int32_t* bcf_hdr_seqlen(const bcf_hdr_t *hdr, int32_t *nseq)
502 {
503 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
504 int tid, m = kh_size(d);
505 int32_t *len = (int32_t*) malloc(m*sizeof(int32_t));
506 khint_t k;
507
508 for (k=kh_begin(d); k<kh_end(d); k++)
509 {
510 if ( !kh_exist(d,k) ) continue;
511 tid = kh_val(d,k).id;
512 len[tid] = bcf_hrec_find_key(kh_val(d, k).hrec[0],"length");
513 int j;
514 if ( sscanf(kh_val(d, k).hrec[0]->vals[len[tid]],"%d",&j) )
515 len[tid] = j;
516 }
517
518 return len;
519 }
520
521 /**
522 * Translates BCF_VL types to string.
523 */
bcf_hdr_vl2str(int32_t id)524 std::string bcf_hdr_vl2str(int32_t id)
525 {
526 std::cerr << "vl id: " << id << "\n";
527
528 if (id==BCF_VL_FIXED)
529 {
530 return "BCF_VL_FIXED";
531 }
532 else if (id==BCF_VL_VAR)
533 {
534 return "BCF_VL_VAR";
535 }
536 else if (id==BCF_VL_A)
537 {
538 return "BCF_VL_A";
539 }
540 else if (id==BCF_VL_G)
541 {
542 return "BCF_VL_G";
543 }
544 else if (id==BCF_VL_R)
545 {
546 return "BCF_VL_R";
547 }
548 else
549 {
550 return "UNRECOGNIZED BCF_VL_* TYPE";
551 }
552 }
553
554 /**
555 * Translates BCF_HT types to string.
556 */
bcf_hdr_ht2str(int32_t id)557 std::string bcf_hdr_ht2str(int32_t id)
558 {
559 if (id==BCF_HT_FLAG)
560 {
561 return "BCF_HT_FLAG";
562 }
563 else if (id==BCF_HT_INT)
564 {
565 return "BCF_HT_INT";
566 }
567 else if (id==BCF_HT_REAL)
568 {
569 return "BCF_HT_REAL";
570 }
571 else if (id==BCF_HT_STR)
572 {
573 return "BCF_HT_STR";
574 }
575 else
576 {
577 return "UNRECOGNIZED BCF_HT_* TYPE";
578 }
579 }
580
581
582 /**********
583 *BCF UTILS
584 **********/
585
586 /**
587 * Copies an info fields from one record to another
588 * @hsrc - source header
589 * @vsrc - source bcf1_t
590 * @hdest - destination header
591 * @vdest - destination bcf1_t
592 * @key - the key name
593 * @type - BCF_HT_FLAG, BCF_HT_INT, BCF_HT_REAL, BCF_HT_STR
594 */
bcf_copy_info_field(bcf_hdr_t * hsrc,bcf1_t * vsrc,bcf_hdr_t * hdest,bcf1_t * vdest,const char * key,int32_t type)595 void bcf_copy_info_field(bcf_hdr_t *hsrc, bcf1_t* vsrc, bcf_hdr_t *hdest, bcf1_t* vdest, const char* key, int32_t type)
596 {
597 if (type==BCF_HT_FLAG)
598 {
599 if (bcf_get_info_flag(hsrc, vsrc, key, NULL, NULL)>0)
600 {
601 bcf_update_info_flag(hdest, vdest, key, NULL, 1);
602 }
603 }
604 else if (type==BCF_HT_INT)
605 {
606 int32_t* str = NULL;
607 int32_t n = 0;
608 if (bcf_get_info_int32(hsrc, vsrc, key, &str, &n)>0)
609 {
610 bcf_update_info_int32(hdest, vdest, key, str, n);
611 free(str);
612 }
613 }
614 else if (type==BCF_HT_REAL)
615 {
616 float* str = NULL;
617 int32_t n = 0;
618 if (bcf_get_info_float(hsrc, vsrc, key, &str, &n)>0)
619 {
620 bcf_update_info_float(hdest, vdest, key, str, n);
621 free(str);
622 }
623 }
624 else if (type==BCF_HT_STR)
625 {
626 char** str = NULL;
627 int32_t n = 0;
628 if (bcf_get_info_string(hsrc, vsrc, key, &str, &n)>0)
629 {
630 bcf_update_info_string(hdest, vdest, key, str);
631 free(str);
632 }
633 }
634 }
635
636 /**
637 * Prints a VCF record to STDERR.
638 */
bcf_print(bcf_hdr_t * h,bcf1_t * v)639 void bcf_print(bcf_hdr_t *h, bcf1_t *v)
640 {
641 kstring_t s = {0,0,0,};
642 vcf_format(h, v, &s);
643 std::cerr << s.s;
644 if (s.m) free(s.s);
645 };
646
647 /**
648 * Prints a VCF record in compact string representation to STDERR.
649 */
bcf_print_liten(bcf_hdr_t * h,bcf1_t * v)650 void bcf_print_liten(bcf_hdr_t *h, bcf1_t *v)
651 {
652 kstring_t s = {0,0,0,};
653 bcf_variant2string(h, v, &s);
654 std::cerr << s.s << "\n";
655 if (s.m) free(s.s);
656 };
657
658 /**
659 * Prints a VCF record in compact string representation to STDERR.
660 */
bcf_print_lite(bcf_hdr_t * h,bcf1_t * v)661 void bcf_print_lite(bcf_hdr_t *h, bcf1_t *v)
662 {
663 kstring_t s = {0,0,0,};
664 bcf_variant2string(h, v, &s);
665 std::cerr << s.s;
666 if (s.m) free(s.s);
667 };
668
669 /**
670 * Prints a VCF record in compact string representation to STDERR with alleles sorted.
671 */
bcf_print_lite_sorted(bcf_hdr_t * h,bcf1_t * v)672 void bcf_print_lite_sorted(bcf_hdr_t *h, bcf1_t *v)
673 {
674 kstring_t s = {0,0,0,};
675 bcf_variant2string_sorted(h, v, &s);
676 std::cerr << s.s;
677 if (s.m) free(s.s);
678 };
679
680 /**
681 * Reads header of a VCF file and returns the bcf header object.
682 * This wraps around vcf_hdr_read from the original htslib to
683 * allow for an alternative header file to be read in.
684 *
685 * this searches for the alternative header saved as <filename>.hdr
686 * If the VCF files is BCF, any alternative header is ignored.
687 */
bcf_alt_hdr_read(htsFile * fp)688 bcf_hdr_t *bcf_alt_hdr_read(htsFile *fp)
689 {
690 bcf_hdr_t *h = NULL;
691
692 //check for existence of alternative header
693 kstring_t alt_hdr_fn = {0, 0, 0};
694 kputs(fp->fn, &alt_hdr_fn);
695 kputs(".hdr", &alt_hdr_fn);
696
697 FILE *file = NULL;
698 struct stat st;
699 if (stat(alt_hdr_fn.s, &st)==0 && st.st_size)
700 {
701 file = fopen(alt_hdr_fn.s, "r");
702 }
703
704 if (fp->format.format ==bcf || !file)
705 {
706 h = bcf_hdr_read(fp);
707 }
708 else
709 {
710 fprintf(stderr, "[I:%s:%d %s] read alternative header for %s\n", __FILE__, __LINE__, __FUNCTION__, fp->fn);
711 fclose(file);
712 htsFile *alt_hdr = hts_open(alt_hdr_fn.s, "r");
713 h = bcf_hdr_read(alt_hdr);
714 hts_close(alt_hdr);
715
716 //helps move the pointer to the right place
717 bcf_hdr_t *temp_h = bcf_hdr_read(fp);
718 bcf_hdr_destroy(temp_h);
719 }
720
721 if (alt_hdr_fn.m) free(alt_hdr_fn.s);
722 return h;
723 }
724
725 /**
726 * Get number of samples in bcf header
727 */
bcf_hdr_get_n_sample(bcf_hdr_t * h)728 int32_t bcf_hdr_get_n_sample(bcf_hdr_t *h)
729 {
730 vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
731 return kh_size(d);
732 }
733
734 /**
735 * Help function for adding a header with a backup tag name.
736 * If the <tag> is already present, a new tag is attempted
737 * in the format <tag>_1 to <tag>_9. If <tag>_9 failed,
738 * the function will not add any new tag and will return
739 * an empty string.
740 *
741 * Returns the tag that was inserted or updated.
742 */
bcf_hdr_append_info_with_backup_naming(bcf_hdr_t * h,std::string tag,std::string number,std::string type,std::string description,bool rename)743 std::string bcf_hdr_append_info_with_backup_naming(bcf_hdr_t *h, std::string tag, std::string number, std::string type, std::string description, bool rename)
744 {
745 if (bcf_hdr_id2int(h, BCF_DT_ID, tag.c_str())==-1)
746 {
747 std::string meta_hdr = "##INFO=<ID=" + tag +
748 ",Number=" + number +
749 ",Type=" + type +
750 ",Description=\"" + description + "\">";
751
752 bcf_hdr_append(h, meta_hdr.c_str());
753 }
754 else
755 {
756 if (rename)
757 {
758 std::string new_tag = "";
759 for (uint32_t i=0; i<=9; ++i)
760 {
761 char c = 49+i;
762 new_tag = tag + "_" + c;
763 if (bcf_hdr_id2int(h, BCF_DT_ID, new_tag.c_str())==-1)
764 {
765 std::string meta_hdr = "##INFO=<ID=" + new_tag +
766 ",Number=" + number +
767 ",Type=" + type +
768 ",Description=\"" + description + "\">";
769
770 bcf_hdr_append(h, meta_hdr.c_str());
771
772 break;
773 }
774
775 new_tag = "";
776 }
777
778 return new_tag;
779 }
780 else
781 {
782 return tag;
783 }
784 }
785
786 return tag;
787 }
788
789 /**
790 * bcf_get_format_*() - same as bcf_get_info*() above
791 *
792 * The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char().
793 * see the description of bcf_update_format_string() and bcf_update_format_char() above.
794 * Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays:
795 * a single block of \0-terminated strings collapsed into a single array and an array of pointers
796 * to these strings. Both arrays must be cleaned by the user.
797 *
798 * Returns negative value on error or the number of written values on success.
799 *
800 * Example:
801 * int ndst = 0; char **dst = NULL;
802 * if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 )
803 * for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i]);
804 * free(dst[0]); free(dst);
805 *
806 * Example:
807 * int ngt, *gt_arr = NULL, ngt_arr = 0;
808 * ngt = bcf_get_genotypes(hdr, line, >_arr, &ngt_arr);
809 *
810 * todo: modify to allow direct reading, instead of copying to char*
811 */
bcf_get_format_string_ro(const bcf_hdr_t * hdr,bcf1_t * line,const char * tag,char *** dst,int * ndst)812 int32_t bcf_get_format_string_ro(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
813 {
814 int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
815 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
816 if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; // expected different type
817
818 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
819
820 for (i=0; i<line->n_fmt; i++)
821 if ( line->d.fmt[i].id==tag_id ) break;
822 if ( i==line->n_fmt ) return -3; // the tag is not present in this record
823 bcf_fmt_t *fmt = &line->d.fmt[i];
824
825 int nsmpl = bcf_hdr_nsamples(hdr);
826 if ( !*dst )
827 {
828 *dst = (char**) malloc(sizeof(char*)*nsmpl);
829 if ( !*dst ) return -4; // could not alloc
830 (*dst)[0] = NULL;
831 }
832 int n = (fmt->n+1)*nsmpl;
833 if ( *ndst < n )
834 {
835 (*dst)[0] = (char*) realloc((*dst)[0], n);
836 if ( !(*dst)[0] ) return -4; // could not alloc
837 *ndst = n;
838 }
839 for (i=0; i<nsmpl; i++)
840 {
841 uint8_t *src = fmt->p + i*fmt->n;
842 uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
843 memcpy(tmp,src,fmt->n);
844 tmp[fmt->n] = 0;
845 (*dst)[i] = (char*) tmp;
846 }
847 return n;
848 }
849
850 /**********
851 *BCF UTILS
852 **********/
853
854 /**
855 * Gets number of genotypes from number of alleles and ploidy.
856 */
bcf_ap2g(uint32_t no_allele,uint32_t no_ploidy)857 uint32_t bcf_ap2g(uint32_t no_allele, uint32_t no_ploidy)
858 {
859 if (!no_ploidy || !no_allele)
860 {
861 return 0;
862 }
863 else if (no_ploidy==1)
864 {
865 return no_allele;
866 }
867 else if (no_ploidy==2)
868 {
869 return (((no_allele+1)*(no_allele))>>1); ;
870 }
871 else
872 {
873 return choose(no_ploidy+no_allele-1, no_allele-1);
874 }
875 }
876
877 /**
878 * Gets alleles from number of ploidy and genotype index.
879 */
bcf_pg2a(uint32_t no_ploidy,uint32_t genotype_index,std::vector<int32_t> & alleles)880 void bcf_pg2a(uint32_t no_ploidy, uint32_t genotype_index, std::vector<int32_t>& alleles)
881 {
882
883 }
884
885 /**
886 * Gets number of ploidy from number of alleles and genotypes.
887 *
888 * Returns 0 if number of alleles and genotypes are not consistent.
889 */
bcf_ag2p(uint32_t no_alleles,uint32_t no_genotypes)890 uint32_t bcf_ag2p(uint32_t no_alleles, uint32_t no_genotypes)
891 {
892 //hard coded simple cases
893 if (no_alleles==2 && no_genotypes==3)
894 {
895 return 2;
896 }
897 else if (no_alleles==3 && no_genotypes==6)
898 {
899 return 2;
900 }
901 else if (no_alleles==4 && no_genotypes==10)
902 {
903 return 2;
904 }
905 else if (no_alleles == no_genotypes)
906 {
907 return 1;
908 }
909
910 //this works in general for all number of alleles and number of genotypes
911 uint32_t no_ploidy = 1;
912 while (true)
913 {
914 uint32_t k = choose(no_ploidy+no_alleles-1, no_alleles-1);
915
916 if (k==no_genotypes)
917 {
918 return no_ploidy;
919 }
920 //number of alleles and genotypes are not consistent for any ploidy
921 else if (k>no_genotypes)
922 {
923 return 0;
924 }
925
926 ++no_ploidy;
927 }
928 }
929
930 /**
931 * Gets genotype from genotype index and ploidy.
932 *
933 * The genotype index is computed by a summation of a series which is
934 * monotonically decreasing. This allows you to compute the inverse function
935 * from index to the ordered genotypes by using a "water rapids algorithm" with
936 * decreasing height of each mini water fall.
937 */
bcf_ip2g(int32_t genotype_index,uint32_t no_ploidy)938 std::vector<int32_t> bcf_ip2g(int32_t genotype_index, uint32_t no_ploidy)
939 {
940 std::vector<int32_t> genotype(no_ploidy, 0);
941
942 int32_t pth = no_ploidy;
943 int32_t max_allele_index = genotype_index;
944 int32_t leftover_genotype_index = genotype_index;
945
946 while (pth>0)
947 {
948 for (int32_t allele_index=0; allele_index <= max_allele_index; ++allele_index)
949 {
950 int32_t i = choose(pth+allele_index-1, pth);
951
952 if (i>=leftover_genotype_index || allele_index==max_allele_index)
953 {
954 if (i>leftover_genotype_index) --allele_index;
955 leftover_genotype_index -= choose(pth+allele_index-1, pth);
956 --pth;
957 max_allele_index = allele_index;
958 genotype[pth] = allele_index;
959 break;
960 }
961 }
962 }
963
964 return genotype;
965 }
966
967 /**
968 * Gets index of a genotype of n ploidy.
969 */
bcf_g2i(int32_t * g,uint32_t n)970 uint32_t bcf_g2i(int32_t* g, uint32_t n)
971 {
972 if (n==1)
973 {
974 return g[0];
975 }
976 if (n==2)
977 {
978 return g[0] + (((g[1]+1)*(g[1]))>>1);
979 }
980 else
981 {
982 uint32_t index = 0;
983 for (uint32_t i=0; i<n; ++i)
984 {
985 index += bcf_ap2g(g[i], i+1);
986 }
987 return index;
988 }
989 }
990
991 /**
992 * Gets index of a genotype of n ploidy.
993 */
bcf_g2i(std::vector<int32_t> & g)994 uint32_t bcf_g2i(std::vector<int32_t>& g)
995 {
996 int32_t n = g.size();
997
998 if (n==1)
999 {
1000 return g[0];
1001 }
1002 if (n==2)
1003 {
1004 return g[0] + (((g[1]+1)*(g[1]))>>1);
1005 }
1006 else
1007 {
1008 uint32_t index = 0;
1009 for (uint32_t i=0; i<n; ++i)
1010 {
1011 index += bcf_ap2g(g[i], i+1);
1012 }
1013 return index;
1014 }
1015 }
1016
1017 /**
1018 * Gets a string representation of a variant.
1019 */
bcf_variant2string(bcf_hdr_t * h,bcf1_t * v)1020 std::string bcf_variant2string(bcf_hdr_t *h, bcf1_t *v)
1021 {
1022 kstring_t var = {0,0,0};
1023 bcf_variant2string(h, v, &var);
1024 std::string var2(var.s);
1025 if (var.m) free(var.s);
1026 return var2;
1027 }
1028
1029 /**
1030 * Gets a string representation of a variant.
1031 */
bcf_variant2string(bcf_hdr_t * h,bcf1_t * v,kstring_t * var)1032 void bcf_variant2string(bcf_hdr_t *h, bcf1_t *v, kstring_t *var)
1033 {
1034 bcf_unpack(v, BCF_UN_STR);
1035 var->l = 0;
1036 kputs(bcf_get_chrom(h, v), var);
1037 kputc(':', var);
1038 kputw(bcf_get_pos1(v), var);
1039 kputc(':', var);
1040 for (size_t i=0; i<bcf_get_n_allele(v); ++i)
1041 {
1042 if (i) kputc('/', var);
1043 kputs(bcf_get_alt(v, i), var);
1044 }
1045 }
1046
1047 /**
1048 * strcmp wrapper for qsort.
1049 */
cmpstr(void const * a,void const * b)1050 int32_t cmpstr(void const *a, void const *b)
1051 {
1052 char const *aa = *(char const **)a;
1053 char const *bb = *(char const **)b;
1054
1055 return strcmp(aa, bb);
1056 }
1057
1058 /**
1059 * Returns true if a is before b, false otherwise.
1060 */
bcf_is_in_order(bcf1_t * a,bcf1_t * b)1061 bool bcf_is_in_order(bcf1_t *a, bcf1_t *b)
1062 {
1063 if (bcf_get_rid(a)==bcf_get_rid(b))
1064 {
1065 return bcf_get_pos0(a)<=bcf_get_pos0(b);
1066 }
1067
1068 return bcf_get_rid(a)<bcf_get_rid(b);
1069 }
1070
1071 /**
1072 * Returns a copy v that only has the chr:pos1:ref:alt information.
1073 */
bcf_copy_variant(bcf_hdr_t * h,bcf1_t * v)1074 bcf1_t* bcf_copy_variant(bcf_hdr_t *h, bcf1_t *v)
1075 {
1076 bcf1_t* nv = bcf_init1();
1077 bcf_clear(nv);
1078 bcf_set_n_sample(nv, bcf_get_n_sample(v));
1079
1080 bcf_set_rid(nv, bcf_get_rid(v));
1081 bcf_set_pos1(nv, bcf_get_pos1(v));
1082 kstring_t s = {0,0,0};
1083 bcf_alleles2string(h, v, &s);
1084 bcf_update_alleles_str(h, nv, s.s);
1085 if (s.m) free(s.s);
1086
1087 return nv;
1088 }
1089
1090 /**
1091 * Gets a sorted string representation of a variant.
1092 */
bcf_variant2string_sorted(bcf_hdr_t * h,bcf1_t * v,kstring_t * var)1093 void bcf_variant2string_sorted(bcf_hdr_t *h, bcf1_t *v, kstring_t *var)
1094 {
1095 bcf_unpack(v, BCF_UN_STR);
1096 var->l = 0;
1097 kputs(bcf_get_chrom(h, v), var);
1098 kputc(':', var);
1099 kputw(bcf_get_pos1(v), var);
1100 kputc(':', var);
1101
1102 if (v->n_allele==2)
1103 {
1104 kputs(bcf_get_alt(v, 0), var);
1105 kputc(',', var);
1106 kputs(bcf_get_alt(v, 1), var);
1107 }
1108 else
1109 {
1110 char** allele = bcf_get_allele(v);
1111 char** temp = (char**) malloc((bcf_get_n_allele(v)-1)*sizeof(char*));
1112 for (size_t i=1; i<v->n_allele; ++i)
1113 {
1114 temp[i] = allele[i];
1115 }
1116 std::qsort(temp, bcf_get_n_allele(v), sizeof(char*), cmpstr);
1117 kputs(bcf_get_alt(v, 0), var);
1118 for (size_t i=0; i<v->n_allele-1; ++i)
1119 {
1120 kputc('/', var);
1121 kputs(temp[i], var);
1122 }
1123 free(temp);
1124 }
1125 }
1126
1127 /**
1128 * Gets a string representation of the alleles of a variant.
1129 */
bcf_alleles2string(bcf_hdr_t * h,bcf1_t * v,kstring_t * var)1130 void bcf_alleles2string(bcf_hdr_t *h, bcf1_t *v, kstring_t *var)
1131 {
1132 bcf_unpack(v, BCF_UN_STR);
1133 var->l = 0;
1134
1135 if (v->n_allele==2)
1136 {
1137 kputs(bcf_get_alt(v, 0), var);
1138 kputc(',', var);
1139 kputs(bcf_get_alt(v, 1), var);
1140 }
1141 else
1142 {
1143 char** allele = bcf_get_allele(v);
1144 for (int32_t i=0; i<v->n_allele; ++i)
1145 {
1146 if (i) kputc(',', var);
1147 kputs(allele[i], var);
1148 }
1149 }
1150 }
1151
1152 /**
1153 * Gets a sorted string representation of the alleles of a variant.
1154 */
bcf_alleles2string_sorted(bcf_hdr_t * h,bcf1_t * v,kstring_t * var)1155 void bcf_alleles2string_sorted(bcf_hdr_t *h, bcf1_t *v, kstring_t *var)
1156 {
1157 bcf_unpack(v, BCF_UN_STR);
1158 var->l = 0;
1159
1160 if (v->n_allele==2)
1161 {
1162 kputs(bcf_get_alt(v, 0), var);
1163 kputc(',', var);
1164 kputs(bcf_get_alt(v, 1), var);
1165 }
1166 else
1167 {
1168 char** allele = bcf_get_allele(v);
1169 char** temp = (char**) malloc((bcf_get_n_allele(v)-1)*sizeof(char*));
1170 for (int32_t i=1; i<v->n_allele; ++i)
1171 {
1172 temp[i-1] = allele[i];
1173 }
1174
1175 std::qsort(temp, bcf_get_n_allele(v)-1, sizeof(char*), cmpstr);
1176
1177 kputs(bcf_get_alt(v, 0), var);
1178 for (int32_t i=0; i<v->n_allele-1; ++i)
1179 {
1180 kputc(',', var);
1181 kputs(temp[i], var);
1182 }
1183
1184 free(temp);
1185 }
1186 }
1187
1188 /**
1189 * Get chromosome name
1190 */
bcf_get_chrom(bcf_hdr_t * h,bcf1_t * v)1191 const char* bcf_get_chrom(bcf_hdr_t *h, bcf1_t *v)
1192 {
1193 if (v->rid >= h->n[BCF_DT_CTG])
1194 {
1195 fprintf(stderr, "[E:%s:%d %s] rid '%d' does not have an associated contig defined in the header. Try tabix workaround or just add the header.\n", __FILE__, __LINE__, __FUNCTION__, v->rid);
1196 exit(1);
1197 }
1198 else
1199 {
1200 return h->id[BCF_DT_CTG][v->rid].key;
1201 }
1202 }
1203
1204 /**
1205 *Set chromosome name.
1206 */
bcf_set_chrom(bcf_hdr_t * h,bcf1_t * v,const char * chrom)1207 void bcf_set_chrom(bcf_hdr_t *h, bcf1_t *v, const char* chrom)
1208 {
1209 vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
1210 khint_t k = kh_get(vdict, d, chrom);
1211 if (k == kh_end(d))
1212 {
1213 fprintf(stderr, "[E:%s:%d %s] contig '%s' is not defined in the header\n", __FILE__, __LINE__, __FUNCTION__, chrom);
1214 kstring_t contig = {0,0,0};
1215 ksprintf(&contig, "##contig=<ID=%s,length=2147483647>", chrom);
1216 bcf_hdr_append(h, contig.s);
1217 if (contig.m) free(contig.s);
1218 k = kh_get(vdict, d, chrom);
1219 }
1220 v->rid = kh_val(d, k).id;
1221 };
1222
1223 /**
1224 * Set id.
1225 */
bcf_set_id(bcf1_t * v,char * id)1226 void bcf_set_id(bcf1_t *v, char* id)
1227 {
1228 if (v->d.id)
1229 {
1230 free(v->d.id);
1231 }
1232
1233 v->d.id = strdup(id);
1234 };
1235
1236 /**
1237 * Gets an info flag.
1238 */
bcf_get_info_flg(bcf_hdr_t * h,bcf1_t * v,const char * tag)1239 bool bcf_get_info_flg(bcf_hdr_t *h, bcf1_t *v, const char* tag)
1240 {
1241 return (bcf_get_info_flag(h, v, tag, 0, 0) ? true : false);
1242 }
1243
1244 /**
1245 * Sets an info flag.
1246 */
bcf_set_info_flg(bcf_hdr_t * h,bcf1_t * v,const char * tag,bool value)1247 void bcf_set_info_flg(bcf_hdr_t *h, bcf1_t *v, const char* tag, bool value)
1248 {
1249 bcf_update_info_flag(h, v, tag, "", (value ? 1 : 0));
1250 }
1251
1252 /**
1253 * Gets an info integer.
1254 */
bcf_get_info_int(bcf_hdr_t * h,bcf1_t * v,const char * tag,int32_t default_value)1255 int32_t bcf_get_info_int(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t default_value)
1256 {
1257 int32_t i = 0;
1258 int32_t* temp_i = NULL;
1259 int32_t n = 0;
1260 if (bcf_get_info_int32(h, v, tag, &temp_i, &n)>0)
1261 {
1262 i = *temp_i;
1263 free(temp_i);
1264 }
1265 else
1266 {
1267 return default_value;
1268 }
1269
1270 return i;
1271 }
1272
1273 /**
1274 * Sets an info integer.
1275 */
bcf_set_info_int(bcf_hdr_t * h,bcf1_t * v,const char * tag,int32_t value)1276 void bcf_set_info_int(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t value)
1277 {
1278 bcf_update_info_int32(h, v, tag, &value, 1);
1279 }
1280
1281 /**
1282 * Gets an info integer vector.
1283 */
bcf_get_info_int_vec(bcf_hdr_t * h,bcf1_t * v,const char * tag,int32_t default_size,int32_t default_value)1284 std::vector<int32_t> bcf_get_info_int_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t default_size, int32_t default_value)
1285 {
1286 std::vector<int32_t> i_vec;
1287 int32_t* temp_i = NULL;
1288 int32_t n = 0;
1289 if (bcf_get_info_int32(h, v, tag, &temp_i, &n)>0)
1290 {
1291 for (int32_t j=0; j<n; ++j) i_vec.push_back(temp_i[j]);
1292 free(temp_i);
1293 }
1294 else
1295 {
1296 i_vec.resize(default_size, default_value);
1297 }
1298
1299 return i_vec;
1300 }
1301
1302 /**
1303 * Sets an info integer vector.
1304 */
bcf_set_info_int_vec(bcf_hdr_t * h,bcf1_t * v,const char * tag,std::vector<int32_t> & values)1305 void bcf_set_info_int_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::vector<int32_t>& values)
1306 {
1307 bcf_update_info_int32(h, v, tag, &values[0], values.size());
1308 }
1309
1310 /**
1311 * Sets an info integer vector.
1312 */
bcf_set_info_int(bcf_hdr_t * h,bcf1_t * v,const char * tag,std::vector<int32_t> & values)1313 void bcf_set_info_int(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::vector<int32_t>& values)
1314 {
1315 bcf_update_info_int32(h, v, tag, &values[0], values.size());
1316 }
1317
1318 /**
1319 * Gets an info float.
1320 */
bcf_get_info_flt(bcf_hdr_t * h,bcf1_t * v,const char * tag,float default_value)1321 float bcf_get_info_flt(bcf_hdr_t *h, bcf1_t *v, const char* tag, float default_value)
1322 {
1323 float f = 0;
1324 float* temp_f = NULL;
1325 int32_t n = 0;
1326 if (bcf_get_info_float(h, v, tag, &temp_f, &n)>0)
1327 {
1328 f = *temp_f;
1329 free(temp_f);
1330 }
1331 else
1332 {
1333 return default_value;
1334 }
1335
1336 return f;
1337 }
1338
1339 /**
1340 * Sets an info float.
1341 */
bcf_set_info_flt(bcf_hdr_t * h,bcf1_t * v,const char * tag,float value)1342 void bcf_set_info_flt(bcf_hdr_t *h, bcf1_t *v, const char* tag, float value)
1343 {
1344 bcf_update_info_float(h, v, tag, &value, 1);
1345 }
1346
1347 /**
1348 * Gets an info integer vector.
1349 */
bcf_get_info_flt_vec(bcf_hdr_t * h,bcf1_t * v,const char * tag,int32_t default_size,float default_value)1350 std::vector<float> bcf_get_info_flt_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t default_size, float default_value)
1351 {
1352 std::vector<float> vec;
1353
1354 return vec;
1355 }
1356
1357 /**
1358 * Sets an info float vector.
1359 */
bcf_set_info_flt_vec(bcf_hdr_t * h,bcf1_t * v,const char * tag,std::vector<float> & values)1360 void bcf_set_info_flt_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::vector<float>& values)
1361 {
1362 }
1363
1364 /**
1365 * Gets an info string.
1366 */
bcf_get_info_str(bcf_hdr_t * h,bcf1_t * v,const char * tag,std::string default_value)1367 std::string bcf_get_info_str(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::string default_value)
1368 {
1369 std::string str = "";
1370 char* s = NULL;
1371 int32_t n = 0;
1372 if (bcf_get_info_string(h, v, tag, &s, &n)>0)
1373 {
1374 str.assign(s);
1375 free(s);
1376 }
1377 else
1378 {
1379 return default_value;
1380 }
1381
1382 return str;
1383 }
1384
1385 /**
1386 * Sets an info string.
1387 */
bcf_set_info_str(bcf_hdr_t * h,bcf1_t * v,const char * tag,std::string default_value)1388 void bcf_set_info_str(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::string default_value)
1389 {
1390 }
1391
1392 /**
1393 * Gets an info string vector.
1394 */
bcf_get_info_str_vec(bcf_hdr_t * h,bcf1_t * v,const char * tag,std::string default_value)1395 std::vector<std::string> bcf_get_info_str_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::string default_value)
1396 {
1397 std::vector<std::string> vec;
1398 std::string str = "";
1399 char* s = NULL;
1400 int32_t n = 0;
1401 int32_t ret = 0;
1402 if ((ret=bcf_get_info_string(h, v, tag, &s, &n))>0)
1403 {
1404 str.assign(s);
1405 free(s);
1406 vec.push_back(str);
1407 }
1408 else
1409 {
1410 vec.push_back(default_value);
1411 }
1412
1413 return vec;
1414 }
1415 /**
1416 * Sets an info string vector.
1417 */
bcf_set_info_str_vec(bcf_hdr_t * h,bcf1_t * v,const char * tag,std::vector<std::string> values)1418 void bcf_set_info_str_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::vector<std::string> values)
1419 {
1420 }
1421
1422 /**
1423 * Creates a dummy header with hs37d5 contigs for testing purposes.
1424 */
bcf_create_dummy_hdr()1425 bcf_hdr_t* bcf_create_dummy_hdr()
1426 {
1427 bcf_hdr_t* h = bcf_hdr_init("r");
1428 bcf_hdr_append(h, "##fileformat=VCFv4.1");
1429 bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1430 bcf_hdr_append(h, "##contig=<ID=1,assembly=b37,length=249250621>");
1431 bcf_hdr_append(h, "##contig=<ID=2,assembly=b37,length=243199373>");
1432 bcf_hdr_append(h, "##contig=<ID=3,assembly=b37,length=198022430>");
1433 bcf_hdr_append(h, "##contig=<ID=4,assembly=b37,length=191154276>");
1434 bcf_hdr_append(h, "##contig=<ID=5,assembly=b37,length=180915260>");
1435 bcf_hdr_append(h, "##contig=<ID=6,assembly=b37,length=171115067>");
1436 bcf_hdr_append(h, "##contig=<ID=7,assembly=b37,length=159138663>");
1437 bcf_hdr_append(h, "##contig=<ID=8,assembly=b37,length=146364022>");
1438 bcf_hdr_append(h, "##contig=<ID=9,assembly=b37,length=141213431>");
1439 bcf_hdr_append(h, "##contig=<ID=10,assembly=b37,length=135534747>");
1440 bcf_hdr_append(h, "##contig=<ID=11,assembly=b37,length=135006516>");
1441 bcf_hdr_append(h, "##contig=<ID=12,assembly=b37,length=133851895>");
1442 bcf_hdr_append(h, "##contig=<ID=13,assembly=b37,length=115169878>");
1443 bcf_hdr_append(h, "##contig=<ID=14,assembly=b37,length=107349540>");
1444 bcf_hdr_append(h, "##contig=<ID=15,assembly=b37,length=102531392>");
1445 bcf_hdr_append(h, "##contig=<ID=16,assembly=b37,length=90354753>");
1446 bcf_hdr_append(h, "##contig=<ID=17,assembly=b37,length=81195210>");
1447 bcf_hdr_append(h, "##contig=<ID=18,assembly=b37,length=78077248>");
1448 bcf_hdr_append(h, "##contig=<ID=19,assembly=b37,length=59128983>");
1449 bcf_hdr_append(h, "##contig=<ID=20,assembly=b37,length=63025520>");
1450 bcf_hdr_append(h, "##contig=<ID=21,assembly=b37,length=48129895>");
1451 bcf_hdr_append(h, "##contig=<ID=22,assembly=b37,length=51304566>");
1452 bcf_hdr_append(h, "##contig=<ID=X,assembly=b37,length=155270560>");
1453 bcf_hdr_append(h, "##contig=<ID=Y,assembly=b37,length=59373566>");
1454 bcf_hdr_append(h, "##contig=<ID=MT,assembly=b37,length=16569>");
1455
1456 return h;
1457 }
1458
1459 /**
1460 * Creates a dummy bcf record representing the variant for testing purposes.
1461 *
1462 * @variant - 1:123:ACT:AC/ACCCC
1463 */
bcf_create_dummy_record(bcf_hdr_t * h,std::string & variant)1464 bcf1_t* bcf_create_dummy_record(bcf_hdr_t* h, std::string& variant)
1465 {
1466 bcf1_t* v = bcf_init1();
1467
1468 std::vector<std::string> var;
1469 split(var, ":", variant);
1470
1471 bcf_set_rid(v, bcf_hdr_name2id(h, var[0].c_str()));
1472 bcf_set_pos1(v, str2int32(var[1]));
1473
1474 std::vector<std::string> alleles;
1475 split(alleles, ":", var[3]);
1476
1477 std::string new_alleles = var[2];
1478
1479 for (uint32_t i=0; i<alleles.size(); ++i)
1480 {
1481 new_alleles.append(1, ',');
1482 new_alleles.append(alleles[i]);
1483 }
1484
1485 bcf_update_alleles_str(h, v, new_alleles.c_str());
1486
1487 return v;
1488 }
1489
1490 /**
1491 * Prints a message to STDERR and abort.
1492 */
error(const char * msg,...)1493 void error(const char * msg, ...)
1494 {
1495 va_list ap;
1496 va_start(ap, msg);
1497
1498 fprintf(stderr, "\nFATAL ERROR - \n");
1499 vfprintf(stderr, msg, ap);
1500 fprintf(stderr, "\n\n");
1501
1502 va_end(ap);
1503
1504 abort();
1505 //throw pexception;
1506 //exit(EXIT_FAILURE);
1507 }
1508
1509 /**
1510 * Prints a message to STDERR.
1511 */
notice(const char * msg,...)1512 void notice(const char * msg, ...)
1513 {
1514 va_list ap;
1515 va_start(ap, msg);
1516
1517 time_t current_time;
1518 char buff[255];
1519 current_time = time(NULL);
1520
1521 strftime(buff, 120, "%Y/%m/%d %H:%M:%S", localtime(¤t_time));
1522
1523 fprintf(stderr,"NOTICE [%s] - ", buff);
1524 vfprintf(stderr, msg, ap);
1525 fprintf(stderr,"\n");
1526
1527 va_end(ap);
1528 }