1 /* The MIT License
2 
3    Copyright (c) 2013 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "hts_utils.h"
25 #include "Rmath/Rmath.h"
26 
27 /********
28  *General
29  ********/
30 
31 KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
32 typedef khash_t(vdict) vdict_t;
33 
34 /**********
35  *FAI UTILS
36  **********/
37 
38 /**
39  * An alternate sequence fetcher for upper case sequence.
40  */
faidx_fetch_uc_seq(const faidx_t * fai,const char * c_name,int p_beg_i,int p_end_i,int * len)41 char *faidx_fetch_uc_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
42 {
43     int l;
44     char c;
45     khiter_t iter;
46     faidx1_t val;
47     char *seq=NULL;
48 
49     // Adjust position
50     iter = kh_get(s, fai->hash, c_name);
51     if(iter == kh_end(fai->hash)) return 0;
52     val = kh_value(fai->hash, iter);
53     if(p_end_i < p_beg_i) p_beg_i = p_end_i;
54     if(p_beg_i < 0) p_beg_i = 0;
55     else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
56     if(p_end_i < 0) p_end_i = 0;
57     else if(val.len <= p_end_i) p_end_i = val.len - 1;
58 
59     // Now retrieve the sequence
60     int ret = bgzf_useek(fai->bgzf, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
61     if ( ret<0 )
62     {
63         *len = -1;
64         fprintf(stderr, "[fai_fetch_seq] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
65         return NULL;
66     }
67     l = 0;
68     seq = (char*)malloc(p_end_i - p_beg_i + 2);
69     while ( (c=bgzf_getc(fai->bgzf))>=0 && l < p_end_i - p_beg_i + 1)
70         if (isgraph(c)) seq[l++] = toupper(c);
71     seq[l] = '\0';
72     *len = l;
73     return seq;
74 }
75 
76 /**********
77  *HTS UTILS
78  **********/
79 
80 /**
81  * Checks file extension for use in writing files.
82  */
str_ends_with(std::string const & value,std::string const & ending)83 bool str_ends_with(std::string const & value, std::string const & ending)
84 {
85     if (ending.size() > value.size()) return false;
86     return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
87 }
88 
89 /**
90  * Checks file extension based on filename.
91  */
hts_filename_type(std::string const & value)92 int32_t hts_filename_type(std::string const& value)
93 {
94     if (str_ends_with(value,".vcf") || str_ends_with(value,".vcf.gz"))
95     {
96         return vcf;
97     }
98     else if (str_ends_with(value,".bcf"))
99     {
100         return bcf;
101     }
102      else if (str_ends_with(value,".sam"))
103     {
104         return sam;
105     }
106     else if (str_ends_with(value,".bam"))
107     {
108         return bam;
109     }
110     else if (str_ends_with(value,".cram"))
111     {
112         return cram;
113     }
114     else
115     {
116         return unknown_format;
117     }
118 }
119 
120 /**************
121  *BAM HDR UTILS
122  **************/
123 
124 /**
125  * Copies contigs found in bam header to bcf header.
126  */
bam_hdr_transfer_contigs_to_bcf_hdr(const bam_hdr_t * sh,bcf_hdr_t * vh)127 void bam_hdr_transfer_contigs_to_bcf_hdr(const bam_hdr_t *sh, bcf_hdr_t *vh)
128 {
129     kstring_t s = {0,0,0};
130     for (size_t i=0; i<bam_hdr_get_n_targets(sh); ++i)
131     {
132         s.l = 0;
133         ksprintf(&s, "##contig=<ID=%s,length=%d>", bam_hdr_get_target_name(sh)[i], bam_hdr_get_target_len(sh)[i]);
134         bcf_hdr_append(vh, s.s);
135     }
136     if (s.m) free(s.s);
137 }
138 
139 /**
140  * Gets sample names from bam header.
141  */
bam_hdr_get_sample_name(bam_hdr_t * hdr)142 std::string bam_hdr_get_sample_name(bam_hdr_t* hdr) {
143   if ( !hdr )
144     error("Failed to read the BAM header");
145 
146   const char *p = hdr->text;
147   const char *q, *r;
148   int n = 0;
149   std::string sm;
150   while( ( q = strstr(p, "@RG" ) ) != 0 ) {
151     p = q + 3;
152     r = q = 0;
153     if ( ( q = strstr(p, "\tID:" ) ) != 0 ) q += 4;
154     if ( ( r = strstr(p, "\tSM:" ) ) != 0 ) r += 4;
155     if ( r && q ) {
156       char *u, *v;
157       for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
158       for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
159       *u = *v = '\0';
160       if ( sm.empty() )
161     sm = r;
162       else if ( sm.compare(r) != 0 )
163     error("Multiple sample IDs are included in one BAM file - %s, %s", sm.c_str(), r);
164     }
165     else break;
166     p = q > r ? q : r;
167     ++n;
168   }
169   if ( sm.empty() )
170     error("Sample ID information cannot be found");
171   return sm;
172 }
173 
174 
175 /**********
176  *BAM UTILS
177  **********/
178 
179 /**
180  * Gets the end position of the last mapped base in the read.
181  */
bam_get_end_pos1(bam1_t * s)182 int32_t bam_get_end_pos1(bam1_t *s)
183 {
184     int32_t end_pos1 = bam_get_pos1(s);
185     int32_t n_cigar_op = bam_get_n_cigar_op(s);
186     if (n_cigar_op)
187     {
188         uint32_t *cigar = bam_get_cigar(s);
189         for (int32_t i = 0; i < n_cigar_op; ++i)
190         {
191             int32_t opchr = bam_cigar_opchr(cigar[i]);
192 
193             if (opchr=='M' || opchr=='D' || opchr=='N' || opchr=='=' || opchr=='X')
194             {
195                 end_pos1 += bam_cigar_oplen(cigar[i]);
196             }
197         }
198     }
199 
200     return end_pos1-1;
201 }
202 
203 /**
204  * Gets the read sequence from a bam record
205  */
bam_get_seq_string(bam1_t * s,kstring_t * seq)206 void bam_get_seq_string(bam1_t *s, kstring_t *seq)
207 {
208     seq->l=0;
209     uint8_t* sq = bam_get_seq(s);
210     for (uint16_t i = 0; i < bam_get_l_qseq(s); ++i)
211     {
212         kputc("=ACMGRSVTWYHKDBN"[bam_seqi(sq, i)], seq);
213     }
214 };
215 
216 /**
217  * Gets the base qualities from a bam record, when N is observed, a placeholder value of 0(!, 33 adjusted) is entered
218  */
bam_get_qual_string(bam1_t * s,kstring_t * qual)219 void bam_get_qual_string(bam1_t *s, kstring_t *qual)
220 {
221     qual->l=0;
222     uint32_t offset = 0;
223     uint8_t* q = bam_get_qual(s);
224     for (int32_t i = 0; i < bam_get_l_qseq(s); ++i)
225     {
226         kputc(q[i-offset] + 33, qual);
227     }
228 };
229 
230 /**
231  * Gets the cigar from a BAM record
232  */
bam_get_cigar_string(bam1_t * s,kstring_t * cigar_string)233 void bam_get_cigar_string(bam1_t *s, kstring_t *cigar_string)
234 {
235     cigar_string->l=0;
236     int32_t n_cigar_op = bam_get_n_cigar_op(s);
237     if (n_cigar_op)
238     {
239         uint32_t *cigar = bam_get_cigar(s);
240         for (int32_t i = 0; i < n_cigar_op; ++i)
241         {
242             kputw(bam_cigar_oplen(cigar[i]), cigar_string);
243             kputc(bam_cigar_opchr(cigar[i]), cigar_string);
244         }
245     }
246 }
247 
248 /**
249  * Gets the cigar string from a bam record
250  */
bam_get_cigar_expanded_string(bam1_t * s,kstring_t * cigar_expanded_string)251 void bam_get_cigar_expanded_string(bam1_t *s, kstring_t *cigar_expanded_string)
252 {
253     kstring_t cigar_string = {0,0,0};
254     bam_get_cigar_string(s, &cigar_string);
255 
256     cigar_expanded_string->l = 0;
257     int32_t lastIndex = cigar_string.l;
258     int32_t i = 0;
259     kstring_t token = {0,0,0};
260 
261     if (lastIndex<0)
262     {
263         return;
264     }
265     char c;
266     bool seenM = false;
267 
268     while (i<=lastIndex)
269     {
270         c = cigar_string.s[i];
271 
272         //captures the numeric count
273         if (c<'A')
274         {
275             kputc(c, &token);
276         }
277 
278         if (c>'A' || i==lastIndex)
279         {
280             //it is possible for I's to be observed before the first M's in the cigar string
281             //in this case, we treat them as 'S'
282             if (!seenM)
283             {
284                 if (c=='I')
285                 {
286                     c = 'S';
287                 }
288                 else if (c=='M')
289                 {
290                     seenM = true;
291                 }
292             }
293 
294             int32_t count = atoi(token.s);
295             for (uint32_t j=0; j<count; ++j)
296                 kputc(c, cigar_expanded_string);
297             token.l = 0;;
298         }
299 
300         ++i;
301     }
302 
303     if (cigar_string.m) free(cigar_string.s);
304     if (token.m) free(token.s);
305 }
306 
307 /**
308  * Gets the base in the read that is mapped to a genomic position.
309  * Extracts the read sequence and qualities too.
310  */
bam_get_base_and_qual_and_read_and_qual(bam1_t * srec,uint32_t pos,char & base,char & qual,int32_t & rpos,kstring_t * readseq,kstring_t * readqual)311 void bam_get_base_and_qual_and_read_and_qual(bam1_t *srec, uint32_t pos, char& base, char& qual, int32_t& rpos, kstring_t* readseq, kstring_t* readqual)
312 {
313     bam1_core_t *c = &srec->core;
314     int32_t rlen = c->l_qseq;
315     uint32_t cpos = c->pos; //reference coordinates of the first mapped base
316     rpos = 0; //read coordinates
317 
318     kstring_t str;
319     str.l = str.m = 0, str.s = 0;
320     base = 'N';
321     qual = 0;
322 
323     if (c->n_cigar)
324     {
325         uint32_t *cigar = bam_get_cigar(srec);
326         for (uint32_t i = 0; i < c->n_cigar; ++i)
327         {
328             char op = bam_cigar_opchr(cigar[i]);
329             str.l = 0;
330             kputw(bam_cigar_oplen(cigar[i]), &str);
331             char* stop;
332             uint32_t len = strtol(str.s, &stop, 10);
333             assert(stop);
334 
335             if (op=='M')
336             {
337                 if (pos>=cpos && pos<=cpos+len-1)
338                 {
339                     rpos += pos-cpos;
340                     break;
341                 }
342 
343                 cpos += len;
344                 rpos += len;
345             }
346             else if (op=='D')
347             {
348                 if (pos>=cpos && pos<=cpos+len-1)
349                 {
350                     rpos = -1;
351                     break;
352                 }
353 
354                 cpos += len;
355             }
356             else if (op=='S' || op=='I')
357             {
358                 rpos += len;
359             }
360         }
361 
362         //std::cout << "bpos " << bpos << "\n";
363         if (rpos>=0 && rpos<=rlen)
364         {
365             //sequence
366             bam_get_seq_string(srec, readseq);
367             base = readseq->s[rpos];
368 
369             //qual
370             bam_get_qual_string(srec, readqual);
371             qual = readqual->s[rpos];
372         }
373         else
374         {
375             rpos = BAM_READ_INDEX_NA;
376         }
377     }
378 //    std::cout << "b: " << base << "\n";
379 //    std::cout << "q: " << s[bpos-1] << " " << q << "\n";
380 //    for (uint32_t i = 0; i < c->l_qseq; ++i) std::cerr << ((char)(s[i] + 33));
381 };
382 
383 /**
384  * Prints a bam.
385  */
bam_print(bam_hdr_t * h,bam1_t * s)386 void bam_print(bam_hdr_t *h, bam1_t *s)
387 {
388     const char* chrom = bam_get_chrom(h, s);
389     uint32_t pos1 = bam_get_pos1(s);
390     kstring_t seq = {0,0,0};
391     bam_get_seq_string(s, &seq);
392     uint32_t len = bam_get_l_qseq(s);
393     kstring_t qual = {0,0,0};
394     bam_get_qual_string(s, &qual);
395     kstring_t cigar_string = {0,0,0};
396     bam_get_cigar_string(s, &cigar_string);
397     kstring_t cigar_expanded_string = {0,0,0};
398     bam_get_cigar_expanded_string(s, &cigar_expanded_string);
399     uint16_t flag = bam_get_flag(s);
400     uint32_t mapq = bam_get_mapq(s);
401 
402     std::cerr << "##################" << "\n";
403     std::cerr << "chrom-pos: " << chrom << "-" << pos1 << "\n";
404     std::cerr << "read     : " << seq.s << "\n";
405     std::cerr << "qual     : " << qual.s << "\n";
406     std::cerr << "cigar_str: " << cigar_string.s << "\n";
407     std::cerr << "cigar    : " << cigar_expanded_string.s << "\n";
408     std::cerr << "len      : " << len << "\n";
409     std::cerr << "mapq     : " << mapq << "\n";
410     std::cerr << "mpos1    : " << bam_get_mpos1(s) << "\n";
411     std::cerr << "mtid     : " << bam_get_mtid(s) << "\n";
412 
413     if (seq.m) free(seq.s);
414     if (qual.m) free(qual.s);
415     if (cigar_string.m) free(cigar_string.s);
416     if (cigar_expanded_string.m) free(cigar_expanded_string.s);
417 }
418 
419 /**************
420  *BCF HDR UTILS
421  **************/
422 
423 /**
424  * Prints header.
425  */
bcf_hdr_print(bcf_hdr_t * h)426 void bcf_hdr_print(bcf_hdr_t *h)
427 {
428     if ( h->dirty ) bcf_hdr_sync(h);
429     kstring_t htxt = {0,0,0};
430     bcf_hdr_format(h, 0, &htxt);
431     std::cerr << htxt.s;
432     if (htxt.m) free(htxt.s);
433 }
434 
435 /**
436  * Copies contigs found in bcf header to another bcf header.
437  */
bcf_hdr_transfer_contigs(const bcf_hdr_t * hsrc,bcf_hdr_t * hdest)438 void bcf_hdr_transfer_contigs(const bcf_hdr_t *hsrc, bcf_hdr_t *hdest)
439 {
440     vdict_t *d = (vdict_t*)hsrc->dict[BCF_DT_CTG];
441     int tid, m = kh_size(d);
442     const char **names = (const char**) calloc(m,sizeof(const char*));
443     int len[m];
444     khint_t k;
445 
446     for (k=kh_begin(d); k<kh_end(d); k++)
447     {
448         if ( !kh_exist(d,k) ) continue;
449         tid = kh_val(d,k).id;
450         len[tid] = bcf_hrec_find_key(kh_val(d, k).hrec[0],"length");
451         int j;
452         if ( sscanf(kh_val(d, k).hrec[0]->vals[len[tid]],"%d",&j) )
453             len[tid] = j;
454         names[tid] = kh_key(d,k);
455     }
456 
457     kstring_t s = {0,0,0};
458     for (tid=0; tid<m; tid++)
459     {
460         s.l = 0;
461         ksprintf(&s, "##contig=<ID=%s,length=%d>", names[tid], len[tid]);
462         bcf_hdr_append(hdest, s.s);
463     }
464     if (s.m) free(s.s);
465 }
466 
467 /**
468  * Checks if a particular header type exists
469  * @hdr  - header
470  * @type - BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT, BCF_HL_CTG
471  * @key  - the key name
472  */
bcf_hdr_exists(bcf_hdr_t * hdr,int32_t type,const char * key)473 bool bcf_hdr_exists(bcf_hdr_t *hdr, int32_t type, const char *key)
474 {
475     if (type>=BCF_HL_FLT && type<= BCF_HL_CTG) //FLT, INFO, FMT, CTG
476     {
477         for (uint32_t i=0; i<hdr->nhrec; ++i)
478         {
479             if (hdr->hrec[i]->type==type)
480             {
481                 int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
482                 if (j>=0)
483                 {
484                     char* val = hdr->hrec[i]->vals[j];
485 
486                     if (strcmp(key, val)==0)
487                     {
488                         return true;
489                     }
490                 }
491             }
492         }
493     }
494 
495     return false;
496 }
497 
498 /**
499  * Extracts sequence length by rid.
500  */
bcf_hdr_seqlen(const bcf_hdr_t * hdr,int32_t * nseq)501 int32_t* bcf_hdr_seqlen(const bcf_hdr_t *hdr, int32_t *nseq)
502 {
503     vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
504     int tid, m = kh_size(d);
505     int32_t *len = (int32_t*) malloc(m*sizeof(int32_t));
506     khint_t k;
507 
508     for (k=kh_begin(d); k<kh_end(d); k++)
509     {
510         if ( !kh_exist(d,k) ) continue;
511         tid = kh_val(d,k).id;
512         len[tid] = bcf_hrec_find_key(kh_val(d, k).hrec[0],"length");
513         int j;
514         if ( sscanf(kh_val(d, k).hrec[0]->vals[len[tid]],"%d",&j) )
515             len[tid] = j;
516     }
517 
518     return len;
519 }
520 
521 /**
522  * Translates BCF_VL types to string.
523  */
bcf_hdr_vl2str(int32_t id)524 std::string bcf_hdr_vl2str(int32_t id)
525 {
526     std::cerr << "vl id: " << id << "\n";
527 
528     if (id==BCF_VL_FIXED)
529     {
530         return "BCF_VL_FIXED";
531     }
532     else if (id==BCF_VL_VAR)
533     {
534         return "BCF_VL_VAR";
535     }
536     else if (id==BCF_VL_A)
537     {
538         return "BCF_VL_A";
539     }
540     else if (id==BCF_VL_G)
541     {
542         return "BCF_VL_G";
543     }
544     else if (id==BCF_VL_R)
545     {
546         return "BCF_VL_R";
547     }
548     else
549     {
550         return "UNRECOGNIZED BCF_VL_* TYPE";
551     }
552 }
553 
554 /**
555  * Translates BCF_HT types to string.
556  */
bcf_hdr_ht2str(int32_t id)557 std::string bcf_hdr_ht2str(int32_t id)
558 {
559     if (id==BCF_HT_FLAG)
560     {
561         return "BCF_HT_FLAG";
562     }
563     else if (id==BCF_HT_INT)
564     {
565         return "BCF_HT_INT";
566     }
567     else if (id==BCF_HT_REAL)
568     {
569         return "BCF_HT_REAL";
570     }
571     else if (id==BCF_HT_STR)
572     {
573         return "BCF_HT_STR";
574     }
575     else
576     {
577         return "UNRECOGNIZED BCF_HT_* TYPE";
578     }
579 }
580 
581 
582 /**********
583  *BCF UTILS
584  **********/
585 
586 /**
587  * Copies an info fields from one record to another
588  * @hsrc  - source header
589  * @vsrc  - source bcf1_t
590  * @hdest - destination header
591  * @vdest - destination bcf1_t
592  * @key   - the key name
593  * @type  - BCF_HT_FLAG, BCF_HT_INT, BCF_HT_REAL, BCF_HT_STR
594  */
bcf_copy_info_field(bcf_hdr_t * hsrc,bcf1_t * vsrc,bcf_hdr_t * hdest,bcf1_t * vdest,const char * key,int32_t type)595 void bcf_copy_info_field(bcf_hdr_t *hsrc, bcf1_t* vsrc, bcf_hdr_t *hdest, bcf1_t* vdest, const char* key, int32_t type)
596 {
597     if (type==BCF_HT_FLAG)
598     {
599         if (bcf_get_info_flag(hsrc, vsrc, key, NULL, NULL)>0)
600         {
601             bcf_update_info_flag(hdest, vdest, key, NULL, 1);
602         }
603     }
604     else if (type==BCF_HT_INT)
605     {
606         int32_t* str = NULL;
607         int32_t n = 0;
608         if (bcf_get_info_int32(hsrc, vsrc, key, &str, &n)>0)
609         {
610             bcf_update_info_int32(hdest, vdest, key, str, n);
611             free(str);
612         }
613     }
614     else if (type==BCF_HT_REAL)
615     {
616         float* str = NULL;
617         int32_t n = 0;
618         if (bcf_get_info_float(hsrc, vsrc, key, &str, &n)>0)
619         {
620             bcf_update_info_float(hdest, vdest, key, str, n);
621             free(str);
622         }
623     }
624     else if (type==BCF_HT_STR)
625     {
626         char** str = NULL;
627         int32_t n = 0;
628         if (bcf_get_info_string(hsrc, vsrc, key, &str, &n)>0)
629         {
630             bcf_update_info_string(hdest, vdest, key, str);
631             free(str);
632         }
633     }
634 }
635 
636 /**
637  * Prints a VCF record to STDERR.
638  */
bcf_print(bcf_hdr_t * h,bcf1_t * v)639 void bcf_print(bcf_hdr_t *h, bcf1_t *v)
640 {
641     kstring_t s = {0,0,0,};
642     vcf_format(h, v, &s);
643     std::cerr << s.s;
644     if (s.m) free(s.s);
645 };
646 
647 /**
648  * Prints a VCF record in compact string representation to STDERR.
649  */
bcf_print_liten(bcf_hdr_t * h,bcf1_t * v)650 void bcf_print_liten(bcf_hdr_t *h, bcf1_t *v)
651 {
652     kstring_t s = {0,0,0,};
653     bcf_variant2string(h, v, &s);
654     std::cerr << s.s << "\n";
655     if (s.m) free(s.s);
656 };
657 
658 /**
659  * Prints a VCF record in compact string representation to STDERR.
660  */
bcf_print_lite(bcf_hdr_t * h,bcf1_t * v)661 void bcf_print_lite(bcf_hdr_t *h, bcf1_t *v)
662 {
663     kstring_t s = {0,0,0,};
664     bcf_variant2string(h, v, &s);
665     std::cerr << s.s;
666     if (s.m) free(s.s);
667 };
668 
669 /**
670  * Prints a VCF record in compact string representation to STDERR with alleles sorted.
671  */
bcf_print_lite_sorted(bcf_hdr_t * h,bcf1_t * v)672 void bcf_print_lite_sorted(bcf_hdr_t *h, bcf1_t *v)
673 {
674     kstring_t s = {0,0,0,};
675     bcf_variant2string_sorted(h, v, &s);
676     std::cerr << s.s;
677     if (s.m) free(s.s);
678 };
679 
680 /**
681  * Reads header of a VCF file and returns the bcf header object.
682  * This wraps around vcf_hdr_read from the original htslib to
683  * allow for an alternative header file to be read in.
684  *
685  * this searches for the alternative header saved as <filename>.hdr
686  * If the VCF files is BCF, any alternative header is ignored.
687  */
bcf_alt_hdr_read(htsFile * fp)688 bcf_hdr_t *bcf_alt_hdr_read(htsFile *fp)
689 {
690     bcf_hdr_t *h = NULL;
691 
692     //check for existence of alternative header
693     kstring_t alt_hdr_fn = {0, 0, 0};
694     kputs(fp->fn, &alt_hdr_fn);
695     kputs(".hdr", &alt_hdr_fn);
696 
697     FILE *file = NULL;
698     struct stat st;
699     if (stat(alt_hdr_fn.s, &st)==0 && st.st_size)
700     {
701         file = fopen(alt_hdr_fn.s, "r");
702     }
703 
704     if (fp->format.format ==bcf || !file)
705     {
706         h = bcf_hdr_read(fp);
707     }
708     else
709     {
710         fprintf(stderr, "[I:%s:%d %s] read alternative header for %s\n", __FILE__, __LINE__, __FUNCTION__, fp->fn);
711         fclose(file);
712         htsFile *alt_hdr = hts_open(alt_hdr_fn.s, "r");
713         h = bcf_hdr_read(alt_hdr);
714         hts_close(alt_hdr);
715 
716         //helps move the pointer to the right place
717         bcf_hdr_t *temp_h = bcf_hdr_read(fp);
718         bcf_hdr_destroy(temp_h);
719     }
720 
721     if (alt_hdr_fn.m) free(alt_hdr_fn.s);
722     return h;
723 }
724 
725 /**
726  * Get number of samples in bcf header
727  */
bcf_hdr_get_n_sample(bcf_hdr_t * h)728 int32_t bcf_hdr_get_n_sample(bcf_hdr_t *h)
729 {
730     vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
731     return kh_size(d);
732 }
733 
734 /**
735  * Help function for adding a header with a backup tag name.
736  * If the <tag> is already present, a new tag is attempted
737  * in the format <tag>_1 to <tag>_9.  If <tag>_9 failed,
738  * the function will not add any new tag and will return
739  * an empty string.
740  *
741  * Returns the tag that was inserted or updated.
742  */
bcf_hdr_append_info_with_backup_naming(bcf_hdr_t * h,std::string tag,std::string number,std::string type,std::string description,bool rename)743 std::string bcf_hdr_append_info_with_backup_naming(bcf_hdr_t *h, std::string tag, std::string number, std::string type, std::string description, bool rename)
744 {
745     if (bcf_hdr_id2int(h,  BCF_DT_ID, tag.c_str())==-1)
746     {
747         std::string meta_hdr = "##INFO=<ID=" + tag +
748                                  ",Number=" + number +
749                                  ",Type=" + type +
750                                  ",Description=\"" + description + "\">";
751 
752         bcf_hdr_append(h, meta_hdr.c_str());
753     }
754     else
755     {
756         if (rename)
757         {
758             std::string new_tag = "";
759             for (uint32_t i=0; i<=9; ++i)
760             {
761                 char c = 49+i;
762                 new_tag = tag +  "_" + c;
763                 if (bcf_hdr_id2int(h,  BCF_DT_ID, new_tag.c_str())==-1)
764                 {
765                     std::string meta_hdr = "##INFO=<ID=" + new_tag +
766                                  ",Number=" + number +
767                                  ",Type=" + type +
768                                  ",Description=\"" + description + "\">";
769 
770                     bcf_hdr_append(h, meta_hdr.c_str());
771 
772                     break;
773                 }
774 
775                 new_tag = "";
776             }
777 
778             return new_tag;
779         }
780         else
781         {
782             return tag;
783         }
784     }
785 
786     return tag;
787 }
788 
789 /**
790  *  bcf_get_format_*() - same as bcf_get_info*() above
791  *
792  *  The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char().
793  *  see the description of bcf_update_format_string() and bcf_update_format_char() above.
794  *  Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays:
795  *  a single block of \0-terminated strings collapsed into a single array and an array of pointers
796  *  to these strings. Both arrays must be cleaned by the user.
797  *
798  *  Returns negative value on error or the number of written values on success.
799  *
800  *  Example:
801  *      int ndst = 0; char **dst = NULL;
802  *      if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 )
803  *          for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i]);
804  *      free(dst[0]); free(dst);
805  *
806  *  Example:
807  *      int ngt, *gt_arr = NULL, ngt_arr = 0;
808  *      ngt = bcf_get_genotypes(hdr, line, &gt_arr, &ngt_arr);
809  *
810  *  todo: modify to allow direct reading, instead of copying to char*
811  */
bcf_get_format_string_ro(const bcf_hdr_t * hdr,bcf1_t * line,const char * tag,char *** dst,int * ndst)812 int32_t bcf_get_format_string_ro(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
813 {
814     int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
815     if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
816     if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;     // expected different type
817 
818     if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
819 
820     for (i=0; i<line->n_fmt; i++)
821         if ( line->d.fmt[i].id==tag_id ) break;
822     if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
823     bcf_fmt_t *fmt = &line->d.fmt[i];
824 
825     int nsmpl = bcf_hdr_nsamples(hdr);
826     if ( !*dst )
827     {
828         *dst = (char**) malloc(sizeof(char*)*nsmpl);
829         if ( !*dst ) return -4;     // could not alloc
830         (*dst)[0] = NULL;
831     }
832     int n = (fmt->n+1)*nsmpl;
833     if ( *ndst < n )
834     {
835         (*dst)[0] = (char*) realloc((*dst)[0], n);
836         if ( !(*dst)[0] ) return -4;    // could not alloc
837         *ndst = n;
838     }
839     for (i=0; i<nsmpl; i++)
840     {
841         uint8_t *src = fmt->p + i*fmt->n;
842         uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
843         memcpy(tmp,src,fmt->n);
844         tmp[fmt->n] = 0;
845         (*dst)[i] = (char*) tmp;
846     }
847     return n;
848 }
849 
850 /**********
851  *BCF UTILS
852  **********/
853 
854 /**
855  * Gets number of genotypes from number of alleles and ploidy.
856  */
bcf_ap2g(uint32_t no_allele,uint32_t no_ploidy)857 uint32_t bcf_ap2g(uint32_t no_allele, uint32_t no_ploidy)
858 {
859     if (!no_ploidy || !no_allele)
860     {
861         return 0;
862     }
863     else if (no_ploidy==1)
864     {
865         return no_allele;
866     }
867     else if (no_ploidy==2)
868     {
869         return (((no_allele+1)*(no_allele))>>1); ;
870     }
871     else
872     {
873         return choose(no_ploidy+no_allele-1, no_allele-1);
874     }
875 }
876 
877 /**
878  * Gets alleles from number of ploidy and genotype index.
879  */
bcf_pg2a(uint32_t no_ploidy,uint32_t genotype_index,std::vector<int32_t> & alleles)880 void bcf_pg2a(uint32_t no_ploidy, uint32_t genotype_index, std::vector<int32_t>& alleles)
881 {
882 
883 }
884 
885 /**
886  * Gets number of ploidy from number of alleles and genotypes.
887  *
888  * Returns 0 if number of alleles and genotypes are not consistent.
889  */
bcf_ag2p(uint32_t no_alleles,uint32_t no_genotypes)890 uint32_t bcf_ag2p(uint32_t no_alleles, uint32_t no_genotypes)
891 {
892     //hard coded simple cases
893     if (no_alleles==2 && no_genotypes==3)
894     {
895         return 2;
896     }
897     else if (no_alleles==3 && no_genotypes==6)
898     {
899         return 2;
900     }
901     else if (no_alleles==4 && no_genotypes==10)
902     {
903         return 2;
904     }
905     else if (no_alleles == no_genotypes)
906     {
907         return 1;
908     }
909 
910     //this works in general for all number of alleles and number of genotypes
911     uint32_t no_ploidy = 1;
912     while (true)
913     {
914         uint32_t k = choose(no_ploidy+no_alleles-1, no_alleles-1);
915 
916         if (k==no_genotypes)
917         {
918             return no_ploidy;
919         }
920         //number of alleles and genotypes are not consistent for any ploidy
921         else if (k>no_genotypes)
922         {
923             return 0;
924         }
925 
926         ++no_ploidy;
927     }
928 }
929 
930 /**
931  * Gets genotype from genotype index and ploidy.
932  *
933  * The genotype index is computed by a summation of a series which is
934  * monotonically decreasing.  This allows you to compute the inverse function
935  * from index to the ordered genotypes by using a "water rapids algorithm" with
936  * decreasing height of each mini water fall.
937  */
bcf_ip2g(int32_t genotype_index,uint32_t no_ploidy)938 std::vector<int32_t> bcf_ip2g(int32_t genotype_index, uint32_t no_ploidy)
939 {
940     std::vector<int32_t> genotype(no_ploidy, 0);
941 
942     int32_t pth = no_ploidy;
943     int32_t max_allele_index = genotype_index;
944     int32_t leftover_genotype_index = genotype_index;
945 
946     while (pth>0)
947     {
948         for (int32_t allele_index=0; allele_index <= max_allele_index; ++allele_index)
949         {
950             int32_t i = choose(pth+allele_index-1, pth);
951 
952             if (i>=leftover_genotype_index || allele_index==max_allele_index)
953             {
954                 if (i>leftover_genotype_index) --allele_index;
955                 leftover_genotype_index -= choose(pth+allele_index-1, pth);
956                 --pth;
957                 max_allele_index = allele_index;
958                 genotype[pth] = allele_index;
959                 break;
960             }
961         }
962     }
963 
964     return genotype;
965 }
966 
967 /**
968  * Gets index of a genotype of n ploidy.
969  */
bcf_g2i(int32_t * g,uint32_t n)970 uint32_t bcf_g2i(int32_t* g, uint32_t n)
971 {
972     if (n==1)
973     {
974         return g[0];
975     }
976     if (n==2)
977     {
978         return g[0] + (((g[1]+1)*(g[1]))>>1);
979     }
980     else
981     {
982         uint32_t index = 0;
983         for (uint32_t i=0; i<n; ++i)
984         {
985             index += bcf_ap2g(g[i], i+1);
986         }
987         return index;
988     }
989 }
990 
991 /**
992  * Gets index of a genotype of n ploidy.
993  */
bcf_g2i(std::vector<int32_t> & g)994 uint32_t bcf_g2i(std::vector<int32_t>& g)
995 {
996     int32_t n = g.size();
997 
998     if (n==1)
999     {
1000         return g[0];
1001     }
1002     if (n==2)
1003     {
1004         return g[0] + (((g[1]+1)*(g[1]))>>1);
1005     }
1006     else
1007     {
1008         uint32_t index = 0;
1009         for (uint32_t i=0; i<n; ++i)
1010         {
1011             index += bcf_ap2g(g[i], i+1);
1012         }
1013         return index;
1014     }
1015 }
1016 
1017 /**
1018  * Gets a string representation of a variant.
1019  */
bcf_variant2string(bcf_hdr_t * h,bcf1_t * v)1020 std::string bcf_variant2string(bcf_hdr_t *h, bcf1_t *v)
1021 {
1022     kstring_t var = {0,0,0};
1023     bcf_variant2string(h, v, &var);
1024     std::string var2(var.s);
1025     if (var.m) free(var.s);
1026     return var2;
1027 }
1028 
1029 /**
1030  * Gets a string representation of a variant.
1031  */
bcf_variant2string(bcf_hdr_t * h,bcf1_t * v,kstring_t * var)1032 void bcf_variant2string(bcf_hdr_t *h, bcf1_t *v, kstring_t *var)
1033 {
1034     bcf_unpack(v, BCF_UN_STR);
1035     var->l = 0;
1036     kputs(bcf_get_chrom(h, v), var);
1037     kputc(':', var);
1038     kputw(bcf_get_pos1(v), var);
1039     kputc(':', var);
1040     for (size_t i=0; i<bcf_get_n_allele(v); ++i)
1041     {
1042         if (i) kputc('/', var);
1043         kputs(bcf_get_alt(v, i), var);
1044     }
1045 }
1046 
1047 /**
1048  * strcmp wrapper for qsort.
1049  */
cmpstr(void const * a,void const * b)1050 int32_t cmpstr(void const *a, void const *b)
1051 {
1052     char const *aa = *(char const **)a;
1053     char const *bb = *(char const **)b;
1054 
1055     return strcmp(aa, bb);
1056 }
1057 
1058 /**
1059  * Returns true if a is before b, false otherwise.
1060  */
bcf_is_in_order(bcf1_t * a,bcf1_t * b)1061 bool bcf_is_in_order(bcf1_t *a, bcf1_t *b)
1062 {
1063     if (bcf_get_rid(a)==bcf_get_rid(b))
1064     {
1065         return bcf_get_pos0(a)<=bcf_get_pos0(b);
1066     }
1067 
1068     return bcf_get_rid(a)<bcf_get_rid(b);
1069 }
1070 
1071 /**
1072  * Returns a copy v that only has the chr:pos1:ref:alt information.
1073  */
bcf_copy_variant(bcf_hdr_t * h,bcf1_t * v)1074 bcf1_t* bcf_copy_variant(bcf_hdr_t *h, bcf1_t *v)
1075 {
1076     bcf1_t* nv = bcf_init1();
1077     bcf_clear(nv);
1078     bcf_set_n_sample(nv, bcf_get_n_sample(v));
1079 
1080     bcf_set_rid(nv, bcf_get_rid(v));
1081     bcf_set_pos1(nv, bcf_get_pos1(v));
1082     kstring_t s = {0,0,0};
1083     bcf_alleles2string(h, v, &s);
1084     bcf_update_alleles_str(h, nv, s.s);
1085     if (s.m) free(s.s);
1086 
1087     return nv;
1088 }
1089 
1090 /**
1091  * Gets a sorted string representation of a variant.
1092  */
bcf_variant2string_sorted(bcf_hdr_t * h,bcf1_t * v,kstring_t * var)1093 void bcf_variant2string_sorted(bcf_hdr_t *h, bcf1_t *v, kstring_t *var)
1094 {
1095     bcf_unpack(v, BCF_UN_STR);
1096     var->l = 0;
1097     kputs(bcf_get_chrom(h, v), var);
1098     kputc(':', var);
1099     kputw(bcf_get_pos1(v), var);
1100     kputc(':', var);
1101 
1102     if (v->n_allele==2)
1103     {
1104         kputs(bcf_get_alt(v, 0), var);
1105         kputc(',', var);
1106         kputs(bcf_get_alt(v, 1), var);
1107     }
1108     else
1109     {
1110         char** allele = bcf_get_allele(v);
1111         char** temp = (char**) malloc((bcf_get_n_allele(v)-1)*sizeof(char*));
1112         for (size_t i=1; i<v->n_allele; ++i)
1113         {
1114             temp[i] = allele[i];
1115         }
1116         std::qsort(temp, bcf_get_n_allele(v), sizeof(char*), cmpstr);
1117         kputs(bcf_get_alt(v, 0), var);
1118         for (size_t i=0; i<v->n_allele-1; ++i)
1119         {
1120             kputc('/', var);
1121             kputs(temp[i], var);
1122         }
1123         free(temp);
1124     }
1125 }
1126 
1127 /**
1128  * Gets a string representation of the alleles of a variant.
1129  */
bcf_alleles2string(bcf_hdr_t * h,bcf1_t * v,kstring_t * var)1130 void bcf_alleles2string(bcf_hdr_t *h, bcf1_t *v, kstring_t *var)
1131 {
1132     bcf_unpack(v, BCF_UN_STR);
1133     var->l = 0;
1134 
1135     if (v->n_allele==2)
1136     {
1137         kputs(bcf_get_alt(v, 0), var);
1138         kputc(',', var);
1139         kputs(bcf_get_alt(v, 1), var);
1140     }
1141     else
1142     {
1143         char** allele = bcf_get_allele(v);
1144         for (int32_t i=0; i<v->n_allele; ++i)
1145         {
1146             if (i) kputc(',', var);
1147             kputs(allele[i], var);
1148         }
1149     }
1150 }
1151 
1152 /**
1153  * Gets a sorted string representation of the alleles of a variant.
1154  */
bcf_alleles2string_sorted(bcf_hdr_t * h,bcf1_t * v,kstring_t * var)1155 void bcf_alleles2string_sorted(bcf_hdr_t *h, bcf1_t *v, kstring_t *var)
1156 {
1157     bcf_unpack(v, BCF_UN_STR);
1158     var->l = 0;
1159 
1160     if (v->n_allele==2)
1161     {
1162         kputs(bcf_get_alt(v, 0), var);
1163         kputc(',', var);
1164         kputs(bcf_get_alt(v, 1), var);
1165     }
1166     else
1167     {
1168         char** allele = bcf_get_allele(v);
1169         char** temp = (char**) malloc((bcf_get_n_allele(v)-1)*sizeof(char*));
1170         for (int32_t i=1; i<v->n_allele; ++i)
1171         {
1172             temp[i-1] = allele[i];
1173         }
1174 
1175         std::qsort(temp, bcf_get_n_allele(v)-1, sizeof(char*), cmpstr);
1176 
1177         kputs(bcf_get_alt(v, 0), var);
1178         for (int32_t i=0; i<v->n_allele-1; ++i)
1179         {
1180             kputc(',', var);
1181             kputs(temp[i], var);
1182         }
1183 
1184         free(temp);
1185     }
1186 }
1187 
1188 /**
1189  * Get chromosome name
1190  */
bcf_get_chrom(bcf_hdr_t * h,bcf1_t * v)1191 const char* bcf_get_chrom(bcf_hdr_t *h, bcf1_t *v)
1192 {
1193     if (v->rid >= h->n[BCF_DT_CTG])
1194     {
1195         fprintf(stderr, "[E:%s:%d %s] rid '%d' does not have an associated contig defined in the header.  Try tabix workaround or just add the header.\n", __FILE__, __LINE__, __FUNCTION__, v->rid);
1196         exit(1);
1197     }
1198     else
1199     {
1200         return h->id[BCF_DT_CTG][v->rid].key;
1201     }
1202 }
1203 
1204 /**
1205  *Set chromosome name.
1206  */
bcf_set_chrom(bcf_hdr_t * h,bcf1_t * v,const char * chrom)1207 void bcf_set_chrom(bcf_hdr_t *h, bcf1_t *v, const char* chrom)
1208 {
1209     vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
1210     khint_t k = kh_get(vdict, d, chrom);
1211     if (k == kh_end(d))
1212     {
1213         fprintf(stderr, "[E:%s:%d %s] contig '%s' is not defined in the header\n", __FILE__, __LINE__, __FUNCTION__, chrom);
1214         kstring_t contig = {0,0,0};
1215         ksprintf(&contig, "##contig=<ID=%s,length=2147483647>", chrom);
1216         bcf_hdr_append(h, contig.s);
1217         if (contig.m) free(contig.s);
1218         k = kh_get(vdict, d, chrom);
1219     }
1220     v->rid = kh_val(d, k).id;
1221 };
1222 
1223 /**
1224  * Set id.
1225  */
bcf_set_id(bcf1_t * v,char * id)1226 void bcf_set_id(bcf1_t *v, char* id)
1227 {
1228     if (v->d.id)
1229     {
1230         free(v->d.id);
1231     }
1232 
1233     v->d.id = strdup(id);
1234 };
1235 
1236 /**
1237  * Gets an info flag.
1238  */
bcf_get_info_flg(bcf_hdr_t * h,bcf1_t * v,const char * tag)1239 bool bcf_get_info_flg(bcf_hdr_t *h, bcf1_t *v, const char* tag)
1240 {
1241     return (bcf_get_info_flag(h, v, tag, 0, 0) ? true : false);
1242 }
1243 
1244 /**
1245  * Sets an info flag.
1246  */
bcf_set_info_flg(bcf_hdr_t * h,bcf1_t * v,const char * tag,bool value)1247 void bcf_set_info_flg(bcf_hdr_t *h, bcf1_t *v, const char* tag, bool value)
1248 {
1249     bcf_update_info_flag(h, v, tag, "", (value ? 1 : 0));
1250 }
1251 
1252 /**
1253  * Gets an info integer.
1254  */
bcf_get_info_int(bcf_hdr_t * h,bcf1_t * v,const char * tag,int32_t default_value)1255 int32_t bcf_get_info_int(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t default_value)
1256 {
1257     int32_t i = 0;
1258     int32_t* temp_i = NULL;
1259     int32_t n = 0;
1260     if (bcf_get_info_int32(h, v, tag, &temp_i, &n)>0)
1261     {
1262         i = *temp_i;
1263         free(temp_i);
1264     }
1265     else
1266     {
1267         return default_value;
1268     }
1269 
1270     return i;
1271 }
1272 
1273 /**
1274  * Sets an info integer.
1275  */
bcf_set_info_int(bcf_hdr_t * h,bcf1_t * v,const char * tag,int32_t value)1276 void bcf_set_info_int(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t value)
1277 {
1278     bcf_update_info_int32(h, v, tag, &value, 1);
1279 }
1280 
1281 /**
1282  * Gets an info integer vector.
1283  */
bcf_get_info_int_vec(bcf_hdr_t * h,bcf1_t * v,const char * tag,int32_t default_size,int32_t default_value)1284 std::vector<int32_t> bcf_get_info_int_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t default_size, int32_t default_value)
1285 {
1286     std::vector<int32_t> i_vec;
1287     int32_t* temp_i = NULL;
1288     int32_t n = 0;
1289     if (bcf_get_info_int32(h, v, tag, &temp_i, &n)>0)
1290     {
1291         for (int32_t j=0; j<n; ++j) i_vec.push_back(temp_i[j]);
1292         free(temp_i);
1293     }
1294     else
1295     {
1296         i_vec.resize(default_size, default_value);
1297     }
1298 
1299     return i_vec;
1300 }
1301 
1302 /**
1303  * Sets an info integer vector.
1304  */
bcf_set_info_int_vec(bcf_hdr_t * h,bcf1_t * v,const char * tag,std::vector<int32_t> & values)1305 void bcf_set_info_int_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::vector<int32_t>& values)
1306 {
1307     bcf_update_info_int32(h, v, tag, &values[0], values.size());
1308 }
1309 
1310 /**
1311  * Sets an info integer vector.
1312  */
bcf_set_info_int(bcf_hdr_t * h,bcf1_t * v,const char * tag,std::vector<int32_t> & values)1313 void bcf_set_info_int(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::vector<int32_t>& values)
1314 {
1315     bcf_update_info_int32(h, v, tag, &values[0], values.size());
1316 }
1317 
1318 /**
1319  * Gets an info float.
1320  */
bcf_get_info_flt(bcf_hdr_t * h,bcf1_t * v,const char * tag,float default_value)1321 float bcf_get_info_flt(bcf_hdr_t *h, bcf1_t *v, const char* tag, float default_value)
1322 {
1323     float f = 0;
1324     float* temp_f = NULL;
1325     int32_t n = 0;
1326     if (bcf_get_info_float(h, v, tag, &temp_f, &n)>0)
1327     {
1328         f = *temp_f;
1329         free(temp_f);
1330     }
1331     else
1332     {
1333         return default_value;
1334     }
1335 
1336     return f;
1337 }
1338 
1339 /**
1340  * Sets an info float.
1341  */
bcf_set_info_flt(bcf_hdr_t * h,bcf1_t * v,const char * tag,float value)1342 void bcf_set_info_flt(bcf_hdr_t *h, bcf1_t *v, const char* tag, float value)
1343 {
1344     bcf_update_info_float(h, v, tag, &value, 1);
1345 }
1346 
1347 /**
1348  * Gets an info integer vector.
1349  */
bcf_get_info_flt_vec(bcf_hdr_t * h,bcf1_t * v,const char * tag,int32_t default_size,float default_value)1350 std::vector<float> bcf_get_info_flt_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t default_size, float default_value)
1351 {
1352     std::vector<float> vec;
1353 
1354     return vec;
1355 }
1356 
1357 /**
1358  * Sets an info float vector.
1359  */
bcf_set_info_flt_vec(bcf_hdr_t * h,bcf1_t * v,const char * tag,std::vector<float> & values)1360 void bcf_set_info_flt_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::vector<float>& values)
1361 {
1362 }
1363 
1364 /**
1365  * Gets an info string.
1366  */
bcf_get_info_str(bcf_hdr_t * h,bcf1_t * v,const char * tag,std::string default_value)1367 std::string bcf_get_info_str(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::string default_value)
1368 {
1369     std::string str = "";
1370     char* s = NULL;
1371     int32_t n = 0;
1372     if (bcf_get_info_string(h, v, tag, &s, &n)>0)
1373     {
1374         str.assign(s);
1375         free(s);
1376     }
1377     else
1378     {
1379         return default_value;
1380     }
1381 
1382     return str;
1383 }
1384 
1385 /**
1386  * Sets an info string.
1387  */
bcf_set_info_str(bcf_hdr_t * h,bcf1_t * v,const char * tag,std::string default_value)1388 void bcf_set_info_str(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::string default_value)
1389 {
1390 }
1391 
1392 /**
1393  * Gets an info string vector.
1394  */
bcf_get_info_str_vec(bcf_hdr_t * h,bcf1_t * v,const char * tag,std::string default_value)1395 std::vector<std::string> bcf_get_info_str_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::string default_value)
1396 {
1397     std::vector<std::string> vec;
1398     std::string str = "";
1399     char* s = NULL;
1400     int32_t n = 0;
1401     int32_t ret = 0;
1402     if ((ret=bcf_get_info_string(h, v, tag, &s, &n))>0)
1403     {
1404         str.assign(s);
1405         free(s);
1406         vec.push_back(str);
1407     }
1408     else
1409     {
1410         vec.push_back(default_value);
1411     }
1412 
1413     return vec;
1414 }
1415 /**
1416  * Sets an info string vector.
1417  */
bcf_set_info_str_vec(bcf_hdr_t * h,bcf1_t * v,const char * tag,std::vector<std::string> values)1418 void bcf_set_info_str_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::vector<std::string> values)
1419 {
1420 }
1421 
1422 /**
1423  * Creates a dummy header with hs37d5 contigs for testing purposes.
1424  */
bcf_create_dummy_hdr()1425 bcf_hdr_t* bcf_create_dummy_hdr()
1426 {
1427     bcf_hdr_t* h = bcf_hdr_init("r");
1428     bcf_hdr_append(h, "##fileformat=VCFv4.1");
1429     bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1430     bcf_hdr_append(h, "##contig=<ID=1,assembly=b37,length=249250621>");
1431     bcf_hdr_append(h, "##contig=<ID=2,assembly=b37,length=243199373>");
1432     bcf_hdr_append(h, "##contig=<ID=3,assembly=b37,length=198022430>");
1433     bcf_hdr_append(h, "##contig=<ID=4,assembly=b37,length=191154276>");
1434     bcf_hdr_append(h, "##contig=<ID=5,assembly=b37,length=180915260>");
1435     bcf_hdr_append(h, "##contig=<ID=6,assembly=b37,length=171115067>");
1436     bcf_hdr_append(h, "##contig=<ID=7,assembly=b37,length=159138663>");
1437     bcf_hdr_append(h, "##contig=<ID=8,assembly=b37,length=146364022>");
1438     bcf_hdr_append(h, "##contig=<ID=9,assembly=b37,length=141213431>");
1439     bcf_hdr_append(h, "##contig=<ID=10,assembly=b37,length=135534747>");
1440     bcf_hdr_append(h, "##contig=<ID=11,assembly=b37,length=135006516>");
1441     bcf_hdr_append(h, "##contig=<ID=12,assembly=b37,length=133851895>");
1442     bcf_hdr_append(h, "##contig=<ID=13,assembly=b37,length=115169878>");
1443     bcf_hdr_append(h, "##contig=<ID=14,assembly=b37,length=107349540>");
1444     bcf_hdr_append(h, "##contig=<ID=15,assembly=b37,length=102531392>");
1445     bcf_hdr_append(h, "##contig=<ID=16,assembly=b37,length=90354753>");
1446     bcf_hdr_append(h, "##contig=<ID=17,assembly=b37,length=81195210>");
1447     bcf_hdr_append(h, "##contig=<ID=18,assembly=b37,length=78077248>");
1448     bcf_hdr_append(h, "##contig=<ID=19,assembly=b37,length=59128983>");
1449     bcf_hdr_append(h, "##contig=<ID=20,assembly=b37,length=63025520>");
1450     bcf_hdr_append(h, "##contig=<ID=21,assembly=b37,length=48129895>");
1451     bcf_hdr_append(h, "##contig=<ID=22,assembly=b37,length=51304566>");
1452     bcf_hdr_append(h, "##contig=<ID=X,assembly=b37,length=155270560>");
1453     bcf_hdr_append(h, "##contig=<ID=Y,assembly=b37,length=59373566>");
1454     bcf_hdr_append(h, "##contig=<ID=MT,assembly=b37,length=16569>");
1455 
1456     return h;
1457 }
1458 
1459 /**
1460  * Creates a dummy bcf record representing the variant for testing purposes.
1461  *
1462  * @variant - 1:123:ACT:AC/ACCCC
1463  */
bcf_create_dummy_record(bcf_hdr_t * h,std::string & variant)1464 bcf1_t* bcf_create_dummy_record(bcf_hdr_t* h, std::string& variant)
1465 {
1466     bcf1_t* v = bcf_init1();
1467 
1468     std::vector<std::string> var;
1469     split(var, ":", variant);
1470 
1471     bcf_set_rid(v, bcf_hdr_name2id(h, var[0].c_str()));
1472     bcf_set_pos1(v, str2int32(var[1]));
1473 
1474     std::vector<std::string> alleles;
1475     split(alleles, ":", var[3]);
1476 
1477     std::string new_alleles = var[2];
1478 
1479     for (uint32_t i=0; i<alleles.size(); ++i)
1480     {
1481         new_alleles.append(1, ',');
1482         new_alleles.append(alleles[i]);
1483     }
1484 
1485     bcf_update_alleles_str(h, v, new_alleles.c_str());
1486 
1487     return v;
1488 }
1489 
1490 /**
1491  * Prints a message to STDERR and abort.
1492  */
error(const char * msg,...)1493 void error(const char * msg, ...)
1494 {
1495     va_list ap;
1496     va_start(ap, msg);
1497 
1498     fprintf(stderr, "\nFATAL ERROR - \n");
1499     vfprintf(stderr, msg, ap);
1500     fprintf(stderr, "\n\n");
1501 
1502     va_end(ap);
1503 
1504     abort();
1505     //throw pexception;
1506     //exit(EXIT_FAILURE);
1507 }
1508 
1509 /**
1510  * Prints a message to STDERR.
1511  */
notice(const char * msg,...)1512 void notice(const char * msg, ...)
1513 {
1514     va_list ap;
1515     va_start(ap, msg);
1516 
1517     time_t current_time;
1518     char buff[255];
1519     current_time = time(NULL);
1520 
1521     strftime(buff, 120, "%Y/%m/%d %H:%M:%S", localtime(&current_time));
1522 
1523     fprintf(stderr,"NOTICE [%s] - ", buff);
1524     vfprintf(stderr, msg, ap);
1525     fprintf(stderr,"\n");
1526 
1527     va_end(ap);
1528 }