1 /*  vcfannotate.c -- Annotate and edit VCF/BCF files.
2 
3     Copyright (C) 2013-2021 Genome Research Ltd.
4 
5     Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.  */
24 
25 #include <stdio.h>
26 #include <strings.h>
27 #include <unistd.h>
28 #include <getopt.h>
29 #include <assert.h>
30 #include <ctype.h>
31 #include <string.h>
32 #include <errno.h>
33 #include <sys/stat.h>
34 #include <sys/types.h>
35 #include <dirent.h>
36 #include <math.h>
37 #include <inttypes.h>
38 #include <htslib/vcf.h>
39 #include <htslib/synced_bcf_reader.h>
40 #include <htslib/kseq.h>
41 #include <htslib/khash_str2int.h>
42 #include "bcftools.h"
43 #include "vcmp.h"
44 #include "filter.h"
45 #include "convert.h"
46 #include "smpl_ilist.h"
47 #include "regidx.h"
48 
49 struct _args_t;
50 
51 typedef struct _rm_tag_t
52 {
53     char *key;
54     int hdr_id;
55     void (*handler)(struct _args_t *, bcf1_t *, struct _rm_tag_t *);
56 }
57 rm_tag_t;
58 
59 typedef struct
60 {
61     char **cols;
62     int ncols, mcols;
63     char **als;
64     int nals, mals;
65     kstring_t line;
66     int rid, start, end;
67 }
68 annot_line_t;
69 
70 #define REPLACE_MISSING     (1<<0)   // -c +TAG  .. replace only missing values
71 #define REPLACE_ALL         (1<<1)   // -c TAG   .. replace both missing and existing values
72 #define REPLACE_NON_MISSING (1<<2)   // -c -TAG  .. replace only if tgt is not missing
73 #define SET_OR_APPEND       (1<<3)   // -c =TAG  .. set new value if missing or non-existent, append otherwise
74 #define MATCH_VALUE         (1<<4)   // -c ~ID   .. do not set, just match the value
75 #define CARRY_OVER_MISSING  (1<<5)   // -c .TAG  .. carry over source missing values as well
76 #define MM_FIRST   0    // if multiple annotation lines overlap a VCF record, use the first, discarding the rest
77 #define MM_APPEND  1    // append, possibly multiple times
78 #define MM_UNIQUE  2    // append, only unique values
79 #define MM_SUM     3
80 #define MM_AVG     4
81 #define MM_MIN     5
82 #define MM_MAX     6
83 #define MM_APPEND_MISSING 7     // missing values will be transferred as well
84 typedef struct _annot_col_t
85 {
86     int icol, replace, number;  // number: one of BCF_VL_* types
87     char *hdr_key_src, *hdr_key_dst;
88     // The setters return 0 on successful update of the bcf record, negative value (bcf_update_* return status) on errors,
89     // or 1 on (repeated partial updates) concluded with a src=NULL call
90     int (*setter)(struct _args_t *, bcf1_t *dst, struct _annot_col_t *, void *src); // the last is the annotation line, either src bcf1_t or annot_line_t
91     int (*getter)(struct _args_t *, bcf1_t *src, struct _annot_col_t *, void **ptr, int *mptr);
92     int merge_method;               // one of the MM_* defines
93     khash_t(str2int) *mm_str_hash;  // lookup table to ensure uniqueness of added string values
94     kstring_t mm_kstr;
95     size_t
96         mm_dbl_nalloc,  // the allocated size --merge-logic values array
97         mm_dbl_nused,   // the number of used elements in the mm_dbl array
98         mm_dbl_ndat;    // the number of merged rows (for calculating the average)
99     double
100         *mm_dbl;
101     void *ptr;
102     int mptr, done;
103 }
104 annot_col_t;
105 
106 // Logic of the filters: include or exclude sites which match the filters?
107 #define FLT_INCLUDE 1
108 #define FLT_EXCLUDE 2
109 
110 #define MARK_LISTED   1
111 #define MARK_UNLISTED 2
112 
113 typedef struct _args_t
114 {
115     bcf_srs_t *files;
116     bcf_hdr_t *hdr, *hdr_out, *tgts_hdr;
117     htsFile *out_fh;
118     int output_type, n_threads, clevel;
119     bcf_sr_regions_t *tgts;
120 
121     regidx_t *tgt_idx;  // keep everything in memory only with .tab annotation file and -c BEG,END columns
122     regitr_t *tgt_itr;
123     int tgt_is_bed;
124 
125     filter_t *filter;
126     char *filter_str;
127     int filter_logic;   // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
128     int keep_sites;
129 
130     rm_tag_t *rm;           // tags scheduled for removal
131     int nrm;
132     int flt_keep_pass;      // when all filters removed, reset to PASS
133 
134     vcmp_t *vcmp;           // for matching annotation and VCF lines by allele
135     annot_line_t *alines;   // buffered annotation lines
136     annot_line_t *aline_missing;
137     uint32_t *srt_alines;   // sorted indexes (iALT<<16 || iAline)
138     int nalines, malines, nsrt_alines, msrt_alines;
139     int ref_idx, alt_idx, chr_idx, beg_idx, end_idx;   // -1 if not present
140     annot_col_t *cols;      // column indexes and setters
141     int ncols;
142     int match_id;           // set iff `-c ~ID` given
143     int match_end;          // set iff `-c ~INFO/END` is given
144 
145     char *set_ids_fmt;
146     convert_t *set_ids;
147     int set_ids_replace;
148 
149     int nsmpl_annot;
150     int *sample_map, nsample_map, sample_is_file;   // map[idst] -> isrc
151     uint8_t *src_smpl_pld, *dst_smpl_pld;   // for Number=G format fields
152     int mtmpi, mtmpf, mtmps;
153     int mtmpi2, mtmpf2, mtmps2;
154     int mtmpi3, mtmpf3, mtmps3;
155     int32_t *tmpi, *tmpi2, *tmpi3;
156     float *tmpf, *tmpf2, *tmpf3;
157     char *tmps, *tmps2, **tmpp, **tmpp2;
158     kstring_t tmpks;
159 
160     char **argv, *output_fname, *targets_fname, *regions_list, *header_fname;
161     char *remove_annots, *columns, *rename_chrs, *rename_annots, *sample_names, *mark_sites;
162     kstring_t merge_method_str;
163     int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps;
164     int columns_is_file, has_append_mode;
165 }
166 args_t;
167 
168 char *msprintf(const char *fmt, ...);
169 
parse_with_payload(const char * line,char ** chr_beg,char ** chr_end,uint32_t * beg,uint32_t * end,void * payload,void * usr)170 int parse_with_payload(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
171 {
172     args_t *args = (args_t*) usr;
173     int ret = args->tgt_is_bed ? regidx_parse_bed(line, chr_beg, chr_end, beg, end, NULL, NULL) : regidx_parse_tab(line, chr_beg, chr_end, beg, end, NULL, NULL);
174     if ( ret<0 ) return ret;
175     *((char **)payload) = strdup(line);
176     return 0;
177 }
free_payload(void * payload)178 void free_payload(void *payload)
179 {
180     char *str = *((char**)payload);
181     free(str);
182 }
183 
remove_id(args_t * args,bcf1_t * line,rm_tag_t * tag)184 void remove_id(args_t *args, bcf1_t *line, rm_tag_t *tag)
185 {
186     bcf_update_id(args->hdr,line,NULL);
187 }
remove_filter(args_t * args,bcf1_t * line,rm_tag_t * tag)188 void remove_filter(args_t *args, bcf1_t *line, rm_tag_t *tag)
189 {
190     if ( tag->key && tag->hdr_id<0 )
191     {
192         error("Error: Cannot proceed, not even with the --force option, bad things could happen.\n"
193               "       Note that \"bcftools annotate -x FILTER\" can be used to remove ALL filters.\n"
194               "       Even better, use \"bcftools view -h\" and \"bcftools reheader\" to fix the header!\n"
195               );
196     }
197     if ( !tag->key ) bcf_update_filter(args->hdr, line, NULL, args->flt_keep_pass);
198     else bcf_remove_filter(args->hdr, line, tag->hdr_id, args->flt_keep_pass);
199 }
remove_qual(args_t * args,bcf1_t * line,rm_tag_t * tag)200 void remove_qual(args_t *args, bcf1_t *line, rm_tag_t *tag)
201 {
202     bcf_float_set_missing(line->qual);
203 }
remove_info(args_t * args,bcf1_t * line,rm_tag_t * tag)204 void remove_info(args_t *args, bcf1_t *line, rm_tag_t *tag)
205 {
206     // remove all INFO fields
207     if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
208 
209     int i;
210     for (i=0; i<line->n_info; i++)
211     {
212         bcf_info_t *inf = &line->d.info[i];
213         if (  !strcmp("END",bcf_hdr_int2id(args->hdr,BCF_DT_ID,inf->key)) )
214             line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
215         if ( inf->vptr_free )
216         {
217             free(inf->vptr - inf->vptr_off);
218             inf->vptr_free = 0;
219         }
220         line->d.shared_dirty |= BCF1_DIRTY_INF;
221         inf->vptr = NULL;
222         inf->vptr_off = inf->vptr_len = 0;
223     }
224 }
remove_info_tag(args_t * args,bcf1_t * line,rm_tag_t * tag)225 void remove_info_tag(args_t *args, bcf1_t *line, rm_tag_t *tag)
226 {
227     bcf_update_info(args->hdr, line, tag->key, NULL, 0, BCF_HT_INT);  // the type does not matter with n=0
228 }
remove_format_tag(args_t * args,bcf1_t * line,rm_tag_t * tag)229 void remove_format_tag(args_t *args, bcf1_t *line, rm_tag_t *tag)
230 {
231     bcf_update_format(args->hdr, line, tag->key, NULL, 0, BCF_HT_INT);  // the type does not matter with n=0
232 }
remove_format(args_t * args,bcf1_t * line,rm_tag_t * tag)233 void remove_format(args_t *args, bcf1_t *line, rm_tag_t *tag)
234 {
235     // remove all FORMAT fields except GT
236     if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
237 
238     int i;
239     for (i=0; i<line->n_fmt; i++)
240     {
241         bcf_fmt_t *fmt = &line->d.fmt[i];
242         const char *key = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id);
243         if ( key[0]=='G' && key[1]=='T' && !key[2] ) continue;
244 
245         if ( fmt->p_free )
246         {
247             free(fmt->p - fmt->p_off);
248             fmt->p_free = 0;
249         }
250         line->d.indiv_dirty = 1;
251         fmt->p = NULL;
252     }
253 }
254 
255 #include "htslib/khash.h"
256 KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
257 typedef khash_t(vdict) vdict_t;
258 
remove_hdr_lines(bcf_hdr_t * hdr,int type)259 static void remove_hdr_lines(bcf_hdr_t *hdr, int type)
260 {
261     int i = 0, nrm = 0;
262     while ( i<hdr->nhrec )
263     {
264         if ( hdr->hrec[i]->type!=type ) { i++; continue; }
265         bcf_hrec_t *hrec = hdr->hrec[i];
266         if ( type==BCF_HL_FMT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
267         {
268             // everything except FORMAT/GT
269             int id = bcf_hrec_find_key(hrec, "ID");
270             if ( id>=0 )
271             {
272                 if ( type==BCF_HL_FMT && !strcmp(hrec->vals[id],"GT") ) { i++; continue; }
273                 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
274                 khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[id]);
275                 kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
276                 kh_val(d, k).info[type] |= 0xf;
277             }
278         }
279         nrm++;
280         hdr->nhrec--;
281         if ( i < hdr->nhrec )
282             memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
283         bcf_hrec_destroy(hrec);
284     }
285     if ( nrm ) {
286         if (bcf_hdr_sync(hdr) < 0)
287             error_errno("[%s] Failed to update header", __func__);
288     }
289 }
290 
init_remove_annots(args_t * args)291 static void init_remove_annots(args_t *args)
292 {
293     int keep_info = 0, keep_fmt = 0, keep_flt = 0;
294     void *keep = khash_str2int_init();
295     kstring_t str = {0,0,0};
296     char *ss = args->remove_annots;
297 
298     int i, ntags, needs_info = 0;
299     if ( args->set_ids )
300     {
301         const char **tags = convert_list_used_tags(args->set_ids,&ntags);
302         for (i=0; i<ntags; i++)
303             if ( !strncmp("INFO/",tags[i],4) ) needs_info = 1;
304     }
305 
306     while ( *ss )
307     {
308         args->nrm++;
309         args->rm = (rm_tag_t*) realloc(args->rm,sizeof(rm_tag_t)*args->nrm);
310         rm_tag_t *tag = &args->rm[args->nrm-1];
311         tag->key = NULL;
312 
313         int type = BCF_HL_GEN;
314         if ( !strncasecmp("INFO/",ss,5) ) { type = BCF_HL_INFO; ss += 5; }
315         else if ( !strncasecmp("INF/",ss,4) ) { type = BCF_HL_INFO; ss += 4; }
316         else if ( !strncasecmp("FORMAT/",ss,7) ) { type = BCF_HL_FMT; ss += 7; }
317         else if ( !strncasecmp("FMT/",ss,4) ) { type = BCF_HL_FMT; ss += 4; }
318         else if ( !strncasecmp("FILTER/",ss,7) ) { type = BCF_HL_FLT; ss += 7; }
319         else if ( !strncasecmp("^INFO/",ss,6) ) { type = BCF_HL_INFO; ss += 6; keep_info = 1; }
320         else if ( !strncasecmp("^INF/",ss,5) ) { type = BCF_HL_INFO; ss += 5; keep_info = 1; }
321         else if ( !strncasecmp("^FORMAT/",ss,8) ) { type = BCF_HL_FMT; ss += 8; keep_fmt = 1; }
322         else if ( !strncasecmp("^FMT/",ss,5) ) { type = BCF_HL_FMT; ss += 5; keep_fmt = 1; }
323         else if ( !strncasecmp("^FILTER/",ss,8) ) { type = BCF_HL_FLT; ss += 8; keep_flt = 1; }
324 
325         char *se = ss;
326         while ( *se && *se!=',' ) se++;
327         str.l = 0;
328         kputsn(ss, se-ss, &str);
329 
330         if ( type==BCF_HL_FLT )
331         {
332             if ( !keep_flt )
333             {
334                 args->flt_keep_pass = 1;
335                 tag->handler = remove_filter;
336                 tag->key = strdup(str.s);
337                 tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key);
338                 if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) )
339                 {
340                     if ( args->keep_sites )
341                         error("Error: The filter \"%s\" is not defined in the header, cannot use the -k option\n", str.s);
342                     else
343                         fprintf(stderr,"Warning: The filter \"%s\" is not defined in the header\n", str.s);
344                 }
345                 else if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
346             }
347             else
348             {
349                 int value, ret = khash_str2int_get(keep, str.s, &value);
350                 if ( ret==-1 ) khash_str2int_set(keep, strdup(str.s),1<<BCF_HL_FLT);
351                 else khash_str2int_set(keep, str.s, value | 1<<BCF_HL_FLT);
352                 args->nrm--;
353             }
354         }
355         else if ( type!=BCF_HL_GEN )
356         {
357             int id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,str.s);
358             if ( !bcf_hdr_idinfo_exists(args->hdr,type,id) )
359             {
360                 if ( args->keep_sites )
361                     error("Error: The tag \"%s\" is not defined in the header, cannot use the -k option\n", str.s);
362                 else
363                     fprintf(stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s);
364 
365                 tag->key = strdup(str.s);
366                 if ( type==BCF_HL_INFO )
367                 {
368                     tag->handler = remove_info_tag;
369                     if ( needs_info ) error("Error: `--remove INFO/%s` is executed first, cannot combine with `--set-id %s`\n",tag->key,args->set_ids_fmt);
370                 }
371                 else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
372             }
373             else if ( (type==BCF_HL_FMT && keep_fmt) || (type==BCF_HL_INFO && keep_info) )
374             {
375                 int value, ret = khash_str2int_get(keep, str.s, &value);
376                 if ( ret==-1 ) khash_str2int_set(keep, strdup(str.s),1<<type);
377                 else khash_str2int_set(keep, str.s, value | 1<<type);
378                 args->nrm--;
379             }
380             else
381             {
382                 tag->key = strdup(str.s);
383                 if ( type==BCF_HL_INFO )
384                 {
385                     tag->handler = remove_info_tag;
386                     if ( needs_info ) error("Error: `--remove INFO/%s` is executed first, cannot combine with `--set-id %s`\n",tag->key,args->set_ids_fmt);
387                 }
388                 else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
389                 if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,type,tag->key);
390             }
391         }
392         else if ( !strcasecmp("ID",str.s) ) tag->handler = remove_id;
393         else if ( !strcasecmp("FILTER",str.s) )
394         {
395             tag->handler = remove_filter;
396             if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
397         }
398         else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual;
399         else if ( !strcasecmp("INFO",str.s) )
400         {
401             if ( needs_info ) error("Error: `--remove INFO` is executed first, cannot combine with `--set-id %s`\n",args->set_ids_fmt);
402             tag->handler = remove_info;
403             if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
404         }
405         else if ( !strcasecmp("FMT",str.s) || !strcasecmp("FORMAT",str.s) )
406         {
407             tag->handler = remove_format;
408             if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
409         }
410         else if ( str.l )
411         {
412             int id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, str.s);
413             if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,id) ) error("Error: did you mean INFO/%s?\n",str.s);
414             if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) error("Error: did you mean FORMAT/%s?\n",str.s);
415 
416             if ( !args->keep_sites )
417             {
418                 if ( str.s[0]=='#' && str.s[1]=='#' )
419                     bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2);
420                 else
421                     bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s);
422             }
423             args->nrm--;
424         }
425 
426         ss = *se ? se+1 : se;
427     }
428     free(str.s);
429     if ( keep_flt || keep_info || keep_fmt )
430     {
431         int j;
432         for (j=0; j<args->hdr->nhrec; j++)
433         {
434             bcf_hrec_t *hrec = args->hdr->hrec[j];
435             if ( hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FMT ) continue;
436             if ( !keep_flt && hrec->type==BCF_HL_FLT ) continue;
437             if ( !keep_info && hrec->type==BCF_HL_INFO ) continue;
438             if ( !keep_fmt && hrec->type==BCF_HL_FMT ) continue;
439             int k = bcf_hrec_find_key(hrec,"ID");
440             assert( k>=0 ); // this should always be true for valid VCFs
441             int value, ret = khash_str2int_get(keep,hrec->vals[k],&value);
442             if ( ret==0 && value>>hrec->type ) // keep
443             {
444                 if ( hrec->type==BCF_HL_FLT && !strcmp("PASS",hrec->vals[k]) ) args->flt_keep_pass = 1;
445                 continue;
446             }
447             args->nrm++;
448             args->rm = (rm_tag_t*) realloc(args->rm,sizeof(rm_tag_t)*args->nrm);
449             rm_tag_t *tag = &args->rm[args->nrm-1];
450             if ( hrec->type==BCF_HL_INFO ) tag->handler = remove_info_tag;
451             else if ( hrec->type==BCF_HL_FMT ) tag->handler = remove_format_tag;
452             else
453             {
454                 tag->handler = remove_filter;
455                 tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, hrec->vals[k]);
456             }
457             tag->key = strdup(hrec->vals[k]);
458             if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
459         }
460     }
461     khash_str2int_destroy_free(keep);
462     if ( !args->nrm ) error("No matching tag in -x %s\n", args->remove_annots);
463     if (bcf_hdr_sync(args->hdr_out) < 0)
464         error_errno("[%s] Failed to update header", __func__);
465 }
init_header_lines(args_t * args)466 static void init_header_lines(args_t *args)
467 {
468     htsFile *file = hts_open(args->header_fname, "rb");
469     if ( !file ) error("Error reading %s\n", args->header_fname);
470     kstring_t str = {0,0,0};
471     while ( hts_getline(file, KS_SEP_LINE, &str) > 0 )
472     {
473         if ( bcf_hdr_append(args->hdr_out,str.s) ) error("Could not parse %s: %s\n", args->header_fname, str.s);
474         bcf_hdr_append(args->hdr,str.s);    // the input file may not have the header line if run with -h (and nothing else)
475     }
476     if ( hts_close(file)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->header_fname);
477     free(str.s);
478     if (bcf_hdr_sync(args->hdr_out) < 0)
479         error_errno("[%s] Failed to update output header", __func__);
480     if (bcf_hdr_sync(args->hdr) < 0)
481         error_errno("[%s] Failed to update input header", __func__);
482 }
vcf_getter_info_str2str(args_t * args,bcf1_t * rec,annot_col_t * col,void ** ptr,int * mptr)483 static int vcf_getter_info_str2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
484 {
485     return bcf_get_info_string(args->tgts_hdr,rec,col->hdr_key_src,ptr,mptr);
486 }
vcf_getter_id2str(args_t * args,bcf1_t * rec,annot_col_t * col,void ** ptr,int * mptr)487 static int vcf_getter_id2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
488 {
489     char *str = *((char**)ptr);
490     int len = strlen(rec->d.id);
491     if ( len >= *mptr ) str = realloc(str, len+1);
492     strcpy(str, rec->d.id);
493     *((char**)ptr) = str;
494     *mptr = len+1;
495     return len;
496 }
vcf_getter_filter2str(args_t * args,bcf1_t * rec,annot_col_t * col,void ** ptr,int * mptr)497 static int vcf_getter_filter2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
498 {
499     kstring_t str;
500     str.s = *((char**)ptr);
501     str.m = *mptr;
502     str.l = 0;
503 
504     int i;
505     if ( rec->d.n_flt )
506     {
507         for (i=0; i<rec->d.n_flt; i++)
508         {
509             if (i) kputc(';', &str);
510             kputs(bcf_hdr_int2id(args->tgts_hdr,BCF_DT_ID,rec->d.flt[i]), &str);
511         }
512     }
513     else kputc('.', &str);
514 
515     *((char**)ptr) = str.s;
516     *mptr = str.m;
517     return str.l;
518 }
setter_filter(args_t * args,bcf1_t * line,annot_col_t * col,void * data)519 static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
520 {
521     if ( !data ) error("Error: the --merge-logic option cannot be used with FILTER (yet?)\n");
522 
523     // note: so far this works only with one filter, not a list of filters
524     annot_line_t *tab = (annot_line_t*) data;
525     if ( tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) // don't overwrite with a missing value unless asked
526     {
527         if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_filter(args->hdr_out,line,NULL,0);
528         return 0;
529     }
530     hts_expand(int,1,args->mtmpi,args->tmpi);
531     args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]);
532     if ( args->tmpi[0]<0 ) error("The FILTER \"%s\" is not defined in the header, was the -h option provided?\n", tab->cols[col->icol]);
533     if ( col->replace & SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]);
534     if ( !(col->replace & REPLACE_MISSING) )
535     {
536         bcf_update_filter(args->hdr_out,line,NULL,0);
537         return bcf_update_filter(args->hdr_out,line,args->tmpi,1);
538     }
539 
540     // only update missing FILTER
541     if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
542     if ( !line->d.n_flt )
543         return bcf_update_filter(args->hdr_out,line,args->tmpi,1);
544 
545     return 0;
546 }
vcf_setter_filter(args_t * args,bcf1_t * line,annot_col_t * col,void * data)547 static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
548 {
549     int i, ret = 0;
550     bcf1_t *rec = (bcf1_t*) data;
551     if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT);
552     if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
553     if ( !rec->d.n_flt ) // don't overwrite with a missing value unless asked
554     {
555         if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_filter(args->hdr_out,line,NULL,0);
556         return 0;
557     }
558     if ( col->replace & (SET_OR_APPEND|REPLACE_MISSING) )
559     {
560         if ( (col->replace & REPLACE_MISSING) && line->d.n_flt ) return 0; // only update missing FILTER
561         for (i=0; i<rec->d.n_flt; i++)
562         {
563             const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]);
564             if ( bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt)) < 0 ) ret = -1;
565         }
566         return ret;
567     }
568     hts_expand(int,rec->d.n_flt,args->mtmpi,args->tmpi);
569     for (i=0; i<rec->d.n_flt; i++)
570     {
571         const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]);
572         args->tmpi[i] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt);
573     }
574     bcf_update_filter(args->hdr_out,line,NULL,0);
575     return bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt);
576 }
setter_pos(args_t * args,bcf1_t * line,annot_col_t * col,void * data)577 static int setter_pos(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
578 {
579     annot_line_t *tab = (annot_line_t*) data;
580     if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
581     char *tmp;
582     int pos = strtol(tab->cols[col->icol], &tmp, 10);
583     if ( tmp==tab->cols[col->icol] )
584         error("Could not parse ~POS at %s:%"PRId64" .. [%s]\n",bcf_seqname(args->hdr,line),(int64_t)line->pos+1,tab->cols[col->icol]);
585     line->pos = pos - 1;
586     return 0;
587 }
setter_id(args_t * args,bcf1_t * line,annot_col_t * col,void * data)588 static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
589 {
590     if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n");
591     if ( col->replace & MATCH_VALUE ) return 0;
592 
593     // possible cases:
594     //      IN  ANNOT   OUT     ACHIEVED_BY
595     //      x   y       x        -c +ID
596     //      x   y       y        -c ID
597     //      x   y       x,y      -c =ID
598     //      x   .       x        -c +ID, ID
599     //      x   .       .        -x ID
600     //      .   y       y        -c +ID, -c ID
601     //
602     annot_line_t *tab = (annot_line_t*) data;
603     if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
604     if ( col->replace & SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,tab->cols[col->icol]);
605     if ( !(col->replace & REPLACE_MISSING) ) return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]);
606 
607     // running with +ID, only update missing ids
608     if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
609         return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]);
610     return 0;
611 }
vcf_setter_id(args_t * args,bcf1_t * line,annot_col_t * col,void * data)612 static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
613 {
614     if ( col->replace & MATCH_VALUE ) return 0;
615 
616     bcf1_t *rec = (bcf1_t*) data;
617 
618     char *id;
619     if ( col->getter )
620     {
621         int nret = col->getter(args,rec,col,&col->ptr,&col->mptr);
622         id = (char*) col->ptr;
623         if ( nret<=0 || (nret==1 && *id=='.') ) return 0;   // don't replace with "."
624     }
625     else
626     {
627         if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0;    // don't replace with "."
628         id = rec->d.id;
629     }
630     if ( col->replace & SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id);
631     if ( !(col->replace & REPLACE_MISSING) ) return bcf_update_id(args->hdr_out,line,id);
632 
633     // running with +ID, only update missing ids
634     if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
635         return bcf_update_id(args->hdr_out,line,id);
636     return 0;
637 }
vcf_setter_ref(args_t * args,bcf1_t * line,annot_col_t * col,void * data)638 static int vcf_setter_ref(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
639 {
640     bcf1_t *rec = (bcf1_t*) data;
641     if ( !strcmp(rec->d.allele[0],line->d.allele[0]) ) return 0;    // no update necessary
642     const char **als = (const char**) malloc(sizeof(char*)*line->n_allele);
643     als[0] = rec->d.allele[0];
644     int i;
645     for (i=1; i<line->n_allele; i++) als[i] = line->d.allele[i];
646     int ret = bcf_update_alleles(args->hdr_out, line, als, line->n_allele);
647     free(als);
648     return ret;
649 }
vcf_setter_alt(args_t * args,bcf1_t * line,annot_col_t * col,void * data)650 static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
651 {
652     bcf1_t *rec = (bcf1_t*) data;
653     int i;
654     if ( rec->n_allele==line->n_allele )
655     {
656         for (i=1; i<rec->n_allele; i++) if ( strcmp(rec->d.allele[i],line->d.allele[i]) ) break;
657         if ( i==rec->n_allele ) return 0;   // no update necessary
658     }
659     const char **als = (const char**) malloc(sizeof(char*)*rec->n_allele);
660     als[0] = line->d.allele[0];
661     for (i=1; i<rec->n_allele; i++) als[i] = rec->d.allele[i];
662     int ret = bcf_update_alleles(args->hdr_out, line, als, rec->n_allele);
663     free(als);
664     return ret;
665 }
setter_qual(args_t * args,bcf1_t * line,annot_col_t * col,void * data)666 static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
667 {
668     if ( !data ) error("Error: the --merge-logic option cannot be used with QUAL (yet?)\n");
669 
670     annot_line_t *tab = (annot_line_t*) data;
671     char *str = tab->cols[col->icol];
672     if ( str[0]=='.' && str[1]==0 ) // don't overwrite with a missing value unless asked
673     {
674         if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_float_set_missing(line->qual);
675         return 0;
676     }
677     if ( (col->replace & REPLACE_MISSING) && !bcf_float_is_missing(line->qual) ) return 0;
678 
679     line->qual = strtod(str, &str);
680     if ( str == tab->cols[col->icol] )
681         error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
682     return 0;
683 }
vcf_setter_qual(args_t * args,bcf1_t * line,annot_col_t * col,void * data)684 static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
685 {
686     bcf1_t *rec = (bcf1_t*) data;
687     if ( bcf_float_is_missing(rec->qual) )  // don't overwrite with a missing value unless asked
688     {
689         if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_float_set_missing(line->qual);
690         return 0;
691     }
692     if ( (col->replace & REPLACE_MISSING) && !bcf_float_is_missing(line->qual) ) return 0;
693     line->qual = rec->qual;
694     return 0;
695 }
setter_info_flag(args_t * args,bcf1_t * line,annot_col_t * col,void * data)696 static int setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
697 {
698     if ( !data ) error("Error: the --merge-logic option cannot be used with INFO type=Flag (yet?)\n");
699 
700     annot_line_t *tab = (annot_line_t*) data;
701     char *str = tab->cols[col->icol];
702     if ( str[0]=='.' && str[1]==0 ) // don't overwrite with a missing value unless asked
703     {
704         if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0);
705         return 0;
706     }
707 
708     if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1);
709     if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0);
710     error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
711     return -1;
712 }
vcf_setter_info_flag(args_t * args,bcf1_t * line,annot_col_t * col,void * data)713 static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
714 {
715     bcf1_t *rec = (bcf1_t*) data;
716     int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key_src,NULL,NULL);
717     bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,flag);
718     return 0;
719 }
setter_ARinfo_int32(args_t * args,bcf1_t * line,annot_col_t * col,int nals,char ** als,int ntmpi)720 static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi)
721 {
722     if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
723         error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
724     else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
725         error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
726 
727     int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
728     int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
729     if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
730 
731     // fill in any missing values in the target VCF (or all, if not present)
732     int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
733     if ( ntmpi2 < ndst ) hts_expand(int32_t,ndst,args->mtmpi2,args->tmpi2);
734 
735     int i;
736     for (i=0; i<ndst; i++)
737     {
738         if ( map[i]<0 )
739         {
740             if ( ntmpi2 < ndst ) args->tmpi2[i] = bcf_int32_missing;
741             continue;
742         }
743         if ( ntmpi2==ndst && (col->replace & REPLACE_MISSING)
744                 && args->tmpi2[i]!=bcf_int32_missing
745                 && args->tmpi2[i]!=bcf_int32_vector_end ) continue;
746 
747         args->tmpi2[i] = args->tmpi[ map[i] ];
748     }
749     return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst);
750 }
setter_info_int(args_t * args,bcf1_t * line,annot_col_t * col,void * data)751 static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
752 {
753     annot_line_t *tab = (annot_line_t*) data;
754 
755     // This is a bit hacky, only to reuse existing code with minimal changes:
756     //      -c =TAG will now behave as -l TAG:APPEND for integers
757     if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_APPEND;
758 
759     if ( !tab )
760     {
761         if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG &&
762              col->merge_method!=MM_MIN && col->merge_method!=MM_MAX &&
763              col->merge_method!=MM_APPEND &&
764              col->merge_method!=MM_APPEND_MISSING )
765             error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Integer\n");
766     }
767 
768     int i,ntmpi = 0;
769     if ( (col->replace & SET_OR_APPEND) && !col->mm_dbl_nused )
770     {
771         ntmpi = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi, &args->mtmpi);
772         if ( ntmpi>0 && (args->tmpi[0]!=bcf_int32_missing || (col->replace & CARRY_OVER_MISSING)) )
773         {
774             col->mm_dbl_nused = col->mm_dbl_ndat = ntmpi;
775             hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
776             for (i=0; i<ntmpi; i++)
777                 col->mm_dbl[i] = args->tmpi[i];
778             col->mm_dbl_ndat = 1;
779         }
780         ntmpi = 0;
781     }
782     if ( tab )  // has data, not flushing yet
783     {
784         char *str = tab->cols[col->icol], *end = str;
785         if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
786 
787         while ( *end )
788         {
789             ntmpi++;
790             hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
791             if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
792             {
793                 if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) )
794                     args->tmpi[ntmpi-1] = bcf_int32_missing;
795                 else
796                     ntmpi--;
797                 if ( str[1]==0 ) end = str+1;
798                 str += 2;
799             }
800             else
801             {
802                 args->tmpi[ntmpi-1] = strtol(str, &end, 10);
803                 if ( end==str )
804                     error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
805                 str = end+1;
806             }
807         }
808         if ( col->merge_method!=MM_FIRST )
809         {
810             if ( !col->mm_dbl_nused )
811             {
812                 col->mm_dbl_nused = ntmpi;
813                 hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
814                 for (i=0; i<ntmpi; i++)
815                     col->mm_dbl[i] = args->tmpi[i];
816             }
817             else
818             {
819                 if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
820                 {
821                     int nori = col->mm_dbl_nused;
822                     col->mm_dbl_nused += ntmpi;
823                     hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
824                     for (i=0; i<ntmpi; i++)
825                         col->mm_dbl[i+nori] = args->tmpi[i];
826                 }
827                 else
828                 {
829                     if ( ntmpi!=col->mm_dbl_nused ) error("Error: cannot merge fields of unequal length\n");
830                     if ( col->merge_method==MM_SUM || col->merge_method==MM_AVG )
831                         for (i=0; i<ntmpi; i++) col->mm_dbl[i] += args->tmpi[i];
832                     else if ( col->merge_method==MM_MIN )
833                         for (i=0; i<ntmpi; i++) { if ( col->mm_dbl[i] > args->tmpi[i] ) col->mm_dbl[i] = args->tmpi[i]; }
834                     else if ( col->merge_method==MM_MAX )
835                         for (i=0; i<ntmpi; i++) { if ( col->mm_dbl[i] < args->tmpi[i] ) col->mm_dbl[i] = args->tmpi[i]; }
836                 }
837             }
838             col->mm_dbl_ndat++;
839             return 1;
840         }
841     }
842     else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
843     {
844         ntmpi = col->mm_dbl_nused;
845         hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
846         for (i=0; i<ntmpi; i++) args->tmpi[i] = col->mm_dbl[i];
847         col->mm_dbl_nused = col->mm_dbl_ndat = 0;
848     }
849     else if ( col->merge_method==MM_AVG )
850     {
851         ntmpi = col->mm_dbl_nused;
852         hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
853         for (i=0; i<ntmpi; i++) args->tmpi[i] = col->mm_dbl[i]/col->mm_dbl_ndat;
854         col->mm_dbl_nused = col->mm_dbl_ndat = 0;
855     }
856 
857     if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
858         return setter_ARinfo_int32(args,line,col,tab->nals,tab->als,ntmpi);
859 
860     if ( col->replace & REPLACE_MISSING )
861     {
862         int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
863         if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
864     }
865     return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
866 }
vcf_setter_info_int(args_t * args,bcf1_t * line,annot_col_t * col,void * data)867 static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
868 {
869     bcf1_t *rec = (bcf1_t*) data;
870     int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi);
871     if ( ntmpi < 0 ) return 0;    // nothing to add
872 
873     if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
874         return setter_ARinfo_int32(args,line,col,rec->n_allele,rec->d.allele,ntmpi);
875 
876     if ( col->replace & REPLACE_MISSING )
877     {
878         int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
879         if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
880     }
881 
882     return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
883 }
setter_ARinfo_real(args_t * args,bcf1_t * line,annot_col_t * col,int nals,char ** als,int ntmpf)884 static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf)
885 {
886     if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
887         error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
888     else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
889         error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
890 
891     int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
892     int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
893     if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
894 
895     // fill in any missing values in the target VCF (or all, if not present)
896     int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
897     if ( ntmpf2 < ndst ) hts_expand(float,ndst,args->mtmpf2,args->tmpf2);
898 
899     int i;
900     for (i=0; i<ndst; i++)
901     {
902         if ( map[i]<0 )
903         {
904             if ( ntmpf2 < ndst ) bcf_float_set_missing(args->tmpf2[i]);
905             continue;
906         }
907         if ( ntmpf2==ndst && (col->replace & REPLACE_MISSING)
908                 && !bcf_float_is_missing(args->tmpf2[i])
909                 && !bcf_float_is_vector_end(args->tmpf2[i]) ) continue;
910 
911         args->tmpf2[i] = args->tmpf[ map[i] ];
912     }
913     return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst);
914 }
setter_info_real(args_t * args,bcf1_t * line,annot_col_t * col,void * data)915 static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
916 {
917     annot_line_t *tab = (annot_line_t*) data;
918 
919     // This is a bit hacky, only to reuse existing code with minimal changes:
920     //      -c =TAG will now behave as -l TAG:APPEND for floats
921     if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_APPEND;
922 
923     if ( !tab )
924     {
925         if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG &&
926              col->merge_method!=MM_MIN && col->merge_method!=MM_MAX &&
927              col->merge_method!=MM_APPEND &&
928              col->merge_method!=MM_APPEND_MISSING )
929             error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Float\n");
930     }
931 
932     int i,ntmpf = 0;
933     if ( (col->replace & SET_OR_APPEND) && !col->mm_dbl_nused )
934     {
935         ntmpf = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf, &args->mtmpf);
936         if ( ntmpf>0 && (!bcf_float_is_missing(args->tmpf[0]) || (col->replace & CARRY_OVER_MISSING)) )
937         {
938             col->mm_dbl_nused = ntmpf;
939             hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
940             for (i=0; i<ntmpf; i++)
941                 if ( bcf_float_is_missing(args->tmpf[i]) )
942                     bcf_double_set_missing(col->mm_dbl[i]);
943                 else
944                     col->mm_dbl[i] = args->tmpf[i];
945             col->mm_dbl_ndat = 1;
946         }
947         ntmpf = 0;
948     }
949     if ( tab )  // data row, not just flushing
950     {
951         char *str = tab->cols[col->icol], *end = str;
952         if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
953 
954         while ( *end )
955         {
956             ntmpf++;
957             hts_expand(float,ntmpf,args->mtmpf,args->tmpf);
958             if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
959             {
960                 if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) )
961                     bcf_float_set_missing(args->tmpf[ntmpf-1]);
962                 else
963                     ntmpf--;
964                 if ( str[1]==0 ) end = str+1;
965                 str += 2;
966             }
967             else
968             {
969                 args->tmpf[ntmpf-1] = strtod(str, &end);
970                 if ( end==str )
971                     error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
972                 str = end+1;
973             }
974         }
975         if ( col->merge_method!=MM_FIRST )
976         {
977             if ( !col->mm_dbl_nused )
978             {
979                 col->mm_dbl_nused = ntmpf;
980                 hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
981                 for (i=0; i<ntmpf; i++)
982                 {
983                     if ( bcf_float_is_missing(args->tmpf[i]) )
984                         bcf_double_set_missing(col->mm_dbl[i]);
985                     else
986                         col->mm_dbl[i] = args->tmpf[i];
987                 }
988             }
989             else
990             {
991                 if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
992                 {
993                     int nori = col->mm_dbl_nused;
994                     col->mm_dbl_nused += ntmpf;
995                     hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
996                     for (i=0; i<ntmpf; i++)
997                     {
998                         if ( bcf_float_is_missing(args->tmpf[i]) )
999                             bcf_double_set_missing(col->mm_dbl[i+nori]);
1000                         else
1001                             col->mm_dbl[i+nori] = args->tmpf[i];
1002                     }
1003                 }
1004                 else
1005                 {
1006                     if ( ntmpf!=col->mm_dbl_nused ) error("Error: cannot merge fields of unequal length\n");
1007                     if ( col->merge_method==MM_SUM || col->merge_method==MM_AVG )
1008                         for (i=0; i<ntmpf; i++) col->mm_dbl[i] += args->tmpf[i];
1009                     else if ( col->merge_method==MM_MIN )
1010                         for (i=0; i<ntmpf; i++) { if ( col->mm_dbl[i] > args->tmpf[i] ) col->mm_dbl[i] = args->tmpf[i]; }
1011                     else if ( col->merge_method==MM_MAX )
1012                         for (i=0; i<ntmpf; i++) { if ( col->mm_dbl[i] < args->tmpf[i] ) col->mm_dbl[i] = args->tmpf[i]; }
1013                 }
1014             }
1015             col->mm_dbl_ndat++;
1016             return 1;
1017         }
1018     }
1019     else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
1020     {
1021         ntmpf = col->mm_dbl_nused;
1022         hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf);
1023         for (i=0; i<ntmpf; i++)
1024         {
1025             if ( bcf_double_is_missing(col->mm_dbl[i]) )
1026                 bcf_float_set_missing(args->tmpf[i]);
1027             else
1028                 args->tmpf[i] = col->mm_dbl[i];
1029         }
1030         col->mm_dbl_nused = col->mm_dbl_ndat = 0;
1031     }
1032     else if ( col->merge_method==MM_AVG )
1033     {
1034         ntmpf = col->mm_dbl_nused;
1035         hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf);
1036         for (i=0; i<ntmpf; i++) args->tmpf[i] = col->mm_dbl[i]/col->mm_dbl_ndat;
1037         col->mm_dbl_nused = col->mm_dbl_ndat = 0;
1038     }
1039 
1040     if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
1041         return setter_ARinfo_real(args,line,col,tab->nals,tab->als,ntmpf);
1042 
1043     if ( col->replace & REPLACE_MISSING )
1044     {
1045         int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
1046         if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
1047     }
1048 
1049     return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
1050 }
vcf_setter_info_real(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1051 static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1052 {
1053     bcf1_t *rec = (bcf1_t*) data;
1054     int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf);
1055     if ( ntmpf < 0 ) return 0;    // nothing to add
1056 
1057     if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
1058         return setter_ARinfo_real(args,line,col,rec->n_allele,rec->d.allele,ntmpf);
1059 
1060     if ( col->replace & REPLACE_MISSING )
1061     {
1062         int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
1063         if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
1064     }
1065 
1066     return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
1067 }
1068 int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
setter_ARinfo_string(args_t * args,bcf1_t * line,annot_col_t * col,int nals,char ** als)1069 static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als)
1070 {
1071     assert( col->merge_method==MM_FIRST );
1072 
1073     int nsrc = 1, lsrc = 0;
1074     while ( args->tmps[lsrc] )
1075     {
1076         if ( args->tmps[lsrc]==',' ) nsrc++;
1077         lsrc++;
1078     }
1079     if ( col->number==BCF_VL_A && nsrc!=nals-1 && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
1080         error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1081     else if ( col->number==BCF_VL_R && nsrc!=nals && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
1082         error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1083 
1084     int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
1085     int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
1086     if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1087 
1088     // fill in any missing values in the target VCF (or all, if not present)
1089     int i, empty = 0, nstr, mstr = args->tmpks.m;
1090     nstr = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmpks.s, &mstr);
1091     args->tmpks.m = mstr;
1092     if ( nstr<0 || (nstr==1 && args->tmpks.s[0]=='.' && args->tmpks.s[1]==0) )
1093     {
1094         empty = 0;
1095         args->tmpks.l = 0;
1096         kputc('.',&args->tmpks);
1097         for (i=1; i<ndst; i++) kputs(",.",&args->tmpks);
1098     }
1099     else args->tmpks.l = nstr;
1100     for (i=0; i<ndst; i++)
1101     {
1102         if ( map[i]<0 )
1103         {
1104             if ( empty ) copy_string_field(".",0,1,&args->tmpks,i);
1105             continue;
1106         }
1107         if ( col->replace & REPLACE_MISSING )
1108         {
1109             // Do not replace filled values. The field must be looked up again because
1110             // of realloc in copy_string_field
1111             int n = 0;
1112             char *str = args->tmpks.s;
1113             while ( *str && n<i )
1114             {
1115                 if ( *str==',' ) n++;
1116                 str++;
1117             }
1118             if ( str[0]!='.' || (str[1]!=',' && str[1]!=0) ) continue;  // value already set
1119         }
1120         int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i);
1121         if ( ret!=0 ) error("[%s:%d %s] Failed to copy a string field\n",  __FILE__,__LINE__,__func__);
1122     }
1123     return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s);
1124 }
khash_str2int_clear_free(void * _hash)1125 void khash_str2int_clear_free(void *_hash)
1126 {
1127     khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
1128     khint_t k;
1129     if (hash == 0) return;
1130     for (k = 0; k < kh_end(hash); ++k)
1131         if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
1132     kh_clear(str2int, hash);
1133 }
setter_info_str(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1134 static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1135 {
1136     if ( (col->replace & REPLACE_MISSING) && col->number!=BCF_VL_A && col->number!=BCF_VL_R )
1137     {
1138         int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
1139         if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
1140     }
1141 
1142     // This is a bit hacky, only to reuse existing code with minimal changes:
1143     //      -c =TAG will now behave as -l TAG:unique for strings
1144     if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_UNIQUE;
1145 
1146     annot_line_t *tab = (annot_line_t*) data;
1147 
1148     int len = 0;
1149     if ( tab )
1150     {
1151         len = strlen(tab->cols[col->icol]);
1152         if ( !len ) return 0;
1153         if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
1154     }
1155 
1156     if ( col->merge_method!=MM_FIRST )
1157     {
1158         if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
1159             error("Error: the --merge-logic option cannot be used with INFO tags Type=String,Number={A,R,G}\n");
1160 
1161         if ( data )
1162         {
1163             assert( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING || col->merge_method==MM_UNIQUE );
1164             if ( col->merge_method==MM_UNIQUE )
1165             {
1166                 if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init();
1167                 if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 1;
1168                 khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol]));
1169             }
1170 
1171             if ( (col->replace & SET_OR_APPEND) && !col->mm_kstr.l )
1172             {
1173                 int m = col->mm_kstr.m;
1174                 int n = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &col->mm_kstr.s, &m);
1175                 col->mm_kstr.m = m;
1176                 if ( n>0 && ((col->replace & CARRY_OVER_MISSING) || col->mm_kstr.s[0]!='.' || col->mm_kstr.s[1]) ) col->mm_kstr.l = n;
1177             }
1178 
1179             if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr);
1180             kputs(tab->cols[col->icol], &col->mm_kstr);
1181             return 1;
1182         }
1183 
1184         if ( col->mm_kstr.l )
1185         {
1186             hts_expand(char,col->mm_kstr.l+1,args->mtmps,args->tmps);
1187             memcpy(args->tmps,col->mm_kstr.s,col->mm_kstr.l+1);
1188         }
1189         else
1190             return 0;
1191 
1192         // flush the line
1193         if ( col->merge_method==MM_UNIQUE )
1194             khash_str2int_clear_free(col->mm_str_hash);
1195         col->mm_kstr.l = 0;
1196     }
1197     else
1198     {
1199         assert(tab);
1200         hts_expand(char,len+1,args->mtmps,args->tmps);
1201         memcpy(args->tmps,tab->cols[col->icol],len+1);
1202 
1203         if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
1204             return setter_ARinfo_string(args,line,col,tab->nals,tab->als);
1205     }
1206 
1207     return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
1208 }
vcf_setter_info_str(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1209 static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1210 {
1211     bcf1_t *rec = (bcf1_t*) data;
1212 
1213     if ( col->getter )
1214         col->getter(args,rec,col,(void**)&args->tmps, &args->mtmps);
1215     else
1216     {
1217         int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps);
1218         if ( ntmps < 0 ) return 0;    // nothing to add
1219     }
1220 
1221     if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
1222         return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele);
1223 
1224     if ( col->replace & REPLACE_MISSING )
1225     {
1226         int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
1227         if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
1228     }
1229 
1230     return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
1231 }
genotypes_to_string(args_t * args,int nsrc1,int32_t * src,int nsmpl_dst,kstring_t * str)1232 static int genotypes_to_string(args_t *args, int nsrc1, int32_t *src, int nsmpl_dst, kstring_t *str)
1233 {
1234     int i, isrc, idst;
1235     int blen = nsrc1 > 1 ? nsrc1 + 1 : 1;   // typically the genotypes take three bytes 0/1, no 0-termination is needed
1236 
1237 gt_length_too_big:
1238     str->l = 0;
1239     for (idst=0; idst<nsmpl_dst; idst++)
1240     {
1241         isrc = args->sample_map ? args->sample_map[idst] : idst;
1242         if ( isrc==-1 )
1243         {
1244             kputc_('.', str);
1245             for (i=1; i < blen; i++) kputc_(0, str);
1246             continue;
1247         }
1248 
1249         size_t plen = str->l;
1250         int32_t *ptr = src + isrc*nsrc1;
1251         for (i=0; i<nsrc1 && ptr[i]!=bcf_int32_vector_end; i++)
1252         {
1253             if ( i ) kputc("/|"[bcf_gt_is_phased(ptr[i])], str);
1254             if ( bcf_gt_is_missing(ptr[i]) ) kputc('.', str);
1255             else kputw(bcf_gt_allele(ptr[i]), str);
1256         }
1257         if ( i==0 ) kputc('.', str);
1258         if ( str->l - plen > blen )
1259         {
1260             // too many alternate alleles or ploidy is too large, the genotype does not fit
1261             // three characters ("0/0" vs "10/10").
1262             blen *= 2;
1263             goto gt_length_too_big;
1264         }
1265         plen = str->l - plen;
1266         while ( plen < blen )
1267         {
1268             kputc_(0, str);
1269             plen++;
1270         }
1271     }
1272     return 0;
1273 }
vcf_setter_format_gt(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1274 static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1275 {
1276     bcf1_t *rec = (bcf1_t*) data;
1277     int nsrc = bcf_get_genotypes(args->files->readers[1].header,rec,&args->tmpi,&args->mtmpi);
1278     if ( nsrc==-3 ) return 0;    // the tag is not present
1279     if ( nsrc<=0 ) return 1;     // error
1280 
1281     // Genotypes are internally represented as integers. This is a complication when
1282     // adding as a different Type=String field, such as FMT/newGT:=GT
1283     if ( strcmp(col->hdr_key_src,col->hdr_key_dst) )
1284     {
1285         int nsmpl_dst = bcf_hdr_nsamples(args->hdr_out);
1286         int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header);
1287         genotypes_to_string(args,nsrc/nsmpl_src,args->tmpi,nsmpl_dst,&args->tmpks);
1288         return bcf_update_format_char(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s,args->tmpks.l);
1289     }
1290 
1291     if ( !args->sample_map )
1292         return bcf_update_genotypes(args->hdr_out,line,args->tmpi,nsrc);
1293 
1294     int i, j, ndst = bcf_get_genotypes(args->hdr,line,&args->tmpi2,&args->mtmpi2);
1295     if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
1296     nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
1297     if ( ndst<=0 )  // field not present in dst file
1298     {
1299         if ( col->replace & REPLACE_NON_MISSING ) return 0;
1300         hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
1301         for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1302         {
1303             int32_t *dst = args->tmpi2 + nsrc*i;
1304             if ( args->sample_map[i]==-1 )
1305             {
1306                 dst[0] = bcf_gt_missing;
1307                 for (j=1; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
1308             }
1309             else
1310             {
1311                 int32_t *src = args->tmpi + nsrc*args->sample_map[i];
1312                 for (j=0; j<nsrc; j++) dst[j] = src[j];
1313             }
1314         }
1315         return bcf_update_genotypes(args->hdr_out,line,args->tmpi2,nsrc*bcf_hdr_nsamples(args->hdr_out));
1316     }
1317     else if ( ndst >= nsrc )
1318     {
1319         for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1320         {
1321             if ( args->sample_map[i]==-1 ) continue;
1322             int32_t *src = args->tmpi  + nsrc*args->sample_map[i];
1323             int32_t *dst = args->tmpi2 + ndst*i;
1324             if ( (col->replace & REPLACE_NON_MISSING) && bcf_gt_is_missing(dst[0]) ) continue;
1325             if ( (col->replace & REPLACE_MISSING)  && !bcf_gt_is_missing(dst[0]) ) continue;
1326             for (j=0; j<nsrc; j++) dst[j] = src[j];
1327             for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
1328         }
1329         return bcf_update_genotypes(args->hdr_out,line,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
1330     }
1331     else    // ndst < nsrc
1332     {
1333         hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
1334         for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1335         {
1336             int32_t *ori = args->tmpi2 + ndst*i;
1337             int32_t *dst = args->tmpi3 + nsrc*i;
1338             int keep_ori = 0;
1339             if ( args->sample_map[i]==-1 ) keep_ori = 1;
1340             else if ( (col->replace & REPLACE_NON_MISSING) && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
1341             else if ( (col->replace & REPLACE_MISSING)  && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
1342             if ( keep_ori )
1343             {
1344                 for (j=0; j<ndst; j++) dst[j] = ori[j];
1345                 for (; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
1346             }
1347             else
1348             {
1349                 int32_t *src = args->tmpi + nsrc*args->sample_map[i];
1350                 for (j=0; j<nsrc; j++) dst[j] = src[j];
1351             }
1352         }
1353         return bcf_update_genotypes(args->hdr_out,line,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out));
1354     }
1355 }
count_vals(annot_line_t * tab,int icol_beg,int icol_end)1356 static int count_vals(annot_line_t *tab, int icol_beg, int icol_end)
1357 {
1358     int i, nmax = 1;
1359     for (i=icol_beg; i<icol_end; i++)
1360     {
1361         char *str = tab->cols[i], *end = str;
1362         if ( str[0]=='.' && !str[1] )
1363         {
1364             // missing value
1365             if ( !nmax ) nmax = 1;
1366             continue;
1367         }
1368         int n = 1;
1369         while ( *end )
1370         {
1371             if ( *end==',' ) n++;
1372             end++;
1373         }
1374         if ( nmax<n ) nmax = n;
1375     }
1376     return nmax;
1377 }
core_setter_format_int(args_t * args,bcf1_t * line,annot_col_t * col,int32_t * vals,int nvals)1378 static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, int32_t *vals, int nvals)
1379 {
1380     if ( !args->sample_map )
1381         return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot);
1382 
1383     int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key_dst,&args->tmpi2,&args->mtmpi2);
1384     if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
1385     if ( ndst<=0 )
1386     {
1387         if ( col->replace & REPLACE_NON_MISSING ) return 0;    // overwrite only if present
1388         hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
1389         for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1390         {
1391             int32_t *dst = args->tmpi2 + nvals*i;
1392             if ( args->sample_map[i]==-1 )
1393             {
1394                 dst[0] = bcf_int32_missing;
1395                 for (j=1; j<nvals; j++) dst[j] = bcf_int32_vector_end;
1396             }
1397             else
1398             {
1399                 int32_t *src = vals + nvals*args->sample_map[i];
1400                 for (j=0; j<nvals; j++) dst[j] = src[j];
1401             }
1402         }
1403         return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,nvals*bcf_hdr_nsamples(args->hdr_out));
1404     }
1405     else if ( ndst >= nvals )
1406     {
1407         for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1408         {
1409             if ( args->sample_map[i]==-1 ) continue;
1410             int32_t *src = vals  + nvals*args->sample_map[i];
1411             int32_t *dst = args->tmpi2 + ndst*i;
1412             // possible cases:
1413             //      in annot out
1414             //       x  y     x     TAG,-TAG,=TAG    .. REPLACE_ALL, REPLACE_NON_MISSING, SET_OR_APPEND
1415             //       x  y     y    +TAG              .. REPLACE_MISSING
1416             //       .  y     .    =TAG              .. SET_OR_APPEND
1417             //       .  y     y     TAG,+TAG,-TAG    .. REPLACE_ALL, REPLACE_MISSING, REPLACE_NON_MISSING
1418             //       x  .     x     TAG,+TAG         .. REPLACE_ALL, REPLACE_MISSING
1419             //       x  .     .    -TAG              .. REPLACE_NON_MISSING
1420             if ( col->replace & REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; }
1421             else if ( col->replace & REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; }
1422             else if ( col->replace & REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; }
1423             for (j=0; j<nvals; j++) dst[j] = src[j];
1424             for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
1425         }
1426         return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
1427     }
1428     else    // ndst < nvals
1429     {
1430         hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
1431         for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1432         {
1433             int32_t *ann = vals + nvals*args->sample_map[i];
1434             int32_t *ori = args->tmpi2 + ndst*i;                // ori vcf line
1435             int32_t *dst = args->tmpi3 + nvals*i;               // expanded buffer
1436             int use_new_ann = 1;
1437             if ( args->sample_map[i]==-1 ) use_new_ann = 0;
1438             else if ( col->replace & REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; }
1439             else if ( col->replace & REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; }
1440             else if ( col->replace & REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; }
1441             if ( !use_new_ann )
1442             {
1443                 for (j=0; j<ndst; j++) dst[j] = ori[j];
1444                 for (; j<nvals; j++) dst[j] = bcf_int32_vector_end;
1445             }
1446             else
1447                 for (j=0; j<nvals; j++) dst[j] = ann[j];
1448         }
1449         return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi3,nvals*bcf_hdr_nsamples(args->hdr_out));
1450     }
1451 }
core_setter_format_real(args_t * args,bcf1_t * line,annot_col_t * col,float * vals,int nvals)1452 static int core_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, float *vals, int nvals)
1453 {
1454     if ( !args->sample_map )
1455         return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot);
1456 
1457     int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key_dst,&args->tmpf2,&args->mtmpf2);
1458     if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
1459     if ( ndst<=0 )
1460     {
1461         if ( col->replace & REPLACE_NON_MISSING ) return 0;    // overwrite only if present
1462         hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2);
1463         for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1464         {
1465             float *dst = args->tmpf2 + nvals*i;
1466             if ( args->sample_map[i]==-1 )
1467             {
1468                 bcf_float_set_missing(dst[0]);
1469                 for (j=1; j<nvals; j++) bcf_float_set_vector_end(dst[j]);
1470             }
1471             else
1472             {
1473                 float *src = vals + nvals*args->sample_map[i];
1474                 for (j=0; j<nvals; j++) dst[j] = src[j];
1475             }
1476         }
1477         return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nvals*bcf_hdr_nsamples(args->hdr_out));
1478     }
1479     else if ( ndst >= nvals )
1480     {
1481         for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1482         {
1483             if ( args->sample_map[i]==-1 ) continue;
1484             float *src = vals  + nvals*args->sample_map[i];
1485             float *dst = args->tmpf2 + ndst*i;
1486             if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; }
1487             else if ( col->replace & REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; }
1488             else if ( col->replace & REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; }
1489             for (j=0; j<nvals; j++) dst[j] = src[j];
1490             for (; j<ndst; j++) bcf_float_set_vector_end(dst[j]);
1491         }
1492         return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out));
1493     }
1494     else    // ndst < nvals
1495     {
1496         hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3);
1497         for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1498         {
1499             float *ann = vals + nvals*args->sample_map[i];
1500             float *ori = args->tmpf2 + ndst*i;                // ori vcf line
1501             float *dst = args->tmpf3 + nvals*i;               // expanded buffer
1502             int use_new_ann = 1;
1503             if ( args->sample_map[i]==-1 ) use_new_ann = 0;
1504             else if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
1505             else if ( col->replace & REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
1506             else if ( col->replace & REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; }
1507             if ( !use_new_ann )
1508             {
1509                 for (j=0; j<ndst; j++) dst[j] = ori[j];
1510                 for (; j<nvals; j++) bcf_float_set_vector_end(dst[j]);
1511             }
1512             else
1513                 for (j=0; j<nvals; j++) dst[j] = ann[j];
1514         }
1515         return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf3,nvals*bcf_hdr_nsamples(args->hdr_out));
1516     }
1517 }
core_setter_format_str(args_t * args,bcf1_t * line,annot_col_t * col,char ** vals)1518 static int core_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, char **vals)
1519 {
1520     if ( !args->sample_map )
1521         return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)vals,args->nsmpl_annot);
1522 
1523     int i;
1524     args->tmpp2[0] = args->tmps2;
1525     int ret = bcf_get_format_string(args->hdr,line,col->hdr_key_dst,&args->tmpp2,&args->mtmps2);
1526     args->tmps2 = args->tmpp2[0];   // tmps2 might be realloced
1527 
1528     int nsmpl = bcf_hdr_nsamples(args->hdr_out);
1529     if ( ret<=0 )   // not present in dst
1530     {
1531         hts_expand(char,bcf_hdr_nsamples(args->hdr_out)*2,args->mtmps2,args->tmps2);
1532         char *tmp = args->tmps2;
1533         for (i=0; i<nsmpl; i++)
1534         {
1535             tmp[0] = '.';
1536             tmp[1] = 0;
1537             args->tmpp2[i] = tmp;
1538             tmp += 2;
1539         }
1540     }
1541     for (i=0; i<nsmpl; i++)
1542     {
1543         if ( args->sample_map[i]==-1 ) continue;
1544         char **src = vals + args->sample_map[i];
1545         char **dst = args->tmpp2 + i;
1546 
1547         if ( col->replace & REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; }
1548         else if ( col->replace & REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; }
1549         else if ( col->replace & REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; }
1550         *dst = *src;
1551     }
1552     return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)args->tmpp2,nsmpl);
1553 }
setter_format_int(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1554 static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1555 {
1556     if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n");
1557 
1558     annot_line_t *tab = (annot_line_t*) data;
1559     if ( col->icol+args->nsmpl_annot > tab->ncols )
1560         error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1561     int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot);
1562     hts_expand(int32_t,nvals*args->nsmpl_annot,args->mtmpi,args->tmpi);
1563 
1564     int icol = col->icol, ismpl;
1565     for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
1566     {
1567         int32_t *ptr = args->tmpi + ismpl*nvals;
1568         int ival = 0;
1569 
1570         char *str = tab->cols[icol];
1571         while ( *str )
1572         {
1573             if ( str[0]=='.' && (!str[1] || str[1]==',') )  // missing value
1574             {
1575                 ptr[ival++] = bcf_int32_missing;
1576                 str += str[1] ? 2 : 1;
1577                 continue;
1578             }
1579 
1580             char *end = str;
1581             ptr[ival] = strtol(str, &end, 10);
1582             if ( end==str )
1583                 error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
1584 
1585             ival++;
1586             str = *end ? end+1 : end;
1587         }
1588         while ( ival<nvals ) ptr[ival++] = bcf_int32_vector_end;
1589         icol++;
1590     }
1591     return core_setter_format_int(args,line,col,args->tmpi,nvals);
1592 }
setter_format_real(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1593 static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1594 {
1595     if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n");
1596 
1597     annot_line_t *tab = (annot_line_t*) data;
1598     if ( col->icol+args->nsmpl_annot > tab->ncols )
1599         error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1600     int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot);
1601     hts_expand(float,nvals*args->nsmpl_annot,args->mtmpf,args->tmpf);
1602 
1603     int icol = col->icol, ismpl;
1604     for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
1605     {
1606         float *ptr = args->tmpf + ismpl*nvals;
1607         int ival = 0;
1608 
1609         char *str = tab->cols[icol];
1610         while ( *str )
1611         {
1612             if ( str[0]=='.' && (!str[1] || str[1]==',') )  // missing value
1613             {
1614                 bcf_float_set_missing(ptr[ival]);
1615                 ival++;
1616                 str += str[1] ? 2 : 1;
1617                 continue;
1618             }
1619 
1620             char *end = str;
1621             ptr[ival] = strtod(str, &end);
1622             if ( end==str )
1623                 error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
1624 
1625             ival++;
1626             str = *end ? end+1 : end;
1627         }
1628         while ( ival<nvals ) { bcf_float_set_vector_end(ptr[ival]); ival++; }
1629         icol++;
1630     }
1631     return core_setter_format_real(args,line,col,args->tmpf,nvals);
1632 }
setter_format_str(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1633 static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1634 {
1635     if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n");
1636 
1637     annot_line_t *tab = (annot_line_t*) data;
1638     if ( col->icol+args->nsmpl_annot > tab->ncols )
1639         error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1640 
1641     int ismpl;
1642     for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
1643         args->tmpp[ismpl] = tab->cols[col->icol + ismpl];
1644 
1645     return core_setter_format_str(args,line,col,args->tmpp);
1646 }
determine_ploidy(int nals,int * vals,int nvals1,uint8_t * smpl,int nsmpl)1647 static int determine_ploidy(int nals, int *vals, int nvals1, uint8_t *smpl, int nsmpl)
1648 {
1649     int i, j, ndip = nals*(nals+1)/2, max_ploidy = 0;
1650     for (i=0; i<nsmpl; i++)
1651     {
1652         int *ptr = vals + i*nvals1;
1653         int has_value = 0;
1654         for (j=0; j<nvals1; j++)
1655         {
1656             if ( ptr[j]==bcf_int32_vector_end ) break;
1657             if ( ptr[j]!=bcf_int32_missing ) has_value = 1;
1658         }
1659         if ( has_value )
1660         {
1661             if ( j==ndip )
1662             {
1663                 smpl[i] = 2;
1664                 max_ploidy = 2;
1665             }
1666             else if ( j==nals )
1667             {
1668                 smpl[i] = 1;
1669                 if ( !max_ploidy ) max_ploidy = 1;
1670             }
1671             else return -j;
1672         }
1673         else smpl[i] = 0;
1674     }
1675     return max_ploidy;
1676 }
vcf_setter_format_int(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1677 static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1678 {
1679     bcf1_t *rec = (bcf1_t*) data;
1680     int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi);
1681     if ( nsrc==-3 ) return 0;    // the tag is not present
1682     if ( nsrc<=0 ) return 1;     // error
1683     int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header);
1684     int nsrc1 = nsrc / nsmpl_src;
1685     if ( col->number!=BCF_VL_G && col->number!=BCF_VL_R && col->number!=BCF_VL_A )
1686         return core_setter_format_int(args,line,col,args->tmpi,nsrc1);
1687 
1688     // create mapping from src to dst genotypes, haploid and diploid version
1689     int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1;
1690     int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele);
1691     if ( !map_hap ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1692 
1693     int i, j;
1694     if ( rec->n_allele==line->n_allele )
1695     {
1696         // alleles unchanged?
1697         for (i=0; i<rec->n_allele; i++) if ( map_hap[i]!=i ) break;
1698         if ( i==rec->n_allele )
1699             return core_setter_format_int(args,line,col,args->tmpi,nsrc1);
1700     }
1701 
1702     int nsmpl_dst = rec->n_sample;
1703     int ndst  = bcf_get_format_int32(args->hdr,line,col->hdr_key_dst,&args->tmpi2,&args->mtmpi2);
1704     int ndst1 = ndst / nsmpl_dst;
1705     if ( ndst <= 0 )
1706     {
1707         if ( col->replace & REPLACE_NON_MISSING ) return 0;  // overwrite only if present
1708         if ( col->number==BCF_VL_G )
1709             ndst1 = line->n_allele*(line->n_allele+1)/2;
1710         else
1711             ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
1712         hts_expand(int, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2);
1713         for (i=0; i<nsmpl_dst; i++)
1714         {
1715             int32_t *dst = args->tmpi2 + i*ndst1;
1716             for (j=0; j<ndst1; j++) dst[j] = bcf_int32_missing;
1717         }
1718     }
1719 
1720     int nmap_dip = 0, *map_dip = NULL;
1721     if ( col->number==BCF_VL_G )
1722     {
1723         map_dip = vcmp_map_dipGvalues(args->vcmp, &nmap_dip);
1724         if ( !args->src_smpl_pld )
1725         {
1726             args->src_smpl_pld = (uint8_t*) malloc(nsmpl_src);
1727             args->dst_smpl_pld = (uint8_t*) malloc(nsmpl_dst);
1728         }
1729         int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src);
1730         if ( pld_src<0 )
1731             error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),(int64_t) rec->pos+1);
1732         int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst);
1733         if ( pld_dst<0 )
1734             error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1735 
1736         int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2;
1737         if ( ndst1_new != ndst1 )
1738         {
1739             if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d  at %s:%"PRId64"\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1740             ndst1 = ndst1_new;
1741             hts_expand(int32_t, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2);
1742         }
1743     }
1744     else if ( !ndst1 )
1745     {
1746         ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
1747         hts_expand(int32_t, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2);
1748     }
1749 
1750     for (i=0; i<nsmpl_dst; i++)
1751     {
1752         int ii = args->sample_map ? args->sample_map[i] : i;
1753         int32_t *ptr_src = args->tmpi + i*nsrc1;
1754         int32_t *ptr_dst = args->tmpi2 + ii*ndst1;
1755 
1756         if ( col->number==BCF_VL_G )
1757         {
1758             if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] )
1759                 error("Sample ploidy differs at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1760             if ( !args->dst_smpl_pld[i] )
1761                 for (j=0; j<ndst1; j++) ptr_dst[j] = bcf_int32_missing;
1762         }
1763         if ( col->number!=BCF_VL_G || args->src_smpl_pld[i]==1 )
1764         {
1765             for (j=0; j<nmap_hap; j++)
1766             {
1767                 int k = map_hap[j];
1768                 if ( k>=0 ) ptr_dst[k] = ptr_src[j];
1769             }
1770             if ( col->number==BCF_VL_G )
1771                 for (j=line->n_allele; j<ndst1; j++) ptr_dst[j++] = bcf_int32_vector_end;
1772         }
1773         else
1774         {
1775             for (j=0; j<nmap_dip; j++)
1776             {
1777                 int k = map_dip[j];
1778                 if ( k>=0 ) ptr_dst[k] = ptr_src[j];
1779             }
1780         }
1781     }
1782     return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,nsmpl_dst*ndst1);
1783 }
vcf_setter_format_real(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1784 static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1785 {
1786     bcf1_t *rec = (bcf1_t*) data;
1787     int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf);
1788     if ( nsrc==-3 ) return 0;    // the tag is not present
1789     if ( nsrc<=0 ) return 1;     // error
1790     int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header);
1791     int nsrc1 = nsrc / nsmpl_src;
1792     if ( col->number!=BCF_VL_G && col->number!=BCF_VL_R && col->number!=BCF_VL_A )
1793         return core_setter_format_real(args,line,col,args->tmpf,nsrc1);
1794 
1795     // create mapping from src to dst genotypes, haploid and diploid version
1796     int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1;
1797     int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele);
1798     if ( !map_hap ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1799 
1800     int i, j;
1801     if ( rec->n_allele==line->n_allele )
1802     {
1803         // alleles unchanged?
1804         for (i=0; i<rec->n_allele; i++) if ( map_hap[i]!=i ) break;
1805         if ( i==rec->n_allele )
1806             return core_setter_format_real(args,line,col,args->tmpf,nsrc1);
1807     }
1808 
1809     int nsmpl_dst = rec->n_sample;
1810     int ndst  = bcf_get_format_float(args->hdr,line,col->hdr_key_dst,&args->tmpf2,&args->mtmpf2);
1811     int ndst1 = ndst / nsmpl_dst;
1812     if ( ndst <= 0 )
1813     {
1814         if ( col->replace & REPLACE_NON_MISSING ) return 0;  // overwrite only if present
1815         if ( col->number==BCF_VL_G )
1816             ndst1 = line->n_allele*(line->n_allele+1)/2;
1817         else
1818             ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
1819         hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2);
1820         for (i=0; i<nsmpl_dst; i++)
1821         {
1822             float *dst = args->tmpf2 + i*ndst1;
1823             for (j=0; j<ndst1; j++) bcf_float_set_missing(dst[j]);
1824         }
1825     }
1826 
1827     int nmap_dip = 0, *map_dip = NULL;
1828     if ( col->number==BCF_VL_G )
1829     {
1830         map_dip = vcmp_map_dipGvalues(args->vcmp, &nmap_dip);
1831         if ( !args->src_smpl_pld )
1832         {
1833             args->src_smpl_pld = (uint8_t*) malloc(nsmpl_src);
1834             args->dst_smpl_pld = (uint8_t*) malloc(nsmpl_dst);
1835         }
1836         int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src);
1837         if ( pld_src<0 )
1838             error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),(int64_t) rec->pos+1);
1839         int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst);
1840         if ( pld_dst<0 )
1841             error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1842 
1843         int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2;
1844         if ( ndst1_new != ndst1 )
1845         {
1846             if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d  at %s:%"PRId64"\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1847             ndst1 = ndst1_new;
1848             hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2);
1849         }
1850     }
1851     else if ( !ndst1 )
1852     {
1853         ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
1854         hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2);
1855     }
1856 
1857     for (i=0; i<nsmpl_dst; i++)
1858     {
1859         int ii = args->sample_map ? args->sample_map[i] : i;
1860         float *ptr_src = args->tmpf + i*nsrc1;
1861         float *ptr_dst = args->tmpf2 + ii*ndst1;
1862 
1863         if ( col->number==BCF_VL_G )
1864         {
1865             if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] )
1866                 error("Sample ploidy differs at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1867             if ( !args->dst_smpl_pld[i] )
1868                 for (j=0; j<ndst1; j++) bcf_float_set_missing(ptr_dst[j]);
1869         }
1870         if ( col->number!=BCF_VL_G || args->src_smpl_pld[i]==1 )
1871         {
1872             for (j=0; j<nmap_hap; j++)
1873             {
1874                 int k = map_hap[j];
1875                 if ( k>=0 )
1876                 {
1877                     if ( bcf_float_is_missing(ptr_src[j]) ) bcf_float_set_missing(ptr_dst[k]);
1878                     else if ( bcf_float_is_vector_end(ptr_src[j]) ) bcf_float_set_vector_end(ptr_dst[k]);
1879                     else ptr_dst[k] = ptr_src[j];
1880                 }
1881             }
1882             if ( col->number==BCF_VL_G )
1883                 for (j=line->n_allele; j<ndst1; j++) bcf_float_set_vector_end(ptr_dst[j]);
1884         }
1885         else
1886         {
1887             for (j=0; j<nmap_dip; j++)
1888             {
1889                 int k = map_dip[j];
1890                 if ( k>=0 )
1891                 {
1892                     if ( bcf_float_is_missing(ptr_src[j]) ) bcf_float_set_missing(ptr_dst[k]);
1893                     else if ( bcf_float_is_vector_end(ptr_src[j]) ) bcf_float_set_vector_end(ptr_dst[k]);
1894                     else ptr_dst[k] = ptr_src[j];
1895                 }
1896             }
1897         }
1898     }
1899     return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nsmpl_dst*ndst1);
1900 }
1901 
vcf_setter_format_str(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1902 static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1903 {
1904     bcf1_t *rec = (bcf1_t*) data;
1905     args->tmpp[0] = args->tmps;
1906     int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpp,&args->mtmps);
1907     args->tmps = args->tmpp[0]; // tmps might be realloced
1908     if ( ret==-3 ) return 0;    // the tag is not present
1909     if ( ret<=0 ) return 1;     // error
1910     if ( strcmp("GT",col->hdr_key_dst) )
1911         return core_setter_format_str(args,line,col,args->tmpp);
1912 
1913     // Genotypes are internally represented as integers. This is a complication for FMT/GT:=oldGT
1914     // First determine the maximum number of alleles per-sample ndst1
1915     int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header);
1916     int nsmpl_dst = bcf_hdr_nsamples(args->hdr_out);
1917     int isrc,idst, ndst1 = 0, nsrc1 = ret / nsmpl_src;
1918     char *ptr = args->tmps, *ptr_end = ptr + ret;
1919     while ( ptr < ptr_end )
1920     {
1921         char *smpl_end = ptr + nsrc1;
1922         int n = 1;
1923         while ( ptr < smpl_end )
1924         {
1925             if ( *ptr=='/' || *ptr=='|' ) n++;
1926             ptr++;
1927         }
1928         if ( ndst1 < n ) ndst1 = n;
1929     }
1930     assert( ndst1 );
1931 
1932     int ndst = ndst1*nsmpl_dst;
1933     hts_expand(int32_t,ndst,args->mtmpi,args->tmpi);
1934     hts_expand(char,ret+1,args->mtmps,args->tmps); args->tmps[ret] = 0; // the FORMAT string may not be 0-terminated
1935     for (idst=0; idst<nsmpl_dst; idst++)
1936     {
1937         int i = 0, is_phased = 0;
1938         int32_t *dst = args->tmpi + idst*ndst1;
1939         isrc = args->sample_map ? args->sample_map[idst] : idst;
1940         if ( isrc==-1 )
1941         {
1942             dst[0] = bcf_gt_missing;
1943             for (i=1; i<ndst1; i++) dst[i] = bcf_int32_vector_end;
1944             continue;
1945         }
1946         char *beg = args->tmps + isrc*nsrc1, *tmp;
1947         char *keep_ptr = beg+nsrc1, keep = *keep_ptr; *keep_ptr = 0;
1948         while ( *beg )
1949         {
1950             char *end = beg;
1951             while ( *end && *end!='/' && *end!='|' ) end++;
1952             if ( *beg=='.' && end-beg==1 ) dst[i] = bcf_gt_missing;
1953             else
1954             {
1955                 if ( *end=='|' ) is_phased = 1;
1956                 dst[i] = strtol(beg, &tmp, 10);
1957                 if ( tmp!=end )
1958                     error("Could not parse the %s field at %s:%"PRId64" in %s\n", col->hdr_key_src,bcf_seqname(args->files->readers[1].header,rec),(int64_t) rec->pos+1,args->targets_fname);
1959                 if ( dst[i] >= line->n_allele )
1960                     error("The source allele index is bigger than the number of destination alleles at %s:%"PRId64"\n", bcf_seqname(args->files->readers[1].header,rec),(int64_t) rec->pos+1);
1961                 dst[i] = is_phased ? bcf_gt_phased(dst[i]) : bcf_gt_unphased(dst[i]);
1962             }
1963             beg = *end ? end+1 : end;
1964             i++;
1965         }
1966         *keep_ptr = keep;
1967         for (; i<ndst1; i++) dst[i] = bcf_int32_vector_end;
1968     }
1969     return bcf_update_genotypes(args->hdr_out,line,args->tmpi,ndst);
1970 }
init_sample_map(args_t * args,bcf_hdr_t * src,bcf_hdr_t * dst)1971 static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst)
1972 {
1973     int i;
1974     if ( !args->sample_names )
1975     {
1976         args->nsmpl_annot = bcf_hdr_nsamples(dst);
1977 
1978         // tab annotation file, expecting that all samples are present: sample map not needed
1979         if ( !src ) return 0;
1980 
1981         int nmatch = 0;
1982         for (i=0; i<bcf_hdr_nsamples(src); i++)
1983         {
1984             int id = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, src->samples[i]);
1985             if ( id!=-1 ) nmatch++;
1986         }
1987         if ( !nmatch ) return -1;   // No matching samples found in the source and the destination file
1988 
1989         args->nsample_map = bcf_hdr_nsamples(dst);
1990         args->sample_map  = (int*) malloc(sizeof(int)*args->nsample_map);
1991         for (i=0; i<args->nsample_map; i++)
1992         {
1993             int id = bcf_hdr_id2int(src, BCF_DT_SAMPLE, dst->samples[i]);
1994             args->sample_map[i] = id;   // idst -> isrc, -1 if not present
1995         }
1996         return 1;
1997     }
1998 
1999     args->nsample_map = bcf_hdr_nsamples(dst);
2000     args->sample_map  = (int*) malloc(sizeof(int)*args->nsample_map);
2001     for (i=0; i<args->nsample_map; i++) args->sample_map[i] = -1;
2002 
2003     int flags = !src ? SMPL_STRICT|SMPL_SINGLE : SMPL_STRICT|SMPL_SINGLE|SMPL_PAIR2; // is vcf vs tab annotation file
2004     smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, flags);    // gives mapping dst->src
2005     if ( !ilist || !ilist->n ) error("Could not parse the samples: %s\n", args->sample_names);
2006     args->nsmpl_annot = ilist->n;
2007     int need_sample_map = args->nsmpl_annot==bcf_hdr_nsamples(dst) ? 0 : 1;
2008     for (i=0; i<args->nsmpl_annot; i++)
2009     {
2010         int idst = ilist->idx[i];
2011         const char *src_name = ilist->pair && ilist->pair[i] ? ilist->pair[i] : bcf_hdr_int2id(dst, BCF_DT_SAMPLE, idst);
2012         int isrc = i;
2013         if ( src )     // the annotation file is a VCF, not a tab-delimited file
2014         {
2015             isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE, src_name);
2016             if ( isrc==-1 ) error("Sample \"%s\" not found in the annotation file\n", src_name);
2017         }
2018         if ( isrc!=idst ) need_sample_map = 1;
2019         args->sample_map[idst] = isrc;
2020     }
2021     smpl_ilist_destroy(ilist);
2022     return need_sample_map;
2023 }
columns_complement(char * columns,void ** skip_info,void ** skip_fmt)2024 static char *columns_complement(char *columns, void **skip_info, void **skip_fmt)
2025 {
2026     kstring_t str = {0,0,0};
2027     char *ss = columns, *se = ss;
2028     while ( *ss )
2029     {
2030         if ( *se && *se!=',' ) { se++; continue; }
2031         if ( *ss!='^' )
2032         {
2033             if ( str.l ) kputc(',',&str);
2034             kputsn(ss, se-ss, &str);
2035             if ( !*se ) break;
2036             ss = ++se;
2037             continue;
2038         }
2039 
2040         if ( !strncasecmp("^INFO/",ss,6) )
2041         {
2042             if ( !*skip_info )
2043             {
2044                 *skip_info = khash_str2int_init();
2045                 if ( str.l ) kputc(',',&str);
2046                 kputs("INFO",&str);
2047             }
2048             char tmp = *se; *se = 0;
2049             khash_str2int_inc(*skip_info, strdup(ss+6));
2050             *se = tmp;
2051         }
2052         else if ( !strncasecmp("^FORMAT/",ss,8) || !strncasecmp("^FMT/",ss,5) )
2053         {
2054             int n = !strncasecmp("^FMT/",ss,5) ? 5 : 8;
2055             if ( !*skip_fmt )
2056             {
2057                 *skip_fmt = khash_str2int_init();
2058                 if ( str.l ) kputc(',',&str);
2059                 kputs("FORMAT",&str);
2060             }
2061             char tmp = *se; *se = 0;
2062             khash_str2int_inc(*skip_fmt, strdup(ss+n));
2063             *se = tmp;
2064         }
2065         else
2066         {
2067             if ( !*skip_info )
2068             {
2069                 *skip_info = khash_str2int_init();
2070                 if ( str.l ) kputc(',',&str);
2071                 kputs("INFO",&str);
2072             }
2073             char tmp = *se; *se = 0;
2074             khash_str2int_inc(*skip_info, strdup(ss+1));
2075             *se = tmp;
2076         }
2077 
2078         if ( !*se ) break;
2079         ss = ++se;
2080     }
2081     free(columns);
2082     return str.s;
2083 }
bcf_hrec_format_rename(bcf_hrec_t * hrec,char * tag,kstring_t * str)2084 static void bcf_hrec_format_rename(bcf_hrec_t *hrec, char *tag, kstring_t *str)
2085 {
2086     int j, nout = 0;
2087     ksprintf(str, "##%s=<", hrec->key);
2088     for (j=0; j<hrec->nkeys; j++)
2089     {
2090         if ( !strcmp("IDX",hrec->keys[j]) ) continue;
2091         if ( nout ) kputc(',',str);
2092         if ( !strcmp("ID", hrec->keys[j]) )
2093             ksprintf(str,"%s=%s", hrec->keys[j], tag);
2094         else
2095             ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]);
2096         nout++;
2097     }
2098     ksprintf(str,">\n");
2099 }
set_replace_mode(char * ss,int * replace)2100 static char *set_replace_mode(char *ss, int *replace)
2101 {
2102     int mode = 0;
2103     while (*ss)
2104     {
2105         if ( *ss=='+' ) mode |= REPLACE_MISSING;
2106         else if ( *ss=='-' ) mode |= REPLACE_NON_MISSING;
2107         else if ( *ss=='=' ) mode |= SET_OR_APPEND;
2108         else if ( *ss=='.' ) mode |= CARRY_OVER_MISSING;
2109         else break;
2110         ss++;
2111     }
2112     if ( !mode ) mode = REPLACE_ALL;
2113 // is exactly one bit set?
2114 //    if ( mode && !(mode && ((mode & mode-1) == 0)) )
2115     *replace = mode;
2116     return ss;
2117 }
init_columns(args_t * args)2118 static void init_columns(args_t *args)
2119 {
2120     int need_sample_map = 0;
2121     int sample_map_ok = init_sample_map(args, args->tgts_is_vcf?args->files->readers[1].header:NULL, args->hdr);
2122 
2123     kstring_t tmp = {0,0,0};
2124     if ( args->columns_is_file )
2125     {
2126         int i,n;
2127         char **str = hts_readlist(args->columns, args->columns_is_file, &n);
2128         if ( !str ) error("Could not parse %s\n", args->columns);
2129         for (i=0; i<n; i++)
2130         {
2131             char *ptr = str[i];
2132             while ( *ptr && !isspace(*ptr) ) ptr++;
2133             if ( *ptr )
2134             {
2135                 *ptr = 0;
2136                 ptr++;
2137                 while ( *ptr && isspace(*ptr) ) ptr++;
2138                 if ( *ptr )
2139                 {
2140                     if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str);
2141                     kputs(str[i],&args->merge_method_str);
2142                     kputc(':',&args->merge_method_str);
2143                     kputs(ptr,&args->merge_method_str);
2144                 }
2145             }
2146             if ( tmp.l ) kputc(',',&tmp);
2147             kputs(str[i],&tmp);
2148             free(str[i]);
2149         }
2150         free(str);
2151         free(args->columns);
2152         args->columns = tmp.s;
2153         tmp.l = tmp.m = 0;
2154         tmp.s = NULL;
2155     }
2156 
2157     void *skip_fmt = NULL, *skip_info = NULL;
2158     if ( args->tgts_is_vcf )
2159         args->columns = columns_complement(args->columns, &skip_info, &skip_fmt);
2160 
2161     kstring_t str = {0,0,0};
2162     char *ss = args->columns, *se = ss;
2163     args->ncols = 0;
2164     int icol = -1, has_fmt_str = 0;
2165     while ( *ss )
2166     {
2167         if ( *se && *se!=',' ) { se++; continue; }
2168         int replace;
2169         ss = set_replace_mode(ss, &replace);
2170         icol++;
2171         str.l = 0;
2172         kputsn(ss, se-ss, &str);
2173         if ( !str.s[0] || !strcasecmp("-",str.s) ) ;
2174         else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = icol;
2175         else if ( !strcasecmp("POS",str.s) ) args->beg_idx = icol;
2176         else if ( !strcasecmp("FROM",str.s) || !strcasecmp("BEG",str.s) ) args->beg_idx = icol;
2177         else if ( !strcasecmp("TO",str.s) || !strcasecmp("END",str.s) ) args->end_idx = icol;
2178         else if ( !strcasecmp("REF",str.s) )
2179         {
2180             if ( args->tgts_is_vcf )
2181             {
2182                 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2183                 annot_col_t *col = &args->cols[args->ncols-1];
2184                 memset(col,0,sizeof(*col));
2185                 col->setter = vcf_setter_ref;
2186                 col->hdr_key_src = strdup(str.s);
2187                 col->hdr_key_dst = strdup(str.s);
2188             }
2189             else args->ref_idx = icol;
2190         }
2191         else if ( !strcasecmp("ALT",str.s) )
2192         {
2193             if ( args->tgts_is_vcf )
2194             {
2195                 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2196                 annot_col_t *col = &args->cols[args->ncols-1];
2197                 memset(col,0,sizeof(*col));
2198                 col->setter = vcf_setter_alt;
2199                 col->hdr_key_src = strdup(str.s);
2200                 col->hdr_key_dst = strdup(str.s);
2201             }
2202             else args->alt_idx = icol;
2203         }
2204         else if ( !strcasecmp("ID",str.s) || !strcasecmp("~ID",str.s) )
2205         {
2206             if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
2207             if ( str.s[0]=='~' ) replace = MATCH_VALUE;
2208             if ( args->tgts_is_vcf && (replace & MATCH_VALUE) ) error("todo: -c ~ID with -a VCF?\n");
2209             args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2210             annot_col_t *col = &args->cols[args->ncols-1];
2211             memset(col,0,sizeof(*col));
2212             col->icol = icol;
2213             col->replace = replace;
2214             col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
2215             col->hdr_key_src = strdup(str.s);
2216             col->hdr_key_dst = strdup(str.s);
2217             if ( replace & MATCH_VALUE ) args->match_id = icol;
2218         }
2219         else if ( !strcasecmp("~INFO/END",str.s) && !args->tgts_is_vcf )
2220         {
2221             replace = MATCH_VALUE;
2222             args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2223             annot_col_t *col = &args->cols[args->ncols-1];
2224             memset(col,0,sizeof(*col));
2225             col->icol = icol;
2226             col->replace = replace;
2227             col->setter  = NULL;
2228             col->hdr_key_src = strdup(str.s);
2229             col->hdr_key_dst = strdup(str.s);
2230             args->match_end = icol;
2231         }
2232         else if ( !strcasecmp("~POS",str.s) && !args->tgts_is_vcf )
2233         {
2234             if ( args->tgts_is_vcf ) error("Error: cannot use ~POS, position can be replaced only from a tab-delimited file\n");
2235             args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2236             annot_col_t *col = &args->cols[args->ncols-1];
2237             memset(col,0,sizeof(*col));
2238             col->icol = icol;
2239             col->replace = replace;
2240             col->setter  = setter_pos;
2241             col->hdr_key_src = strdup(str.s);
2242             col->hdr_key_dst = strdup(str.s);
2243             args->match_end = icol;
2244         }
2245         else if ( !strncasecmp("ID:=",str.s,4) )    // transfer a tag from INFO to ID column
2246         {
2247             if ( !args->tgts_is_vcf ) error("The annotation source must be a VCF for \"%s\"\n",str.s);
2248             if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
2249             args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2250             annot_col_t *col = &args->cols[args->ncols-1];
2251             memset(col,0,sizeof(*col));
2252             col->icol = icol;
2253             col->replace = replace;
2254             col->setter = vcf_setter_id;
2255             col->getter = vcf_getter_info_str2str;
2256             str.s[2] = 0;
2257             col->hdr_key_dst = strdup(str.s);
2258             col->hdr_key_src = strncasecmp("INFO/",str.s+4,5) ? strdup(str.s+4) : strdup(str.s+4+5);
2259             int hdr_id = bcf_hdr_id2int(args->tgts_hdr, BCF_DT_ID,col->hdr_key_src);
2260             if ( !bcf_hdr_idinfo_exists(args->tgts_hdr,BCF_HL_INFO,hdr_id) )
2261                 error("The INFO tag \"%s\" is not defined in %s\n", col->hdr_key_src, args->targets_fname);
2262             if ( bcf_hdr_id2type(args->tgts_hdr,BCF_HL_INFO,hdr_id)!=BCF_HT_STR )
2263                 error("Only Type=String tags can be used to annotate the ID column\n");
2264         }
2265         else if ( !strcasecmp("FILTER",str.s) )
2266         {
2267             if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
2268             args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2269             annot_col_t *col = &args->cols[args->ncols-1];
2270             memset(col,0,sizeof(*col));
2271             col->icol = icol;
2272             col->replace = replace;
2273             col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter;
2274             col->hdr_key_src = strdup(str.s);
2275             col->hdr_key_dst = strdup(str.s);
2276             if ( args->tgts_is_vcf )
2277             {
2278                 bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
2279                 int j;
2280                 for (j=0; j<tgts_hdr->nhrec; j++)
2281                 {
2282                     bcf_hrec_t *hrec = tgts_hdr->hrec[j];
2283                     if ( hrec->type!=BCF_HL_FLT ) continue;
2284                     int k = bcf_hrec_find_key(hrec,"ID");
2285                     if ( k<0 ) error("[%s] Failed to parse the header, the ID attribute not found", __func__);
2286                     tmp.l = 0;
2287                     bcf_hrec_format(hrec, &tmp);
2288                     bcf_hdr_append(args->hdr_out, tmp.s);
2289                 }
2290                 if (bcf_hdr_sync(args->hdr_out) < 0)
2291                     error_errno("[%s] Failed to update header", __func__);
2292             }
2293         }
2294         else if ( !strcasecmp("QUAL",str.s) )
2295         {
2296             if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
2297             if ( replace & SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
2298             args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2299             annot_col_t *col = &args->cols[args->ncols-1];
2300             memset(col,0,sizeof(*col));
2301             col->icol = icol;
2302             col->replace = replace;
2303             col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual;
2304             col->hdr_key_src = strdup(str.s);
2305             col->hdr_key_dst = strdup(str.s);
2306         }
2307         else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields
2308         {
2309             if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
2310             if ( replace & SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n");
2311             bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
2312             int j;
2313             for (j=0; j<tgts_hdr->nhrec; j++)
2314             {
2315                 bcf_hrec_t *hrec = tgts_hdr->hrec[j];
2316                 if ( hrec->type!=BCF_HL_INFO ) continue;
2317                 int k = bcf_hrec_find_key(hrec,"ID");
2318                 assert( k>=0 ); // this should always be true for valid VCFs
2319                 if ( skip_info && khash_str2int_has_key(skip_info,hrec->vals[k]) ) continue;
2320                 tmp.l = 0;
2321                 bcf_hrec_format(hrec, &tmp);
2322                 bcf_hdr_append(args->hdr_out, tmp.s);
2323                 if (bcf_hdr_sync(args->hdr_out) < 0)
2324                     error_errno("[%s] Failed to update header", __func__);
2325                 int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
2326                 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2327                 annot_col_t *col = &args->cols[args->ncols-1];
2328                 memset(col,0,sizeof(*col));
2329                 col->icol = -1;
2330                 col->replace = replace;
2331                 col->hdr_key_src = strdup(hrec->vals[k]);
2332                 col->hdr_key_dst = strdup(hrec->vals[k]);
2333                 col->number  = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
2334                 switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
2335                 {
2336                     case BCF_HT_FLAG:   col->setter = vcf_setter_info_flag; break;
2337                     case BCF_HT_INT:    col->setter = vcf_setter_info_int; break;
2338                     case BCF_HT_REAL:   col->setter = vcf_setter_info_real; break;
2339                     case BCF_HT_STR:    col->setter = vcf_setter_info_str; break;
2340                     default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id));
2341                 }
2342             }
2343         }
2344         else if ( args->tgts_is_vcf && (!strcasecmp("FORMAT",str.s) || !strcasecmp("FMT",str.s)) ) // All FORMAT fields
2345         {
2346             bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
2347             need_sample_map = 1;
2348             int j;
2349             for (j=0; j<tgts_hdr->nhrec; j++)
2350             {
2351                 bcf_hrec_t *hrec = tgts_hdr->hrec[j];
2352                 if ( hrec->type!=BCF_HL_FMT) continue;
2353                 int k = bcf_hrec_find_key(hrec,"ID");
2354                 assert( k>=0 ); // this should always be true for valid VCFs
2355                 if ( skip_fmt && khash_str2int_has_key(skip_fmt,hrec->vals[k]) ) continue;
2356                 tmp.l = 0;
2357                 bcf_hrec_format(hrec, &tmp);
2358                 bcf_hdr_append(args->hdr_out, tmp.s);
2359                 if (bcf_hdr_sync(args->hdr_out) < 0)
2360                     error_errno("[%s] Failed to update header", __func__);
2361                 int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
2362                 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2363                 annot_col_t *col = &args->cols[args->ncols-1];
2364                 memset(col,0,sizeof(*col));
2365                 col->icol = -1;
2366                 col->replace = replace;
2367                 col->hdr_key_src = strdup(hrec->vals[k]);
2368                 col->hdr_key_dst = strdup(hrec->vals[k]);
2369                 if ( !strcasecmp("GT",col->hdr_key_src) )
2370                 {
2371                     if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n");
2372                     col->setter = vcf_setter_format_gt;
2373                 }
2374                 else
2375                     switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
2376                     {
2377                         case BCF_HT_INT:    col->setter = vcf_setter_format_int; break;
2378                         case BCF_HT_REAL:   col->setter = vcf_setter_format_real; break;
2379                         case BCF_HT_STR:    col->setter = vcf_setter_format_str; has_fmt_str = 1; break;
2380                         default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id));
2381                     }
2382                 hdr_id = bcf_hdr_id2int(tgts_hdr, BCF_DT_ID, hrec->vals[k]);
2383                 col->number = bcf_hdr_id2length(tgts_hdr,BCF_HL_FMT,hdr_id);
2384             }
2385         }
2386         else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) )
2387         {
2388             char *key_dst = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7);
2389             char *key_src = strstr(key_dst,":=");
2390             if ( key_src )
2391             {
2392                 *key_src = 0;
2393                 key_src += 2;
2394                 if ( !strncasecmp("FORMAT/",key_src,7) ) key_src += 7;
2395                 else if ( !strncasecmp("FMT/",key_src,4) ) key_src += 4;
2396             }
2397             else
2398                 key_src = key_dst;
2399             need_sample_map = 1;
2400             if ( args->tgts_is_vcf )
2401             {
2402                 bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL);
2403                 if ( !hrec ) error("No such annotation \"%s\" in %s\n", key_src,args->targets_fname);
2404                 tmp.l = 0;
2405                 bcf_hrec_format_rename(hrec, key_dst, &tmp);
2406                 bcf_hdr_append(args->hdr_out, tmp.s);
2407                 if (bcf_hdr_sync(args->hdr_out) < 0)
2408                     error_errno("[%s] Failed to update header", __func__);
2409             }
2410             int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
2411             if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) )
2412                 error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname);
2413             args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2414             annot_col_t *col = &args->cols[args->ncols-1];
2415             memset(col,0,sizeof(*col));
2416             if ( !args->tgts_is_vcf )
2417             {
2418                 col->icol = icol;
2419                 icol += args->nsmpl_annot - 1;
2420             }
2421             else
2422                 col->icol = -1;
2423             col->replace = replace;
2424             col->hdr_key_src = strdup(key_src);
2425             col->hdr_key_dst = strdup(key_dst);
2426             if ( !strcasecmp("GT",key_src) )
2427             {
2428                 if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n");
2429                 col->setter = vcf_setter_format_gt;
2430             }
2431             else
2432                 switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
2433                 {
2434                     case BCF_HT_INT:    col->setter = args->tgts_is_vcf ? vcf_setter_format_int  : setter_format_int; break;
2435                     case BCF_HT_REAL:   col->setter = args->tgts_is_vcf ? vcf_setter_format_real : setter_format_real; break;
2436                     case BCF_HT_STR:    col->setter = args->tgts_is_vcf ? vcf_setter_format_str  : setter_format_str; has_fmt_str = 1; break;
2437                     default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id));
2438                 }
2439             if ( args->tgts_is_vcf )
2440             {
2441                 bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
2442                 hdr_id = bcf_hdr_id2int(tgts_hdr, BCF_DT_ID, col->hdr_key_src);
2443                 col->number = bcf_hdr_id2length(tgts_hdr,BCF_HL_FMT,hdr_id);
2444             }
2445         }
2446         else
2447         {
2448             if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
2449             if ( replace & SET_OR_APPEND )
2450             {
2451                 if ( args->tgts_is_vcf )
2452                     error("Error: the =INFO/TAG feature is currently supported only with TAB annotation files and has limitations\n"
2453                           "       (the annotation type is modified to \"Number=.\" and allele ordering is disregarded)\n");
2454                 fprintf(stderr,"Warning: the =INFO/TAG feature modifies the annotation to \"Number=.\" and disregards allele ordering\n");
2455             }
2456             int explicit_src_info = 0;
2457             int explicit_dst_info = 0;
2458             char *key_dst;
2459             if ( !strncasecmp("INFO/",str.s,5) )
2460             {
2461                 key_dst = str.s + 5;
2462                 explicit_dst_info = 1;
2463             }
2464             else if ( !strcasecmp("~INFO/END",str.s) )
2465             {
2466                 key_dst = str.s + 6;
2467                 explicit_dst_info = 1;
2468             }
2469             else
2470                 key_dst = str.s;
2471             char *key_src = strstr(key_dst,":=");
2472             if ( key_src )
2473             {
2474                 *key_src = 0;
2475                 key_src += 2;
2476                 if ( !strncasecmp("INFO/",key_src,5) )
2477                 {
2478                     key_src += 5;
2479                     explicit_src_info = 1;
2480                 }
2481                 else if ( !strncasecmp("FMT/",key_src,4) || !strncasecmp("FORMAT/",key_src,5) )
2482                 {
2483                     key_src[-2] = ':';
2484                     error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s);
2485                 }
2486             }
2487             else
2488                 key_src = key_dst;
2489 
2490             args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2491             annot_col_t *col = &args->cols[args->ncols-1];
2492             memset(col,0,sizeof(*col));
2493             col->icol = icol;
2494             col->replace = replace;
2495             col->hdr_key_src = strdup(key_src);
2496             col->hdr_key_dst = strdup(key_dst);
2497 
2498             int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
2499             if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) )
2500             {
2501                 if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line
2502                 {
2503                     if ( !strcasecmp("ID",key_src) && !explicit_src_info )
2504                     {
2505                         // transferring ID column into a new INFO tag
2506                         tmp.l = 0;
2507                         ksprintf(&tmp,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Transferred ID column\">",key_dst);
2508                     }
2509                     else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info )
2510                     {
2511                         // transferring FILTER column into a new INFO tag
2512                         tmp.l = 0;
2513                         ksprintf(&tmp,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Transferred FILTER column\">",key_dst);
2514                     }
2515                     else
2516                     {
2517                         bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL);
2518                         if ( !hrec )
2519                         {
2520                             if ( explicit_dst_info+explicit_src_info==0 && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) )
2521                                 error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s);
2522                             char *ptr = strchr(key_src,'=');
2523                             if ( ptr )
2524                             {
2525                                 *ptr = 0; tmp.l = 0; ksprintf(&tmp,"%s:=%s",key_src,ptr+1); *ptr = '=';
2526                                 error("The tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s);
2527                             }
2528                             error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname);
2529                         }
2530                         tmp.l = 0;
2531                         bcf_hrec_format_rename(hrec, key_dst, &tmp);
2532                     }
2533                     bcf_hdr_append(args->hdr_out, tmp.s);
2534                     if (bcf_hdr_sync(args->hdr_out) < 0)
2535                         error_errno("[%s] Failed to update header", __func__);
2536                     hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
2537                 }
2538                 else
2539                     error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src, args->targets_fname);
2540                 assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) );
2541             }
2542             if  ( args->tgts_is_vcf )
2543             {
2544                 if ( !strcasecmp("ID",key_src) && !explicit_src_info ) col->getter = vcf_getter_id2str;
2545                 else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info ) col->getter = vcf_getter_filter2str;
2546             }
2547             col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
2548             switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
2549             {
2550                 case BCF_HT_FLAG:   col->setter = args->tgts_is_vcf ? vcf_setter_info_flag : setter_info_flag; break;
2551                 case BCF_HT_INT:    col->setter = args->tgts_is_vcf ? vcf_setter_info_int  : setter_info_int; break;
2552                 case BCF_HT_REAL:   col->setter = args->tgts_is_vcf ? vcf_setter_info_real : setter_info_real; break;
2553                 case BCF_HT_STR:    col->setter = args->tgts_is_vcf ? vcf_setter_info_str  : setter_info_str; break;
2554                 default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id));
2555             }
2556             if ( replace & SET_OR_APPEND )   // change to Number=.
2557             {
2558                 bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_INFO, "ID", key_dst, NULL);
2559                 if ( !hrec ) error("Uh, could not find the new tag \"%s\" in the header\n", key_dst);
2560                 hrec = bcf_hrec_dup(hrec);
2561                 int j = bcf_hrec_find_key(hrec, "Number");
2562                 if ( j<0 ) error("Uh, could not find the entry Number in the header record of %s\n",key_dst);
2563                 free(hrec->vals[j]);
2564                 hrec->vals[j] = strdup(".");
2565                 bcf_hdr_remove(args->hdr_out,BCF_HL_INFO, key_dst);
2566                 bcf_hdr_add_hrec(args->hdr_out, hrec);
2567             }
2568         }
2569         if ( !*se ) break;
2570         ss = ++se;
2571     }
2572     free(str.s);
2573     free(tmp.s);
2574     free(args->columns);
2575     if ( skip_info ) khash_str2int_destroy_free(skip_info);
2576     if ( skip_fmt ) khash_str2int_destroy_free(skip_fmt);
2577     if ( has_fmt_str )
2578     {
2579         int n = bcf_hdr_nsamples(args->hdr_out);
2580         if ( args->tgts_is_vcf && n<bcf_hdr_nsamples(args->files->readers[1].header) ) n = bcf_hdr_nsamples(args->files->readers[1].header);
2581         args->tmpp  = (char**)malloc(sizeof(char*)*n);
2582         args->tmpp2 = (char**)malloc(sizeof(char*)*n);
2583     }
2584     if ( !need_sample_map )
2585     {
2586         free(args->sample_map);
2587         args->sample_map = NULL;
2588     }
2589     else if ( sample_map_ok<0 )
2590         error("No matching samples in source and destination file?\n");
2591 }
init_merge_method(args_t * args)2592 static void init_merge_method(args_t *args)
2593 {
2594     int i;
2595     for (i=0; i<args->ncols; i++)
2596     {
2597         args->cols[i].merge_method = MM_FIRST;
2598         args->cols[i].mm_str_hash = NULL;
2599         args->cols[i].mm_dbl = NULL;
2600         args->cols[i].mm_dbl_nalloc = args->cols[i].mm_dbl_nused = args->cols[i].mm_dbl_ndat = 0;
2601         memset(&args->cols[i].mm_kstr, 0, sizeof(args->cols[i].mm_kstr));
2602     }
2603     if ( !args->merge_method_str.l ) return;
2604     if ( args->tgts_is_vcf ) error("Error: the --merge-logic is intended for use with BED or TAB-delimited files only.\n");
2605     if ( !args->tgt_idx && !args->tgts ) error("Error: BEG,END (or FROM,TO) columns or REF,ALT columns are expected with the --merge-logic option.\n");
2606     char *sb = args->merge_method_str.s;
2607     while ( *sb )
2608     {
2609         char *se = sb;
2610         while ( *se && *se!=',' ) se++;
2611         args->tmpks.l = 0;
2612         kputsn(sb, se-sb, &args->tmpks);
2613         kputc(0, &args->tmpks);
2614         char *mm_type_str = args->tmpks.s + args->tmpks.l;
2615         while ( *mm_type_str!=':' && mm_type_str > args->tmpks.s ) mm_type_str--;
2616         if ( *mm_type_str!=':' )
2617             error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str.s);
2618         *mm_type_str = 0;
2619         mm_type_str++;
2620         int mm_type = MM_FIRST;
2621         if ( !strcasecmp("unique",mm_type_str) ) mm_type = MM_UNIQUE;
2622         else if ( !strcasecmp("first",mm_type_str) ) mm_type = MM_FIRST;
2623         else if ( !strcasecmp("append",mm_type_str) ) mm_type = MM_APPEND;
2624         else if ( !strcasecmp("append-missing",mm_type_str) )
2625         {
2626             mm_type = MM_APPEND_MISSING;
2627             if ( args->ref_idx!=-1 ) args->has_append_mode = 1;
2628         }
2629         else if ( !strcasecmp("sum",mm_type_str) ) mm_type = MM_SUM;
2630         else if ( !strcasecmp("avg",mm_type_str) ) mm_type = MM_AVG;
2631         else if ( !strcasecmp("min",mm_type_str) ) mm_type = MM_MIN;
2632         else if ( !strcasecmp("max",mm_type_str) ) mm_type = MM_MAX;
2633         else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str.s,mm_type_str);
2634         for (i=0; i<args->ncols; i++)
2635         {
2636             if ( strcmp(args->cols[i].hdr_key_dst,args->tmpks.s) ) continue;
2637             if ( (mm_type==MM_APPEND || mm_type==MM_APPEND_MISSING) && args->cols[i].number!=BCF_VL_VAR )
2638                 error("Error: --merge-logic append can be requested only for tags of variable length (Number=.)\n");
2639             args->cols[i].merge_method = mm_type;
2640             break;
2641         }
2642         if ( i==args->ncols ) error("No such tag in the destination file: %s\n", args->tmpks.s);
2643         sb = *se ? se + 1 : se;
2644     }
2645     if ( args->has_append_mode )
2646     {
2647         // create a missing line to insert missing values when VCF ALT finds no match in the annotation file
2648         args->aline_missing = (annot_line_t*)calloc(1,sizeof(*args->aline_missing));
2649         int ncol = 0;
2650         for (i=0; i<args->ncols; i++)
2651             if ( ncol < args->cols[i].icol + 1 ) ncol = args->cols[i].icol + 1;
2652         if ( ncol < args->ref_idx + 1 ) ncol = args->ref_idx + 1;
2653         args->aline_missing->mcols = ncol;
2654         args->aline_missing->ncols = ncol;
2655         args->aline_missing->cols = (char**) malloc(ncol*sizeof(char*));
2656         for (i=0; i<ncol; i++)
2657             args->aline_missing->cols[i] = strdup(".");
2658     }
2659 }
2660 
rename_chrs(args_t * args,char * fname)2661 static void rename_chrs(args_t *args, char *fname)
2662 {
2663     int n, i;
2664     char **map = hts_readlist(fname, 1, &n);
2665     if ( !map ) error("Could not read: %s\n", fname);
2666     for (i=0; i<n; i++)
2667     {
2668         char *ss = map[i];
2669         while ( *ss && !isspace(*ss) ) ss++;
2670         if ( !*ss ) error("Could not parse: %s\n", fname);
2671         *ss = 0;
2672         int rid = bcf_hdr_name2id(args->hdr_out, map[i]);
2673         bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_CTG, "ID", map[i], NULL);
2674         if ( !hrec ) continue;  // the sequence not present
2675         int j = bcf_hrec_find_key(hrec, "ID");
2676         assert( j>=0 );
2677         free(hrec->vals[j]);
2678         ss++;
2679         while ( *ss && isspace(*ss) ) ss++;
2680         char *se = ss;
2681         while ( *se && !isspace(*se) ) se++;
2682         *se = 0;
2683         hrec->vals[j] = strdup(ss);
2684         args->hdr_out->id[BCF_DT_CTG][rid].key = hrec->vals[j];
2685     }
2686     for (i=0; i<n; i++) free(map[i]);
2687     free(map);
2688 }
2689 
rename_annots(args_t * args,char * fname)2690 static void rename_annots(args_t *args, char *fname)
2691 {
2692     int n, i;
2693     char **map = hts_readlist(fname, 1, &n);
2694     if ( !map ) error("Could not read: %s\n", fname);
2695     for (i=0; i<n; i++)
2696     {
2697         char *sb = NULL, *ss = map[i];
2698         while ( *ss && !isspace(*ss) ) ss++;
2699         if ( !*ss ) error("Could not parse: %s\n", fname);
2700         *ss = 0;
2701         int type;
2702         if ( !strncasecmp("info/",map[i],5) ) type = BCF_HL_INFO, sb = map[i] + 5;
2703         else if ( !strncasecmp("format/",map[i],7) ) type = BCF_HL_FMT, sb = map[i] + 7;
2704         else if ( !strncasecmp("fmt/",map[i],4) ) type = BCF_HL_FMT, sb = map[i] + 4;
2705         else if ( !strncasecmp("filter/",map[i],7) ) type = BCF_HL_FLT, sb = map[i] + 7;
2706         else error("Could not parse \"%s\", expected INFO, FORMAT, or FILTER prefix for each line: %s\n",map[i],fname);
2707         int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, sb);
2708         if ( id<0 ) continue;
2709         bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", sb, NULL);
2710         if ( !hrec ) continue;  // the sequence not present
2711         int j = bcf_hrec_find_key(hrec, "ID");
2712         assert( j>=0 );
2713         free(hrec->vals[j]);
2714         ss++;
2715         while ( *ss && isspace(*ss) ) ss++;
2716         char *se = ss;
2717         while ( *se && !isspace(*se) ) se++;
2718         *se = 0;
2719         hrec->vals[j] = strdup(ss);
2720         args->hdr_out->id[BCF_DT_ID][id].key = hrec->vals[j];
2721     }
2722     for (i=0; i<n; i++) free(map[i]);
2723     free(map);
2724 }
2725 
init_data(args_t * args)2726 static void init_data(args_t *args)
2727 {
2728     args->hdr = args->files->readers[0].header;
2729     args->hdr_out = bcf_hdr_dup(args->hdr);
2730 
2731     if ( args->set_ids_fmt )
2732     {
2733         if ( args->set_ids_fmt[0]=='+' ) { args->set_ids_replace = 0; args->set_ids_fmt++; }
2734         args->set_ids = convert_init(args->hdr_out, NULL, 0, args->set_ids_fmt);
2735     }
2736     if ( args->remove_annots ) init_remove_annots(args);
2737     if ( args->header_fname ) init_header_lines(args);
2738     if ( args->targets_fname && args->tgts_is_vcf )
2739     {
2740         // reading annots from a VCF
2741         if ( !bcf_sr_add_reader(args->files, args->targets_fname) )
2742             error("Failed to open %s: %s\n", args->targets_fname,bcf_sr_strerror(args->files->errnum));
2743         args->tgts_hdr = args->files->readers[1].header;
2744     }
2745     if ( args->columns ) init_columns(args);
2746     if ( args->targets_fname && !args->tgts_is_vcf )
2747     {
2748         if ( !args->columns ) error("The -c option not given\n");
2749         if ( args->chr_idx==-1 ) error("The -c CHROM option not given\n");
2750         if ( args->beg_idx==-1 ) error("The -c POS option not given\n");
2751         if ( args->single_overlaps && args->merge_method_str.l ) error("The options --merge-logic and --single-overlaps cannot be combined\n");
2752         if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str.l) )
2753         {
2754             args->end_idx = -args->beg_idx - 1;
2755             args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->beg_idx,args->end_idx);
2756             if ( !args->tgts ) error("Could not initialize the annotation file: %s\n", args->targets_fname);
2757             if ( !args->tgts->tbx ) error("Expected tabix-indexed annotation file: %s\n", args->targets_fname);
2758         }
2759         else
2760         {
2761             if ( args->ref_idx!=-1 ) error("Error: the REF columns will be ignored when BEG,END (or FROM,TO) is present. Replace END (or TO) with \"-\".\n");
2762             int len = strlen(args->targets_fname);
2763             if ( len>=7 && !strcasecmp(".bed.gz",args->targets_fname+len-7) ) args->tgt_is_bed = 1;
2764             else if ( len>=8 && !strcasecmp(".bed.bgz",args->targets_fname+len-8) ) args->tgt_is_bed = 1;
2765             else if ( len>=4 && !strcasecmp(".bed",args->targets_fname+len-4) ) args->tgt_is_bed = 1;
2766             args->tgt_idx = regidx_init(args->targets_fname,parse_with_payload,free_payload,sizeof(char*),args);
2767             if ( !args->tgt_idx ) error("Failed to parse: %s\n", args->targets_fname);
2768             args->tgt_itr = regitr_init(args->tgt_idx);
2769             args->nalines++;
2770             hts_expand0(annot_line_t,args->nalines,args->malines,args->alines);
2771         }
2772     }
2773     init_merge_method(args);
2774     args->vcmp = vcmp_init();
2775 
2776     if ( args->filter_str )
2777         args->filter = filter_init(args->hdr, args->filter_str);
2778 
2779     if ( args->mark_sites )
2780     {
2781         if ( !args->targets_fname ) error("The -a option not given\n");
2782         bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
2783             args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
2784     }
2785 
2786     if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
2787     if ( !args->drop_header )
2788     {
2789         if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs);
2790         if ( args->rename_annots ) rename_annots(args, args->rename_annots);
2791 
2792         char wmode[8];
2793         set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
2794         args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
2795         if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno));
2796         if ( args->n_threads )
2797             hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
2798         if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname);
2799     }
2800 }
2801 
destroy_data(args_t * args)2802 static void destroy_data(args_t *args)
2803 {
2804     int i;
2805     for (i=0; i<args->nrm; i++) free(args->rm[i].key);
2806     free(args->rm);
2807     if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out);
2808     if (args->vcmp) vcmp_destroy(args->vcmp);
2809     for (i=0; i<args->ncols; i++)
2810     {
2811         free(args->cols[i].hdr_key_src);
2812         free(args->cols[i].hdr_key_dst);
2813         free(args->cols[i].mm_kstr.s);
2814         if ( args->cols[i].mm_str_hash ) khash_str2int_destroy_free(args->cols[i].mm_str_hash);
2815         free(args->cols[i].mm_dbl);
2816         free(args->cols[i].ptr);
2817     }
2818     free(args->cols);
2819     if ( args->aline_missing )
2820     {
2821         for (i=0; i<args->aline_missing->ncols; i++) free(args->aline_missing->cols[i]);
2822         free(args->aline_missing->cols);
2823         free(args->aline_missing);
2824     }
2825     for (i=0; i<args->malines; i++)
2826     {
2827         free(args->alines[i].cols);
2828         free(args->alines[i].als);
2829         free(args->alines[i].line.s);
2830     }
2831     free(args->alines);
2832     free(args->srt_alines);
2833     if ( args->tgt_idx )
2834     {
2835         regidx_destroy(args->tgt_idx);
2836         regitr_destroy(args->tgt_itr);
2837     }
2838     if ( args->tgts ) bcf_sr_regions_destroy(args->tgts);
2839     free(args->tmpks.s);
2840     free(args->tmpi);
2841     free(args->tmpf);
2842     free(args->tmps);
2843     free(args->tmpp);
2844     free(args->tmpi2);
2845     free(args->tmpf2);
2846     free(args->tmps2);
2847     free(args->tmpp2);
2848     free(args->tmpi3);
2849     free(args->tmpf3);
2850     free(args->src_smpl_pld);
2851     free(args->dst_smpl_pld);
2852     if ( args->set_ids )
2853         convert_destroy(args->set_ids);
2854     if ( args->filter )
2855         filter_destroy(args->filter);
2856     if (args->out_fh) hts_close(args->out_fh);
2857     free(args->sample_map);
2858     free(args->merge_method_str.s);
2859 }
2860 
parse_annot_line(args_t * args,char * str,annot_line_t * tmp)2861 static void parse_annot_line(args_t *args, char *str, annot_line_t *tmp)
2862 {
2863     tmp->line.l = 0;
2864     kputs(str, &tmp->line);
2865     char *s = tmp->line.s;
2866     tmp->ncols = 1;
2867     hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols);
2868     tmp->cols[0] = s;
2869     while ( *s )
2870     {
2871         if ( *s=='\t' )
2872         {
2873             tmp->ncols++;
2874             hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols);
2875             tmp->cols[tmp->ncols-1] = s+1;
2876             *s = 0;
2877         }
2878         s++;
2879     }
2880     if ( args->ref_idx != -1 )
2881     {
2882         if ( args->ref_idx >= tmp->ncols )
2883             error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,str);
2884         if ( args->alt_idx >= tmp->ncols )
2885             error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,str);
2886         tmp->nals = 2;
2887         hts_expand(char*,tmp->nals,tmp->mals,tmp->als);
2888         tmp->als[0] = tmp->cols[args->ref_idx];
2889         tmp->als[1] = s = tmp->cols[args->alt_idx];
2890         while ( *s )
2891         {
2892             if ( *s==',' )
2893             {
2894                 tmp->nals++;
2895                 hts_expand(char*,tmp->nals,tmp->mals,tmp->als);
2896                 tmp->als[tmp->nals-1] = s+1;
2897                 *s = 0;
2898             }
2899             s++;
2900         }
2901     }
2902 }
buffer_annot_lines(args_t * args,bcf1_t * line,int start_pos,int end_pos)2903 static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int end_pos)
2904 {
2905     if ( args->nalines && args->alines[0].rid != line->rid ) args->nalines = 0;
2906 
2907     int i = 0;
2908     while ( i<args->nalines )
2909     {
2910         if ( line->pos > args->alines[i].end )
2911         {
2912             args->nalines--;
2913             if ( args->nalines && i<args->nalines )
2914             {
2915                 annot_line_t tmp = args->alines[i];
2916                 memmove(&args->alines[i],&args->alines[i+1],(args->nalines-i)*sizeof(annot_line_t));
2917                 args->alines[args->nalines] = tmp;
2918             }
2919         }
2920         else i++;
2921     }
2922     if ( args->ref_idx==-1 && args->nalines ) return;
2923 
2924     while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) )
2925     {
2926         if ( args->nalines + 1 == 0xffff ) break;   // likely a symbolic allele, don't let the buffer overflow
2927         args->nalines++;
2928         hts_expand0(annot_line_t,args->nalines,args->malines,args->alines);
2929         annot_line_t *tmp = &args->alines[args->nalines-1];
2930         tmp->rid   = line->rid;
2931         tmp->start = args->tgts->start;
2932         tmp->end   = args->tgts->end;
2933         parse_annot_line(args, args->tgts->line.s, tmp);
2934         if ( args->ref_idx != -1 )
2935         {
2936             int iseq = args->tgts->iseq;
2937             if ( bcf_sr_regions_next(args->tgts)<0 || args->tgts->iseq!=iseq ) break;
2938         }
2939         else break;
2940     }
2941 }
2942 
2943 // search string in semicolon separated strings (xx vs aa;bb)
str_match(char * needle,char * haystack)2944 static int str_match(char *needle, char *haystack)
2945 {
2946     int len = strlen(needle);
2947     char *ptr = haystack;
2948     while ( *ptr && (ptr=strstr(ptr,needle)) )
2949     {
2950         if ( ptr[len]!=0 && ptr[len]!=';' ) ptr++;          // a prefix, not a match
2951         else if ( ptr==haystack || ptr[-1]==';' ) return 1; // a match
2952         ptr++;  // a suffix, not a match
2953     }
2954     return 0;
2955 }
2956 // search common string in semicolon separated strings (xx;yy;zz vs aa;bb)
strstr_match(char * a,char * b)2957 static int strstr_match(char *a, char *b)
2958 {
2959     char *beg = a;
2960     while ( *beg )
2961     {
2962         char *end = beg;
2963         while ( *end && *end!=';' ) end++;
2964         char tmp = *end;
2965         if ( *end==';' ) *end = 0;
2966         int ret = str_match(beg,b);
2967         *end = tmp;
2968         if ( ret || !*end ) return ret;
2969         beg = end + 1;
2970     }
2971     return 0;
2972 }
annotate(args_t * args,bcf1_t * line)2973 static void annotate(args_t *args, bcf1_t *line)
2974 {
2975     int i, j;
2976     for (i=0; i<args->nrm; i++)
2977         args->rm[i].handler(args, line, &args->rm[i]);
2978 
2979     int has_overlap = 0;
2980     if ( args->tgt_idx )
2981     {
2982         for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
2983         if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) )
2984         {
2985             while ( regitr_overlap(args->tgt_itr) )
2986             {
2987                 annot_line_t *tmp = &args->alines[0];
2988                 tmp->rid   = line->rid;
2989                 tmp->start = args->tgt_itr->beg;
2990                 tmp->end   = args->tgt_itr->end;
2991                 parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp);
2992                 for (j=0; j<args->ncols; j++)
2993                 {
2994                     if ( args->cols[j].done==1 ) continue;
2995                     int ret = args->cols[j].setter(args,line,&args->cols[j],tmp);
2996                     if ( ret < 0 )
2997                         error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
2998                     if ( ret==0 )
2999                         args->cols[j].done = 1;
3000                 }
3001             }
3002             has_overlap = 1;
3003         }
3004         for (j=0; j<args->ncols; j++)
3005         {
3006             if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
3007             if ( !args->cols[j].setter ) continue;
3008             if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 )
3009                 error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
3010         }
3011     }
3012     else if ( args->tgts )
3013     {
3014         // Buffer annotation lines. When multiple ALT alleles are present in the annotation file, at least one
3015         // must match some of the VCF alleles. If the append-missing mode is set (and REF+ALT is requested), the
3016         // buffered lines will annotate the VCF respecting the order in ALT and when no matching line is found
3017         // for an ALT, missing value is appended instead.
3018         int end_pos = line->pos + line->rlen - 1;
3019         buffer_annot_lines(args, line, line->pos, end_pos);
3020 
3021         args->nsrt_alines = 0;
3022         hts_expand(uint32_t,args->nalines,args->msrt_alines,args->srt_alines);
3023         if ( args->nalines >= 0xffff || line->n_allele >= 0xffff )
3024             error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
3025 
3026         kstring_t match_end = {0,0,0};
3027         if ( args->match_end>=0 && bcf_get_info_int32(args->hdr,line,"END",&args->tmpi,&args->mtmpi)==1 )
3028             kputw(args->tmpi[0],&match_end);
3029 
3030         // Find matching lines
3031         for (i=0; i<args->nalines; i++)
3032         {
3033             if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue;
3034             if ( args->ref_idx != -1 )  // REF+ALT matching requested
3035             {
3036                 if ( line->pos!=args->alines[i].start || vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue;   // refs are not compatible
3037                 for (j=1; j<args->alines[i].nals; j++)
3038                 {
3039                     int ialt;
3040                     if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 )  // match: no ALT allele in VCF and annot file has "."
3041                         ialt = 0;
3042                     else
3043                     {
3044                         ialt = vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]);
3045                         if ( ialt < 0 ) continue;
3046                         ialt++;
3047                     }
3048                     if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue;
3049                     if ( match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue;
3050                     args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i;
3051                     has_overlap = 1;
3052                     break;
3053                 }
3054             }
3055             else    // overlap, REF+ALT matching not requested
3056             {
3057                 args->srt_alines[args->nsrt_alines++] = (0xffff<<16) | i;
3058                 has_overlap = 1;
3059             }
3060         }
3061 
3062         free(match_end.s);
3063 
3064         // Sort lines if needed
3065         if ( args->has_append_mode )
3066         {
3067             // insertion sort by VCF ALT index (top bits) and alines index (low bits)
3068             uint32_t tmp;
3069             for (i=1; i<args->nsrt_alines; i++)
3070                 for (j=i; j>0 && args->srt_alines[j] < args->srt_alines[j-1]; j--)
3071                     tmp = args->srt_alines[j], args->srt_alines[j] = args->srt_alines[j-1], args->srt_alines[j-1] = tmp;
3072         }
3073         // Annotate
3074         for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
3075         int ialt_exp = 1;
3076         for (i=0; i<args->nsrt_alines; i++)
3077         {
3078             int ialt = args->srt_alines[i] >> 16;
3079             int ilin = args->srt_alines[i] & 0xffff;
3080             if ( args->has_append_mode )
3081             {
3082                 if ( ialt_exp > ialt ) continue;    // multiple annotation lines for the same position
3083                 if ( ialt_exp < ialt )
3084                 {
3085                     // REF+ALT matching requested, append-missing mode: insert "." if no annotation line was found for the ALT
3086                     while ( ialt_exp++ < ialt )
3087                     {
3088                         for (j=0; j<args->ncols; j++)
3089                         {
3090                             if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
3091                             if ( args->cols[j].done==1 ) continue;
3092                             if ( !args->cols[j].setter ) continue;
3093                             int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
3094                             if ( ret < 0 )
3095                                 error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
3096                             if ( ret==0 )
3097                                 args->cols[j].done = 1;
3098                         }
3099                     }
3100                 }
3101             }
3102             for (j=0; j<args->ncols; j++)
3103             {
3104                 if ( args->cols[j].done==1 ) continue;
3105                 if ( !args->cols[j].setter ) continue;
3106                 int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]);
3107                 if ( ret < 0 )
3108                     error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
3109                 if ( ret==0 )
3110                     args->cols[j].done = 1;
3111             }
3112             ialt_exp = ialt + 1;
3113         }
3114         if ( args->nsrt_alines )
3115         {
3116             // In the append-missing mode fill missing values to all trailing ALTs, but only if at least one
3117             // record was found. Otherwise leave the row will be left without annotation.
3118             if ( args->has_append_mode && ialt_exp < line->n_allele )
3119             {
3120                 while ( ialt_exp++ < line->n_allele )
3121                 {
3122                     for (j=0; j<args->ncols; j++)
3123                     {
3124                         if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
3125                         if ( args->cols[j].done==1 ) continue;
3126                         if ( !args->cols[j].setter ) continue;
3127                         int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
3128                         if ( ret < 0 )
3129                             error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
3130                         if ( ret==0 )
3131                             args->cols[j].done = 1;
3132                     }
3133                 }
3134             }
3135             // Flush
3136             for (j=0; j<args->ncols; j++)
3137             {
3138                 if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
3139                 if ( !args->cols[j].setter ) continue;
3140                 int ret = args->cols[j].setter(args,line,&args->cols[j],NULL);
3141                 if ( ret < 0 )
3142                     error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
3143             }
3144         }
3145     }
3146     else if ( args->files->nreaders == 2 )
3147     {
3148         if ( bcf_sr_has_line(args->files,1) )
3149         {
3150             bcf1_t *aline = bcf_sr_get_line(args->files,1);
3151             for (j=0; j<args->ncols; j++)
3152             {
3153                 if ( !args->cols[j].setter ) continue;
3154                 if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
3155                     error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
3156             }
3157 
3158             has_overlap = 1;
3159         }
3160     }
3161     if ( args->set_ids )
3162     {
3163         args->tmpks.l = 0;
3164         convert_line(args->set_ids, line, &args->tmpks);
3165         if ( args->tmpks.l )
3166         {
3167             int replace = 0;
3168             if ( args->set_ids_replace ) replace = 1;
3169             else if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) ) replace = 1;
3170             if ( replace )
3171                 bcf_update_id(args->hdr_out,line,args->tmpks.s);
3172         }
3173     }
3174 
3175     if ( args->mark_sites )
3176     {
3177         // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87
3178         if ( args->mark_sites_logic==MARK_LISTED )
3179             bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?1:0);
3180         else
3181             bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?0:1);
3182     }
3183 }
3184 
usage(args_t * args)3185 static void usage(args_t *args)
3186 {
3187     fprintf(stderr, "\n");
3188     fprintf(stderr, "About:   Annotate and edit VCF/BCF files.\n");
3189     fprintf(stderr, "Usage:   bcftools annotate [options] <in.vcf.gz>\n");
3190     fprintf(stderr, "\n");
3191     fprintf(stderr, "Options:\n");
3192     fprintf(stderr, "   -a, --annotations FILE          VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n");
3193     fprintf(stderr, "       --collapse STR              Matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
3194     fprintf(stderr, "   -c, --columns LIST              List of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
3195     fprintf(stderr, "   -C, --columns-file FILE         Read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n");
3196     fprintf(stderr, "   -e, --exclude EXPR              Exclude sites for which the expression is true (see man page for details)\n");
3197     fprintf(stderr, "       --force                     Continue despite parsing error (at your own risk!)\n");
3198     fprintf(stderr, "   -h, --header-lines FILE         Lines which should be appended to the VCF header\n");
3199     fprintf(stderr, "   -I, --set-id [+]FORMAT          Set ID column using a `bcftools query`-like expression, see man page for details\n");
3200     fprintf(stderr, "   -i, --include EXPR              Select sites for which the expression is true (see man page for details)\n");
3201     fprintf(stderr, "   -k, --keep-sites                Leave -i/-e sites unchanged instead of discarding them\n");
3202     fprintf(stderr, "   -l, --merge-logic TAG:TYPE      Merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n");
3203     fprintf(stderr, "   -m, --mark-sites [+-]TAG        Add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
3204     fprintf(stderr, "       --no-version                Do not append version and command line to the header\n");
3205     fprintf(stderr, "   -o, --output FILE               Write output to a file [standard output]\n");
3206     fprintf(stderr, "   -O, --output-type u|b|v|z[0-9]  u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
3207     fprintf(stderr, "   -r, --regions REGION            Restrict to comma-separated list of regions\n");
3208     fprintf(stderr, "   -R, --regions-file FILE         Restrict to regions listed in FILE\n");
3209     fprintf(stderr, "       --regions-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
3210     fprintf(stderr, "       --rename-annots FILE        Rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n");
3211     fprintf(stderr, "       --rename-chrs FILE          Rename sequences according to the mapping: old\\tnew\n");
3212     fprintf(stderr, "   -s, --samples [^]LIST           Comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
3213     fprintf(stderr, "   -S, --samples-file [^]FILE      File of samples to annotate (or exclude with \"^\" prefix)\n");
3214     fprintf(stderr, "       --single-overlaps           Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
3215     fprintf(stderr, "   -x, --remove LIST               List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
3216     fprintf(stderr, "       --threads INT               Number of extra output compression threads [0]\n");
3217     fprintf(stderr, "\n");
3218     fprintf(stderr, "Examples:\n");
3219     fprintf(stderr, "   http://samtools.github.io/bcftools/howtos/annotate.html\n");
3220     fprintf(stderr, "\n");
3221     exit(1);
3222 }
3223 
main_vcfannotate(int argc,char * argv[])3224 int main_vcfannotate(int argc, char *argv[])
3225 {
3226     int c;
3227     args_t *args  = (args_t*) calloc(1,sizeof(args_t));
3228     args->argc    = argc; args->argv = argv;
3229     args->files   = bcf_sr_init();
3230     args->output_fname = "-";
3231     args->output_type = FT_VCF;
3232     args->n_threads = 0;
3233     args->record_cmd_line = 1;
3234     args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1;
3235     args->set_ids_replace = 1;
3236     args->match_id = -1;
3237     args->clevel = -1;
3238     int regions_is_file = 0, collapse = 0;
3239     int regions_overlap = 1;
3240 
3241     static struct option loptions[] =
3242     {
3243         {"keep-sites",no_argument,NULL,'k'},
3244         {"mark-sites",required_argument,NULL,'m'},
3245         {"set-id",required_argument,NULL,'I'},
3246         {"output",required_argument,NULL,'o'},
3247         {"output-type",required_argument,NULL,'O'},
3248         {"threads",required_argument,NULL,9},
3249         {"annotations",required_argument,NULL,'a'},
3250         {"merge-logic",required_argument,NULL,'l'},
3251         {"collapse",required_argument,NULL,2},
3252         {"include",required_argument,NULL,'i'},
3253         {"exclude",required_argument,NULL,'e'},
3254         {"regions",required_argument,NULL,'r'},
3255         {"regions-file",required_argument,NULL,'R'},
3256         {"regions-overlap",required_argument,NULL,3},
3257         {"remove",required_argument,NULL,'x'},
3258         {"columns-file",required_argument,NULL,'C'},
3259         {"columns",required_argument,NULL,'c'},
3260         {"rename-annots",required_argument,NULL,11},
3261         {"rename-chrs",required_argument,NULL,1},
3262         {"header-lines",required_argument,NULL,'h'},
3263         {"samples",required_argument,NULL,'s'},
3264         {"samples-file",required_argument,NULL,'S'},
3265         {"single-overlaps",no_argument,NULL,10},
3266         {"no-version",no_argument,NULL,8},
3267         {"force",no_argument,NULL,'f'},
3268         {NULL,0,NULL,0}
3269     };
3270     char *tmp;
3271     while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0)
3272     {
3273         switch (c) {
3274             case 'f': args->force = 1; break;
3275             case 'k': args->keep_sites = 1; break;
3276             case 'm':
3277                 args->mark_sites_logic = MARK_LISTED;
3278                 if ( optarg[0]=='+' ) args->mark_sites = optarg+1;
3279                 else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; }
3280                 else args->mark_sites = optarg;
3281                 break;
3282             case 'l':
3283                 if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str);
3284                 kputs(optarg,&args->merge_method_str);
3285                 break;
3286             case 'I': args->set_ids_fmt = optarg; break;
3287             case 's': args->sample_names = optarg; break;
3288             case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
3289             case 'c': args->columns = strdup(optarg); break;
3290             case 'C': args->columns = strdup(optarg); args->columns_is_file = 1; break;
3291             case 'o': args->output_fname = optarg; break;
3292             case 'O':
3293                 switch (optarg[0]) {
3294                     case 'b': args->output_type = FT_BCF_GZ; break;
3295                     case 'u': args->output_type = FT_BCF; break;
3296                     case 'z': args->output_type = FT_VCF_GZ; break;
3297                     case 'v': args->output_type = FT_VCF; break;
3298                     default:
3299                     {
3300                         args->clevel = strtol(optarg,&tmp,10);
3301                         if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
3302                     }
3303                 };
3304                 if ( optarg[1] )
3305                 {
3306                     args->clevel = strtol(optarg+1,&tmp,10);
3307                     if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
3308                 }
3309                 break;
3310             case 'e':
3311                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
3312                 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
3313             case 'i':
3314                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
3315                 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
3316             case 'x': args->remove_annots = optarg; break;
3317             case 'a': args->targets_fname = optarg; break;
3318             case 'r': args->regions_list = optarg; break;
3319             case 'R': args->regions_list = optarg; regions_is_file = 1; break;
3320             case 'h': args->header_fname = optarg; break;
3321             case  1 : args->rename_chrs = optarg; break;
3322             case  2 :
3323                 if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS;
3324                 else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS;
3325                 else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
3326                 else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY;
3327                 else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY;
3328                 else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME;
3329                 else if ( !strcmp(optarg,"none") ) collapse = COLLAPSE_NONE;
3330                 else error("The --collapse string \"%s\" not recognised.\n", optarg);
3331                 break;
3332             case  3 :
3333                 if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
3334                 else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
3335                 else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
3336                 else error("Could not parse: --regions-overlap %s\n",optarg);
3337                 break;
3338             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
3339             case  8 : args->record_cmd_line = 0; break;
3340             case 10 : args->single_overlaps = 1; break;
3341             case 11 : args->rename_annots = optarg; break;
3342             case '?': usage(args); break;
3343             default: error("Unknown argument: %s\n", optarg);
3344         }
3345     }
3346 
3347     char *fname = NULL;
3348     if ( optind>=argc )
3349     {
3350         if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";  // reading from stdin
3351         else usage(args);
3352     }
3353     else fname = argv[optind];
3354 
3355     if ( args->regions_list )
3356     {
3357         bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
3358         if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
3359             error("Failed to read the regions: %s\n", args->regions_list);
3360     }
3361     if ( args->targets_fname )
3362     {
3363         htsFile *fp = hts_open(args->targets_fname,"r");
3364         if ( !fp ) error("Failed to open %s\n", args->targets_fname);
3365         htsFormat type = *hts_get_format(fp);
3366         hts_close(fp);
3367 
3368         if ( type.format==vcf || type.format==bcf )
3369         {
3370             args->tgts_is_vcf = 1;
3371             args->files->require_index = 1;
3372             args->files->collapse = collapse ? collapse : COLLAPSE_SOME;
3373         }
3374     }
3375     if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
3376     if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
3377 
3378     static int line_errcode_warned = 0;
3379     init_data(args);
3380     while ( bcf_sr_next_line(args->files) )
3381     {
3382         if ( !bcf_sr_has_line(args->files,0) ) continue;
3383         bcf1_t *line = bcf_sr_get_line(args->files,0);
3384         if ( line->errcode )
3385         {
3386             if ( !args->force )
3387                 error("Encountered an error, cannot proceed. Please check the error output above.\n"
3388                       "If feeling adventurous, use the --force option. (At your own risk!)\n");
3389             else if ( !line_errcode_warned )
3390             {
3391                 fprintf(stderr,
3392                     "Warning: Encountered an error, proceeding only because --force was given.\n"
3393                     "         Note that this can result in a segfault or a silent corruption of the output file!\n");
3394                 line_errcode_warned = 1;
3395                 line->errcode = 0;
3396             }
3397         }
3398         if ( args->filter )
3399         {
3400             int pass = filter_test(args->filter, line, NULL);
3401             if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
3402             if ( !pass )
3403             {
3404                 if ( args->keep_sites && bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname);
3405                 continue;
3406             }
3407         }
3408         annotate(args, line);
3409         if ( bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname);
3410     }
3411     destroy_data(args);
3412     bcf_sr_destroy(args->files);
3413     free(args);
3414     return 0;
3415 }
3416