1 /* vcfannotate.c -- Annotate and edit VCF/BCF files.
2
3 Copyright (C) 2013-2021 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE. */
24
25 #include <stdio.h>
26 #include <strings.h>
27 #include <unistd.h>
28 #include <getopt.h>
29 #include <assert.h>
30 #include <ctype.h>
31 #include <string.h>
32 #include <errno.h>
33 #include <sys/stat.h>
34 #include <sys/types.h>
35 #include <dirent.h>
36 #include <math.h>
37 #include <inttypes.h>
38 #include <htslib/vcf.h>
39 #include <htslib/synced_bcf_reader.h>
40 #include <htslib/kseq.h>
41 #include <htslib/khash_str2int.h>
42 #include "bcftools.h"
43 #include "vcmp.h"
44 #include "filter.h"
45 #include "convert.h"
46 #include "smpl_ilist.h"
47 #include "regidx.h"
48
49 struct _args_t;
50
51 typedef struct _rm_tag_t
52 {
53 char *key;
54 int hdr_id;
55 void (*handler)(struct _args_t *, bcf1_t *, struct _rm_tag_t *);
56 }
57 rm_tag_t;
58
59 typedef struct
60 {
61 char **cols;
62 int ncols, mcols;
63 char **als;
64 int nals, mals;
65 kstring_t line;
66 int rid, start, end;
67 }
68 annot_line_t;
69
70 #define REPLACE_MISSING (1<<0) // -c +TAG .. replace only missing values
71 #define REPLACE_ALL (1<<1) // -c TAG .. replace both missing and existing values
72 #define REPLACE_NON_MISSING (1<<2) // -c -TAG .. replace only if tgt is not missing
73 #define SET_OR_APPEND (1<<3) // -c =TAG .. set new value if missing or non-existent, append otherwise
74 #define MATCH_VALUE (1<<4) // -c ~ID .. do not set, just match the value
75 #define CARRY_OVER_MISSING (1<<5) // -c .TAG .. carry over source missing values as well
76 #define MM_FIRST 0 // if multiple annotation lines overlap a VCF record, use the first, discarding the rest
77 #define MM_APPEND 1 // append, possibly multiple times
78 #define MM_UNIQUE 2 // append, only unique values
79 #define MM_SUM 3
80 #define MM_AVG 4
81 #define MM_MIN 5
82 #define MM_MAX 6
83 #define MM_APPEND_MISSING 7 // missing values will be transferred as well
84 typedef struct _annot_col_t
85 {
86 int icol, replace, number; // number: one of BCF_VL_* types
87 char *hdr_key_src, *hdr_key_dst;
88 // The setters return 0 on successful update of the bcf record, negative value (bcf_update_* return status) on errors,
89 // or 1 on (repeated partial updates) concluded with a src=NULL call
90 int (*setter)(struct _args_t *, bcf1_t *dst, struct _annot_col_t *, void *src); // the last is the annotation line, either src bcf1_t or annot_line_t
91 int (*getter)(struct _args_t *, bcf1_t *src, struct _annot_col_t *, void **ptr, int *mptr);
92 int merge_method; // one of the MM_* defines
93 khash_t(str2int) *mm_str_hash; // lookup table to ensure uniqueness of added string values
94 kstring_t mm_kstr;
95 size_t
96 mm_dbl_nalloc, // the allocated size --merge-logic values array
97 mm_dbl_nused, // the number of used elements in the mm_dbl array
98 mm_dbl_ndat; // the number of merged rows (for calculating the average)
99 double
100 *mm_dbl;
101 void *ptr;
102 int mptr, done;
103 }
104 annot_col_t;
105
106 // Logic of the filters: include or exclude sites which match the filters?
107 #define FLT_INCLUDE 1
108 #define FLT_EXCLUDE 2
109
110 #define MARK_LISTED 1
111 #define MARK_UNLISTED 2
112
113 typedef struct _args_t
114 {
115 bcf_srs_t *files;
116 bcf_hdr_t *hdr, *hdr_out, *tgts_hdr;
117 htsFile *out_fh;
118 int output_type, n_threads, clevel;
119 bcf_sr_regions_t *tgts;
120
121 regidx_t *tgt_idx; // keep everything in memory only with .tab annotation file and -c BEG,END columns
122 regitr_t *tgt_itr;
123 int tgt_is_bed;
124
125 filter_t *filter;
126 char *filter_str;
127 int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
128 int keep_sites;
129
130 rm_tag_t *rm; // tags scheduled for removal
131 int nrm;
132 int flt_keep_pass; // when all filters removed, reset to PASS
133
134 vcmp_t *vcmp; // for matching annotation and VCF lines by allele
135 annot_line_t *alines; // buffered annotation lines
136 annot_line_t *aline_missing;
137 uint32_t *srt_alines; // sorted indexes (iALT<<16 || iAline)
138 int nalines, malines, nsrt_alines, msrt_alines;
139 int ref_idx, alt_idx, chr_idx, beg_idx, end_idx; // -1 if not present
140 annot_col_t *cols; // column indexes and setters
141 int ncols;
142 int match_id; // set iff `-c ~ID` given
143 int match_end; // set iff `-c ~INFO/END` is given
144
145 char *set_ids_fmt;
146 convert_t *set_ids;
147 int set_ids_replace;
148
149 int nsmpl_annot;
150 int *sample_map, nsample_map, sample_is_file; // map[idst] -> isrc
151 uint8_t *src_smpl_pld, *dst_smpl_pld; // for Number=G format fields
152 int mtmpi, mtmpf, mtmps;
153 int mtmpi2, mtmpf2, mtmps2;
154 int mtmpi3, mtmpf3, mtmps3;
155 int32_t *tmpi, *tmpi2, *tmpi3;
156 float *tmpf, *tmpf2, *tmpf3;
157 char *tmps, *tmps2, **tmpp, **tmpp2;
158 kstring_t tmpks;
159
160 char **argv, *output_fname, *targets_fname, *regions_list, *header_fname;
161 char *remove_annots, *columns, *rename_chrs, *rename_annots, *sample_names, *mark_sites;
162 kstring_t merge_method_str;
163 int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps;
164 int columns_is_file, has_append_mode;
165 }
166 args_t;
167
168 char *msprintf(const char *fmt, ...);
169
parse_with_payload(const char * line,char ** chr_beg,char ** chr_end,uint32_t * beg,uint32_t * end,void * payload,void * usr)170 int parse_with_payload(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
171 {
172 args_t *args = (args_t*) usr;
173 int ret = args->tgt_is_bed ? regidx_parse_bed(line, chr_beg, chr_end, beg, end, NULL, NULL) : regidx_parse_tab(line, chr_beg, chr_end, beg, end, NULL, NULL);
174 if ( ret<0 ) return ret;
175 *((char **)payload) = strdup(line);
176 return 0;
177 }
free_payload(void * payload)178 void free_payload(void *payload)
179 {
180 char *str = *((char**)payload);
181 free(str);
182 }
183
remove_id(args_t * args,bcf1_t * line,rm_tag_t * tag)184 void remove_id(args_t *args, bcf1_t *line, rm_tag_t *tag)
185 {
186 bcf_update_id(args->hdr,line,NULL);
187 }
remove_filter(args_t * args,bcf1_t * line,rm_tag_t * tag)188 void remove_filter(args_t *args, bcf1_t *line, rm_tag_t *tag)
189 {
190 if ( tag->key && tag->hdr_id<0 )
191 {
192 error("Error: Cannot proceed, not even with the --force option, bad things could happen.\n"
193 " Note that \"bcftools annotate -x FILTER\" can be used to remove ALL filters.\n"
194 " Even better, use \"bcftools view -h\" and \"bcftools reheader\" to fix the header!\n"
195 );
196 }
197 if ( !tag->key ) bcf_update_filter(args->hdr, line, NULL, args->flt_keep_pass);
198 else bcf_remove_filter(args->hdr, line, tag->hdr_id, args->flt_keep_pass);
199 }
remove_qual(args_t * args,bcf1_t * line,rm_tag_t * tag)200 void remove_qual(args_t *args, bcf1_t *line, rm_tag_t *tag)
201 {
202 bcf_float_set_missing(line->qual);
203 }
remove_info(args_t * args,bcf1_t * line,rm_tag_t * tag)204 void remove_info(args_t *args, bcf1_t *line, rm_tag_t *tag)
205 {
206 // remove all INFO fields
207 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
208
209 int i;
210 for (i=0; i<line->n_info; i++)
211 {
212 bcf_info_t *inf = &line->d.info[i];
213 if ( !strcmp("END",bcf_hdr_int2id(args->hdr,BCF_DT_ID,inf->key)) )
214 line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
215 if ( inf->vptr_free )
216 {
217 free(inf->vptr - inf->vptr_off);
218 inf->vptr_free = 0;
219 }
220 line->d.shared_dirty |= BCF1_DIRTY_INF;
221 inf->vptr = NULL;
222 inf->vptr_off = inf->vptr_len = 0;
223 }
224 }
remove_info_tag(args_t * args,bcf1_t * line,rm_tag_t * tag)225 void remove_info_tag(args_t *args, bcf1_t *line, rm_tag_t *tag)
226 {
227 bcf_update_info(args->hdr, line, tag->key, NULL, 0, BCF_HT_INT); // the type does not matter with n=0
228 }
remove_format_tag(args_t * args,bcf1_t * line,rm_tag_t * tag)229 void remove_format_tag(args_t *args, bcf1_t *line, rm_tag_t *tag)
230 {
231 bcf_update_format(args->hdr, line, tag->key, NULL, 0, BCF_HT_INT); // the type does not matter with n=0
232 }
remove_format(args_t * args,bcf1_t * line,rm_tag_t * tag)233 void remove_format(args_t *args, bcf1_t *line, rm_tag_t *tag)
234 {
235 // remove all FORMAT fields except GT
236 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
237
238 int i;
239 for (i=0; i<line->n_fmt; i++)
240 {
241 bcf_fmt_t *fmt = &line->d.fmt[i];
242 const char *key = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id);
243 if ( key[0]=='G' && key[1]=='T' && !key[2] ) continue;
244
245 if ( fmt->p_free )
246 {
247 free(fmt->p - fmt->p_off);
248 fmt->p_free = 0;
249 }
250 line->d.indiv_dirty = 1;
251 fmt->p = NULL;
252 }
253 }
254
255 #include "htslib/khash.h"
256 KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
257 typedef khash_t(vdict) vdict_t;
258
remove_hdr_lines(bcf_hdr_t * hdr,int type)259 static void remove_hdr_lines(bcf_hdr_t *hdr, int type)
260 {
261 int i = 0, nrm = 0;
262 while ( i<hdr->nhrec )
263 {
264 if ( hdr->hrec[i]->type!=type ) { i++; continue; }
265 bcf_hrec_t *hrec = hdr->hrec[i];
266 if ( type==BCF_HL_FMT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
267 {
268 // everything except FORMAT/GT
269 int id = bcf_hrec_find_key(hrec, "ID");
270 if ( id>=0 )
271 {
272 if ( type==BCF_HL_FMT && !strcmp(hrec->vals[id],"GT") ) { i++; continue; }
273 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
274 khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[id]);
275 kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
276 kh_val(d, k).info[type] |= 0xf;
277 }
278 }
279 nrm++;
280 hdr->nhrec--;
281 if ( i < hdr->nhrec )
282 memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
283 bcf_hrec_destroy(hrec);
284 }
285 if ( nrm ) {
286 if (bcf_hdr_sync(hdr) < 0)
287 error_errno("[%s] Failed to update header", __func__);
288 }
289 }
290
init_remove_annots(args_t * args)291 static void init_remove_annots(args_t *args)
292 {
293 int keep_info = 0, keep_fmt = 0, keep_flt = 0;
294 void *keep = khash_str2int_init();
295 kstring_t str = {0,0,0};
296 char *ss = args->remove_annots;
297
298 int i, ntags, needs_info = 0;
299 if ( args->set_ids )
300 {
301 const char **tags = convert_list_used_tags(args->set_ids,&ntags);
302 for (i=0; i<ntags; i++)
303 if ( !strncmp("INFO/",tags[i],4) ) needs_info = 1;
304 }
305
306 while ( *ss )
307 {
308 args->nrm++;
309 args->rm = (rm_tag_t*) realloc(args->rm,sizeof(rm_tag_t)*args->nrm);
310 rm_tag_t *tag = &args->rm[args->nrm-1];
311 tag->key = NULL;
312
313 int type = BCF_HL_GEN;
314 if ( !strncasecmp("INFO/",ss,5) ) { type = BCF_HL_INFO; ss += 5; }
315 else if ( !strncasecmp("INF/",ss,4) ) { type = BCF_HL_INFO; ss += 4; }
316 else if ( !strncasecmp("FORMAT/",ss,7) ) { type = BCF_HL_FMT; ss += 7; }
317 else if ( !strncasecmp("FMT/",ss,4) ) { type = BCF_HL_FMT; ss += 4; }
318 else if ( !strncasecmp("FILTER/",ss,7) ) { type = BCF_HL_FLT; ss += 7; }
319 else if ( !strncasecmp("^INFO/",ss,6) ) { type = BCF_HL_INFO; ss += 6; keep_info = 1; }
320 else if ( !strncasecmp("^INF/",ss,5) ) { type = BCF_HL_INFO; ss += 5; keep_info = 1; }
321 else if ( !strncasecmp("^FORMAT/",ss,8) ) { type = BCF_HL_FMT; ss += 8; keep_fmt = 1; }
322 else if ( !strncasecmp("^FMT/",ss,5) ) { type = BCF_HL_FMT; ss += 5; keep_fmt = 1; }
323 else if ( !strncasecmp("^FILTER/",ss,8) ) { type = BCF_HL_FLT; ss += 8; keep_flt = 1; }
324
325 char *se = ss;
326 while ( *se && *se!=',' ) se++;
327 str.l = 0;
328 kputsn(ss, se-ss, &str);
329
330 if ( type==BCF_HL_FLT )
331 {
332 if ( !keep_flt )
333 {
334 args->flt_keep_pass = 1;
335 tag->handler = remove_filter;
336 tag->key = strdup(str.s);
337 tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key);
338 if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) )
339 {
340 if ( args->keep_sites )
341 error("Error: The filter \"%s\" is not defined in the header, cannot use the -k option\n", str.s);
342 else
343 fprintf(stderr,"Warning: The filter \"%s\" is not defined in the header\n", str.s);
344 }
345 else if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
346 }
347 else
348 {
349 int value, ret = khash_str2int_get(keep, str.s, &value);
350 if ( ret==-1 ) khash_str2int_set(keep, strdup(str.s),1<<BCF_HL_FLT);
351 else khash_str2int_set(keep, str.s, value | 1<<BCF_HL_FLT);
352 args->nrm--;
353 }
354 }
355 else if ( type!=BCF_HL_GEN )
356 {
357 int id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,str.s);
358 if ( !bcf_hdr_idinfo_exists(args->hdr,type,id) )
359 {
360 if ( args->keep_sites )
361 error("Error: The tag \"%s\" is not defined in the header, cannot use the -k option\n", str.s);
362 else
363 fprintf(stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s);
364
365 tag->key = strdup(str.s);
366 if ( type==BCF_HL_INFO )
367 {
368 tag->handler = remove_info_tag;
369 if ( needs_info ) error("Error: `--remove INFO/%s` is executed first, cannot combine with `--set-id %s`\n",tag->key,args->set_ids_fmt);
370 }
371 else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
372 }
373 else if ( (type==BCF_HL_FMT && keep_fmt) || (type==BCF_HL_INFO && keep_info) )
374 {
375 int value, ret = khash_str2int_get(keep, str.s, &value);
376 if ( ret==-1 ) khash_str2int_set(keep, strdup(str.s),1<<type);
377 else khash_str2int_set(keep, str.s, value | 1<<type);
378 args->nrm--;
379 }
380 else
381 {
382 tag->key = strdup(str.s);
383 if ( type==BCF_HL_INFO )
384 {
385 tag->handler = remove_info_tag;
386 if ( needs_info ) error("Error: `--remove INFO/%s` is executed first, cannot combine with `--set-id %s`\n",tag->key,args->set_ids_fmt);
387 }
388 else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
389 if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,type,tag->key);
390 }
391 }
392 else if ( !strcasecmp("ID",str.s) ) tag->handler = remove_id;
393 else if ( !strcasecmp("FILTER",str.s) )
394 {
395 tag->handler = remove_filter;
396 if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
397 }
398 else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual;
399 else if ( !strcasecmp("INFO",str.s) )
400 {
401 if ( needs_info ) error("Error: `--remove INFO` is executed first, cannot combine with `--set-id %s`\n",args->set_ids_fmt);
402 tag->handler = remove_info;
403 if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
404 }
405 else if ( !strcasecmp("FMT",str.s) || !strcasecmp("FORMAT",str.s) )
406 {
407 tag->handler = remove_format;
408 if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
409 }
410 else if ( str.l )
411 {
412 int id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, str.s);
413 if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,id) ) error("Error: did you mean INFO/%s?\n",str.s);
414 if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) error("Error: did you mean FORMAT/%s?\n",str.s);
415
416 if ( !args->keep_sites )
417 {
418 if ( str.s[0]=='#' && str.s[1]=='#' )
419 bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2);
420 else
421 bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s);
422 }
423 args->nrm--;
424 }
425
426 ss = *se ? se+1 : se;
427 }
428 free(str.s);
429 if ( keep_flt || keep_info || keep_fmt )
430 {
431 int j;
432 for (j=0; j<args->hdr->nhrec; j++)
433 {
434 bcf_hrec_t *hrec = args->hdr->hrec[j];
435 if ( hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FMT ) continue;
436 if ( !keep_flt && hrec->type==BCF_HL_FLT ) continue;
437 if ( !keep_info && hrec->type==BCF_HL_INFO ) continue;
438 if ( !keep_fmt && hrec->type==BCF_HL_FMT ) continue;
439 int k = bcf_hrec_find_key(hrec,"ID");
440 assert( k>=0 ); // this should always be true for valid VCFs
441 int value, ret = khash_str2int_get(keep,hrec->vals[k],&value);
442 if ( ret==0 && value>>hrec->type ) // keep
443 {
444 if ( hrec->type==BCF_HL_FLT && !strcmp("PASS",hrec->vals[k]) ) args->flt_keep_pass = 1;
445 continue;
446 }
447 args->nrm++;
448 args->rm = (rm_tag_t*) realloc(args->rm,sizeof(rm_tag_t)*args->nrm);
449 rm_tag_t *tag = &args->rm[args->nrm-1];
450 if ( hrec->type==BCF_HL_INFO ) tag->handler = remove_info_tag;
451 else if ( hrec->type==BCF_HL_FMT ) tag->handler = remove_format_tag;
452 else
453 {
454 tag->handler = remove_filter;
455 tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, hrec->vals[k]);
456 }
457 tag->key = strdup(hrec->vals[k]);
458 if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
459 }
460 }
461 khash_str2int_destroy_free(keep);
462 if ( !args->nrm ) error("No matching tag in -x %s\n", args->remove_annots);
463 if (bcf_hdr_sync(args->hdr_out) < 0)
464 error_errno("[%s] Failed to update header", __func__);
465 }
init_header_lines(args_t * args)466 static void init_header_lines(args_t *args)
467 {
468 htsFile *file = hts_open(args->header_fname, "rb");
469 if ( !file ) error("Error reading %s\n", args->header_fname);
470 kstring_t str = {0,0,0};
471 while ( hts_getline(file, KS_SEP_LINE, &str) > 0 )
472 {
473 if ( bcf_hdr_append(args->hdr_out,str.s) ) error("Could not parse %s: %s\n", args->header_fname, str.s);
474 bcf_hdr_append(args->hdr,str.s); // the input file may not have the header line if run with -h (and nothing else)
475 }
476 if ( hts_close(file)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->header_fname);
477 free(str.s);
478 if (bcf_hdr_sync(args->hdr_out) < 0)
479 error_errno("[%s] Failed to update output header", __func__);
480 if (bcf_hdr_sync(args->hdr) < 0)
481 error_errno("[%s] Failed to update input header", __func__);
482 }
vcf_getter_info_str2str(args_t * args,bcf1_t * rec,annot_col_t * col,void ** ptr,int * mptr)483 static int vcf_getter_info_str2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
484 {
485 return bcf_get_info_string(args->tgts_hdr,rec,col->hdr_key_src,ptr,mptr);
486 }
vcf_getter_id2str(args_t * args,bcf1_t * rec,annot_col_t * col,void ** ptr,int * mptr)487 static int vcf_getter_id2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
488 {
489 char *str = *((char**)ptr);
490 int len = strlen(rec->d.id);
491 if ( len >= *mptr ) str = realloc(str, len+1);
492 strcpy(str, rec->d.id);
493 *((char**)ptr) = str;
494 *mptr = len+1;
495 return len;
496 }
vcf_getter_filter2str(args_t * args,bcf1_t * rec,annot_col_t * col,void ** ptr,int * mptr)497 static int vcf_getter_filter2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
498 {
499 kstring_t str;
500 str.s = *((char**)ptr);
501 str.m = *mptr;
502 str.l = 0;
503
504 int i;
505 if ( rec->d.n_flt )
506 {
507 for (i=0; i<rec->d.n_flt; i++)
508 {
509 if (i) kputc(';', &str);
510 kputs(bcf_hdr_int2id(args->tgts_hdr,BCF_DT_ID,rec->d.flt[i]), &str);
511 }
512 }
513 else kputc('.', &str);
514
515 *((char**)ptr) = str.s;
516 *mptr = str.m;
517 return str.l;
518 }
setter_filter(args_t * args,bcf1_t * line,annot_col_t * col,void * data)519 static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
520 {
521 if ( !data ) error("Error: the --merge-logic option cannot be used with FILTER (yet?)\n");
522
523 // note: so far this works only with one filter, not a list of filters
524 annot_line_t *tab = (annot_line_t*) data;
525 if ( tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) // don't overwrite with a missing value unless asked
526 {
527 if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_filter(args->hdr_out,line,NULL,0);
528 return 0;
529 }
530 hts_expand(int,1,args->mtmpi,args->tmpi);
531 args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]);
532 if ( args->tmpi[0]<0 ) error("The FILTER \"%s\" is not defined in the header, was the -h option provided?\n", tab->cols[col->icol]);
533 if ( col->replace & SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]);
534 if ( !(col->replace & REPLACE_MISSING) )
535 {
536 bcf_update_filter(args->hdr_out,line,NULL,0);
537 return bcf_update_filter(args->hdr_out,line,args->tmpi,1);
538 }
539
540 // only update missing FILTER
541 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
542 if ( !line->d.n_flt )
543 return bcf_update_filter(args->hdr_out,line,args->tmpi,1);
544
545 return 0;
546 }
vcf_setter_filter(args_t * args,bcf1_t * line,annot_col_t * col,void * data)547 static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
548 {
549 int i, ret = 0;
550 bcf1_t *rec = (bcf1_t*) data;
551 if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT);
552 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
553 if ( !rec->d.n_flt ) // don't overwrite with a missing value unless asked
554 {
555 if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_filter(args->hdr_out,line,NULL,0);
556 return 0;
557 }
558 if ( col->replace & (SET_OR_APPEND|REPLACE_MISSING) )
559 {
560 if ( (col->replace & REPLACE_MISSING) && line->d.n_flt ) return 0; // only update missing FILTER
561 for (i=0; i<rec->d.n_flt; i++)
562 {
563 const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]);
564 if ( bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt)) < 0 ) ret = -1;
565 }
566 return ret;
567 }
568 hts_expand(int,rec->d.n_flt,args->mtmpi,args->tmpi);
569 for (i=0; i<rec->d.n_flt; i++)
570 {
571 const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]);
572 args->tmpi[i] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt);
573 }
574 bcf_update_filter(args->hdr_out,line,NULL,0);
575 return bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt);
576 }
setter_pos(args_t * args,bcf1_t * line,annot_col_t * col,void * data)577 static int setter_pos(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
578 {
579 annot_line_t *tab = (annot_line_t*) data;
580 if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
581 char *tmp;
582 int pos = strtol(tab->cols[col->icol], &tmp, 10);
583 if ( tmp==tab->cols[col->icol] )
584 error("Could not parse ~POS at %s:%"PRId64" .. [%s]\n",bcf_seqname(args->hdr,line),(int64_t)line->pos+1,tab->cols[col->icol]);
585 line->pos = pos - 1;
586 return 0;
587 }
setter_id(args_t * args,bcf1_t * line,annot_col_t * col,void * data)588 static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
589 {
590 if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n");
591 if ( col->replace & MATCH_VALUE ) return 0;
592
593 // possible cases:
594 // IN ANNOT OUT ACHIEVED_BY
595 // x y x -c +ID
596 // x y y -c ID
597 // x y x,y -c =ID
598 // x . x -c +ID, ID
599 // x . . -x ID
600 // . y y -c +ID, -c ID
601 //
602 annot_line_t *tab = (annot_line_t*) data;
603 if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
604 if ( col->replace & SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,tab->cols[col->icol]);
605 if ( !(col->replace & REPLACE_MISSING) ) return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]);
606
607 // running with +ID, only update missing ids
608 if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
609 return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]);
610 return 0;
611 }
vcf_setter_id(args_t * args,bcf1_t * line,annot_col_t * col,void * data)612 static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
613 {
614 if ( col->replace & MATCH_VALUE ) return 0;
615
616 bcf1_t *rec = (bcf1_t*) data;
617
618 char *id;
619 if ( col->getter )
620 {
621 int nret = col->getter(args,rec,col,&col->ptr,&col->mptr);
622 id = (char*) col->ptr;
623 if ( nret<=0 || (nret==1 && *id=='.') ) return 0; // don't replace with "."
624 }
625 else
626 {
627 if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0; // don't replace with "."
628 id = rec->d.id;
629 }
630 if ( col->replace & SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id);
631 if ( !(col->replace & REPLACE_MISSING) ) return bcf_update_id(args->hdr_out,line,id);
632
633 // running with +ID, only update missing ids
634 if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
635 return bcf_update_id(args->hdr_out,line,id);
636 return 0;
637 }
vcf_setter_ref(args_t * args,bcf1_t * line,annot_col_t * col,void * data)638 static int vcf_setter_ref(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
639 {
640 bcf1_t *rec = (bcf1_t*) data;
641 if ( !strcmp(rec->d.allele[0],line->d.allele[0]) ) return 0; // no update necessary
642 const char **als = (const char**) malloc(sizeof(char*)*line->n_allele);
643 als[0] = rec->d.allele[0];
644 int i;
645 for (i=1; i<line->n_allele; i++) als[i] = line->d.allele[i];
646 int ret = bcf_update_alleles(args->hdr_out, line, als, line->n_allele);
647 free(als);
648 return ret;
649 }
vcf_setter_alt(args_t * args,bcf1_t * line,annot_col_t * col,void * data)650 static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
651 {
652 bcf1_t *rec = (bcf1_t*) data;
653 int i;
654 if ( rec->n_allele==line->n_allele )
655 {
656 for (i=1; i<rec->n_allele; i++) if ( strcmp(rec->d.allele[i],line->d.allele[i]) ) break;
657 if ( i==rec->n_allele ) return 0; // no update necessary
658 }
659 const char **als = (const char**) malloc(sizeof(char*)*rec->n_allele);
660 als[0] = line->d.allele[0];
661 for (i=1; i<rec->n_allele; i++) als[i] = rec->d.allele[i];
662 int ret = bcf_update_alleles(args->hdr_out, line, als, rec->n_allele);
663 free(als);
664 return ret;
665 }
setter_qual(args_t * args,bcf1_t * line,annot_col_t * col,void * data)666 static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
667 {
668 if ( !data ) error("Error: the --merge-logic option cannot be used with QUAL (yet?)\n");
669
670 annot_line_t *tab = (annot_line_t*) data;
671 char *str = tab->cols[col->icol];
672 if ( str[0]=='.' && str[1]==0 ) // don't overwrite with a missing value unless asked
673 {
674 if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_float_set_missing(line->qual);
675 return 0;
676 }
677 if ( (col->replace & REPLACE_MISSING) && !bcf_float_is_missing(line->qual) ) return 0;
678
679 line->qual = strtod(str, &str);
680 if ( str == tab->cols[col->icol] )
681 error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
682 return 0;
683 }
vcf_setter_qual(args_t * args,bcf1_t * line,annot_col_t * col,void * data)684 static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
685 {
686 bcf1_t *rec = (bcf1_t*) data;
687 if ( bcf_float_is_missing(rec->qual) ) // don't overwrite with a missing value unless asked
688 {
689 if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_float_set_missing(line->qual);
690 return 0;
691 }
692 if ( (col->replace & REPLACE_MISSING) && !bcf_float_is_missing(line->qual) ) return 0;
693 line->qual = rec->qual;
694 return 0;
695 }
setter_info_flag(args_t * args,bcf1_t * line,annot_col_t * col,void * data)696 static int setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
697 {
698 if ( !data ) error("Error: the --merge-logic option cannot be used with INFO type=Flag (yet?)\n");
699
700 annot_line_t *tab = (annot_line_t*) data;
701 char *str = tab->cols[col->icol];
702 if ( str[0]=='.' && str[1]==0 ) // don't overwrite with a missing value unless asked
703 {
704 if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0);
705 return 0;
706 }
707
708 if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1);
709 if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0);
710 error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
711 return -1;
712 }
vcf_setter_info_flag(args_t * args,bcf1_t * line,annot_col_t * col,void * data)713 static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
714 {
715 bcf1_t *rec = (bcf1_t*) data;
716 int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key_src,NULL,NULL);
717 bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,flag);
718 return 0;
719 }
setter_ARinfo_int32(args_t * args,bcf1_t * line,annot_col_t * col,int nals,char ** als,int ntmpi)720 static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi)
721 {
722 if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
723 error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
724 else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
725 error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
726
727 int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
728 int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
729 if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
730
731 // fill in any missing values in the target VCF (or all, if not present)
732 int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
733 if ( ntmpi2 < ndst ) hts_expand(int32_t,ndst,args->mtmpi2,args->tmpi2);
734
735 int i;
736 for (i=0; i<ndst; i++)
737 {
738 if ( map[i]<0 )
739 {
740 if ( ntmpi2 < ndst ) args->tmpi2[i] = bcf_int32_missing;
741 continue;
742 }
743 if ( ntmpi2==ndst && (col->replace & REPLACE_MISSING)
744 && args->tmpi2[i]!=bcf_int32_missing
745 && args->tmpi2[i]!=bcf_int32_vector_end ) continue;
746
747 args->tmpi2[i] = args->tmpi[ map[i] ];
748 }
749 return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst);
750 }
setter_info_int(args_t * args,bcf1_t * line,annot_col_t * col,void * data)751 static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
752 {
753 annot_line_t *tab = (annot_line_t*) data;
754
755 // This is a bit hacky, only to reuse existing code with minimal changes:
756 // -c =TAG will now behave as -l TAG:APPEND for integers
757 if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_APPEND;
758
759 if ( !tab )
760 {
761 if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG &&
762 col->merge_method!=MM_MIN && col->merge_method!=MM_MAX &&
763 col->merge_method!=MM_APPEND &&
764 col->merge_method!=MM_APPEND_MISSING )
765 error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Integer\n");
766 }
767
768 int i,ntmpi = 0;
769 if ( (col->replace & SET_OR_APPEND) && !col->mm_dbl_nused )
770 {
771 ntmpi = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi, &args->mtmpi);
772 if ( ntmpi>0 && (args->tmpi[0]!=bcf_int32_missing || (col->replace & CARRY_OVER_MISSING)) )
773 {
774 col->mm_dbl_nused = col->mm_dbl_ndat = ntmpi;
775 hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
776 for (i=0; i<ntmpi; i++)
777 col->mm_dbl[i] = args->tmpi[i];
778 col->mm_dbl_ndat = 1;
779 }
780 ntmpi = 0;
781 }
782 if ( tab ) // has data, not flushing yet
783 {
784 char *str = tab->cols[col->icol], *end = str;
785 if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
786
787 while ( *end )
788 {
789 ntmpi++;
790 hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
791 if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
792 {
793 if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) )
794 args->tmpi[ntmpi-1] = bcf_int32_missing;
795 else
796 ntmpi--;
797 if ( str[1]==0 ) end = str+1;
798 str += 2;
799 }
800 else
801 {
802 args->tmpi[ntmpi-1] = strtol(str, &end, 10);
803 if ( end==str )
804 error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
805 str = end+1;
806 }
807 }
808 if ( col->merge_method!=MM_FIRST )
809 {
810 if ( !col->mm_dbl_nused )
811 {
812 col->mm_dbl_nused = ntmpi;
813 hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
814 for (i=0; i<ntmpi; i++)
815 col->mm_dbl[i] = args->tmpi[i];
816 }
817 else
818 {
819 if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
820 {
821 int nori = col->mm_dbl_nused;
822 col->mm_dbl_nused += ntmpi;
823 hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
824 for (i=0; i<ntmpi; i++)
825 col->mm_dbl[i+nori] = args->tmpi[i];
826 }
827 else
828 {
829 if ( ntmpi!=col->mm_dbl_nused ) error("Error: cannot merge fields of unequal length\n");
830 if ( col->merge_method==MM_SUM || col->merge_method==MM_AVG )
831 for (i=0; i<ntmpi; i++) col->mm_dbl[i] += args->tmpi[i];
832 else if ( col->merge_method==MM_MIN )
833 for (i=0; i<ntmpi; i++) { if ( col->mm_dbl[i] > args->tmpi[i] ) col->mm_dbl[i] = args->tmpi[i]; }
834 else if ( col->merge_method==MM_MAX )
835 for (i=0; i<ntmpi; i++) { if ( col->mm_dbl[i] < args->tmpi[i] ) col->mm_dbl[i] = args->tmpi[i]; }
836 }
837 }
838 col->mm_dbl_ndat++;
839 return 1;
840 }
841 }
842 else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
843 {
844 ntmpi = col->mm_dbl_nused;
845 hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
846 for (i=0; i<ntmpi; i++) args->tmpi[i] = col->mm_dbl[i];
847 col->mm_dbl_nused = col->mm_dbl_ndat = 0;
848 }
849 else if ( col->merge_method==MM_AVG )
850 {
851 ntmpi = col->mm_dbl_nused;
852 hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
853 for (i=0; i<ntmpi; i++) args->tmpi[i] = col->mm_dbl[i]/col->mm_dbl_ndat;
854 col->mm_dbl_nused = col->mm_dbl_ndat = 0;
855 }
856
857 if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
858 return setter_ARinfo_int32(args,line,col,tab->nals,tab->als,ntmpi);
859
860 if ( col->replace & REPLACE_MISSING )
861 {
862 int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
863 if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
864 }
865 return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
866 }
vcf_setter_info_int(args_t * args,bcf1_t * line,annot_col_t * col,void * data)867 static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
868 {
869 bcf1_t *rec = (bcf1_t*) data;
870 int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi);
871 if ( ntmpi < 0 ) return 0; // nothing to add
872
873 if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
874 return setter_ARinfo_int32(args,line,col,rec->n_allele,rec->d.allele,ntmpi);
875
876 if ( col->replace & REPLACE_MISSING )
877 {
878 int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
879 if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
880 }
881
882 return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
883 }
setter_ARinfo_real(args_t * args,bcf1_t * line,annot_col_t * col,int nals,char ** als,int ntmpf)884 static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf)
885 {
886 if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
887 error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
888 else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
889 error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
890
891 int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
892 int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
893 if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
894
895 // fill in any missing values in the target VCF (or all, if not present)
896 int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
897 if ( ntmpf2 < ndst ) hts_expand(float,ndst,args->mtmpf2,args->tmpf2);
898
899 int i;
900 for (i=0; i<ndst; i++)
901 {
902 if ( map[i]<0 )
903 {
904 if ( ntmpf2 < ndst ) bcf_float_set_missing(args->tmpf2[i]);
905 continue;
906 }
907 if ( ntmpf2==ndst && (col->replace & REPLACE_MISSING)
908 && !bcf_float_is_missing(args->tmpf2[i])
909 && !bcf_float_is_vector_end(args->tmpf2[i]) ) continue;
910
911 args->tmpf2[i] = args->tmpf[ map[i] ];
912 }
913 return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst);
914 }
setter_info_real(args_t * args,bcf1_t * line,annot_col_t * col,void * data)915 static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
916 {
917 annot_line_t *tab = (annot_line_t*) data;
918
919 // This is a bit hacky, only to reuse existing code with minimal changes:
920 // -c =TAG will now behave as -l TAG:APPEND for floats
921 if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_APPEND;
922
923 if ( !tab )
924 {
925 if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG &&
926 col->merge_method!=MM_MIN && col->merge_method!=MM_MAX &&
927 col->merge_method!=MM_APPEND &&
928 col->merge_method!=MM_APPEND_MISSING )
929 error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Float\n");
930 }
931
932 int i,ntmpf = 0;
933 if ( (col->replace & SET_OR_APPEND) && !col->mm_dbl_nused )
934 {
935 ntmpf = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf, &args->mtmpf);
936 if ( ntmpf>0 && (!bcf_float_is_missing(args->tmpf[0]) || (col->replace & CARRY_OVER_MISSING)) )
937 {
938 col->mm_dbl_nused = ntmpf;
939 hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
940 for (i=0; i<ntmpf; i++)
941 if ( bcf_float_is_missing(args->tmpf[i]) )
942 bcf_double_set_missing(col->mm_dbl[i]);
943 else
944 col->mm_dbl[i] = args->tmpf[i];
945 col->mm_dbl_ndat = 1;
946 }
947 ntmpf = 0;
948 }
949 if ( tab ) // data row, not just flushing
950 {
951 char *str = tab->cols[col->icol], *end = str;
952 if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
953
954 while ( *end )
955 {
956 ntmpf++;
957 hts_expand(float,ntmpf,args->mtmpf,args->tmpf);
958 if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
959 {
960 if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) )
961 bcf_float_set_missing(args->tmpf[ntmpf-1]);
962 else
963 ntmpf--;
964 if ( str[1]==0 ) end = str+1;
965 str += 2;
966 }
967 else
968 {
969 args->tmpf[ntmpf-1] = strtod(str, &end);
970 if ( end==str )
971 error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
972 str = end+1;
973 }
974 }
975 if ( col->merge_method!=MM_FIRST )
976 {
977 if ( !col->mm_dbl_nused )
978 {
979 col->mm_dbl_nused = ntmpf;
980 hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
981 for (i=0; i<ntmpf; i++)
982 {
983 if ( bcf_float_is_missing(args->tmpf[i]) )
984 bcf_double_set_missing(col->mm_dbl[i]);
985 else
986 col->mm_dbl[i] = args->tmpf[i];
987 }
988 }
989 else
990 {
991 if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
992 {
993 int nori = col->mm_dbl_nused;
994 col->mm_dbl_nused += ntmpf;
995 hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
996 for (i=0; i<ntmpf; i++)
997 {
998 if ( bcf_float_is_missing(args->tmpf[i]) )
999 bcf_double_set_missing(col->mm_dbl[i+nori]);
1000 else
1001 col->mm_dbl[i+nori] = args->tmpf[i];
1002 }
1003 }
1004 else
1005 {
1006 if ( ntmpf!=col->mm_dbl_nused ) error("Error: cannot merge fields of unequal length\n");
1007 if ( col->merge_method==MM_SUM || col->merge_method==MM_AVG )
1008 for (i=0; i<ntmpf; i++) col->mm_dbl[i] += args->tmpf[i];
1009 else if ( col->merge_method==MM_MIN )
1010 for (i=0; i<ntmpf; i++) { if ( col->mm_dbl[i] > args->tmpf[i] ) col->mm_dbl[i] = args->tmpf[i]; }
1011 else if ( col->merge_method==MM_MAX )
1012 for (i=0; i<ntmpf; i++) { if ( col->mm_dbl[i] < args->tmpf[i] ) col->mm_dbl[i] = args->tmpf[i]; }
1013 }
1014 }
1015 col->mm_dbl_ndat++;
1016 return 1;
1017 }
1018 }
1019 else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
1020 {
1021 ntmpf = col->mm_dbl_nused;
1022 hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf);
1023 for (i=0; i<ntmpf; i++)
1024 {
1025 if ( bcf_double_is_missing(col->mm_dbl[i]) )
1026 bcf_float_set_missing(args->tmpf[i]);
1027 else
1028 args->tmpf[i] = col->mm_dbl[i];
1029 }
1030 col->mm_dbl_nused = col->mm_dbl_ndat = 0;
1031 }
1032 else if ( col->merge_method==MM_AVG )
1033 {
1034 ntmpf = col->mm_dbl_nused;
1035 hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf);
1036 for (i=0; i<ntmpf; i++) args->tmpf[i] = col->mm_dbl[i]/col->mm_dbl_ndat;
1037 col->mm_dbl_nused = col->mm_dbl_ndat = 0;
1038 }
1039
1040 if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
1041 return setter_ARinfo_real(args,line,col,tab->nals,tab->als,ntmpf);
1042
1043 if ( col->replace & REPLACE_MISSING )
1044 {
1045 int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
1046 if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
1047 }
1048
1049 return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
1050 }
vcf_setter_info_real(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1051 static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1052 {
1053 bcf1_t *rec = (bcf1_t*) data;
1054 int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf);
1055 if ( ntmpf < 0 ) return 0; // nothing to add
1056
1057 if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
1058 return setter_ARinfo_real(args,line,col,rec->n_allele,rec->d.allele,ntmpf);
1059
1060 if ( col->replace & REPLACE_MISSING )
1061 {
1062 int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
1063 if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
1064 }
1065
1066 return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
1067 }
1068 int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
setter_ARinfo_string(args_t * args,bcf1_t * line,annot_col_t * col,int nals,char ** als)1069 static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als)
1070 {
1071 assert( col->merge_method==MM_FIRST );
1072
1073 int nsrc = 1, lsrc = 0;
1074 while ( args->tmps[lsrc] )
1075 {
1076 if ( args->tmps[lsrc]==',' ) nsrc++;
1077 lsrc++;
1078 }
1079 if ( col->number==BCF_VL_A && nsrc!=nals-1 && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
1080 error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1081 else if ( col->number==BCF_VL_R && nsrc!=nals && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
1082 error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1083
1084 int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
1085 int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
1086 if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1087
1088 // fill in any missing values in the target VCF (or all, if not present)
1089 int i, empty = 0, nstr, mstr = args->tmpks.m;
1090 nstr = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmpks.s, &mstr);
1091 args->tmpks.m = mstr;
1092 if ( nstr<0 || (nstr==1 && args->tmpks.s[0]=='.' && args->tmpks.s[1]==0) )
1093 {
1094 empty = 0;
1095 args->tmpks.l = 0;
1096 kputc('.',&args->tmpks);
1097 for (i=1; i<ndst; i++) kputs(",.",&args->tmpks);
1098 }
1099 else args->tmpks.l = nstr;
1100 for (i=0; i<ndst; i++)
1101 {
1102 if ( map[i]<0 )
1103 {
1104 if ( empty ) copy_string_field(".",0,1,&args->tmpks,i);
1105 continue;
1106 }
1107 if ( col->replace & REPLACE_MISSING )
1108 {
1109 // Do not replace filled values. The field must be looked up again because
1110 // of realloc in copy_string_field
1111 int n = 0;
1112 char *str = args->tmpks.s;
1113 while ( *str && n<i )
1114 {
1115 if ( *str==',' ) n++;
1116 str++;
1117 }
1118 if ( str[0]!='.' || (str[1]!=',' && str[1]!=0) ) continue; // value already set
1119 }
1120 int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i);
1121 if ( ret!=0 ) error("[%s:%d %s] Failed to copy a string field\n", __FILE__,__LINE__,__func__);
1122 }
1123 return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s);
1124 }
khash_str2int_clear_free(void * _hash)1125 void khash_str2int_clear_free(void *_hash)
1126 {
1127 khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
1128 khint_t k;
1129 if (hash == 0) return;
1130 for (k = 0; k < kh_end(hash); ++k)
1131 if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
1132 kh_clear(str2int, hash);
1133 }
setter_info_str(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1134 static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1135 {
1136 if ( (col->replace & REPLACE_MISSING) && col->number!=BCF_VL_A && col->number!=BCF_VL_R )
1137 {
1138 int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
1139 if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
1140 }
1141
1142 // This is a bit hacky, only to reuse existing code with minimal changes:
1143 // -c =TAG will now behave as -l TAG:unique for strings
1144 if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_UNIQUE;
1145
1146 annot_line_t *tab = (annot_line_t*) data;
1147
1148 int len = 0;
1149 if ( tab )
1150 {
1151 len = strlen(tab->cols[col->icol]);
1152 if ( !len ) return 0;
1153 if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
1154 }
1155
1156 if ( col->merge_method!=MM_FIRST )
1157 {
1158 if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
1159 error("Error: the --merge-logic option cannot be used with INFO tags Type=String,Number={A,R,G}\n");
1160
1161 if ( data )
1162 {
1163 assert( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING || col->merge_method==MM_UNIQUE );
1164 if ( col->merge_method==MM_UNIQUE )
1165 {
1166 if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init();
1167 if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 1;
1168 khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol]));
1169 }
1170
1171 if ( (col->replace & SET_OR_APPEND) && !col->mm_kstr.l )
1172 {
1173 int m = col->mm_kstr.m;
1174 int n = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &col->mm_kstr.s, &m);
1175 col->mm_kstr.m = m;
1176 if ( n>0 && ((col->replace & CARRY_OVER_MISSING) || col->mm_kstr.s[0]!='.' || col->mm_kstr.s[1]) ) col->mm_kstr.l = n;
1177 }
1178
1179 if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr);
1180 kputs(tab->cols[col->icol], &col->mm_kstr);
1181 return 1;
1182 }
1183
1184 if ( col->mm_kstr.l )
1185 {
1186 hts_expand(char,col->mm_kstr.l+1,args->mtmps,args->tmps);
1187 memcpy(args->tmps,col->mm_kstr.s,col->mm_kstr.l+1);
1188 }
1189 else
1190 return 0;
1191
1192 // flush the line
1193 if ( col->merge_method==MM_UNIQUE )
1194 khash_str2int_clear_free(col->mm_str_hash);
1195 col->mm_kstr.l = 0;
1196 }
1197 else
1198 {
1199 assert(tab);
1200 hts_expand(char,len+1,args->mtmps,args->tmps);
1201 memcpy(args->tmps,tab->cols[col->icol],len+1);
1202
1203 if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
1204 return setter_ARinfo_string(args,line,col,tab->nals,tab->als);
1205 }
1206
1207 return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
1208 }
vcf_setter_info_str(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1209 static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1210 {
1211 bcf1_t *rec = (bcf1_t*) data;
1212
1213 if ( col->getter )
1214 col->getter(args,rec,col,(void**)&args->tmps, &args->mtmps);
1215 else
1216 {
1217 int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps);
1218 if ( ntmps < 0 ) return 0; // nothing to add
1219 }
1220
1221 if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
1222 return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele);
1223
1224 if ( col->replace & REPLACE_MISSING )
1225 {
1226 int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
1227 if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
1228 }
1229
1230 return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
1231 }
genotypes_to_string(args_t * args,int nsrc1,int32_t * src,int nsmpl_dst,kstring_t * str)1232 static int genotypes_to_string(args_t *args, int nsrc1, int32_t *src, int nsmpl_dst, kstring_t *str)
1233 {
1234 int i, isrc, idst;
1235 int blen = nsrc1 > 1 ? nsrc1 + 1 : 1; // typically the genotypes take three bytes 0/1, no 0-termination is needed
1236
1237 gt_length_too_big:
1238 str->l = 0;
1239 for (idst=0; idst<nsmpl_dst; idst++)
1240 {
1241 isrc = args->sample_map ? args->sample_map[idst] : idst;
1242 if ( isrc==-1 )
1243 {
1244 kputc_('.', str);
1245 for (i=1; i < blen; i++) kputc_(0, str);
1246 continue;
1247 }
1248
1249 size_t plen = str->l;
1250 int32_t *ptr = src + isrc*nsrc1;
1251 for (i=0; i<nsrc1 && ptr[i]!=bcf_int32_vector_end; i++)
1252 {
1253 if ( i ) kputc("/|"[bcf_gt_is_phased(ptr[i])], str);
1254 if ( bcf_gt_is_missing(ptr[i]) ) kputc('.', str);
1255 else kputw(bcf_gt_allele(ptr[i]), str);
1256 }
1257 if ( i==0 ) kputc('.', str);
1258 if ( str->l - plen > blen )
1259 {
1260 // too many alternate alleles or ploidy is too large, the genotype does not fit
1261 // three characters ("0/0" vs "10/10").
1262 blen *= 2;
1263 goto gt_length_too_big;
1264 }
1265 plen = str->l - plen;
1266 while ( plen < blen )
1267 {
1268 kputc_(0, str);
1269 plen++;
1270 }
1271 }
1272 return 0;
1273 }
vcf_setter_format_gt(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1274 static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1275 {
1276 bcf1_t *rec = (bcf1_t*) data;
1277 int nsrc = bcf_get_genotypes(args->files->readers[1].header,rec,&args->tmpi,&args->mtmpi);
1278 if ( nsrc==-3 ) return 0; // the tag is not present
1279 if ( nsrc<=0 ) return 1; // error
1280
1281 // Genotypes are internally represented as integers. This is a complication when
1282 // adding as a different Type=String field, such as FMT/newGT:=GT
1283 if ( strcmp(col->hdr_key_src,col->hdr_key_dst) )
1284 {
1285 int nsmpl_dst = bcf_hdr_nsamples(args->hdr_out);
1286 int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header);
1287 genotypes_to_string(args,nsrc/nsmpl_src,args->tmpi,nsmpl_dst,&args->tmpks);
1288 return bcf_update_format_char(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s,args->tmpks.l);
1289 }
1290
1291 if ( !args->sample_map )
1292 return bcf_update_genotypes(args->hdr_out,line,args->tmpi,nsrc);
1293
1294 int i, j, ndst = bcf_get_genotypes(args->hdr,line,&args->tmpi2,&args->mtmpi2);
1295 if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
1296 nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
1297 if ( ndst<=0 ) // field not present in dst file
1298 {
1299 if ( col->replace & REPLACE_NON_MISSING ) return 0;
1300 hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
1301 for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1302 {
1303 int32_t *dst = args->tmpi2 + nsrc*i;
1304 if ( args->sample_map[i]==-1 )
1305 {
1306 dst[0] = bcf_gt_missing;
1307 for (j=1; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
1308 }
1309 else
1310 {
1311 int32_t *src = args->tmpi + nsrc*args->sample_map[i];
1312 for (j=0; j<nsrc; j++) dst[j] = src[j];
1313 }
1314 }
1315 return bcf_update_genotypes(args->hdr_out,line,args->tmpi2,nsrc*bcf_hdr_nsamples(args->hdr_out));
1316 }
1317 else if ( ndst >= nsrc )
1318 {
1319 for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1320 {
1321 if ( args->sample_map[i]==-1 ) continue;
1322 int32_t *src = args->tmpi + nsrc*args->sample_map[i];
1323 int32_t *dst = args->tmpi2 + ndst*i;
1324 if ( (col->replace & REPLACE_NON_MISSING) && bcf_gt_is_missing(dst[0]) ) continue;
1325 if ( (col->replace & REPLACE_MISSING) && !bcf_gt_is_missing(dst[0]) ) continue;
1326 for (j=0; j<nsrc; j++) dst[j] = src[j];
1327 for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
1328 }
1329 return bcf_update_genotypes(args->hdr_out,line,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
1330 }
1331 else // ndst < nsrc
1332 {
1333 hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
1334 for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1335 {
1336 int32_t *ori = args->tmpi2 + ndst*i;
1337 int32_t *dst = args->tmpi3 + nsrc*i;
1338 int keep_ori = 0;
1339 if ( args->sample_map[i]==-1 ) keep_ori = 1;
1340 else if ( (col->replace & REPLACE_NON_MISSING) && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
1341 else if ( (col->replace & REPLACE_MISSING) && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
1342 if ( keep_ori )
1343 {
1344 for (j=0; j<ndst; j++) dst[j] = ori[j];
1345 for (; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
1346 }
1347 else
1348 {
1349 int32_t *src = args->tmpi + nsrc*args->sample_map[i];
1350 for (j=0; j<nsrc; j++) dst[j] = src[j];
1351 }
1352 }
1353 return bcf_update_genotypes(args->hdr_out,line,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out));
1354 }
1355 }
count_vals(annot_line_t * tab,int icol_beg,int icol_end)1356 static int count_vals(annot_line_t *tab, int icol_beg, int icol_end)
1357 {
1358 int i, nmax = 1;
1359 for (i=icol_beg; i<icol_end; i++)
1360 {
1361 char *str = tab->cols[i], *end = str;
1362 if ( str[0]=='.' && !str[1] )
1363 {
1364 // missing value
1365 if ( !nmax ) nmax = 1;
1366 continue;
1367 }
1368 int n = 1;
1369 while ( *end )
1370 {
1371 if ( *end==',' ) n++;
1372 end++;
1373 }
1374 if ( nmax<n ) nmax = n;
1375 }
1376 return nmax;
1377 }
core_setter_format_int(args_t * args,bcf1_t * line,annot_col_t * col,int32_t * vals,int nvals)1378 static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, int32_t *vals, int nvals)
1379 {
1380 if ( !args->sample_map )
1381 return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot);
1382
1383 int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key_dst,&args->tmpi2,&args->mtmpi2);
1384 if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
1385 if ( ndst<=0 )
1386 {
1387 if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present
1388 hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
1389 for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1390 {
1391 int32_t *dst = args->tmpi2 + nvals*i;
1392 if ( args->sample_map[i]==-1 )
1393 {
1394 dst[0] = bcf_int32_missing;
1395 for (j=1; j<nvals; j++) dst[j] = bcf_int32_vector_end;
1396 }
1397 else
1398 {
1399 int32_t *src = vals + nvals*args->sample_map[i];
1400 for (j=0; j<nvals; j++) dst[j] = src[j];
1401 }
1402 }
1403 return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,nvals*bcf_hdr_nsamples(args->hdr_out));
1404 }
1405 else if ( ndst >= nvals )
1406 {
1407 for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1408 {
1409 if ( args->sample_map[i]==-1 ) continue;
1410 int32_t *src = vals + nvals*args->sample_map[i];
1411 int32_t *dst = args->tmpi2 + ndst*i;
1412 // possible cases:
1413 // in annot out
1414 // x y x TAG,-TAG,=TAG .. REPLACE_ALL, REPLACE_NON_MISSING, SET_OR_APPEND
1415 // x y y +TAG .. REPLACE_MISSING
1416 // . y . =TAG .. SET_OR_APPEND
1417 // . y y TAG,+TAG,-TAG .. REPLACE_ALL, REPLACE_MISSING, REPLACE_NON_MISSING
1418 // x . x TAG,+TAG .. REPLACE_ALL, REPLACE_MISSING
1419 // x . . -TAG .. REPLACE_NON_MISSING
1420 if ( col->replace & REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; }
1421 else if ( col->replace & REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; }
1422 else if ( col->replace & REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; }
1423 for (j=0; j<nvals; j++) dst[j] = src[j];
1424 for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
1425 }
1426 return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
1427 }
1428 else // ndst < nvals
1429 {
1430 hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
1431 for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1432 {
1433 int32_t *ann = vals + nvals*args->sample_map[i];
1434 int32_t *ori = args->tmpi2 + ndst*i; // ori vcf line
1435 int32_t *dst = args->tmpi3 + nvals*i; // expanded buffer
1436 int use_new_ann = 1;
1437 if ( args->sample_map[i]==-1 ) use_new_ann = 0;
1438 else if ( col->replace & REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; }
1439 else if ( col->replace & REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; }
1440 else if ( col->replace & REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; }
1441 if ( !use_new_ann )
1442 {
1443 for (j=0; j<ndst; j++) dst[j] = ori[j];
1444 for (; j<nvals; j++) dst[j] = bcf_int32_vector_end;
1445 }
1446 else
1447 for (j=0; j<nvals; j++) dst[j] = ann[j];
1448 }
1449 return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi3,nvals*bcf_hdr_nsamples(args->hdr_out));
1450 }
1451 }
core_setter_format_real(args_t * args,bcf1_t * line,annot_col_t * col,float * vals,int nvals)1452 static int core_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, float *vals, int nvals)
1453 {
1454 if ( !args->sample_map )
1455 return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot);
1456
1457 int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key_dst,&args->tmpf2,&args->mtmpf2);
1458 if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
1459 if ( ndst<=0 )
1460 {
1461 if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present
1462 hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2);
1463 for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1464 {
1465 float *dst = args->tmpf2 + nvals*i;
1466 if ( args->sample_map[i]==-1 )
1467 {
1468 bcf_float_set_missing(dst[0]);
1469 for (j=1; j<nvals; j++) bcf_float_set_vector_end(dst[j]);
1470 }
1471 else
1472 {
1473 float *src = vals + nvals*args->sample_map[i];
1474 for (j=0; j<nvals; j++) dst[j] = src[j];
1475 }
1476 }
1477 return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nvals*bcf_hdr_nsamples(args->hdr_out));
1478 }
1479 else if ( ndst >= nvals )
1480 {
1481 for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1482 {
1483 if ( args->sample_map[i]==-1 ) continue;
1484 float *src = vals + nvals*args->sample_map[i];
1485 float *dst = args->tmpf2 + ndst*i;
1486 if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; }
1487 else if ( col->replace & REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; }
1488 else if ( col->replace & REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; }
1489 for (j=0; j<nvals; j++) dst[j] = src[j];
1490 for (; j<ndst; j++) bcf_float_set_vector_end(dst[j]);
1491 }
1492 return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out));
1493 }
1494 else // ndst < nvals
1495 {
1496 hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3);
1497 for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
1498 {
1499 float *ann = vals + nvals*args->sample_map[i];
1500 float *ori = args->tmpf2 + ndst*i; // ori vcf line
1501 float *dst = args->tmpf3 + nvals*i; // expanded buffer
1502 int use_new_ann = 1;
1503 if ( args->sample_map[i]==-1 ) use_new_ann = 0;
1504 else if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
1505 else if ( col->replace & REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
1506 else if ( col->replace & REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; }
1507 if ( !use_new_ann )
1508 {
1509 for (j=0; j<ndst; j++) dst[j] = ori[j];
1510 for (; j<nvals; j++) bcf_float_set_vector_end(dst[j]);
1511 }
1512 else
1513 for (j=0; j<nvals; j++) dst[j] = ann[j];
1514 }
1515 return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf3,nvals*bcf_hdr_nsamples(args->hdr_out));
1516 }
1517 }
core_setter_format_str(args_t * args,bcf1_t * line,annot_col_t * col,char ** vals)1518 static int core_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, char **vals)
1519 {
1520 if ( !args->sample_map )
1521 return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)vals,args->nsmpl_annot);
1522
1523 int i;
1524 args->tmpp2[0] = args->tmps2;
1525 int ret = bcf_get_format_string(args->hdr,line,col->hdr_key_dst,&args->tmpp2,&args->mtmps2);
1526 args->tmps2 = args->tmpp2[0]; // tmps2 might be realloced
1527
1528 int nsmpl = bcf_hdr_nsamples(args->hdr_out);
1529 if ( ret<=0 ) // not present in dst
1530 {
1531 hts_expand(char,bcf_hdr_nsamples(args->hdr_out)*2,args->mtmps2,args->tmps2);
1532 char *tmp = args->tmps2;
1533 for (i=0; i<nsmpl; i++)
1534 {
1535 tmp[0] = '.';
1536 tmp[1] = 0;
1537 args->tmpp2[i] = tmp;
1538 tmp += 2;
1539 }
1540 }
1541 for (i=0; i<nsmpl; i++)
1542 {
1543 if ( args->sample_map[i]==-1 ) continue;
1544 char **src = vals + args->sample_map[i];
1545 char **dst = args->tmpp2 + i;
1546
1547 if ( col->replace & REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; }
1548 else if ( col->replace & REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; }
1549 else if ( col->replace & REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; }
1550 *dst = *src;
1551 }
1552 return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)args->tmpp2,nsmpl);
1553 }
setter_format_int(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1554 static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1555 {
1556 if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n");
1557
1558 annot_line_t *tab = (annot_line_t*) data;
1559 if ( col->icol+args->nsmpl_annot > tab->ncols )
1560 error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1561 int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot);
1562 hts_expand(int32_t,nvals*args->nsmpl_annot,args->mtmpi,args->tmpi);
1563
1564 int icol = col->icol, ismpl;
1565 for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
1566 {
1567 int32_t *ptr = args->tmpi + ismpl*nvals;
1568 int ival = 0;
1569
1570 char *str = tab->cols[icol];
1571 while ( *str )
1572 {
1573 if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
1574 {
1575 ptr[ival++] = bcf_int32_missing;
1576 str += str[1] ? 2 : 1;
1577 continue;
1578 }
1579
1580 char *end = str;
1581 ptr[ival] = strtol(str, &end, 10);
1582 if ( end==str )
1583 error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
1584
1585 ival++;
1586 str = *end ? end+1 : end;
1587 }
1588 while ( ival<nvals ) ptr[ival++] = bcf_int32_vector_end;
1589 icol++;
1590 }
1591 return core_setter_format_int(args,line,col,args->tmpi,nvals);
1592 }
setter_format_real(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1593 static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1594 {
1595 if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n");
1596
1597 annot_line_t *tab = (annot_line_t*) data;
1598 if ( col->icol+args->nsmpl_annot > tab->ncols )
1599 error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1600 int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot);
1601 hts_expand(float,nvals*args->nsmpl_annot,args->mtmpf,args->tmpf);
1602
1603 int icol = col->icol, ismpl;
1604 for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
1605 {
1606 float *ptr = args->tmpf + ismpl*nvals;
1607 int ival = 0;
1608
1609 char *str = tab->cols[icol];
1610 while ( *str )
1611 {
1612 if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
1613 {
1614 bcf_float_set_missing(ptr[ival]);
1615 ival++;
1616 str += str[1] ? 2 : 1;
1617 continue;
1618 }
1619
1620 char *end = str;
1621 ptr[ival] = strtod(str, &end);
1622 if ( end==str )
1623 error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
1624
1625 ival++;
1626 str = *end ? end+1 : end;
1627 }
1628 while ( ival<nvals ) { bcf_float_set_vector_end(ptr[ival]); ival++; }
1629 icol++;
1630 }
1631 return core_setter_format_real(args,line,col,args->tmpf,nvals);
1632 }
setter_format_str(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1633 static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1634 {
1635 if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n");
1636
1637 annot_line_t *tab = (annot_line_t*) data;
1638 if ( col->icol+args->nsmpl_annot > tab->ncols )
1639 error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1640
1641 int ismpl;
1642 for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
1643 args->tmpp[ismpl] = tab->cols[col->icol + ismpl];
1644
1645 return core_setter_format_str(args,line,col,args->tmpp);
1646 }
determine_ploidy(int nals,int * vals,int nvals1,uint8_t * smpl,int nsmpl)1647 static int determine_ploidy(int nals, int *vals, int nvals1, uint8_t *smpl, int nsmpl)
1648 {
1649 int i, j, ndip = nals*(nals+1)/2, max_ploidy = 0;
1650 for (i=0; i<nsmpl; i++)
1651 {
1652 int *ptr = vals + i*nvals1;
1653 int has_value = 0;
1654 for (j=0; j<nvals1; j++)
1655 {
1656 if ( ptr[j]==bcf_int32_vector_end ) break;
1657 if ( ptr[j]!=bcf_int32_missing ) has_value = 1;
1658 }
1659 if ( has_value )
1660 {
1661 if ( j==ndip )
1662 {
1663 smpl[i] = 2;
1664 max_ploidy = 2;
1665 }
1666 else if ( j==nals )
1667 {
1668 smpl[i] = 1;
1669 if ( !max_ploidy ) max_ploidy = 1;
1670 }
1671 else return -j;
1672 }
1673 else smpl[i] = 0;
1674 }
1675 return max_ploidy;
1676 }
vcf_setter_format_int(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1677 static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1678 {
1679 bcf1_t *rec = (bcf1_t*) data;
1680 int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi);
1681 if ( nsrc==-3 ) return 0; // the tag is not present
1682 if ( nsrc<=0 ) return 1; // error
1683 int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header);
1684 int nsrc1 = nsrc / nsmpl_src;
1685 if ( col->number!=BCF_VL_G && col->number!=BCF_VL_R && col->number!=BCF_VL_A )
1686 return core_setter_format_int(args,line,col,args->tmpi,nsrc1);
1687
1688 // create mapping from src to dst genotypes, haploid and diploid version
1689 int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1;
1690 int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele);
1691 if ( !map_hap ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1692
1693 int i, j;
1694 if ( rec->n_allele==line->n_allele )
1695 {
1696 // alleles unchanged?
1697 for (i=0; i<rec->n_allele; i++) if ( map_hap[i]!=i ) break;
1698 if ( i==rec->n_allele )
1699 return core_setter_format_int(args,line,col,args->tmpi,nsrc1);
1700 }
1701
1702 int nsmpl_dst = rec->n_sample;
1703 int ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key_dst,&args->tmpi2,&args->mtmpi2);
1704 int ndst1 = ndst / nsmpl_dst;
1705 if ( ndst <= 0 )
1706 {
1707 if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present
1708 if ( col->number==BCF_VL_G )
1709 ndst1 = line->n_allele*(line->n_allele+1)/2;
1710 else
1711 ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
1712 hts_expand(int, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2);
1713 for (i=0; i<nsmpl_dst; i++)
1714 {
1715 int32_t *dst = args->tmpi2 + i*ndst1;
1716 for (j=0; j<ndst1; j++) dst[j] = bcf_int32_missing;
1717 }
1718 }
1719
1720 int nmap_dip = 0, *map_dip = NULL;
1721 if ( col->number==BCF_VL_G )
1722 {
1723 map_dip = vcmp_map_dipGvalues(args->vcmp, &nmap_dip);
1724 if ( !args->src_smpl_pld )
1725 {
1726 args->src_smpl_pld = (uint8_t*) malloc(nsmpl_src);
1727 args->dst_smpl_pld = (uint8_t*) malloc(nsmpl_dst);
1728 }
1729 int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src);
1730 if ( pld_src<0 )
1731 error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),(int64_t) rec->pos+1);
1732 int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst);
1733 if ( pld_dst<0 )
1734 error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1735
1736 int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2;
1737 if ( ndst1_new != ndst1 )
1738 {
1739 if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%"PRId64"\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1740 ndst1 = ndst1_new;
1741 hts_expand(int32_t, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2);
1742 }
1743 }
1744 else if ( !ndst1 )
1745 {
1746 ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
1747 hts_expand(int32_t, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2);
1748 }
1749
1750 for (i=0; i<nsmpl_dst; i++)
1751 {
1752 int ii = args->sample_map ? args->sample_map[i] : i;
1753 int32_t *ptr_src = args->tmpi + i*nsrc1;
1754 int32_t *ptr_dst = args->tmpi2 + ii*ndst1;
1755
1756 if ( col->number==BCF_VL_G )
1757 {
1758 if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] )
1759 error("Sample ploidy differs at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1760 if ( !args->dst_smpl_pld[i] )
1761 for (j=0; j<ndst1; j++) ptr_dst[j] = bcf_int32_missing;
1762 }
1763 if ( col->number!=BCF_VL_G || args->src_smpl_pld[i]==1 )
1764 {
1765 for (j=0; j<nmap_hap; j++)
1766 {
1767 int k = map_hap[j];
1768 if ( k>=0 ) ptr_dst[k] = ptr_src[j];
1769 }
1770 if ( col->number==BCF_VL_G )
1771 for (j=line->n_allele; j<ndst1; j++) ptr_dst[j++] = bcf_int32_vector_end;
1772 }
1773 else
1774 {
1775 for (j=0; j<nmap_dip; j++)
1776 {
1777 int k = map_dip[j];
1778 if ( k>=0 ) ptr_dst[k] = ptr_src[j];
1779 }
1780 }
1781 }
1782 return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,nsmpl_dst*ndst1);
1783 }
vcf_setter_format_real(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1784 static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1785 {
1786 bcf1_t *rec = (bcf1_t*) data;
1787 int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf);
1788 if ( nsrc==-3 ) return 0; // the tag is not present
1789 if ( nsrc<=0 ) return 1; // error
1790 int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header);
1791 int nsrc1 = nsrc / nsmpl_src;
1792 if ( col->number!=BCF_VL_G && col->number!=BCF_VL_R && col->number!=BCF_VL_A )
1793 return core_setter_format_real(args,line,col,args->tmpf,nsrc1);
1794
1795 // create mapping from src to dst genotypes, haploid and diploid version
1796 int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1;
1797 int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele);
1798 if ( !map_hap ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1799
1800 int i, j;
1801 if ( rec->n_allele==line->n_allele )
1802 {
1803 // alleles unchanged?
1804 for (i=0; i<rec->n_allele; i++) if ( map_hap[i]!=i ) break;
1805 if ( i==rec->n_allele )
1806 return core_setter_format_real(args,line,col,args->tmpf,nsrc1);
1807 }
1808
1809 int nsmpl_dst = rec->n_sample;
1810 int ndst = bcf_get_format_float(args->hdr,line,col->hdr_key_dst,&args->tmpf2,&args->mtmpf2);
1811 int ndst1 = ndst / nsmpl_dst;
1812 if ( ndst <= 0 )
1813 {
1814 if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present
1815 if ( col->number==BCF_VL_G )
1816 ndst1 = line->n_allele*(line->n_allele+1)/2;
1817 else
1818 ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
1819 hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2);
1820 for (i=0; i<nsmpl_dst; i++)
1821 {
1822 float *dst = args->tmpf2 + i*ndst1;
1823 for (j=0; j<ndst1; j++) bcf_float_set_missing(dst[j]);
1824 }
1825 }
1826
1827 int nmap_dip = 0, *map_dip = NULL;
1828 if ( col->number==BCF_VL_G )
1829 {
1830 map_dip = vcmp_map_dipGvalues(args->vcmp, &nmap_dip);
1831 if ( !args->src_smpl_pld )
1832 {
1833 args->src_smpl_pld = (uint8_t*) malloc(nsmpl_src);
1834 args->dst_smpl_pld = (uint8_t*) malloc(nsmpl_dst);
1835 }
1836 int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src);
1837 if ( pld_src<0 )
1838 error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),(int64_t) rec->pos+1);
1839 int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst);
1840 if ( pld_dst<0 )
1841 error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1842
1843 int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2;
1844 if ( ndst1_new != ndst1 )
1845 {
1846 if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%"PRId64"\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1847 ndst1 = ndst1_new;
1848 hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2);
1849 }
1850 }
1851 else if ( !ndst1 )
1852 {
1853 ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
1854 hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2);
1855 }
1856
1857 for (i=0; i<nsmpl_dst; i++)
1858 {
1859 int ii = args->sample_map ? args->sample_map[i] : i;
1860 float *ptr_src = args->tmpf + i*nsrc1;
1861 float *ptr_dst = args->tmpf2 + ii*ndst1;
1862
1863 if ( col->number==BCF_VL_G )
1864 {
1865 if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] )
1866 error("Sample ploidy differs at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
1867 if ( !args->dst_smpl_pld[i] )
1868 for (j=0; j<ndst1; j++) bcf_float_set_missing(ptr_dst[j]);
1869 }
1870 if ( col->number!=BCF_VL_G || args->src_smpl_pld[i]==1 )
1871 {
1872 for (j=0; j<nmap_hap; j++)
1873 {
1874 int k = map_hap[j];
1875 if ( k>=0 )
1876 {
1877 if ( bcf_float_is_missing(ptr_src[j]) ) bcf_float_set_missing(ptr_dst[k]);
1878 else if ( bcf_float_is_vector_end(ptr_src[j]) ) bcf_float_set_vector_end(ptr_dst[k]);
1879 else ptr_dst[k] = ptr_src[j];
1880 }
1881 }
1882 if ( col->number==BCF_VL_G )
1883 for (j=line->n_allele; j<ndst1; j++) bcf_float_set_vector_end(ptr_dst[j]);
1884 }
1885 else
1886 {
1887 for (j=0; j<nmap_dip; j++)
1888 {
1889 int k = map_dip[j];
1890 if ( k>=0 )
1891 {
1892 if ( bcf_float_is_missing(ptr_src[j]) ) bcf_float_set_missing(ptr_dst[k]);
1893 else if ( bcf_float_is_vector_end(ptr_src[j]) ) bcf_float_set_vector_end(ptr_dst[k]);
1894 else ptr_dst[k] = ptr_src[j];
1895 }
1896 }
1897 }
1898 }
1899 return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nsmpl_dst*ndst1);
1900 }
1901
vcf_setter_format_str(args_t * args,bcf1_t * line,annot_col_t * col,void * data)1902 static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
1903 {
1904 bcf1_t *rec = (bcf1_t*) data;
1905 args->tmpp[0] = args->tmps;
1906 int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpp,&args->mtmps);
1907 args->tmps = args->tmpp[0]; // tmps might be realloced
1908 if ( ret==-3 ) return 0; // the tag is not present
1909 if ( ret<=0 ) return 1; // error
1910 if ( strcmp("GT",col->hdr_key_dst) )
1911 return core_setter_format_str(args,line,col,args->tmpp);
1912
1913 // Genotypes are internally represented as integers. This is a complication for FMT/GT:=oldGT
1914 // First determine the maximum number of alleles per-sample ndst1
1915 int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header);
1916 int nsmpl_dst = bcf_hdr_nsamples(args->hdr_out);
1917 int isrc,idst, ndst1 = 0, nsrc1 = ret / nsmpl_src;
1918 char *ptr = args->tmps, *ptr_end = ptr + ret;
1919 while ( ptr < ptr_end )
1920 {
1921 char *smpl_end = ptr + nsrc1;
1922 int n = 1;
1923 while ( ptr < smpl_end )
1924 {
1925 if ( *ptr=='/' || *ptr=='|' ) n++;
1926 ptr++;
1927 }
1928 if ( ndst1 < n ) ndst1 = n;
1929 }
1930 assert( ndst1 );
1931
1932 int ndst = ndst1*nsmpl_dst;
1933 hts_expand(int32_t,ndst,args->mtmpi,args->tmpi);
1934 hts_expand(char,ret+1,args->mtmps,args->tmps); args->tmps[ret] = 0; // the FORMAT string may not be 0-terminated
1935 for (idst=0; idst<nsmpl_dst; idst++)
1936 {
1937 int i = 0, is_phased = 0;
1938 int32_t *dst = args->tmpi + idst*ndst1;
1939 isrc = args->sample_map ? args->sample_map[idst] : idst;
1940 if ( isrc==-1 )
1941 {
1942 dst[0] = bcf_gt_missing;
1943 for (i=1; i<ndst1; i++) dst[i] = bcf_int32_vector_end;
1944 continue;
1945 }
1946 char *beg = args->tmps + isrc*nsrc1, *tmp;
1947 char *keep_ptr = beg+nsrc1, keep = *keep_ptr; *keep_ptr = 0;
1948 while ( *beg )
1949 {
1950 char *end = beg;
1951 while ( *end && *end!='/' && *end!='|' ) end++;
1952 if ( *beg=='.' && end-beg==1 ) dst[i] = bcf_gt_missing;
1953 else
1954 {
1955 if ( *end=='|' ) is_phased = 1;
1956 dst[i] = strtol(beg, &tmp, 10);
1957 if ( tmp!=end )
1958 error("Could not parse the %s field at %s:%"PRId64" in %s\n", col->hdr_key_src,bcf_seqname(args->files->readers[1].header,rec),(int64_t) rec->pos+1,args->targets_fname);
1959 if ( dst[i] >= line->n_allele )
1960 error("The source allele index is bigger than the number of destination alleles at %s:%"PRId64"\n", bcf_seqname(args->files->readers[1].header,rec),(int64_t) rec->pos+1);
1961 dst[i] = is_phased ? bcf_gt_phased(dst[i]) : bcf_gt_unphased(dst[i]);
1962 }
1963 beg = *end ? end+1 : end;
1964 i++;
1965 }
1966 *keep_ptr = keep;
1967 for (; i<ndst1; i++) dst[i] = bcf_int32_vector_end;
1968 }
1969 return bcf_update_genotypes(args->hdr_out,line,args->tmpi,ndst);
1970 }
init_sample_map(args_t * args,bcf_hdr_t * src,bcf_hdr_t * dst)1971 static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst)
1972 {
1973 int i;
1974 if ( !args->sample_names )
1975 {
1976 args->nsmpl_annot = bcf_hdr_nsamples(dst);
1977
1978 // tab annotation file, expecting that all samples are present: sample map not needed
1979 if ( !src ) return 0;
1980
1981 int nmatch = 0;
1982 for (i=0; i<bcf_hdr_nsamples(src); i++)
1983 {
1984 int id = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, src->samples[i]);
1985 if ( id!=-1 ) nmatch++;
1986 }
1987 if ( !nmatch ) return -1; // No matching samples found in the source and the destination file
1988
1989 args->nsample_map = bcf_hdr_nsamples(dst);
1990 args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
1991 for (i=0; i<args->nsample_map; i++)
1992 {
1993 int id = bcf_hdr_id2int(src, BCF_DT_SAMPLE, dst->samples[i]);
1994 args->sample_map[i] = id; // idst -> isrc, -1 if not present
1995 }
1996 return 1;
1997 }
1998
1999 args->nsample_map = bcf_hdr_nsamples(dst);
2000 args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
2001 for (i=0; i<args->nsample_map; i++) args->sample_map[i] = -1;
2002
2003 int flags = !src ? SMPL_STRICT|SMPL_SINGLE : SMPL_STRICT|SMPL_SINGLE|SMPL_PAIR2; // is vcf vs tab annotation file
2004 smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, flags); // gives mapping dst->src
2005 if ( !ilist || !ilist->n ) error("Could not parse the samples: %s\n", args->sample_names);
2006 args->nsmpl_annot = ilist->n;
2007 int need_sample_map = args->nsmpl_annot==bcf_hdr_nsamples(dst) ? 0 : 1;
2008 for (i=0; i<args->nsmpl_annot; i++)
2009 {
2010 int idst = ilist->idx[i];
2011 const char *src_name = ilist->pair && ilist->pair[i] ? ilist->pair[i] : bcf_hdr_int2id(dst, BCF_DT_SAMPLE, idst);
2012 int isrc = i;
2013 if ( src ) // the annotation file is a VCF, not a tab-delimited file
2014 {
2015 isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE, src_name);
2016 if ( isrc==-1 ) error("Sample \"%s\" not found in the annotation file\n", src_name);
2017 }
2018 if ( isrc!=idst ) need_sample_map = 1;
2019 args->sample_map[idst] = isrc;
2020 }
2021 smpl_ilist_destroy(ilist);
2022 return need_sample_map;
2023 }
columns_complement(char * columns,void ** skip_info,void ** skip_fmt)2024 static char *columns_complement(char *columns, void **skip_info, void **skip_fmt)
2025 {
2026 kstring_t str = {0,0,0};
2027 char *ss = columns, *se = ss;
2028 while ( *ss )
2029 {
2030 if ( *se && *se!=',' ) { se++; continue; }
2031 if ( *ss!='^' )
2032 {
2033 if ( str.l ) kputc(',',&str);
2034 kputsn(ss, se-ss, &str);
2035 if ( !*se ) break;
2036 ss = ++se;
2037 continue;
2038 }
2039
2040 if ( !strncasecmp("^INFO/",ss,6) )
2041 {
2042 if ( !*skip_info )
2043 {
2044 *skip_info = khash_str2int_init();
2045 if ( str.l ) kputc(',',&str);
2046 kputs("INFO",&str);
2047 }
2048 char tmp = *se; *se = 0;
2049 khash_str2int_inc(*skip_info, strdup(ss+6));
2050 *se = tmp;
2051 }
2052 else if ( !strncasecmp("^FORMAT/",ss,8) || !strncasecmp("^FMT/",ss,5) )
2053 {
2054 int n = !strncasecmp("^FMT/",ss,5) ? 5 : 8;
2055 if ( !*skip_fmt )
2056 {
2057 *skip_fmt = khash_str2int_init();
2058 if ( str.l ) kputc(',',&str);
2059 kputs("FORMAT",&str);
2060 }
2061 char tmp = *se; *se = 0;
2062 khash_str2int_inc(*skip_fmt, strdup(ss+n));
2063 *se = tmp;
2064 }
2065 else
2066 {
2067 if ( !*skip_info )
2068 {
2069 *skip_info = khash_str2int_init();
2070 if ( str.l ) kputc(',',&str);
2071 kputs("INFO",&str);
2072 }
2073 char tmp = *se; *se = 0;
2074 khash_str2int_inc(*skip_info, strdup(ss+1));
2075 *se = tmp;
2076 }
2077
2078 if ( !*se ) break;
2079 ss = ++se;
2080 }
2081 free(columns);
2082 return str.s;
2083 }
bcf_hrec_format_rename(bcf_hrec_t * hrec,char * tag,kstring_t * str)2084 static void bcf_hrec_format_rename(bcf_hrec_t *hrec, char *tag, kstring_t *str)
2085 {
2086 int j, nout = 0;
2087 ksprintf(str, "##%s=<", hrec->key);
2088 for (j=0; j<hrec->nkeys; j++)
2089 {
2090 if ( !strcmp("IDX",hrec->keys[j]) ) continue;
2091 if ( nout ) kputc(',',str);
2092 if ( !strcmp("ID", hrec->keys[j]) )
2093 ksprintf(str,"%s=%s", hrec->keys[j], tag);
2094 else
2095 ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]);
2096 nout++;
2097 }
2098 ksprintf(str,">\n");
2099 }
set_replace_mode(char * ss,int * replace)2100 static char *set_replace_mode(char *ss, int *replace)
2101 {
2102 int mode = 0;
2103 while (*ss)
2104 {
2105 if ( *ss=='+' ) mode |= REPLACE_MISSING;
2106 else if ( *ss=='-' ) mode |= REPLACE_NON_MISSING;
2107 else if ( *ss=='=' ) mode |= SET_OR_APPEND;
2108 else if ( *ss=='.' ) mode |= CARRY_OVER_MISSING;
2109 else break;
2110 ss++;
2111 }
2112 if ( !mode ) mode = REPLACE_ALL;
2113 // is exactly one bit set?
2114 // if ( mode && !(mode && ((mode & mode-1) == 0)) )
2115 *replace = mode;
2116 return ss;
2117 }
init_columns(args_t * args)2118 static void init_columns(args_t *args)
2119 {
2120 int need_sample_map = 0;
2121 int sample_map_ok = init_sample_map(args, args->tgts_is_vcf?args->files->readers[1].header:NULL, args->hdr);
2122
2123 kstring_t tmp = {0,0,0};
2124 if ( args->columns_is_file )
2125 {
2126 int i,n;
2127 char **str = hts_readlist(args->columns, args->columns_is_file, &n);
2128 if ( !str ) error("Could not parse %s\n", args->columns);
2129 for (i=0; i<n; i++)
2130 {
2131 char *ptr = str[i];
2132 while ( *ptr && !isspace(*ptr) ) ptr++;
2133 if ( *ptr )
2134 {
2135 *ptr = 0;
2136 ptr++;
2137 while ( *ptr && isspace(*ptr) ) ptr++;
2138 if ( *ptr )
2139 {
2140 if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str);
2141 kputs(str[i],&args->merge_method_str);
2142 kputc(':',&args->merge_method_str);
2143 kputs(ptr,&args->merge_method_str);
2144 }
2145 }
2146 if ( tmp.l ) kputc(',',&tmp);
2147 kputs(str[i],&tmp);
2148 free(str[i]);
2149 }
2150 free(str);
2151 free(args->columns);
2152 args->columns = tmp.s;
2153 tmp.l = tmp.m = 0;
2154 tmp.s = NULL;
2155 }
2156
2157 void *skip_fmt = NULL, *skip_info = NULL;
2158 if ( args->tgts_is_vcf )
2159 args->columns = columns_complement(args->columns, &skip_info, &skip_fmt);
2160
2161 kstring_t str = {0,0,0};
2162 char *ss = args->columns, *se = ss;
2163 args->ncols = 0;
2164 int icol = -1, has_fmt_str = 0;
2165 while ( *ss )
2166 {
2167 if ( *se && *se!=',' ) { se++; continue; }
2168 int replace;
2169 ss = set_replace_mode(ss, &replace);
2170 icol++;
2171 str.l = 0;
2172 kputsn(ss, se-ss, &str);
2173 if ( !str.s[0] || !strcasecmp("-",str.s) ) ;
2174 else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = icol;
2175 else if ( !strcasecmp("POS",str.s) ) args->beg_idx = icol;
2176 else if ( !strcasecmp("FROM",str.s) || !strcasecmp("BEG",str.s) ) args->beg_idx = icol;
2177 else if ( !strcasecmp("TO",str.s) || !strcasecmp("END",str.s) ) args->end_idx = icol;
2178 else if ( !strcasecmp("REF",str.s) )
2179 {
2180 if ( args->tgts_is_vcf )
2181 {
2182 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2183 annot_col_t *col = &args->cols[args->ncols-1];
2184 memset(col,0,sizeof(*col));
2185 col->setter = vcf_setter_ref;
2186 col->hdr_key_src = strdup(str.s);
2187 col->hdr_key_dst = strdup(str.s);
2188 }
2189 else args->ref_idx = icol;
2190 }
2191 else if ( !strcasecmp("ALT",str.s) )
2192 {
2193 if ( args->tgts_is_vcf )
2194 {
2195 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2196 annot_col_t *col = &args->cols[args->ncols-1];
2197 memset(col,0,sizeof(*col));
2198 col->setter = vcf_setter_alt;
2199 col->hdr_key_src = strdup(str.s);
2200 col->hdr_key_dst = strdup(str.s);
2201 }
2202 else args->alt_idx = icol;
2203 }
2204 else if ( !strcasecmp("ID",str.s) || !strcasecmp("~ID",str.s) )
2205 {
2206 if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
2207 if ( str.s[0]=='~' ) replace = MATCH_VALUE;
2208 if ( args->tgts_is_vcf && (replace & MATCH_VALUE) ) error("todo: -c ~ID with -a VCF?\n");
2209 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2210 annot_col_t *col = &args->cols[args->ncols-1];
2211 memset(col,0,sizeof(*col));
2212 col->icol = icol;
2213 col->replace = replace;
2214 col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
2215 col->hdr_key_src = strdup(str.s);
2216 col->hdr_key_dst = strdup(str.s);
2217 if ( replace & MATCH_VALUE ) args->match_id = icol;
2218 }
2219 else if ( !strcasecmp("~INFO/END",str.s) && !args->tgts_is_vcf )
2220 {
2221 replace = MATCH_VALUE;
2222 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2223 annot_col_t *col = &args->cols[args->ncols-1];
2224 memset(col,0,sizeof(*col));
2225 col->icol = icol;
2226 col->replace = replace;
2227 col->setter = NULL;
2228 col->hdr_key_src = strdup(str.s);
2229 col->hdr_key_dst = strdup(str.s);
2230 args->match_end = icol;
2231 }
2232 else if ( !strcasecmp("~POS",str.s) && !args->tgts_is_vcf )
2233 {
2234 if ( args->tgts_is_vcf ) error("Error: cannot use ~POS, position can be replaced only from a tab-delimited file\n");
2235 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2236 annot_col_t *col = &args->cols[args->ncols-1];
2237 memset(col,0,sizeof(*col));
2238 col->icol = icol;
2239 col->replace = replace;
2240 col->setter = setter_pos;
2241 col->hdr_key_src = strdup(str.s);
2242 col->hdr_key_dst = strdup(str.s);
2243 args->match_end = icol;
2244 }
2245 else if ( !strncasecmp("ID:=",str.s,4) ) // transfer a tag from INFO to ID column
2246 {
2247 if ( !args->tgts_is_vcf ) error("The annotation source must be a VCF for \"%s\"\n",str.s);
2248 if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
2249 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2250 annot_col_t *col = &args->cols[args->ncols-1];
2251 memset(col,0,sizeof(*col));
2252 col->icol = icol;
2253 col->replace = replace;
2254 col->setter = vcf_setter_id;
2255 col->getter = vcf_getter_info_str2str;
2256 str.s[2] = 0;
2257 col->hdr_key_dst = strdup(str.s);
2258 col->hdr_key_src = strncasecmp("INFO/",str.s+4,5) ? strdup(str.s+4) : strdup(str.s+4+5);
2259 int hdr_id = bcf_hdr_id2int(args->tgts_hdr, BCF_DT_ID,col->hdr_key_src);
2260 if ( !bcf_hdr_idinfo_exists(args->tgts_hdr,BCF_HL_INFO,hdr_id) )
2261 error("The INFO tag \"%s\" is not defined in %s\n", col->hdr_key_src, args->targets_fname);
2262 if ( bcf_hdr_id2type(args->tgts_hdr,BCF_HL_INFO,hdr_id)!=BCF_HT_STR )
2263 error("Only Type=String tags can be used to annotate the ID column\n");
2264 }
2265 else if ( !strcasecmp("FILTER",str.s) )
2266 {
2267 if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
2268 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2269 annot_col_t *col = &args->cols[args->ncols-1];
2270 memset(col,0,sizeof(*col));
2271 col->icol = icol;
2272 col->replace = replace;
2273 col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter;
2274 col->hdr_key_src = strdup(str.s);
2275 col->hdr_key_dst = strdup(str.s);
2276 if ( args->tgts_is_vcf )
2277 {
2278 bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
2279 int j;
2280 for (j=0; j<tgts_hdr->nhrec; j++)
2281 {
2282 bcf_hrec_t *hrec = tgts_hdr->hrec[j];
2283 if ( hrec->type!=BCF_HL_FLT ) continue;
2284 int k = bcf_hrec_find_key(hrec,"ID");
2285 if ( k<0 ) error("[%s] Failed to parse the header, the ID attribute not found", __func__);
2286 tmp.l = 0;
2287 bcf_hrec_format(hrec, &tmp);
2288 bcf_hdr_append(args->hdr_out, tmp.s);
2289 }
2290 if (bcf_hdr_sync(args->hdr_out) < 0)
2291 error_errno("[%s] Failed to update header", __func__);
2292 }
2293 }
2294 else if ( !strcasecmp("QUAL",str.s) )
2295 {
2296 if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
2297 if ( replace & SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
2298 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2299 annot_col_t *col = &args->cols[args->ncols-1];
2300 memset(col,0,sizeof(*col));
2301 col->icol = icol;
2302 col->replace = replace;
2303 col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual;
2304 col->hdr_key_src = strdup(str.s);
2305 col->hdr_key_dst = strdup(str.s);
2306 }
2307 else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields
2308 {
2309 if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
2310 if ( replace & SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n");
2311 bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
2312 int j;
2313 for (j=0; j<tgts_hdr->nhrec; j++)
2314 {
2315 bcf_hrec_t *hrec = tgts_hdr->hrec[j];
2316 if ( hrec->type!=BCF_HL_INFO ) continue;
2317 int k = bcf_hrec_find_key(hrec,"ID");
2318 assert( k>=0 ); // this should always be true for valid VCFs
2319 if ( skip_info && khash_str2int_has_key(skip_info,hrec->vals[k]) ) continue;
2320 tmp.l = 0;
2321 bcf_hrec_format(hrec, &tmp);
2322 bcf_hdr_append(args->hdr_out, tmp.s);
2323 if (bcf_hdr_sync(args->hdr_out) < 0)
2324 error_errno("[%s] Failed to update header", __func__);
2325 int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
2326 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2327 annot_col_t *col = &args->cols[args->ncols-1];
2328 memset(col,0,sizeof(*col));
2329 col->icol = -1;
2330 col->replace = replace;
2331 col->hdr_key_src = strdup(hrec->vals[k]);
2332 col->hdr_key_dst = strdup(hrec->vals[k]);
2333 col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
2334 switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
2335 {
2336 case BCF_HT_FLAG: col->setter = vcf_setter_info_flag; break;
2337 case BCF_HT_INT: col->setter = vcf_setter_info_int; break;
2338 case BCF_HT_REAL: col->setter = vcf_setter_info_real; break;
2339 case BCF_HT_STR: col->setter = vcf_setter_info_str; break;
2340 default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id));
2341 }
2342 }
2343 }
2344 else if ( args->tgts_is_vcf && (!strcasecmp("FORMAT",str.s) || !strcasecmp("FMT",str.s)) ) // All FORMAT fields
2345 {
2346 bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
2347 need_sample_map = 1;
2348 int j;
2349 for (j=0; j<tgts_hdr->nhrec; j++)
2350 {
2351 bcf_hrec_t *hrec = tgts_hdr->hrec[j];
2352 if ( hrec->type!=BCF_HL_FMT) continue;
2353 int k = bcf_hrec_find_key(hrec,"ID");
2354 assert( k>=0 ); // this should always be true for valid VCFs
2355 if ( skip_fmt && khash_str2int_has_key(skip_fmt,hrec->vals[k]) ) continue;
2356 tmp.l = 0;
2357 bcf_hrec_format(hrec, &tmp);
2358 bcf_hdr_append(args->hdr_out, tmp.s);
2359 if (bcf_hdr_sync(args->hdr_out) < 0)
2360 error_errno("[%s] Failed to update header", __func__);
2361 int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
2362 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2363 annot_col_t *col = &args->cols[args->ncols-1];
2364 memset(col,0,sizeof(*col));
2365 col->icol = -1;
2366 col->replace = replace;
2367 col->hdr_key_src = strdup(hrec->vals[k]);
2368 col->hdr_key_dst = strdup(hrec->vals[k]);
2369 if ( !strcasecmp("GT",col->hdr_key_src) )
2370 {
2371 if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n");
2372 col->setter = vcf_setter_format_gt;
2373 }
2374 else
2375 switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
2376 {
2377 case BCF_HT_INT: col->setter = vcf_setter_format_int; break;
2378 case BCF_HT_REAL: col->setter = vcf_setter_format_real; break;
2379 case BCF_HT_STR: col->setter = vcf_setter_format_str; has_fmt_str = 1; break;
2380 default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id));
2381 }
2382 hdr_id = bcf_hdr_id2int(tgts_hdr, BCF_DT_ID, hrec->vals[k]);
2383 col->number = bcf_hdr_id2length(tgts_hdr,BCF_HL_FMT,hdr_id);
2384 }
2385 }
2386 else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) )
2387 {
2388 char *key_dst = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7);
2389 char *key_src = strstr(key_dst,":=");
2390 if ( key_src )
2391 {
2392 *key_src = 0;
2393 key_src += 2;
2394 if ( !strncasecmp("FORMAT/",key_src,7) ) key_src += 7;
2395 else if ( !strncasecmp("FMT/",key_src,4) ) key_src += 4;
2396 }
2397 else
2398 key_src = key_dst;
2399 need_sample_map = 1;
2400 if ( args->tgts_is_vcf )
2401 {
2402 bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL);
2403 if ( !hrec ) error("No such annotation \"%s\" in %s\n", key_src,args->targets_fname);
2404 tmp.l = 0;
2405 bcf_hrec_format_rename(hrec, key_dst, &tmp);
2406 bcf_hdr_append(args->hdr_out, tmp.s);
2407 if (bcf_hdr_sync(args->hdr_out) < 0)
2408 error_errno("[%s] Failed to update header", __func__);
2409 }
2410 int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
2411 if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) )
2412 error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname);
2413 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2414 annot_col_t *col = &args->cols[args->ncols-1];
2415 memset(col,0,sizeof(*col));
2416 if ( !args->tgts_is_vcf )
2417 {
2418 col->icol = icol;
2419 icol += args->nsmpl_annot - 1;
2420 }
2421 else
2422 col->icol = -1;
2423 col->replace = replace;
2424 col->hdr_key_src = strdup(key_src);
2425 col->hdr_key_dst = strdup(key_dst);
2426 if ( !strcasecmp("GT",key_src) )
2427 {
2428 if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n");
2429 col->setter = vcf_setter_format_gt;
2430 }
2431 else
2432 switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
2433 {
2434 case BCF_HT_INT: col->setter = args->tgts_is_vcf ? vcf_setter_format_int : setter_format_int; break;
2435 case BCF_HT_REAL: col->setter = args->tgts_is_vcf ? vcf_setter_format_real : setter_format_real; break;
2436 case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_format_str : setter_format_str; has_fmt_str = 1; break;
2437 default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id));
2438 }
2439 if ( args->tgts_is_vcf )
2440 {
2441 bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
2442 hdr_id = bcf_hdr_id2int(tgts_hdr, BCF_DT_ID, col->hdr_key_src);
2443 col->number = bcf_hdr_id2length(tgts_hdr,BCF_HL_FMT,hdr_id);
2444 }
2445 }
2446 else
2447 {
2448 if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
2449 if ( replace & SET_OR_APPEND )
2450 {
2451 if ( args->tgts_is_vcf )
2452 error("Error: the =INFO/TAG feature is currently supported only with TAB annotation files and has limitations\n"
2453 " (the annotation type is modified to \"Number=.\" and allele ordering is disregarded)\n");
2454 fprintf(stderr,"Warning: the =INFO/TAG feature modifies the annotation to \"Number=.\" and disregards allele ordering\n");
2455 }
2456 int explicit_src_info = 0;
2457 int explicit_dst_info = 0;
2458 char *key_dst;
2459 if ( !strncasecmp("INFO/",str.s,5) )
2460 {
2461 key_dst = str.s + 5;
2462 explicit_dst_info = 1;
2463 }
2464 else if ( !strcasecmp("~INFO/END",str.s) )
2465 {
2466 key_dst = str.s + 6;
2467 explicit_dst_info = 1;
2468 }
2469 else
2470 key_dst = str.s;
2471 char *key_src = strstr(key_dst,":=");
2472 if ( key_src )
2473 {
2474 *key_src = 0;
2475 key_src += 2;
2476 if ( !strncasecmp("INFO/",key_src,5) )
2477 {
2478 key_src += 5;
2479 explicit_src_info = 1;
2480 }
2481 else if ( !strncasecmp("FMT/",key_src,4) || !strncasecmp("FORMAT/",key_src,5) )
2482 {
2483 key_src[-2] = ':';
2484 error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s);
2485 }
2486 }
2487 else
2488 key_src = key_dst;
2489
2490 args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
2491 annot_col_t *col = &args->cols[args->ncols-1];
2492 memset(col,0,sizeof(*col));
2493 col->icol = icol;
2494 col->replace = replace;
2495 col->hdr_key_src = strdup(key_src);
2496 col->hdr_key_dst = strdup(key_dst);
2497
2498 int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
2499 if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) )
2500 {
2501 if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line
2502 {
2503 if ( !strcasecmp("ID",key_src) && !explicit_src_info )
2504 {
2505 // transferring ID column into a new INFO tag
2506 tmp.l = 0;
2507 ksprintf(&tmp,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Transferred ID column\">",key_dst);
2508 }
2509 else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info )
2510 {
2511 // transferring FILTER column into a new INFO tag
2512 tmp.l = 0;
2513 ksprintf(&tmp,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Transferred FILTER column\">",key_dst);
2514 }
2515 else
2516 {
2517 bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL);
2518 if ( !hrec )
2519 {
2520 if ( explicit_dst_info+explicit_src_info==0 && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) )
2521 error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s);
2522 char *ptr = strchr(key_src,'=');
2523 if ( ptr )
2524 {
2525 *ptr = 0; tmp.l = 0; ksprintf(&tmp,"%s:=%s",key_src,ptr+1); *ptr = '=';
2526 error("The tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s);
2527 }
2528 error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname);
2529 }
2530 tmp.l = 0;
2531 bcf_hrec_format_rename(hrec, key_dst, &tmp);
2532 }
2533 bcf_hdr_append(args->hdr_out, tmp.s);
2534 if (bcf_hdr_sync(args->hdr_out) < 0)
2535 error_errno("[%s] Failed to update header", __func__);
2536 hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
2537 }
2538 else
2539 error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src, args->targets_fname);
2540 assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) );
2541 }
2542 if ( args->tgts_is_vcf )
2543 {
2544 if ( !strcasecmp("ID",key_src) && !explicit_src_info ) col->getter = vcf_getter_id2str;
2545 else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info ) col->getter = vcf_getter_filter2str;
2546 }
2547 col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
2548 switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
2549 {
2550 case BCF_HT_FLAG: col->setter = args->tgts_is_vcf ? vcf_setter_info_flag : setter_info_flag; break;
2551 case BCF_HT_INT: col->setter = args->tgts_is_vcf ? vcf_setter_info_int : setter_info_int; break;
2552 case BCF_HT_REAL: col->setter = args->tgts_is_vcf ? vcf_setter_info_real : setter_info_real; break;
2553 case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_info_str : setter_info_str; break;
2554 default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id));
2555 }
2556 if ( replace & SET_OR_APPEND ) // change to Number=.
2557 {
2558 bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_INFO, "ID", key_dst, NULL);
2559 if ( !hrec ) error("Uh, could not find the new tag \"%s\" in the header\n", key_dst);
2560 hrec = bcf_hrec_dup(hrec);
2561 int j = bcf_hrec_find_key(hrec, "Number");
2562 if ( j<0 ) error("Uh, could not find the entry Number in the header record of %s\n",key_dst);
2563 free(hrec->vals[j]);
2564 hrec->vals[j] = strdup(".");
2565 bcf_hdr_remove(args->hdr_out,BCF_HL_INFO, key_dst);
2566 bcf_hdr_add_hrec(args->hdr_out, hrec);
2567 }
2568 }
2569 if ( !*se ) break;
2570 ss = ++se;
2571 }
2572 free(str.s);
2573 free(tmp.s);
2574 free(args->columns);
2575 if ( skip_info ) khash_str2int_destroy_free(skip_info);
2576 if ( skip_fmt ) khash_str2int_destroy_free(skip_fmt);
2577 if ( has_fmt_str )
2578 {
2579 int n = bcf_hdr_nsamples(args->hdr_out);
2580 if ( args->tgts_is_vcf && n<bcf_hdr_nsamples(args->files->readers[1].header) ) n = bcf_hdr_nsamples(args->files->readers[1].header);
2581 args->tmpp = (char**)malloc(sizeof(char*)*n);
2582 args->tmpp2 = (char**)malloc(sizeof(char*)*n);
2583 }
2584 if ( !need_sample_map )
2585 {
2586 free(args->sample_map);
2587 args->sample_map = NULL;
2588 }
2589 else if ( sample_map_ok<0 )
2590 error("No matching samples in source and destination file?\n");
2591 }
init_merge_method(args_t * args)2592 static void init_merge_method(args_t *args)
2593 {
2594 int i;
2595 for (i=0; i<args->ncols; i++)
2596 {
2597 args->cols[i].merge_method = MM_FIRST;
2598 args->cols[i].mm_str_hash = NULL;
2599 args->cols[i].mm_dbl = NULL;
2600 args->cols[i].mm_dbl_nalloc = args->cols[i].mm_dbl_nused = args->cols[i].mm_dbl_ndat = 0;
2601 memset(&args->cols[i].mm_kstr, 0, sizeof(args->cols[i].mm_kstr));
2602 }
2603 if ( !args->merge_method_str.l ) return;
2604 if ( args->tgts_is_vcf ) error("Error: the --merge-logic is intended for use with BED or TAB-delimited files only.\n");
2605 if ( !args->tgt_idx && !args->tgts ) error("Error: BEG,END (or FROM,TO) columns or REF,ALT columns are expected with the --merge-logic option.\n");
2606 char *sb = args->merge_method_str.s;
2607 while ( *sb )
2608 {
2609 char *se = sb;
2610 while ( *se && *se!=',' ) se++;
2611 args->tmpks.l = 0;
2612 kputsn(sb, se-sb, &args->tmpks);
2613 kputc(0, &args->tmpks);
2614 char *mm_type_str = args->tmpks.s + args->tmpks.l;
2615 while ( *mm_type_str!=':' && mm_type_str > args->tmpks.s ) mm_type_str--;
2616 if ( *mm_type_str!=':' )
2617 error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str.s);
2618 *mm_type_str = 0;
2619 mm_type_str++;
2620 int mm_type = MM_FIRST;
2621 if ( !strcasecmp("unique",mm_type_str) ) mm_type = MM_UNIQUE;
2622 else if ( !strcasecmp("first",mm_type_str) ) mm_type = MM_FIRST;
2623 else if ( !strcasecmp("append",mm_type_str) ) mm_type = MM_APPEND;
2624 else if ( !strcasecmp("append-missing",mm_type_str) )
2625 {
2626 mm_type = MM_APPEND_MISSING;
2627 if ( args->ref_idx!=-1 ) args->has_append_mode = 1;
2628 }
2629 else if ( !strcasecmp("sum",mm_type_str) ) mm_type = MM_SUM;
2630 else if ( !strcasecmp("avg",mm_type_str) ) mm_type = MM_AVG;
2631 else if ( !strcasecmp("min",mm_type_str) ) mm_type = MM_MIN;
2632 else if ( !strcasecmp("max",mm_type_str) ) mm_type = MM_MAX;
2633 else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str.s,mm_type_str);
2634 for (i=0; i<args->ncols; i++)
2635 {
2636 if ( strcmp(args->cols[i].hdr_key_dst,args->tmpks.s) ) continue;
2637 if ( (mm_type==MM_APPEND || mm_type==MM_APPEND_MISSING) && args->cols[i].number!=BCF_VL_VAR )
2638 error("Error: --merge-logic append can be requested only for tags of variable length (Number=.)\n");
2639 args->cols[i].merge_method = mm_type;
2640 break;
2641 }
2642 if ( i==args->ncols ) error("No such tag in the destination file: %s\n", args->tmpks.s);
2643 sb = *se ? se + 1 : se;
2644 }
2645 if ( args->has_append_mode )
2646 {
2647 // create a missing line to insert missing values when VCF ALT finds no match in the annotation file
2648 args->aline_missing = (annot_line_t*)calloc(1,sizeof(*args->aline_missing));
2649 int ncol = 0;
2650 for (i=0; i<args->ncols; i++)
2651 if ( ncol < args->cols[i].icol + 1 ) ncol = args->cols[i].icol + 1;
2652 if ( ncol < args->ref_idx + 1 ) ncol = args->ref_idx + 1;
2653 args->aline_missing->mcols = ncol;
2654 args->aline_missing->ncols = ncol;
2655 args->aline_missing->cols = (char**) malloc(ncol*sizeof(char*));
2656 for (i=0; i<ncol; i++)
2657 args->aline_missing->cols[i] = strdup(".");
2658 }
2659 }
2660
rename_chrs(args_t * args,char * fname)2661 static void rename_chrs(args_t *args, char *fname)
2662 {
2663 int n, i;
2664 char **map = hts_readlist(fname, 1, &n);
2665 if ( !map ) error("Could not read: %s\n", fname);
2666 for (i=0; i<n; i++)
2667 {
2668 char *ss = map[i];
2669 while ( *ss && !isspace(*ss) ) ss++;
2670 if ( !*ss ) error("Could not parse: %s\n", fname);
2671 *ss = 0;
2672 int rid = bcf_hdr_name2id(args->hdr_out, map[i]);
2673 bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_CTG, "ID", map[i], NULL);
2674 if ( !hrec ) continue; // the sequence not present
2675 int j = bcf_hrec_find_key(hrec, "ID");
2676 assert( j>=0 );
2677 free(hrec->vals[j]);
2678 ss++;
2679 while ( *ss && isspace(*ss) ) ss++;
2680 char *se = ss;
2681 while ( *se && !isspace(*se) ) se++;
2682 *se = 0;
2683 hrec->vals[j] = strdup(ss);
2684 args->hdr_out->id[BCF_DT_CTG][rid].key = hrec->vals[j];
2685 }
2686 for (i=0; i<n; i++) free(map[i]);
2687 free(map);
2688 }
2689
rename_annots(args_t * args,char * fname)2690 static void rename_annots(args_t *args, char *fname)
2691 {
2692 int n, i;
2693 char **map = hts_readlist(fname, 1, &n);
2694 if ( !map ) error("Could not read: %s\n", fname);
2695 for (i=0; i<n; i++)
2696 {
2697 char *sb = NULL, *ss = map[i];
2698 while ( *ss && !isspace(*ss) ) ss++;
2699 if ( !*ss ) error("Could not parse: %s\n", fname);
2700 *ss = 0;
2701 int type;
2702 if ( !strncasecmp("info/",map[i],5) ) type = BCF_HL_INFO, sb = map[i] + 5;
2703 else if ( !strncasecmp("format/",map[i],7) ) type = BCF_HL_FMT, sb = map[i] + 7;
2704 else if ( !strncasecmp("fmt/",map[i],4) ) type = BCF_HL_FMT, sb = map[i] + 4;
2705 else if ( !strncasecmp("filter/",map[i],7) ) type = BCF_HL_FLT, sb = map[i] + 7;
2706 else error("Could not parse \"%s\", expected INFO, FORMAT, or FILTER prefix for each line: %s\n",map[i],fname);
2707 int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, sb);
2708 if ( id<0 ) continue;
2709 bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", sb, NULL);
2710 if ( !hrec ) continue; // the sequence not present
2711 int j = bcf_hrec_find_key(hrec, "ID");
2712 assert( j>=0 );
2713 free(hrec->vals[j]);
2714 ss++;
2715 while ( *ss && isspace(*ss) ) ss++;
2716 char *se = ss;
2717 while ( *se && !isspace(*se) ) se++;
2718 *se = 0;
2719 hrec->vals[j] = strdup(ss);
2720 args->hdr_out->id[BCF_DT_ID][id].key = hrec->vals[j];
2721 }
2722 for (i=0; i<n; i++) free(map[i]);
2723 free(map);
2724 }
2725
init_data(args_t * args)2726 static void init_data(args_t *args)
2727 {
2728 args->hdr = args->files->readers[0].header;
2729 args->hdr_out = bcf_hdr_dup(args->hdr);
2730
2731 if ( args->set_ids_fmt )
2732 {
2733 if ( args->set_ids_fmt[0]=='+' ) { args->set_ids_replace = 0; args->set_ids_fmt++; }
2734 args->set_ids = convert_init(args->hdr_out, NULL, 0, args->set_ids_fmt);
2735 }
2736 if ( args->remove_annots ) init_remove_annots(args);
2737 if ( args->header_fname ) init_header_lines(args);
2738 if ( args->targets_fname && args->tgts_is_vcf )
2739 {
2740 // reading annots from a VCF
2741 if ( !bcf_sr_add_reader(args->files, args->targets_fname) )
2742 error("Failed to open %s: %s\n", args->targets_fname,bcf_sr_strerror(args->files->errnum));
2743 args->tgts_hdr = args->files->readers[1].header;
2744 }
2745 if ( args->columns ) init_columns(args);
2746 if ( args->targets_fname && !args->tgts_is_vcf )
2747 {
2748 if ( !args->columns ) error("The -c option not given\n");
2749 if ( args->chr_idx==-1 ) error("The -c CHROM option not given\n");
2750 if ( args->beg_idx==-1 ) error("The -c POS option not given\n");
2751 if ( args->single_overlaps && args->merge_method_str.l ) error("The options --merge-logic and --single-overlaps cannot be combined\n");
2752 if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str.l) )
2753 {
2754 args->end_idx = -args->beg_idx - 1;
2755 args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->beg_idx,args->end_idx);
2756 if ( !args->tgts ) error("Could not initialize the annotation file: %s\n", args->targets_fname);
2757 if ( !args->tgts->tbx ) error("Expected tabix-indexed annotation file: %s\n", args->targets_fname);
2758 }
2759 else
2760 {
2761 if ( args->ref_idx!=-1 ) error("Error: the REF columns will be ignored when BEG,END (or FROM,TO) is present. Replace END (or TO) with \"-\".\n");
2762 int len = strlen(args->targets_fname);
2763 if ( len>=7 && !strcasecmp(".bed.gz",args->targets_fname+len-7) ) args->tgt_is_bed = 1;
2764 else if ( len>=8 && !strcasecmp(".bed.bgz",args->targets_fname+len-8) ) args->tgt_is_bed = 1;
2765 else if ( len>=4 && !strcasecmp(".bed",args->targets_fname+len-4) ) args->tgt_is_bed = 1;
2766 args->tgt_idx = regidx_init(args->targets_fname,parse_with_payload,free_payload,sizeof(char*),args);
2767 if ( !args->tgt_idx ) error("Failed to parse: %s\n", args->targets_fname);
2768 args->tgt_itr = regitr_init(args->tgt_idx);
2769 args->nalines++;
2770 hts_expand0(annot_line_t,args->nalines,args->malines,args->alines);
2771 }
2772 }
2773 init_merge_method(args);
2774 args->vcmp = vcmp_init();
2775
2776 if ( args->filter_str )
2777 args->filter = filter_init(args->hdr, args->filter_str);
2778
2779 if ( args->mark_sites )
2780 {
2781 if ( !args->targets_fname ) error("The -a option not given\n");
2782 bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
2783 args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
2784 }
2785
2786 if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
2787 if ( !args->drop_header )
2788 {
2789 if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs);
2790 if ( args->rename_annots ) rename_annots(args, args->rename_annots);
2791
2792 char wmode[8];
2793 set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
2794 args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
2795 if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno));
2796 if ( args->n_threads )
2797 hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
2798 if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname);
2799 }
2800 }
2801
destroy_data(args_t * args)2802 static void destroy_data(args_t *args)
2803 {
2804 int i;
2805 for (i=0; i<args->nrm; i++) free(args->rm[i].key);
2806 free(args->rm);
2807 if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out);
2808 if (args->vcmp) vcmp_destroy(args->vcmp);
2809 for (i=0; i<args->ncols; i++)
2810 {
2811 free(args->cols[i].hdr_key_src);
2812 free(args->cols[i].hdr_key_dst);
2813 free(args->cols[i].mm_kstr.s);
2814 if ( args->cols[i].mm_str_hash ) khash_str2int_destroy_free(args->cols[i].mm_str_hash);
2815 free(args->cols[i].mm_dbl);
2816 free(args->cols[i].ptr);
2817 }
2818 free(args->cols);
2819 if ( args->aline_missing )
2820 {
2821 for (i=0; i<args->aline_missing->ncols; i++) free(args->aline_missing->cols[i]);
2822 free(args->aline_missing->cols);
2823 free(args->aline_missing);
2824 }
2825 for (i=0; i<args->malines; i++)
2826 {
2827 free(args->alines[i].cols);
2828 free(args->alines[i].als);
2829 free(args->alines[i].line.s);
2830 }
2831 free(args->alines);
2832 free(args->srt_alines);
2833 if ( args->tgt_idx )
2834 {
2835 regidx_destroy(args->tgt_idx);
2836 regitr_destroy(args->tgt_itr);
2837 }
2838 if ( args->tgts ) bcf_sr_regions_destroy(args->tgts);
2839 free(args->tmpks.s);
2840 free(args->tmpi);
2841 free(args->tmpf);
2842 free(args->tmps);
2843 free(args->tmpp);
2844 free(args->tmpi2);
2845 free(args->tmpf2);
2846 free(args->tmps2);
2847 free(args->tmpp2);
2848 free(args->tmpi3);
2849 free(args->tmpf3);
2850 free(args->src_smpl_pld);
2851 free(args->dst_smpl_pld);
2852 if ( args->set_ids )
2853 convert_destroy(args->set_ids);
2854 if ( args->filter )
2855 filter_destroy(args->filter);
2856 if (args->out_fh) hts_close(args->out_fh);
2857 free(args->sample_map);
2858 free(args->merge_method_str.s);
2859 }
2860
parse_annot_line(args_t * args,char * str,annot_line_t * tmp)2861 static void parse_annot_line(args_t *args, char *str, annot_line_t *tmp)
2862 {
2863 tmp->line.l = 0;
2864 kputs(str, &tmp->line);
2865 char *s = tmp->line.s;
2866 tmp->ncols = 1;
2867 hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols);
2868 tmp->cols[0] = s;
2869 while ( *s )
2870 {
2871 if ( *s=='\t' )
2872 {
2873 tmp->ncols++;
2874 hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols);
2875 tmp->cols[tmp->ncols-1] = s+1;
2876 *s = 0;
2877 }
2878 s++;
2879 }
2880 if ( args->ref_idx != -1 )
2881 {
2882 if ( args->ref_idx >= tmp->ncols )
2883 error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,str);
2884 if ( args->alt_idx >= tmp->ncols )
2885 error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,str);
2886 tmp->nals = 2;
2887 hts_expand(char*,tmp->nals,tmp->mals,tmp->als);
2888 tmp->als[0] = tmp->cols[args->ref_idx];
2889 tmp->als[1] = s = tmp->cols[args->alt_idx];
2890 while ( *s )
2891 {
2892 if ( *s==',' )
2893 {
2894 tmp->nals++;
2895 hts_expand(char*,tmp->nals,tmp->mals,tmp->als);
2896 tmp->als[tmp->nals-1] = s+1;
2897 *s = 0;
2898 }
2899 s++;
2900 }
2901 }
2902 }
buffer_annot_lines(args_t * args,bcf1_t * line,int start_pos,int end_pos)2903 static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int end_pos)
2904 {
2905 if ( args->nalines && args->alines[0].rid != line->rid ) args->nalines = 0;
2906
2907 int i = 0;
2908 while ( i<args->nalines )
2909 {
2910 if ( line->pos > args->alines[i].end )
2911 {
2912 args->nalines--;
2913 if ( args->nalines && i<args->nalines )
2914 {
2915 annot_line_t tmp = args->alines[i];
2916 memmove(&args->alines[i],&args->alines[i+1],(args->nalines-i)*sizeof(annot_line_t));
2917 args->alines[args->nalines] = tmp;
2918 }
2919 }
2920 else i++;
2921 }
2922 if ( args->ref_idx==-1 && args->nalines ) return;
2923
2924 while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) )
2925 {
2926 if ( args->nalines + 1 == 0xffff ) break; // likely a symbolic allele, don't let the buffer overflow
2927 args->nalines++;
2928 hts_expand0(annot_line_t,args->nalines,args->malines,args->alines);
2929 annot_line_t *tmp = &args->alines[args->nalines-1];
2930 tmp->rid = line->rid;
2931 tmp->start = args->tgts->start;
2932 tmp->end = args->tgts->end;
2933 parse_annot_line(args, args->tgts->line.s, tmp);
2934 if ( args->ref_idx != -1 )
2935 {
2936 int iseq = args->tgts->iseq;
2937 if ( bcf_sr_regions_next(args->tgts)<0 || args->tgts->iseq!=iseq ) break;
2938 }
2939 else break;
2940 }
2941 }
2942
2943 // search string in semicolon separated strings (xx vs aa;bb)
str_match(char * needle,char * haystack)2944 static int str_match(char *needle, char *haystack)
2945 {
2946 int len = strlen(needle);
2947 char *ptr = haystack;
2948 while ( *ptr && (ptr=strstr(ptr,needle)) )
2949 {
2950 if ( ptr[len]!=0 && ptr[len]!=';' ) ptr++; // a prefix, not a match
2951 else if ( ptr==haystack || ptr[-1]==';' ) return 1; // a match
2952 ptr++; // a suffix, not a match
2953 }
2954 return 0;
2955 }
2956 // search common string in semicolon separated strings (xx;yy;zz vs aa;bb)
strstr_match(char * a,char * b)2957 static int strstr_match(char *a, char *b)
2958 {
2959 char *beg = a;
2960 while ( *beg )
2961 {
2962 char *end = beg;
2963 while ( *end && *end!=';' ) end++;
2964 char tmp = *end;
2965 if ( *end==';' ) *end = 0;
2966 int ret = str_match(beg,b);
2967 *end = tmp;
2968 if ( ret || !*end ) return ret;
2969 beg = end + 1;
2970 }
2971 return 0;
2972 }
annotate(args_t * args,bcf1_t * line)2973 static void annotate(args_t *args, bcf1_t *line)
2974 {
2975 int i, j;
2976 for (i=0; i<args->nrm; i++)
2977 args->rm[i].handler(args, line, &args->rm[i]);
2978
2979 int has_overlap = 0;
2980 if ( args->tgt_idx )
2981 {
2982 for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
2983 if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) )
2984 {
2985 while ( regitr_overlap(args->tgt_itr) )
2986 {
2987 annot_line_t *tmp = &args->alines[0];
2988 tmp->rid = line->rid;
2989 tmp->start = args->tgt_itr->beg;
2990 tmp->end = args->tgt_itr->end;
2991 parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp);
2992 for (j=0; j<args->ncols; j++)
2993 {
2994 if ( args->cols[j].done==1 ) continue;
2995 int ret = args->cols[j].setter(args,line,&args->cols[j],tmp);
2996 if ( ret < 0 )
2997 error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
2998 if ( ret==0 )
2999 args->cols[j].done = 1;
3000 }
3001 }
3002 has_overlap = 1;
3003 }
3004 for (j=0; j<args->ncols; j++)
3005 {
3006 if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
3007 if ( !args->cols[j].setter ) continue;
3008 if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 )
3009 error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
3010 }
3011 }
3012 else if ( args->tgts )
3013 {
3014 // Buffer annotation lines. When multiple ALT alleles are present in the annotation file, at least one
3015 // must match some of the VCF alleles. If the append-missing mode is set (and REF+ALT is requested), the
3016 // buffered lines will annotate the VCF respecting the order in ALT and when no matching line is found
3017 // for an ALT, missing value is appended instead.
3018 int end_pos = line->pos + line->rlen - 1;
3019 buffer_annot_lines(args, line, line->pos, end_pos);
3020
3021 args->nsrt_alines = 0;
3022 hts_expand(uint32_t,args->nalines,args->msrt_alines,args->srt_alines);
3023 if ( args->nalines >= 0xffff || line->n_allele >= 0xffff )
3024 error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
3025
3026 kstring_t match_end = {0,0,0};
3027 if ( args->match_end>=0 && bcf_get_info_int32(args->hdr,line,"END",&args->tmpi,&args->mtmpi)==1 )
3028 kputw(args->tmpi[0],&match_end);
3029
3030 // Find matching lines
3031 for (i=0; i<args->nalines; i++)
3032 {
3033 if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue;
3034 if ( args->ref_idx != -1 ) // REF+ALT matching requested
3035 {
3036 if ( line->pos!=args->alines[i].start || vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue; // refs are not compatible
3037 for (j=1; j<args->alines[i].nals; j++)
3038 {
3039 int ialt;
3040 if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) // match: no ALT allele in VCF and annot file has "."
3041 ialt = 0;
3042 else
3043 {
3044 ialt = vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]);
3045 if ( ialt < 0 ) continue;
3046 ialt++;
3047 }
3048 if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue;
3049 if ( match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue;
3050 args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i;
3051 has_overlap = 1;
3052 break;
3053 }
3054 }
3055 else // overlap, REF+ALT matching not requested
3056 {
3057 args->srt_alines[args->nsrt_alines++] = (0xffff<<16) | i;
3058 has_overlap = 1;
3059 }
3060 }
3061
3062 free(match_end.s);
3063
3064 // Sort lines if needed
3065 if ( args->has_append_mode )
3066 {
3067 // insertion sort by VCF ALT index (top bits) and alines index (low bits)
3068 uint32_t tmp;
3069 for (i=1; i<args->nsrt_alines; i++)
3070 for (j=i; j>0 && args->srt_alines[j] < args->srt_alines[j-1]; j--)
3071 tmp = args->srt_alines[j], args->srt_alines[j] = args->srt_alines[j-1], args->srt_alines[j-1] = tmp;
3072 }
3073 // Annotate
3074 for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
3075 int ialt_exp = 1;
3076 for (i=0; i<args->nsrt_alines; i++)
3077 {
3078 int ialt = args->srt_alines[i] >> 16;
3079 int ilin = args->srt_alines[i] & 0xffff;
3080 if ( args->has_append_mode )
3081 {
3082 if ( ialt_exp > ialt ) continue; // multiple annotation lines for the same position
3083 if ( ialt_exp < ialt )
3084 {
3085 // REF+ALT matching requested, append-missing mode: insert "." if no annotation line was found for the ALT
3086 while ( ialt_exp++ < ialt )
3087 {
3088 for (j=0; j<args->ncols; j++)
3089 {
3090 if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
3091 if ( args->cols[j].done==1 ) continue;
3092 if ( !args->cols[j].setter ) continue;
3093 int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
3094 if ( ret < 0 )
3095 error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
3096 if ( ret==0 )
3097 args->cols[j].done = 1;
3098 }
3099 }
3100 }
3101 }
3102 for (j=0; j<args->ncols; j++)
3103 {
3104 if ( args->cols[j].done==1 ) continue;
3105 if ( !args->cols[j].setter ) continue;
3106 int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]);
3107 if ( ret < 0 )
3108 error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
3109 if ( ret==0 )
3110 args->cols[j].done = 1;
3111 }
3112 ialt_exp = ialt + 1;
3113 }
3114 if ( args->nsrt_alines )
3115 {
3116 // In the append-missing mode fill missing values to all trailing ALTs, but only if at least one
3117 // record was found. Otherwise leave the row will be left without annotation.
3118 if ( args->has_append_mode && ialt_exp < line->n_allele )
3119 {
3120 while ( ialt_exp++ < line->n_allele )
3121 {
3122 for (j=0; j<args->ncols; j++)
3123 {
3124 if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
3125 if ( args->cols[j].done==1 ) continue;
3126 if ( !args->cols[j].setter ) continue;
3127 int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
3128 if ( ret < 0 )
3129 error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
3130 if ( ret==0 )
3131 args->cols[j].done = 1;
3132 }
3133 }
3134 }
3135 // Flush
3136 for (j=0; j<args->ncols; j++)
3137 {
3138 if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
3139 if ( !args->cols[j].setter ) continue;
3140 int ret = args->cols[j].setter(args,line,&args->cols[j],NULL);
3141 if ( ret < 0 )
3142 error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
3143 }
3144 }
3145 }
3146 else if ( args->files->nreaders == 2 )
3147 {
3148 if ( bcf_sr_has_line(args->files,1) )
3149 {
3150 bcf1_t *aline = bcf_sr_get_line(args->files,1);
3151 for (j=0; j<args->ncols; j++)
3152 {
3153 if ( !args->cols[j].setter ) continue;
3154 if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
3155 error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
3156 }
3157
3158 has_overlap = 1;
3159 }
3160 }
3161 if ( args->set_ids )
3162 {
3163 args->tmpks.l = 0;
3164 convert_line(args->set_ids, line, &args->tmpks);
3165 if ( args->tmpks.l )
3166 {
3167 int replace = 0;
3168 if ( args->set_ids_replace ) replace = 1;
3169 else if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) ) replace = 1;
3170 if ( replace )
3171 bcf_update_id(args->hdr_out,line,args->tmpks.s);
3172 }
3173 }
3174
3175 if ( args->mark_sites )
3176 {
3177 // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87
3178 if ( args->mark_sites_logic==MARK_LISTED )
3179 bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?1:0);
3180 else
3181 bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?0:1);
3182 }
3183 }
3184
usage(args_t * args)3185 static void usage(args_t *args)
3186 {
3187 fprintf(stderr, "\n");
3188 fprintf(stderr, "About: Annotate and edit VCF/BCF files.\n");
3189 fprintf(stderr, "Usage: bcftools annotate [options] <in.vcf.gz>\n");
3190 fprintf(stderr, "\n");
3191 fprintf(stderr, "Options:\n");
3192 fprintf(stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n");
3193 fprintf(stderr, " --collapse STR Matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
3194 fprintf(stderr, " -c, --columns LIST List of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
3195 fprintf(stderr, " -C, --columns-file FILE Read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n");
3196 fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n");
3197 fprintf(stderr, " --force Continue despite parsing error (at your own risk!)\n");
3198 fprintf(stderr, " -h, --header-lines FILE Lines which should be appended to the VCF header\n");
3199 fprintf(stderr, " -I, --set-id [+]FORMAT Set ID column using a `bcftools query`-like expression, see man page for details\n");
3200 fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n");
3201 fprintf(stderr, " -k, --keep-sites Leave -i/-e sites unchanged instead of discarding them\n");
3202 fprintf(stderr, " -l, --merge-logic TAG:TYPE Merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n");
3203 fprintf(stderr, " -m, --mark-sites [+-]TAG Add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
3204 fprintf(stderr, " --no-version Do not append version and command line to the header\n");
3205 fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
3206 fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
3207 fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
3208 fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n");
3209 fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
3210 fprintf(stderr, " --rename-annots FILE Rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n");
3211 fprintf(stderr, " --rename-chrs FILE Rename sequences according to the mapping: old\\tnew\n");
3212 fprintf(stderr, " -s, --samples [^]LIST Comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
3213 fprintf(stderr, " -S, --samples-file [^]FILE File of samples to annotate (or exclude with \"^\" prefix)\n");
3214 fprintf(stderr, " --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
3215 fprintf(stderr, " -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
3216 fprintf(stderr, " --threads INT Number of extra output compression threads [0]\n");
3217 fprintf(stderr, "\n");
3218 fprintf(stderr, "Examples:\n");
3219 fprintf(stderr, " http://samtools.github.io/bcftools/howtos/annotate.html\n");
3220 fprintf(stderr, "\n");
3221 exit(1);
3222 }
3223
main_vcfannotate(int argc,char * argv[])3224 int main_vcfannotate(int argc, char *argv[])
3225 {
3226 int c;
3227 args_t *args = (args_t*) calloc(1,sizeof(args_t));
3228 args->argc = argc; args->argv = argv;
3229 args->files = bcf_sr_init();
3230 args->output_fname = "-";
3231 args->output_type = FT_VCF;
3232 args->n_threads = 0;
3233 args->record_cmd_line = 1;
3234 args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1;
3235 args->set_ids_replace = 1;
3236 args->match_id = -1;
3237 args->clevel = -1;
3238 int regions_is_file = 0, collapse = 0;
3239 int regions_overlap = 1;
3240
3241 static struct option loptions[] =
3242 {
3243 {"keep-sites",no_argument,NULL,'k'},
3244 {"mark-sites",required_argument,NULL,'m'},
3245 {"set-id",required_argument,NULL,'I'},
3246 {"output",required_argument,NULL,'o'},
3247 {"output-type",required_argument,NULL,'O'},
3248 {"threads",required_argument,NULL,9},
3249 {"annotations",required_argument,NULL,'a'},
3250 {"merge-logic",required_argument,NULL,'l'},
3251 {"collapse",required_argument,NULL,2},
3252 {"include",required_argument,NULL,'i'},
3253 {"exclude",required_argument,NULL,'e'},
3254 {"regions",required_argument,NULL,'r'},
3255 {"regions-file",required_argument,NULL,'R'},
3256 {"regions-overlap",required_argument,NULL,3},
3257 {"remove",required_argument,NULL,'x'},
3258 {"columns-file",required_argument,NULL,'C'},
3259 {"columns",required_argument,NULL,'c'},
3260 {"rename-annots",required_argument,NULL,11},
3261 {"rename-chrs",required_argument,NULL,1},
3262 {"header-lines",required_argument,NULL,'h'},
3263 {"samples",required_argument,NULL,'s'},
3264 {"samples-file",required_argument,NULL,'S'},
3265 {"single-overlaps",no_argument,NULL,10},
3266 {"no-version",no_argument,NULL,8},
3267 {"force",no_argument,NULL,'f'},
3268 {NULL,0,NULL,0}
3269 };
3270 char *tmp;
3271 while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0)
3272 {
3273 switch (c) {
3274 case 'f': args->force = 1; break;
3275 case 'k': args->keep_sites = 1; break;
3276 case 'm':
3277 args->mark_sites_logic = MARK_LISTED;
3278 if ( optarg[0]=='+' ) args->mark_sites = optarg+1;
3279 else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; }
3280 else args->mark_sites = optarg;
3281 break;
3282 case 'l':
3283 if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str);
3284 kputs(optarg,&args->merge_method_str);
3285 break;
3286 case 'I': args->set_ids_fmt = optarg; break;
3287 case 's': args->sample_names = optarg; break;
3288 case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
3289 case 'c': args->columns = strdup(optarg); break;
3290 case 'C': args->columns = strdup(optarg); args->columns_is_file = 1; break;
3291 case 'o': args->output_fname = optarg; break;
3292 case 'O':
3293 switch (optarg[0]) {
3294 case 'b': args->output_type = FT_BCF_GZ; break;
3295 case 'u': args->output_type = FT_BCF; break;
3296 case 'z': args->output_type = FT_VCF_GZ; break;
3297 case 'v': args->output_type = FT_VCF; break;
3298 default:
3299 {
3300 args->clevel = strtol(optarg,&tmp,10);
3301 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
3302 }
3303 };
3304 if ( optarg[1] )
3305 {
3306 args->clevel = strtol(optarg+1,&tmp,10);
3307 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
3308 }
3309 break;
3310 case 'e':
3311 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
3312 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
3313 case 'i':
3314 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
3315 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
3316 case 'x': args->remove_annots = optarg; break;
3317 case 'a': args->targets_fname = optarg; break;
3318 case 'r': args->regions_list = optarg; break;
3319 case 'R': args->regions_list = optarg; regions_is_file = 1; break;
3320 case 'h': args->header_fname = optarg; break;
3321 case 1 : args->rename_chrs = optarg; break;
3322 case 2 :
3323 if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS;
3324 else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS;
3325 else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
3326 else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY;
3327 else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY;
3328 else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME;
3329 else if ( !strcmp(optarg,"none") ) collapse = COLLAPSE_NONE;
3330 else error("The --collapse string \"%s\" not recognised.\n", optarg);
3331 break;
3332 case 3 :
3333 if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
3334 else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
3335 else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
3336 else error("Could not parse: --regions-overlap %s\n",optarg);
3337 break;
3338 case 9 : args->n_threads = strtol(optarg, 0, 0); break;
3339 case 8 : args->record_cmd_line = 0; break;
3340 case 10 : args->single_overlaps = 1; break;
3341 case 11 : args->rename_annots = optarg; break;
3342 case '?': usage(args); break;
3343 default: error("Unknown argument: %s\n", optarg);
3344 }
3345 }
3346
3347 char *fname = NULL;
3348 if ( optind>=argc )
3349 {
3350 if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
3351 else usage(args);
3352 }
3353 else fname = argv[optind];
3354
3355 if ( args->regions_list )
3356 {
3357 bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
3358 if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
3359 error("Failed to read the regions: %s\n", args->regions_list);
3360 }
3361 if ( args->targets_fname )
3362 {
3363 htsFile *fp = hts_open(args->targets_fname,"r");
3364 if ( !fp ) error("Failed to open %s\n", args->targets_fname);
3365 htsFormat type = *hts_get_format(fp);
3366 hts_close(fp);
3367
3368 if ( type.format==vcf || type.format==bcf )
3369 {
3370 args->tgts_is_vcf = 1;
3371 args->files->require_index = 1;
3372 args->files->collapse = collapse ? collapse : COLLAPSE_SOME;
3373 }
3374 }
3375 if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
3376 if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
3377
3378 static int line_errcode_warned = 0;
3379 init_data(args);
3380 while ( bcf_sr_next_line(args->files) )
3381 {
3382 if ( !bcf_sr_has_line(args->files,0) ) continue;
3383 bcf1_t *line = bcf_sr_get_line(args->files,0);
3384 if ( line->errcode )
3385 {
3386 if ( !args->force )
3387 error("Encountered an error, cannot proceed. Please check the error output above.\n"
3388 "If feeling adventurous, use the --force option. (At your own risk!)\n");
3389 else if ( !line_errcode_warned )
3390 {
3391 fprintf(stderr,
3392 "Warning: Encountered an error, proceeding only because --force was given.\n"
3393 " Note that this can result in a segfault or a silent corruption of the output file!\n");
3394 line_errcode_warned = 1;
3395 line->errcode = 0;
3396 }
3397 }
3398 if ( args->filter )
3399 {
3400 int pass = filter_test(args->filter, line, NULL);
3401 if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
3402 if ( !pass )
3403 {
3404 if ( args->keep_sites && bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname);
3405 continue;
3406 }
3407 }
3408 annotate(args, line);
3409 if ( bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname);
3410 }
3411 destroy_data(args);
3412 bcf_sr_destroy(args->files);
3413 free(args);
3414 return 0;
3415 }
3416