1 /* filter.c -- filter expressions.
2
3 Copyright (C) 2013-2021 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE. */
24
25 #include <ctype.h>
26 #include <stdlib.h>
27 #include <strings.h>
28 #include <assert.h>
29 #include <errno.h>
30 #include <math.h>
31 #include <sys/types.h>
32 #include <inttypes.h>
33 #ifndef _WIN32
34 #include <pwd.h>
35 #endif
36 #include <regex.h>
37 #include <htslib/khash_str2int.h>
38 #include <htslib/hts_defs.h>
39 #include <htslib/vcfutils.h>
40 #include <htslib/kfunc.h>
41 #include "config.h"
42 #include "filter.h"
43 #include "bcftools.h"
44
45 #if ENABLE_PERL_FILTERS
46 # define filter_t perl_filter_t
47 # include <EXTERN.h>
48 # include <perl.h>
49 # undef filter_t
50 # define my_perl perl
51
52 static int filter_ninit = 0;
53 #endif
54
55
56 #ifndef __FUNCTION__
57 # define __FUNCTION__ __func__
58 #endif
59
60 typedef struct _token_t
61 {
62 // read-only values, same for all VCF lines
63 int tok_type; // one of the TOK_* keys below
64 int nargs; // with TOK_PERLSUB the first argument is the name of the subroutine
65 char *key; // set only for string constants, otherwise NULL
66 char *tag; // for debugging and printout only, VCF tag name
67 double threshold; // filtering threshold
68 int is_constant; // the threshold is set
69 int hdr_id, tag_type; // BCF header lookup ID and one of BCF_HL_* types
70 int idx; // 0-based index to VCF vectors,
71 // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
72 int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited; used by VCF retrievers only
73 int nidxs, nuidxs; // size of idxs array and the number of elements set to 1
74 uint8_t *usmpl; // bitmask of used samples as set by idx, set for FORMAT fields, NULL otherwise
75 int nsamples; // number of samples for format fields, 0 for info and other fields
76 void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
77 int (*func)(filter_t *, bcf1_t *, struct _token_t *rtok, struct _token_t **stack, int nstack);
78 void (*comparator)(struct _token_t *, struct _token_t *, struct _token_t *rtok, bcf1_t *);
79 void *hash; // test presence of str value in the hash via comparator
80 regex_t *regex; // precompiled regex for string comparison
81
82 // modified on filter evaluation at each VCF line
83 double *values;
84 kstring_t str_value;
85 int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
86 int pass_site; // -1 not applicable, 0 fails, >0 pass
87 uint8_t *pass_samples; // status of individual samples
88 int nvalues, mvalues; // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l
89 int nval1; // number of per-sample fields or string length
90 }
91 token_t;
92
93 struct _filter_t
94 {
95 bcf_hdr_t *hdr;
96 char *str;
97 int nfilters;
98 token_t *filters, **flt_stack; // filtering input tokens (in RPN) and evaluation stack
99 int32_t *tmpi;
100 float *tmpf;
101 kstring_t tmps;
102 int max_unpack, mtmpi, mtmpf, nsamples;
103 #if ENABLE_PERL_FILTERS
104 PerlInterpreter *perl;
105 #endif
106 };
107
108
109 #define TOK_VAL 0
110 #define TOK_LFT 1 // (
111 #define TOK_RGT 2 // )
112 #define TOK_LE 3 // less or equal
113 #define TOK_LT 4 // less than
114 #define TOK_EQ 5 // equal
115 #define TOK_BT 6 // bigger than
116 #define TOK_BE 7 // bigger or equal
117 #define TOK_NE 8 // not equal
118 #define TOK_OR 9 // |
119 #define TOK_AND 10 // &
120 #define TOK_ADD 11 // +
121 #define TOK_SUB 12 // -
122 #define TOK_MULT 13 // *
123 #define TOK_DIV 14 // /
124 #define TOK_MAX 15
125 #define TOK_MIN 16
126 #define TOK_AVG 17
127 #define TOK_AND_VEC 18 // && (operator applied in samples)
128 #define TOK_OR_VEC 19 // || (operator applied in samples)
129 #define TOK_LIKE 20 // ~ regular expression
130 #define TOK_NLIKE 21 // !~ regular expression
131 #define TOK_SUM 22
132 #define TOK_ABS 23
133 #define TOK_LEN 24
134 #define TOK_FUNC 25
135 #define TOK_CNT 26
136 #define TOK_PERLSUB 27
137 #define TOK_BINOM 28
138 #define TOK_PHRED 29
139 #define TOK_MEDIAN 30
140 #define TOK_STDEV 31
141 #define TOK_sMAX 32
142 #define TOK_sMIN 33
143 #define TOK_sAVG 34
144 #define TOK_sMEDIAN 35
145 #define TOK_sSTDEV 36
146 #define TOK_sSUM 37
147 #define TOK_IN 38 // contains, e.g. FILTER~"A"
148 #define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A"
149
150 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
151 // ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s
152 static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
153 #define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" // this is only for debugging, not maintained diligently
154
155 // Return negative values if it is a function with variable number of arguments
filters_next_token(char ** str,int * len)156 static int filters_next_token(char **str, int *len)
157 {
158 char *tmp = *str;
159 while ( *tmp && isspace(*tmp) ) tmp++;
160 *str = tmp;
161 *len = 0;
162
163 // test for doubles: d.ddde[+-]dd
164 if ( isdigit(*str[0]) || *str[0]=='.' ) // strtod would eat +/-
165 {
166 double HTS_UNUSED v = strtod(*str, &tmp);
167 if ( *str!=tmp && (!tmp[0] || !isalnum(tmp[0])) )
168 {
169 *len = tmp - (*str);
170 return TOK_VAL;
171 }
172 tmp = *str;
173 }
174
175 if ( !strncasecmp(tmp,"SMPL_MAX(",9) ) { (*str) += 8; return TOK_sMAX; }
176 if ( !strncasecmp(tmp,"SMPL_MIN(",9) ) { (*str) += 8; return TOK_sMIN; }
177 if ( !strncasecmp(tmp,"SMPL_MEAN(",10) ) { (*str) += 9; return TOK_sAVG; }
178 if ( !strncasecmp(tmp,"SMPL_MEDIAN(",12) ) { (*str) += 11; return TOK_sMEDIAN; }
179 if ( !strncasecmp(tmp,"SMPL_AVG(",9) ) { (*str) += 8; return TOK_sAVG; }
180 if ( !strncasecmp(tmp,"SMPL_STDEV(",11) ) { (*str) += 10; return TOK_sSTDEV; }
181 if ( !strncasecmp(tmp,"SMPL_SUM(",9) ) { (*str) += 8; return TOK_sSUM; }
182 if ( !strncasecmp(tmp,"sMAX(",5) ) { (*str) += 4; return TOK_sMAX; }
183 if ( !strncasecmp(tmp,"sMIN(",5) ) { (*str) += 4; return TOK_sMIN; }
184 if ( !strncasecmp(tmp,"sMEAN(",6) ) { (*str) += 5; return TOK_sAVG; }
185 if ( !strncasecmp(tmp,"sMEDIAN(",8) ) { (*str) += 7; return TOK_sMEDIAN; }
186 if ( !strncasecmp(tmp,"sAVG(",5) ) { (*str) += 4; return TOK_sAVG; }
187 if ( !strncasecmp(tmp,"sSTDEV(",7) ) { (*str) += 6; return TOK_sSTDEV; }
188 if ( !strncasecmp(tmp,"sSUM(",5) ) { (*str) += 4; return TOK_sSUM; }
189 if ( !strncasecmp(tmp,"MAX(",4) ) { (*str) += 3; return TOK_MAX; }
190 if ( !strncasecmp(tmp,"MIN(",4) ) { (*str) += 3; return TOK_MIN; }
191 if ( !strncasecmp(tmp,"MEAN(",5) ) { (*str) += 4; return TOK_AVG; }
192 if ( !strncasecmp(tmp,"MEDIAN(",7) ) { (*str) += 6; return TOK_MEDIAN; }
193 if ( !strncasecmp(tmp,"AVG(",4) ) { (*str) += 3; return TOK_AVG; }
194 if ( !strncasecmp(tmp,"STDEV(",6) ) { (*str) += 5; return TOK_STDEV; }
195 if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; }
196 if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; }
197 if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; }
198 if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; }
199 if ( !strncasecmp(tmp,"BINOM(",6) ) { (*str) += 5; return -TOK_BINOM; }
200 if ( !strncasecmp(tmp,"PHRED(",6) ) { (*str) += 5; return TOK_PHRED; }
201 if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility
202 if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility
203 if ( !strncasecmp(tmp,"%AVG(",5) ) { (*str) += 4; return TOK_AVG; } // for backward compatibility
204 if ( !strncasecmp(tmp,"%SUM(",5) ) { (*str) += 4; return TOK_SUM; } // for backward compatibility
205 if ( !strncasecmp(tmp,"INFO/",5) ) tmp += 5;
206 if ( !strncasecmp(tmp,"FORMAT/",7) ) tmp += 7;
207 if ( !strncasecmp(tmp,"FMT/",4) ) tmp += 4;
208 if ( !strncasecmp(tmp,"PERL.",5) ) { (*str) += 5; return -TOK_PERLSUB; }
209 if ( !strncasecmp(tmp,"N_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; }
210 if ( !strncasecmp(tmp,"F_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; }
211 if ( !strncasecmp(tmp,"%ILEN",5) ) { *len = 5; return TOK_VAL; } // to be able to distinguish between INFO/ILEN and on-the-fly ILEN
212
213 if ( tmp[0]=='@' ) // file name
214 {
215 while ( *tmp && !isspace(*tmp) && *tmp!='=' && *tmp!='!' ) tmp++;
216 *len = tmp - (*str);
217 return TOK_VAL;
218 }
219
220 int square_brackets = 0;
221 while ( tmp[0] )
222 {
223 if ( !square_brackets )
224 {
225 if ( tmp[0]=='"' ) break;
226 if ( tmp[0]=='\'' ) break;
227 if ( isspace(tmp[0]) ) break;
228 if ( tmp[0]=='<' ) break;
229 if ( tmp[0]=='>' ) break;
230 if ( tmp[0]=='=' ) break;
231 if ( tmp[0]=='!' ) break;
232 if ( tmp[0]=='&' ) break;
233 if ( tmp[0]=='|' ) break;
234 if ( tmp[0]=='(' ) break;
235 if ( tmp[0]==')' ) break;
236 if ( tmp[0]=='+' ) break;
237 if ( tmp[0]=='*' ) break;
238 if ( tmp[0]=='-' ) break;
239 if ( tmp[0]=='/' ) break;
240 if ( tmp[0]=='~' ) break;
241 }
242 if ( tmp[0]==']' ) { if (square_brackets) tmp++; break; }
243 if ( tmp[0]=='[' ) square_brackets++;
244 tmp++;
245 }
246 if ( tmp > *str )
247 {
248 *len = tmp - (*str);
249 return TOK_VAL;
250 }
251 if ( tmp[0]=='"' || tmp[0]=='\'' )
252 {
253 int quote = tmp[0];
254 tmp++;
255 while ( *tmp && tmp[0]!=quote ) tmp++;
256 if ( !*tmp ) return -1; // missing quotes
257 *len = tmp - (*str) + 1;
258 return TOK_VAL;
259 }
260 if ( tmp[0]=='!' )
261 {
262 if ( tmp[1]=='=' ) { (*str) += 2; return TOK_NE; }
263 if ( tmp[1]=='~' ) { (*str) += 2; return TOK_NLIKE; }
264 }
265 if ( tmp[0]=='<' )
266 {
267 if ( tmp[1]=='=' ) { (*str) += 2; return TOK_LE; }
268 (*str) += 1; return TOK_LT;
269 }
270 if ( tmp[0]=='>' )
271 {
272 if ( tmp[1]=='=' ) { (*str) += 2; return TOK_BE; }
273 (*str) += 1; return TOK_BT;
274 }
275 if ( tmp[0]=='=' )
276 {
277 if ( tmp[1]=='=' ) { (*str) += 2; return TOK_EQ; }
278 (*str) += 1; return TOK_EQ;
279 }
280 if ( tmp[0]=='(' ) { (*str) += 1; return TOK_LFT; }
281 if ( tmp[0]==')' ) { (*str) += 1; return TOK_RGT; }
282 if ( tmp[0]=='&' && tmp[1]=='&' ) { (*str) += 2; return TOK_AND_VEC; }
283 if ( tmp[0]=='|' && tmp[1]=='|' ) { (*str) += 2; return TOK_OR_VEC; }
284 if ( tmp[0]=='&' ) { (*str) += 1; return TOK_AND; }
285 if ( tmp[0]=='|' ) { (*str) += 1; return TOK_OR; }
286 if ( tmp[0]=='+' ) { (*str) += 1; return TOK_ADD; }
287 if ( tmp[0]=='-' ) { (*str) += 1; return TOK_SUB; }
288 if ( tmp[0]=='*' ) { (*str) += 1; return TOK_MULT; }
289 if ( tmp[0]=='/' ) { (*str) += 1; return TOK_DIV; }
290 if ( tmp[0]=='~' ) { (*str) += 1; return TOK_LIKE; }
291
292 *len = tmp - (*str);
293 return TOK_VAL;
294 }
295
296
297 /*
298 Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller.
299
300 Based on jkb's staden code with some adjustments.
301 https://sourceforge.net/p/staden/code/HEAD/tree/staden/trunk/src/Misc/getfile.c#l123
302 */
expand_path(char * path)303 char *expand_path(char *path)
304 {
305 kstring_t str = {0,0,0};
306
307 if ( path[0] == '~' )
308 {
309 if ( !path[1] || path[1] == '/' )
310 {
311 #ifdef _WIN32
312 kputs(getenv("HOMEDRIVE"), &str);
313 kputs(getenv("HOMEPATH"), &str);
314 #else
315 // ~ or ~/path
316 kputs(getenv("HOME"), &str);
317 if ( path[1] ) kputs(path+1, &str);
318 #endif
319 }
320 #ifndef _WIN32
321 else
322 {
323 // user name: ~pd3/path
324 char *end = path;
325 while ( *end && *end!='/' ) end++;
326 kputsn(path+1, end-path-1, &str);
327 struct passwd *pwentry = getpwnam(str.s);
328 str.l = 0;
329
330 if ( !pwentry ) kputsn(path, end-path, &str);
331 else kputs(pwentry->pw_dir, &str);
332 kputs(end, &str);
333 }
334 #endif
335 return ks_release(&str);
336 }
337 if ( path[0] == '$' )
338 {
339 char *var = getenv(path+1);
340 if ( var ) {
341 kputs(var, &str);
342 return ks_release(&str);
343 }
344 }
345
346 return strdup(path);
347 }
348
filters_set_qual(filter_t * flt,bcf1_t * line,token_t * tok)349 static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok)
350 {
351 float *ptr = &line->qual;
352 if ( bcf_float_is_missing(*ptr) )
353 bcf_double_set_missing(tok->values[0]);
354 else
355 tok->values[0] = (double)line->qual;
356 tok->nvalues = 1;
357 }
filters_set_type(filter_t * flt,bcf1_t * line,token_t * tok)358 static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok)
359 {
360 tok->values[0] = bcf_get_variant_types(line);
361 if ( !tok->values[0] ) tok->values[0] = 1; // mistake in htslib: VCF_* should start with 1
362 else tok->values[0] = ((int)tok->values[0]) << 1;
363 tok->nvalues = 1;
364 }
filters_set_info(filter_t * flt,bcf1_t * line,token_t * tok)365 static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
366 {
367 assert( tok->hdr_id >=0 );
368 int i;
369 for (i=0; i<line->n_info; i++)
370 if ( line->d.info[i].key == tok->hdr_id ) break;
371
372 if ( i==line->n_info )
373 tok->nvalues = tok->str_value.l = 0;
374 else if ( line->d.info[i].type==BCF_BT_CHAR )
375 {
376 int n = line->d.info[i].len;
377 if ( n >= tok->str_value.m )
378 {
379 tok->str_value.m = n + 1;
380 tok->str_value.s = (char*) realloc(tok->str_value.s, tok->str_value.m);
381 if ( !tok->str_value.s ) error("Failed to alloc %d bytes\n", (int)tok->str_value.m);
382 }
383 memcpy(tok->str_value.s, line->d.info[i].vptr, n);
384 tok->str_value.s[n] = 0;
385 tok->nvalues = tok->str_value.l = n;
386 }
387 else if ( line->d.info[i].type==BCF_BT_FLOAT )
388 {
389 if ( bcf_float_is_missing(line->d.info[i].v1.f) ) tok->nvalues = 0;
390 else
391 {
392 tok->values[0] = line->d.info[i].v1.f;
393 tok->nvalues = 1;
394 }
395 tok->str_value.l = 0;
396 }
397 else
398 {
399 tok->str_value.l = 0;
400 if ( line->d.info[i].type==BCF_BT_INT8 && line->d.info[i].v1.i==bcf_int8_missing ) tok->nvalues = 0;
401 else if ( line->d.info[i].type==BCF_BT_INT16 && line->d.info[i].v1.i==bcf_int16_missing ) tok->nvalues = 0;
402 else if ( line->d.info[i].type==BCF_BT_INT32 && line->d.info[i].v1.i==bcf_int32_missing ) tok->nvalues = 0;
403 else
404 {
405 tok->values[0] = line->d.info[i].v1.i;
406 tok->nvalues = 1;
407 }
408 }
409 }
filters_cmp_bit_and(token_t * atok,token_t * btok,token_t * rtok,bcf1_t * line)410 static void filters_cmp_bit_and(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
411 {
412 int a = (int)(atok->nvalues?atok->values[0]:atok->threshold);
413 int b = (int)(btok->nvalues?btok->values[0]:btok->threshold);
414 if ( rtok->tok_type==TOK_LIKE )
415 rtok->pass_site = a&b ? 1 : 0;
416 else
417 rtok->pass_site = a&b ? 0 : 1;
418 }
filters_cmp_filter(token_t * atok,token_t * btok,token_t * rtok,bcf1_t * line)419 static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
420 {
421 int i;
422 if ( rtok->tok_type==TOK_NOT_IN )
423 {
424 if ( !line->d.n_flt )
425 {
426 if ( atok->hdr_id==-1 ) return; // missing value
427 rtok->pass_site = 1;
428 return; // no filter present, eval to true
429 }
430 for (i=0; i<line->d.n_flt; i++)
431 if ( atok->hdr_id==line->d.flt[i] ) return;
432 rtok->pass_site = 1;
433 return;
434 }
435 else if ( rtok->tok_type==TOK_IN )
436 {
437 if ( !line->d.n_flt )
438 {
439 if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
440 return; // no filter present, eval to false
441 }
442 for (i=0; i<line->d.n_flt; i++)
443 if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; }
444 return;
445 }
446 else if ( rtok->tok_type==TOK_NE ) // exact match
447 {
448 if ( !line->d.n_flt )
449 {
450 if ( atok->hdr_id==-1 ) return; // missing value
451 rtok->pass_site = 1;
452 return; // no filter present, eval to true
453 }
454 if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) return; // exact match, fail iff a single matching value is present
455 rtok->pass_site = 1;
456 return;
457 }
458 else if ( rtok->tok_type==TOK_EQ ) // exact match, pass iff a single matching value is present
459 {
460 if ( !line->d.n_flt )
461 {
462 if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
463 return; // no filter present, eval to false
464 }
465 if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) rtok->pass_site = 1;
466 return;
467 }
468 else
469 error("Only ==, !=, ~, and !~ operators are supported for FILTER\n");
470 return;
471 }
filters_cmp_id(token_t * atok,token_t * btok,token_t * rtok,bcf1_t * line)472 static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
473 {
474 // multiple IDs not supported yet (easy to add though)
475 if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE )
476 error("Only == and != operators are supported for ID\n");
477
478 if ( btok->hash )
479 {
480 token_t *tmp = atok; atok = btok; btok = tmp;
481 }
482 if ( atok->hash )
483 {
484 int ret = khash_str2int_has_key(atok->hash, line->d.id);
485 if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1;
486 rtok->pass_site = ret;
487 return;
488 }
489
490 if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n");
491
492 if ( rtok->tok_type==TOK_EQ )
493 rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
494 else
495 rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
496 }
497
498 /**
499 * bcf_get_info_value() - get single INFO value, int64_t or double
500 * @line: BCF line
501 * @info_id: tag ID, as returned by bcf_hdr_id2int
502 * @ivec: 0-based index to retrieve, -1 when single value is expected
503 * @vptr: pointer to memory location of sufficient size to accomodate
504 * info_id's type
505 *
506 * The returned value is -1 if tag is not present, 0 if present but
507 * values is missing or ivec is out of range, and 1 on success.
508 */
bcf_get_info_value(bcf1_t * line,int info_id,int ivec,void * value)509 static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value)
510 {
511 int j;
512 for (j=0; j<line->n_info; j++)
513 if ( line->d.info[j].key == info_id ) break;
514 if ( j==line->n_info ) return -1;
515
516 bcf_info_t *info = &line->d.info[j];
517 if ( info->len == 1 )
518 {
519 if ( info->type==BCF_BT_FLOAT ) *((double*)value) = info->v1.f;
520 else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int64_t*)value) = info->v1.i;
521 return 1;
522 }
523
524 if ( ivec<0 ) ivec = 0;
525
526 #define BRANCH(type_t, is_missing, is_vector_end, out_type_t) { \
527 type_t *p = (type_t *) info->vptr; \
528 for (j=0; j<ivec && j<info->len; j++) \
529 { \
530 if ( is_vector_end ) return 0; \
531 } \
532 if ( is_missing ) return 0; \
533 *((out_type_t*)value) = p[j]; \
534 return 1; \
535 }
536 switch (info->type) {
537 case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int64_t); break;
538 case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int64_t); break;
539 case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int64_t); break;
540 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), double); break;
541 default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
542 }
543 #undef BRANCH
544 return -1; // this shouldn't happen
545 }
546
filters_set_chrom(filter_t * flt,bcf1_t * line,token_t * tok)547 static void filters_set_chrom(filter_t *flt, bcf1_t *line, token_t *tok)
548 {
549 tok->str_value.l = 0;
550 kputs(bcf_seqname(flt->hdr,line), &tok->str_value);
551 tok->nvalues = tok->str_value.l;
552 tok->is_str = 1;
553 }
554
filters_set_pos(filter_t * flt,bcf1_t * line,token_t * tok)555 static void filters_set_pos(filter_t *flt, bcf1_t *line, token_t *tok)
556 {
557 tok->values[0] = line->pos+1;
558 tok->nvalues = 1;
559 }
560
filters_set_info_int(filter_t * flt,bcf1_t * line,token_t * tok)561 static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok)
562 {
563 if ( tok->idx==-2 )
564 {
565 tok->nvalues = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
566 if ( tok->nvalues<=0 ) tok->nvalues = 0;
567 else
568 {
569 hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
570 int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1;
571 if ( end >= tok->nvalues ) end = tok->nvalues - 1;
572 for (i=0; i<=end; i++)
573 if ( i>=tok->nidxs || tok->idxs[i] ) tok->values[j++] = flt->tmpi[i];
574 tok->nvalues = j;
575 }
576 }
577 else
578 {
579 int64_t value = 0;
580 if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
581 tok->nvalues = 0;
582 else
583 {
584 tok->values[0] = value;
585 tok->nvalues = 1;
586 }
587 }
588 }
589
filters_set_info_float(filter_t * flt,bcf1_t * line,token_t * tok)590 static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok)
591 {
592 if ( tok->idx==-2 )
593 {
594 tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf);
595 if ( tok->nvalues<=0 ) tok->nvalues = 0;
596 else
597 {
598 hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
599 int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1;
600 if ( end >= tok->nvalues ) end = tok->nvalues - 1;
601 for (i=0; i<=end; i++)
602 if ( i>=tok->nidxs || tok->idxs[i] )
603 {
604 if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[j]);
605 else tok->values[j] = flt->tmpf[i];
606 j++;
607 }
608 tok->nvalues = j;
609 }
610 }
611 else
612 {
613 double value;
614 if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
615 tok->nvalues = 0;
616 else
617 {
618 tok->values[0] = value;
619 tok->nvalues = 1;
620 }
621 }
622 }
623
filters_set_info_string(filter_t * flt,bcf1_t * line,token_t * tok)624 static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok)
625 {
626 int32_t m = tok->str_value.m;
627 int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value.s,&m);
628 tok->str_value.m = m;
629 if ( n<0 ) { tok->nvalues = tok->str_value.l = 0; return; }
630
631 if ( tok->idx>=0 )
632 {
633 // get ith field (i=tok->idx)
634 int i = 0;
635 char *ss = tok->str_value.s, *se = tok->str_value.s + n;
636 while ( ss<se && i<tok->idx )
637 {
638 if ( *ss==',' ) i++;
639 ss++;
640 }
641 if ( ss==se || i!=tok->idx ) { tok->nvalues = tok->str_value.l = 0; return; }
642 se = ss;
643 while ( se - tok->str_value.s < n && *se!=',' ) se++;
644 if ( ss==tok->str_value.s ) *se = 0;
645 else
646 {
647 memmove(tok->str_value.s, ss, se-ss);
648 tok->str_value.s[se-ss] = 0;
649 }
650 tok->str_value.l = se - ss;
651 }
652 else if ( tok->idx==-2 && tok->idxs[0]==-1 ) // keep all values, TAG[*]
653 tok->str_value.l = n;
654 else if ( tok->idx==-2 )
655 {
656 flt->tmps.l = 0;
657 ks_resize(&flt->tmps, n);
658 int i, iend = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1;
659 if ( iend >= n ) iend = n - 1;
660 char *beg = tok->str_value.s, *dst = flt->tmps.s;
661 for (i=0; i<=iend; i++)
662 {
663 char *end = beg;
664 while ( *end && *end!=',' ) end++;
665 if ( i>=tok->nidxs || tok->idxs[i] )
666 {
667 memcpy(dst, beg, end - beg);
668 dst += end - beg;
669 dst[0] = ',';
670 dst++;
671 }
672 beg = end+1;
673 }
674 dst[0] = 0;
675 tok->str_value.l = dst - flt->tmps.s;
676
677 #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
678 SWAP(char *, flt->tmps.s, tok->str_value.s);
679 SWAP(size_t, flt->tmps.m, tok->str_value.m);
680 }
681 tok->nvalues = tok->str_value.l;
682 }
683
filters_set_info_flag(filter_t * flt,bcf1_t * line,token_t * tok)684 static void filters_set_info_flag(filter_t *flt, bcf1_t *line, token_t *tok)
685 {
686 int j;
687 for (j=0; j<line->n_info; j++)
688 if ( line->d.info[j].key == tok->hdr_id ) break;
689 tok->values[0] = j==line->n_info ? 0 : 1;
690 tok->nvalues = 1;
691 }
692
filters_set_format_int(filter_t * flt,bcf1_t * line,token_t * tok)693 static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
694 {
695 if ( line->n_sample != tok->nsamples )
696 error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples);
697
698 int nvals;
699 if ( (nvals=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
700 {
701 tok->nvalues = 0;
702 return;
703 }
704 int i, nsrc1 = nvals / tok->nsamples;
705 tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1);
706 tok->nvalues = tok->nval1*tok->nsamples;
707 hts_expand(double, tok->nvalues, tok->mvalues, tok->values);
708
709 if ( tok->idx >= 0 ) // scalar or vector index
710 {
711 for (i=0; i<tok->nsamples; i++)
712 {
713 if ( !tok->usmpl[i] ) continue;
714 int32_t *ptr = flt->tmpi + i*nsrc1;
715 if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing )
716 bcf_double_set_missing(tok->values[i]);
717 else if ( ptr[tok->idx]==bcf_int32_vector_end )
718 bcf_double_set_vector_end(tok->values[i]);
719 else
720 tok->values[i] = ptr[tok->idx];
721 }
722 }
723 else
724 {
725 int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
726 for (i=0; i<tok->nsamples; i++)
727 {
728 if ( !tok->usmpl[i] ) continue;
729 int32_t *src = flt->tmpi + i*nsrc1;
730 double *dst = tok->values + i*tok->nval1;
731 int k, j = 0;
732 for (k=0; k<kend; k++)
733 {
734 if ( k<tok->nidxs && !tok->idxs[k] ) continue;
735 if ( src[k]==bcf_int32_missing )
736 bcf_double_set_missing(dst[j]);
737 else if ( src[k]==bcf_int32_vector_end )
738 bcf_double_set_vector_end(dst[j]);
739 else
740 dst[j] = src[k];
741 j++;
742 }
743 if ( j==0 )
744 {
745 bcf_double_set_missing(dst[j]);
746 j++;
747 }
748 while (j < tok->nval1)
749 {
750 bcf_double_set_vector_end(dst[j]);
751 j++;
752 }
753 }
754 }
755 }
filters_set_format_float(filter_t * flt,bcf1_t * line,token_t * tok)756 static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
757 {
758 if ( line->n_sample != tok->nsamples )
759 error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples);
760
761 int nvals;
762 if ( (nvals=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 )
763 {
764 tok->nvalues = 0;
765 return;
766 }
767 int i, nsrc1 = nvals / tok->nsamples;
768 tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1);
769 tok->nvalues = tok->nval1*tok->nsamples;
770 hts_expand(double, tok->nvalues, tok->mvalues, tok->values);
771
772 if ( tok->idx >= 0 ) // scalar or vector index
773 {
774 for (i=0; i<tok->nsamples; i++)
775 {
776 if ( !tok->usmpl[i] ) continue;
777 float *ptr = flt->tmpf + i*nsrc1;
778 if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) )
779 bcf_double_set_missing(tok->values[i]);
780 else if ( bcf_float_is_vector_end(ptr[tok->idx]) )
781 bcf_double_set_vector_end(tok->values[i]);
782 else
783 tok->values[i] = ptr[tok->idx];
784 }
785 }
786 else
787 {
788 int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
789 for (i=0; i<tok->nsamples; i++)
790 {
791 if ( !tok->usmpl[i] ) continue;
792 float *src = flt->tmpf + i*nsrc1;
793 double *dst = tok->values + i*tok->nval1;
794 int k, j = 0;
795 for (k=0; k<kend; k++)
796 {
797 if ( k<tok->nidxs && !tok->idxs[k] ) continue;
798 if ( bcf_float_is_missing(src[k]) )
799 bcf_double_set_missing(dst[j]);
800 else if ( bcf_float_is_vector_end(src[k]) )
801 bcf_double_set_vector_end(dst[j]);
802 else
803 dst[j] = src[k];
804 j++;
805 }
806 if ( j==0 )
807 {
808 bcf_double_set_missing(dst[j]);
809 j++;
810 }
811 while (j < tok->nval1)
812 {
813 bcf_double_set_vector_end(dst[j]);
814 j++;
815 }
816 }
817 }
818 }
filters_set_format_string(filter_t * flt,bcf1_t * line,token_t * tok)819 static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
820 {
821 if ( line->n_sample != tok->nsamples )
822 error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples);
823
824 int i, ndim = tok->str_value.m;
825 int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim);
826 tok->str_value.m = ndim;
827 tok->str_value.l = tok->nvalues = 0;
828
829 if ( nstr<0 ) return;
830
831 tok->nvalues = tok->str_value.l = nstr;
832 tok->nval1 = nstr / tok->nsamples;
833 for (i=0; i<tok->nsamples; i++)
834 {
835 if ( !tok->usmpl[i] ) continue;
836 char *src = tok->str_value.s + i*tok->nval1, *dst = src;
837 int ibeg = 0, idx = 0;
838 while ( ibeg < tok->nval1 )
839 {
840 int iend = ibeg;
841 while ( iend < tok->nval1 && src[iend] && src[iend]!=',' ) iend++;
842
843 int keep = 0;
844 if ( tok->idx >= 0 )
845 {
846 if ( tok->idx==idx ) keep = 1;
847 }
848 else if ( idx < tok->nidxs )
849 {
850 if ( tok->idxs[idx] != 0 ) keep = 1;
851 }
852 else if ( tok->idxs[tok->nidxs-1] < 0 )
853 keep = 1;
854
855 if ( keep )
856 {
857 if ( ibeg!=0 ) memmove(dst, src+ibeg, iend-ibeg+1);
858 dst += iend - ibeg + 1;
859 if ( tok->idx>=0 ) break;
860 }
861 if ( !src[iend] ) break;
862 ibeg = iend + 1;
863 idx++;
864 }
865 if ( dst==src ) { dst[0] = '.'; dst+=2; }
866 if ( dst - src < tok->nval1 ) memset(dst-1, 0, tok->nval1 - (dst - src));
867 }
868 }
_filters_set_genotype(filter_t * flt,bcf1_t * line,token_t * tok,int type)869 static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type)
870 {
871 bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
872 if ( !fmt )
873 {
874 tok->nvalues = tok->str_value.l = 0;
875 return;
876 }
877
878 int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4;
879 if ( tok->str_value.m <= nvals1*nsmpl )
880 {
881 tok->str_value.m = nvals1*nsmpl + 1;
882 tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m);
883 }
884
885 #define BRANCH_INT(type_t,vector_end) \
886 { \
887 for (i=0; i<line->n_sample; i++) \
888 { \
889 type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \
890 int is_het = 0, has_ref = 0, missing = 0; \
891 for (j=0; j<fmt->n; j++) \
892 { \
893 if ( ptr[j]==vector_end ) break; /* smaller ploidy */ \
894 if ( bcf_gt_is_missing(ptr[j]) ) { missing=1; break; } /* missing allele */ \
895 int ial = ptr[j]; \
896 if ( bcf_gt_allele(ial)==0 ) has_ref = 1; \
897 if ( j>0 ) \
898 { \
899 int jal = ptr[j-1]; \
900 if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \
901 } \
902 } \
903 char *dst = &tok->str_value.s[nvals1*i]; \
904 if ( type==4 ) \
905 { \
906 if ( !j || missing ) dst[0]='m', dst[1]='i', dst[2]='s', dst[3] = 0; /* mis, missing genotype */ \
907 else if ( !has_ref ) dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, no ref, must have alt allele */ \
908 else if ( !is_het ) dst[0]='r', dst[1]='e', dst[2]='f', dst[3] = 0; /* ref, must be ref-only, no alt alelle */ \
909 else dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, is het, has alt allele */ \
910 } \
911 else if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \
912 else if ( type==3 ) \
913 { \
914 if ( j==1 ) dst[0]='h', dst[1]='a', dst[2]='p', dst[3] = 0; /* hap, haploid */ \
915 else if ( !is_het ) dst[0]='h', dst[1]='o', dst[2]='m', dst[3] = 0; /* hom */ \
916 else dst[0]='h', dst[1]='e', dst[2]='t', dst[3] = 0; /* het */ \
917 } \
918 else \
919 { \
920 if ( j==1 ) \
921 { \
922 if ( has_ref ) dst[0]='r', dst[1]=0; /* r, haploid */ \
923 else dst[0]='a', dst[1]=0; /* a, haploid */ \
924 } \
925 else if ( !is_het ) \
926 { \
927 if ( has_ref ) dst[0]='r', dst[1]='r', dst[2] = 0; /* rr */ \
928 else dst[0]='a', dst[1]='a', dst[2] = 0; /* aa */ \
929 } \
930 else \
931 { \
932 if ( has_ref ) dst[0]='r', dst[1]='a', dst[2] = 0; /* ra */ \
933 else dst[0]='a', dst[1]='A', dst[2] = 0; /* aA */ \
934 } \
935 } \
936 } \
937 }
938 switch (fmt->type) {
939 case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
940 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
941 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
942 default: error("The GT type is not lineognised: %d at %s:%"PRId64"\n",fmt->type, bcf_seqname(flt->hdr,line),(int64_t) line->pos+1); break;
943 }
944 #undef BRANCH_INT
945 assert( tok->nsamples == nsmpl );
946 tok->nvalues = tok->str_value.l = nvals1*nsmpl;
947 tok->str_value.s[tok->str_value.l] = 0;
948 tok->nval1 = nvals1;
949 }
filters_set_genotype2(filter_t * flt,bcf1_t * line,token_t * tok)950 static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); }
filters_set_genotype3(filter_t * flt,bcf1_t * line,token_t * tok)951 static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); }
filters_set_genotype4(filter_t * flt,bcf1_t * line,token_t * tok)952 static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); }
953
filters_set_genotype_string(filter_t * flt,bcf1_t * line,token_t * tok)954 static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
955 {
956 bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
957 if ( !fmt )
958 {
959 tok->nvalues = 0;
960 return;
961 }
962 int i, blen = 4, nsmpl = line->n_sample;
963
964 gt_length_too_big:
965 tok->str_value.l = 0;
966 for (i=0; i<nsmpl; i++)
967 {
968 size_t plen = tok->str_value.l;
969 bcf_format_gt(fmt, i, &tok->str_value);
970 kputc_(0, &tok->str_value);
971 if ( tok->str_value.l - plen > blen )
972 {
973 // too many alternate alleles or ploidy is too large, the genotype does not fit
974 // three characters ("0/0" vs "10/10").
975 blen *= 2;
976 goto gt_length_too_big;
977 }
978
979 plen = tok->str_value.l - plen;
980 while ( plen < blen )
981 {
982 kputc_(0, &tok->str_value);
983 plen++;
984 }
985 }
986 assert( tok->nsamples == nsmpl );
987 tok->nvalues = tok->str_value.l;
988 tok->nval1 = blen;
989 }
filters_set_ilen(filter_t * flt,bcf1_t * line,token_t * tok)990 static void filters_set_ilen(filter_t *flt, bcf1_t *line, token_t *tok)
991 {
992 tok->nvalues = line->n_allele - 1;
993 hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
994
995 int i, rlen = strlen(line->d.allele[0]);
996 for (i=1; i<line->n_allele; i++)
997 {
998 int alen = strlen(line->d.allele[i]);
999 if ( rlen==alen ) bcf_double_set_missing(tok->values[i-1]);
1000 else tok->values[i-1] = alen - rlen;
1001 }
1002 }
filters_set_ref_string(filter_t * flt,bcf1_t * line,token_t * tok)1003 static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok)
1004 {
1005 tok->str_value.l = 0;
1006 kputs(line->d.allele[0], &tok->str_value);
1007 tok->nvalues = tok->str_value.l;
1008 }
filters_set_alt_string(filter_t * flt,bcf1_t * line,token_t * tok)1009 static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok)
1010 {
1011 tok->str_value.l = 0;
1012 if ( tok->idx>=0 )
1013 {
1014 if ( line->n_allele > tok->idx + 1 )
1015 kputs(line->d.allele[tok->idx + 1], &tok->str_value);
1016 else
1017 kputc('.', &tok->str_value);
1018 }
1019 else if ( tok->idx==-2 )
1020 {
1021 int i, end = tok->nuidxs ? tok->nuidxs : line->n_allele - 1;
1022 if ( end >= line->n_allele - 1 ) end = line->n_allele - 2;
1023 for (i=0; i<=end; i++)
1024 if ( i>=tok->nidxs || tok->idxs[i] )
1025 {
1026 if ( tok->str_value.l ) kputc(',', &tok->str_value);
1027 kputs(line->d.allele[i+1], &tok->str_value);
1028 }
1029 }
1030 else if ( line->n_allele>1 )
1031 {
1032 kputs(line->d.allele[1], &tok->str_value);
1033 int i;
1034 for (i=2; i<line->n_allele; i++)
1035 {
1036 kputc(',', &tok->str_value);
1037 kputs(line->d.allele[i], &tok->str_value);
1038 }
1039 }
1040 else if ( line->n_allele==1 )
1041 kputc('.', &tok->str_value);
1042 tok->nvalues = tok->str_value.l;
1043 }
filters_set_nmissing(filter_t * flt,bcf1_t * line,token_t * tok)1044 static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok)
1045 {
1046 bcf_unpack(line, BCF_UN_FMT);
1047 if ( !line->n_sample )
1048 {
1049 tok->nvalues = 1;
1050 tok->values[0] = 0;
1051 return;
1052 }
1053
1054 int i,igt = bcf_hdr_id2int(flt->hdr, BCF_DT_ID, "GT");
1055 bcf_fmt_t *fmt = NULL;
1056 for (i=0; i<line->n_fmt; i++)
1057 if ( line->d.fmt[i].id==igt ) { fmt = &line->d.fmt[i]; break; }
1058 if ( !fmt )
1059 {
1060 tok->nvalues = 0;
1061 return;
1062 }
1063
1064 int j,nmissing = 0;
1065 #define BRANCH(type_t, is_vector_end) { \
1066 for (i=0; i<line->n_sample; i++) \
1067 { \
1068 type_t *ptr = (type_t *) (fmt->p + i*fmt->size); \
1069 for (j=0; j<fmt->n; j++) \
1070 { \
1071 if ( ptr[j]==is_vector_end ) break; \
1072 if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; } \
1073 } \
1074 } \
1075 }
1076 switch (fmt->type) {
1077 case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break;
1078 case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
1079 case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
1080 default: fprintf(stderr,"todo: type %d\n", fmt->type); exit(1); break;
1081 }
1082 #undef BRANCH
1083 tok->nvalues = 1;
1084 tok->values[0] = tok->tag[0]=='N' ? nmissing : (double)nmissing / line->n_sample;
1085 }
func_npass(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1086 static int func_npass(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1087 {
1088 if ( nstack==0 ) error("Error parsing the expression\n");
1089 token_t *tok = stack[nstack - 1];
1090 if ( !tok->nsamples ) error("The function %s works with FORMAT fields\n", rtok->tag);
1091 assert(tok->usmpl);
1092
1093 int i, npass = 0;
1094 for (i=0; i<tok->nsamples; i++)
1095 {
1096 if ( !tok->usmpl[i] ) continue;
1097 if ( tok->pass_samples[i] ) npass++;
1098 }
1099 hts_expand(double,1,rtok->mvalues,rtok->values);
1100 rtok->nsamples = 0;
1101 rtok->nvalues = 1;
1102 rtok->values[0] = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0);
1103
1104 return 1;
1105 }
filters_set_nalt(filter_t * flt,bcf1_t * line,token_t * tok)1106 static void filters_set_nalt(filter_t *flt, bcf1_t *line, token_t *tok)
1107 {
1108 tok->nvalues = 1;
1109 tok->values[0] = line->n_allele - 1;
1110 }
filters_set_ac(filter_t * flt,bcf1_t * line,token_t * tok)1111 static void filters_set_ac(filter_t *flt, bcf1_t *line, token_t *tok)
1112 {
1113 hts_expand(int32_t, line->n_allele, flt->mtmpi, flt->tmpi);
1114 if ( !bcf_calc_ac(flt->hdr, line, flt->tmpi, BCF_UN_INFO|BCF_UN_FMT) )
1115 {
1116 tok->nvalues = 0;
1117 return;
1118 }
1119 int i, an = flt->tmpi[0];
1120 for (i=1; i<line->n_allele; i++) an += flt->tmpi[i];
1121 if ( !an )
1122 {
1123 tok->nvalues = 0;
1124 return;
1125 }
1126 flt->tmpi[0] = an; // for filters_set_[mac|af|maf]
1127 if ( tok->idx>=0 )
1128 {
1129 tok->nvalues = 1;
1130 tok->values[0] = tok->idx+1<line->n_allele ? flt->tmpi[tok->idx+1] : 0;
1131 }
1132 else if ( line->n_allele==1 ) // no ALT
1133 {
1134 tok->nvalues = 1;
1135 tok->values[0] = 0;
1136 }
1137 else
1138 {
1139 hts_expand(double,line->n_allele,tok->mvalues,tok->values);
1140 for (i=1; i<line->n_allele; i++)
1141 tok->values[i-1] = flt->tmpi[i];
1142 tok->nvalues = line->n_allele - 1;
1143 }
1144 }
filters_set_an(filter_t * flt,bcf1_t * line,token_t * tok)1145 static void filters_set_an(filter_t *flt, bcf1_t *line, token_t *tok)
1146 {
1147 filters_set_ac(flt,line,tok);
1148 tok->values[0] = tok->nvalues ? flt->tmpi[0] : 0;
1149 tok->nvalues = 1;
1150 }
filters_set_mac(filter_t * flt,bcf1_t * line,token_t * tok)1151 static void filters_set_mac(filter_t *flt, bcf1_t *line, token_t *tok)
1152 {
1153 filters_set_ac(flt,line,tok);
1154 if ( !tok->nvalues ) return;
1155 int i, an = flt->tmpi[0];
1156 for (i=0; i<tok->nvalues; i++)
1157 if ( tok->values[i] > an*0.5 ) tok->values[i] = an - tok->values[i];
1158 }
filters_set_af(filter_t * flt,bcf1_t * line,token_t * tok)1159 static void filters_set_af(filter_t *flt, bcf1_t *line, token_t *tok)
1160 {
1161 filters_set_ac(flt,line,tok);
1162 if ( !tok->nvalues ) return;
1163 int i, an = flt->tmpi[0];
1164 for (i=0; i<tok->nvalues; i++)
1165 tok->values[i] /= (double)an;
1166 }
filters_set_maf(filter_t * flt,bcf1_t * line,token_t * tok)1167 static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok)
1168 {
1169 filters_set_ac(flt,line,tok);
1170 if ( !tok->nvalues ) return;
1171 int i, an = flt->tmpi[0];
1172 for (i=0; i<tok->nvalues; i++)
1173 {
1174 tok->values[i] /= (double)an;
1175 if ( tok->values[i] > 0.5 ) tok->values[i] = 1 - tok->values[i];
1176 }
1177 }
1178
func_max(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1179 static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1180 {
1181 token_t *tok = stack[nstack - 1];
1182 rtok->nvalues = 0;
1183 if ( !tok->nvalues ) return 1;
1184 double *ptr, val = -HUGE_VAL;
1185 int i,j, has_value = 0;
1186 if ( tok->nsamples )
1187 {
1188 for (i=0; i<tok->nsamples; i++)
1189 {
1190 if ( !tok->usmpl[i] ) continue;
1191 ptr = tok->values + i*tok->nval1;
1192 for (j=0; j<tok->nval1; j++)
1193 {
1194 if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1195 has_value = 1;
1196 if ( val < ptr[j] ) val = ptr[j];
1197 }
1198 }
1199 }
1200 else
1201 {
1202 for (i=0; i<tok->nvalues; i++)
1203 {
1204 if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
1205 has_value = 1;
1206 if ( val < tok->values[i] ) val = tok->values[i];
1207 }
1208 }
1209 if ( has_value )
1210 {
1211 rtok->values[0] = val;
1212 rtok->nvalues = has_value;
1213 }
1214 return 1;
1215 }
func_smpl_max(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1216 static int func_smpl_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1217 {
1218 token_t *tok = stack[nstack - 1];
1219 if ( !tok->nsamples ) return func_max(flt,line,rtok,stack,nstack);
1220 rtok->nsamples = tok->nsamples;
1221 rtok->nvalues = tok->nsamples;
1222 rtok->nval1 = 1;
1223 hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
1224 assert(tok->usmpl);
1225 if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1226 memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1227 int i, j, has_value;
1228 double val, *ptr;
1229 for (i=0; i<tok->nsamples; i++)
1230 {
1231 if ( !rtok->usmpl[i] ) continue;
1232 val = -HUGE_VAL;
1233 has_value = 0;
1234 ptr = tok->values + i*tok->nval1;
1235 for (j=0; j<tok->nval1; j++)
1236 {
1237 if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1238 has_value = 1;
1239 if ( val < ptr[j] ) val = ptr[j];
1240 }
1241 if ( has_value ) rtok->values[i] = val;
1242 else bcf_double_set_missing(rtok->values[i]);
1243 }
1244 return 1;
1245 }
func_min(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1246 static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1247 {
1248 token_t *tok = stack[nstack - 1];
1249 rtok->nvalues = 0;
1250 if ( !tok->nvalues ) return 1;
1251 double *ptr, val = HUGE_VAL;
1252 int i,j, has_value = 0;
1253 if ( tok->nsamples )
1254 {
1255 for (i=0; i<tok->nsamples; i++)
1256 {
1257 if ( !tok->usmpl[i] ) continue;
1258 ptr = tok->values + i*tok->nval1;
1259 for (j=0; j<tok->nval1; j++)
1260 {
1261 if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1262 has_value = 1;
1263 if ( val > ptr[j] ) val = ptr[j];
1264 }
1265 }
1266 }
1267 else
1268 {
1269 for (i=0; i<tok->nvalues; i++)
1270 {
1271 if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
1272 has_value = 1;
1273 if ( val > tok->values[i] ) val = tok->values[i];
1274 }
1275 }
1276 if ( has_value )
1277 {
1278 rtok->values[0] = val;
1279 rtok->nvalues = has_value;
1280 }
1281 return 1;
1282 }
func_smpl_min(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1283 static int func_smpl_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1284 {
1285 token_t *tok = stack[nstack - 1];
1286 if ( !tok->nsamples ) return func_min(flt,line,rtok,stack,nstack);
1287 rtok->nsamples = tok->nsamples;
1288 rtok->nvalues = tok->nsamples;
1289 rtok->nval1 = 1;
1290 hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
1291 assert(tok->usmpl);
1292 if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1293 memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1294 int i, j, has_value;
1295 double val, *ptr;
1296 for (i=0; i<tok->nsamples; i++)
1297 {
1298 if ( !rtok->usmpl[i] ) continue;
1299 val = HUGE_VAL;
1300 has_value = 0;
1301 ptr = tok->values + i*tok->nval1;
1302 for (j=0; j<tok->nval1; j++)
1303 {
1304 if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1305 has_value = 1;
1306 if ( val > ptr[j] ) val = ptr[j];
1307 }
1308 if ( has_value ) rtok->values[i] = val;
1309 else bcf_double_set_missing(rtok->values[i]);
1310 }
1311 return 1;
1312 }
func_avg(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1313 static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1314 {
1315 token_t *tok = stack[nstack - 1];
1316 rtok->nvalues = 0;
1317 if ( !tok->nvalues ) return 1;
1318 double *ptr, val = 0;
1319 int i,j, n = 0;
1320 if ( tok->nsamples )
1321 {
1322 for (i=0; i<tok->nsamples; i++)
1323 {
1324 if ( !tok->usmpl[i] ) continue;
1325 ptr = tok->values + i*tok->nval1;
1326 for (j=0; j<tok->nval1; j++)
1327 {
1328 if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1329 val += ptr[j];
1330 n++;
1331 }
1332 }
1333 }
1334 else
1335 {
1336 for (i=0; i<tok->nvalues; i++)
1337 if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
1338 }
1339 if ( n )
1340 {
1341 rtok->values[0] = val / n;
1342 rtok->nvalues = 1;
1343 }
1344 return 1;
1345 }
func_smpl_avg(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1346 static int func_smpl_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1347 {
1348 token_t *tok = stack[nstack - 1];
1349 if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
1350 rtok->nsamples = tok->nsamples;
1351 rtok->nvalues = tok->nsamples;
1352 rtok->nval1 = 1;
1353 hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
1354 assert(tok->usmpl);
1355 if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1356 memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1357 int i, j, n;
1358 double val, *ptr;
1359 for (i=0; i<tok->nsamples; i++)
1360 {
1361 if ( !rtok->usmpl[i] ) continue;
1362 val = 0;
1363 n = 0;
1364 ptr = tok->values + i*tok->nval1;
1365 for (j=0; j<tok->nval1; j++)
1366 {
1367 if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) { val += ptr[j]; n++; }
1368 }
1369 if ( n ) rtok->values[i] = val / n;
1370 else bcf_double_set_missing(rtok->values[i]);
1371 }
1372 return 1;
1373 }
compare_doubles(const void * lhs,const void * rhs)1374 static int compare_doubles(const void *lhs, const void *rhs)
1375 {
1376 double arg1 = *(const double*) lhs;
1377 double arg2 = *(const double*) rhs;
1378 if (arg1 < arg2) return -1;
1379 if (arg1 > arg2) return 1;
1380 return 0;
1381 }
func_median(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1382 static int func_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1383 {
1384 token_t *tok = stack[nstack - 1];
1385 rtok->nvalues = 0;
1386 if ( !tok->nvalues ) return 1;
1387 // sweep through all tok->values and while excluding all missing values reuse the very same array
1388 int i,j,k = 0, n = 0;
1389 if ( tok->nsamples )
1390 {
1391 for (i=0; i<tok->nsamples; i++)
1392 {
1393 if ( !tok->usmpl[i] ) { k += tok->nval1; continue; }
1394 for (j=0; j<tok->nval1; k++,j++)
1395 {
1396 if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue;
1397 if ( n < k ) tok->values[n] = tok->values[k];
1398 n++;
1399 }
1400 }
1401 }
1402 else
1403 {
1404 for (i=0; i<tok->nvalues; i++)
1405 {
1406 if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
1407 if ( n < i ) tok->values[n] = tok->values[i];
1408 n++;
1409 }
1410 }
1411 if ( !n ) return 1;
1412 if ( n==1 ) rtok->values[0] = tok->values[0];
1413 else
1414 {
1415 qsort(tok->values, n, sizeof(double), compare_doubles);
1416 rtok->values[0] = n % 2 ? tok->values[n/2] : (tok->values[n/2-1] + tok->values[n/2]) * 0.5;
1417 }
1418 rtok->nvalues = 1;
1419 return 1;
1420 }
func_smpl_median(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1421 static int func_smpl_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1422 {
1423 token_t *tok = stack[nstack - 1];
1424 if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
1425 rtok->nsamples = tok->nsamples;
1426 rtok->nvalues = tok->nsamples;
1427 rtok->nval1 = 1;
1428 hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
1429 assert(tok->usmpl);
1430 if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1431 memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1432 int i, j, n;
1433 double *ptr;
1434 for (i=0; i<tok->nsamples; i++)
1435 {
1436 if ( !rtok->usmpl[i] ) continue;
1437 n = 0;
1438 ptr = tok->values + i*tok->nval1;
1439 for (j=0; j<tok->nval1; j++)
1440 {
1441 if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1442 if ( n < j ) ptr[n] = ptr[j];
1443 n++;
1444 }
1445 if ( n==0 )
1446 bcf_double_set_missing(rtok->values[i]);
1447 else if ( n==1 )
1448 rtok->values[i] = ptr[0];
1449 else
1450 {
1451 qsort(ptr, n, sizeof(double), compare_doubles);
1452 rtok->values[i] = n % 2 ? ptr[n/2] : (ptr[n/2-1] + ptr[n/2]) * 0.5;
1453 }
1454 }
1455 return 1;
1456 }
func_stddev(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1457 static int func_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1458 {
1459 token_t *tok = stack[nstack - 1];
1460 rtok->nvalues = 0;
1461 if ( !tok->nvalues ) return 1;
1462 // sweep through all tok->values and while excluding all missing values reuse the very same array
1463 int i,j,k = 0, n = 0;
1464 if ( tok->nsamples )
1465 {
1466 for (i=0; i<tok->nsamples; i++)
1467 {
1468 if ( !tok->usmpl[i] ) { k += tok->nval1; continue; }
1469 for (j=0; j<tok->nval1; k++,j++)
1470 {
1471 if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue;
1472 if ( n < k ) tok->values[n] = tok->values[k];
1473 n++;
1474 }
1475 }
1476 }
1477 else
1478 {
1479 for (i=0; i<tok->nvalues; i++)
1480 {
1481 if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
1482 if ( n < i ) tok->values[n] = tok->values[i];
1483 n++;
1484 }
1485 }
1486 if ( !n ) return 1;
1487 if ( n==1 ) rtok->values[0] = 0;
1488 else
1489 {
1490 double sdev = 0, avg = 0;
1491 for (i=0; i<n; i++) avg += tok->values[i];
1492 avg /= n;
1493 for (i=0; i<n; i++) sdev += (tok->values[i] - avg) * (tok->values[i] - avg);
1494 rtok->values[0] = sqrt(sdev/n);
1495 }
1496 rtok->nvalues = 1;
1497 return 1;
1498 }
func_smpl_stddev(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1499 static int func_smpl_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1500 {
1501 token_t *tok = stack[nstack - 1];
1502 if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
1503 rtok->nsamples = tok->nsamples;
1504 rtok->nvalues = tok->nsamples;
1505 rtok->nval1 = 1;
1506 hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
1507 assert(tok->usmpl);
1508 if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1509 memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1510 int i, j, n;
1511 double *ptr;
1512 for (i=0; i<tok->nsamples; i++)
1513 {
1514 if ( !rtok->usmpl[i] ) continue;
1515 n = 0;
1516 ptr = tok->values + i*tok->nval1;
1517 for (j=0; j<tok->nval1; j++)
1518 {
1519 if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1520 if ( n < j ) ptr[n] = ptr[j];
1521 n++;
1522 }
1523 if ( n==0 )
1524 bcf_double_set_missing(rtok->values[i]);
1525 else if ( n==1 )
1526 rtok->values[i] = 0;
1527 else
1528 {
1529 double sdev = 0, avg = 0;
1530 for (j=0; j<n; j++) avg += ptr[j];
1531 avg /= n;
1532 for (j=0; j<n; j++) sdev += (ptr[j] - avg) * (ptr[j] - avg);
1533 rtok->values[i] = sqrt(sdev/n);
1534 }
1535 }
1536 return 1;
1537 }
func_sum(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1538 static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1539 {
1540 rtok->nvalues = 0;
1541 token_t *tok = stack[nstack - 1];
1542 if ( !tok->nvalues ) return 1;
1543 double *ptr, val = 0;
1544 int i,j, n = 0;
1545 if ( tok->nsamples )
1546 {
1547 for (i=0; i<tok->nsamples; i++)
1548 {
1549 if ( !tok->usmpl[i] ) continue;
1550 ptr = tok->values + i*tok->nval1;
1551 for (j=0; j<tok->nval1; j++)
1552 {
1553 if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1554 val += ptr[j];
1555 n++;
1556 }
1557 }
1558 }
1559 else
1560 {
1561 for (i=0; i<tok->nvalues; i++)
1562 if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
1563 }
1564 if ( n )
1565 {
1566 rtok->values[0] = val;
1567 rtok->nvalues = 1;
1568 }
1569 return 1;
1570 }
func_smpl_sum(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1571 static int func_smpl_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1572 {
1573 token_t *tok = stack[nstack - 1];
1574 if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
1575 rtok->nsamples = tok->nsamples;
1576 rtok->nvalues = tok->nsamples;
1577 rtok->nval1 = 1;
1578 hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
1579 assert(tok->usmpl);
1580 if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1581 memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1582 int i, j, has_value;
1583 double val, *ptr;
1584 for (i=0; i<tok->nsamples; i++)
1585 {
1586 if ( !rtok->usmpl[i] ) continue;
1587 val = 0;
1588 has_value = 0;
1589 ptr = tok->values + i*tok->nval1;
1590 for (j=0; j<tok->nval1; j++)
1591 {
1592 if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1593 has_value = 1;
1594 val += ptr[j];
1595 }
1596 if ( has_value ) rtok->values[i] = val;
1597 else bcf_double_set_missing(rtok->values[i]);
1598 }
1599 return 1;
1600 }
func_abs(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1601 static int func_abs(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1602 {
1603 token_t *tok = stack[nstack - 1];
1604 if ( tok->is_str ) error("ABS() can be applied only on numeric values\n");
1605 rtok->nsamples = tok->nsamples;
1606 rtok->nvalues = tok->nvalues;
1607 rtok->nval1 = tok->nval1;
1608 hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
1609 if ( tok->usmpl )
1610 {
1611 if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1612 memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1613 }
1614 if ( !tok->nvalues ) return 1;
1615 hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
1616 int i,j,k = 0;
1617 if ( tok->usmpl )
1618 {
1619 for (i=0; i<tok->nsamples; i++)
1620 {
1621 if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; }
1622 for (j=0; j<tok->nval1; k++,j++)
1623 {
1624 if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]);
1625 else rtok->values[k] = fabs(tok->values[k]);
1626 }
1627 }
1628 }
1629 else
1630 {
1631 for (i=0; i<tok->nvalues; i++)
1632 {
1633 if ( tok->usmpl && !tok->usmpl[i] ) continue;
1634 if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
1635 else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]);
1636 }
1637 }
1638 return 1;
1639 }
func_count(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1640 static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1641 {
1642 token_t *tok = stack[nstack - 1];
1643 int i,j, cnt = 0;
1644 if ( tok->tag && tok->nsamples )
1645 {
1646 // raw number of values in a FMT tag, e.g. COUNT(FMT/TAG)
1647 if ( tok->is_str ) error("todo: Type=String for COUNT on FORMAT fields?\n");
1648 for (i=0; i<tok->nsamples; i++)
1649 {
1650 if ( !tok->usmpl[i] ) continue;
1651 double *ptr = tok->values + i*tok->nval1;
1652 for (j=0; j<tok->nval1; j++)
1653 if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) cnt++;
1654 }
1655 }
1656 else if ( tok->nsamples )
1657 {
1658 // number of samples that pass a processed FMT tag
1659 for (i=0; i<tok->nsamples; i++)
1660 if ( tok->pass_samples[i] ) cnt++;
1661 }
1662 else if ( tok->is_str )
1663 {
1664 if ( tok->str_value.l ) cnt = 1;
1665 for (i=0; i<tok->str_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++;
1666 }
1667 else
1668 cnt = tok->nvalues;
1669
1670 rtok->nvalues = 1;
1671 rtok->values[0] = cnt;
1672 return 1;
1673 }
func_strlen(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1674 static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1675 {
1676 token_t *tok = stack[nstack - 1];
1677 rtok->nvalues = rtok->str_value.l = 0;
1678 if ( !tok->str_value.l ) return 1;
1679
1680 if ( tok->idx==-2 )
1681 {
1682 int i = 0;
1683 char *ss = tok->str_value.s;
1684 while ( *ss )
1685 {
1686 char *se = ss;
1687 while ( *se && *se!=',' ) se++;
1688 hts_expand(double, i+1, rtok->mvalues, rtok->values);
1689 if ( !*se ) rtok->values[i] = strlen(ss);
1690 else
1691 {
1692 *se = 0;
1693 rtok->values[i] = strlen(ss);
1694 *se = ',';
1695 }
1696 ss = *se ? se + 1 : se;
1697 i++;
1698 }
1699 rtok->nvalues = i;
1700 }
1701 else
1702 {
1703 if ( !tok->str_value.s[1] && tok->str_value.s[0]=='.' )
1704 rtok->values[0] = 0;
1705 else
1706 rtok->values[0] = strlen(tok->str_value.s);
1707 rtok->nvalues = 1;
1708 }
1709 return 1;
1710 }
calc_binom(int na,int nb)1711 static inline double calc_binom(int na, int nb)
1712 {
1713 if ( na==0 && nb==0 ) return -1;
1714 if ( na==nb ) return 1;
1715
1716 // kfunc.h implements kf_betai, which is the regularized beta function P(X<=k/N;p) = I_{1-p}(N-k,k+1)
1717
1718 double pval = na < nb ? kf_betai(nb, na + 1, 0.5) : kf_betai(na, nb + 1, 0.5);
1719 pval *= 2;
1720 assert( pval-1 < 1e-10 );
1721 if ( pval>1 ) pval = 1; // this can happen, machine precision error, eg. kf_betai(1,0,0.5)
1722
1723 return pval;
1724 }
func_binom(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1725 static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1726 {
1727 int i, istack = nstack - rtok->nargs;
1728
1729 if ( rtok->nargs!=2 && rtok->nargs!=1 ) error("Error: binom() takes one or two arguments\n");
1730 assert( istack>=0 );
1731
1732 // The expected mean is 0.5. Should we support also prob!=0.5?
1733 //
1734 // double prob = 0.5;
1735 // if ( nstack==3 )
1736 // {
1737 // // three parameters, the first one must be a scalar: binom(0.25,AD[0],AD[1])
1738 // if ( !stack[istack]->is_constant )
1739 // error("The first argument of binom() must be a numeric constant if three parameters are given\n");
1740 // prob = stack[istack]->threshold;
1741 // istack++;
1742 // }
1743 // else if ( nstack==2 && stack[istack]->is_constant )
1744 // {
1745 // // two parameters, the first can be a scalar: binom(0.25,AD) or binom(AD[0],AD[1])
1746 // prob = stack[istack]->threshold;
1747 // istack++;
1748 // }
1749
1750 token_t *tok = stack[istack];
1751 if ( tok->nsamples )
1752 {
1753 // working with a FORMAT tag
1754 rtok->nval1 = 1;
1755 rtok->nvalues = tok->nsamples;
1756 rtok->nsamples = tok->nsamples;
1757 hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
1758 assert(tok->usmpl);
1759 if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1760 memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1761
1762 if ( istack+1==nstack )
1763 {
1764 // determine the index from the GT field: binom(AD)
1765 int ngt = bcf_get_genotypes(flt->hdr, line, &flt->tmpi, &flt->mtmpi);
1766 int max_ploidy = ngt/line->n_sample;
1767 if ( ngt <= 0 || max_ploidy < 2 ) // GT not present or not diploid, cannot set
1768 {
1769 for (i=0; i<rtok->nsamples; i++)
1770 if ( rtok->usmpl[i] ) bcf_double_set_missing(rtok->values[i]);
1771 return rtok->nargs;
1772 }
1773 for (i=0; i<rtok->nsamples; i++)
1774 {
1775 if ( !rtok->usmpl[i] ) continue;
1776 int32_t *ptr = flt->tmpi + i*max_ploidy;
1777 if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end )
1778 {
1779 bcf_double_set_missing(rtok->values[i]);
1780 continue;
1781 }
1782 int idx1 = bcf_gt_allele(ptr[0]);
1783 int idx2 = bcf_gt_allele(ptr[1]);
1784 if ( idx1>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]);
1785 if ( idx2>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]);
1786 double *vals = tok->values + tok->nval1*i;
1787 if ( bcf_double_is_missing_or_vector_end(vals[idx1]) || bcf_double_is_missing_or_vector_end(vals[idx2]) )
1788 {
1789 bcf_double_set_missing(rtok->values[i]);
1790 continue;
1791 }
1792 rtok->values[i] = calc_binom(vals[idx1],vals[idx2]);
1793 if ( rtok->values[i] < 0 )
1794 {
1795 bcf_double_set_missing(rtok->values[i]);
1796 continue;
1797 }
1798 }
1799 }
1800 else
1801 {
1802 // the fields given explicitly: binom(AD[:0],AD[:1])
1803 token_t *tok2 = stack[istack+1];
1804 if ( tok->nval1!=1 || tok2->nval1!=1 )
1805 error("Expected one value per binom() argument, found %d and %d at %s:%"PRId64"\n",tok->nval1,tok2->nval1, bcf_seqname(flt->hdr,line),(int64_t) line->pos+1);
1806 for (i=0; i<rtok->nsamples; i++)
1807 {
1808 if ( !rtok->usmpl[i] ) continue;
1809 double *ptr1 = tok->values + tok->nval1*i;
1810 double *ptr2 = tok2->values + tok2->nval1*i;
1811 if ( bcf_double_is_missing_or_vector_end(ptr1[0]) || bcf_double_is_missing_or_vector_end(ptr2[0]) )
1812 {
1813 bcf_double_set_missing(rtok->values[i]);
1814 continue;
1815 }
1816 rtok->values[i] = calc_binom(ptr1[0],ptr2[0]);
1817 if ( rtok->values[i] < 0 )
1818 {
1819 bcf_double_set_missing(rtok->values[i]);
1820 continue;
1821 }
1822 }
1823 }
1824 }
1825 else
1826 {
1827 // working with an INFO tag
1828 rtok->nvalues = 1;
1829 hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
1830
1831 double *ptr1 = NULL, *ptr2 = NULL;
1832 if ( istack+1==nstack )
1833 {
1834 // only one tag, expecting two values: binom(INFO/AD)
1835 if ( tok->nvalues==2 )
1836 {
1837 ptr1 = &tok->values[0];
1838 ptr2 = &tok->values[1];
1839 }
1840 }
1841 else
1842 {
1843 // two tags, expecting one value in each: binom(INFO/AD[0],INFO/AD[1])
1844 token_t *tok2 = stack[istack+1];
1845 if ( tok->nvalues==1 && tok2->nvalues==1 )
1846 {
1847 ptr1 = &tok->values[0];
1848 ptr2 = &tok2->values[0];
1849 }
1850 }
1851 if ( !ptr1 || !ptr2 || bcf_double_is_missing_or_vector_end(ptr1[0]) || bcf_double_is_missing_or_vector_end(ptr2[0]) )
1852 bcf_double_set_missing(rtok->values[0]);
1853 else
1854 {
1855 rtok->values[0] = calc_binom(ptr1[0],ptr2[0]);
1856 if ( rtok->values[0] < 0 )
1857 bcf_double_set_missing(rtok->values[0]);
1858 }
1859 }
1860 return rtok->nargs;
1861 }
func_phred(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1862 static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1863 {
1864 token_t *tok = stack[nstack - 1];
1865 if ( tok->is_str ) error("PHRED() can be applied only on numeric values\n");
1866
1867 rtok->nsamples = tok->nsamples;
1868 rtok->nval1 = tok->nval1;
1869 memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples));
1870 assert(tok->usmpl);
1871 if ( !rtok->usmpl )
1872 {
1873 rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl));
1874 memcpy(rtok->usmpl, tok->usmpl, tok->nsamples*sizeof(*rtok->usmpl));
1875 }
1876 rtok->nvalues = tok->nvalues;
1877 if ( !tok->nvalues ) return 1;
1878
1879 hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
1880 int i,j,k = 0;
1881 if ( tok->usmpl )
1882 {
1883 for (i=0; i<tok->nsamples; i++)
1884 {
1885 if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; }
1886 for (j=0; j<tok->nval1; k++,j++)
1887 {
1888 if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]);
1889 else rtok->values[k] = -4.34294481903*log(tok->values[k]);
1890 }
1891 }
1892 }
1893 else
1894 {
1895 for (i=0; i<tok->nvalues; i++)
1896 {
1897 if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
1898 else rtok->values[i] = -4.34294481903*log(tok->values[i]);
1899 }
1900 }
1901 return 1;
1902 }
tok_init_values(token_t * atok,token_t * btok,token_t * rtok)1903 inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok)
1904 {
1905 token_t *tok = atok->nvalues > btok->nvalues ? atok : btok;
1906 rtok->nvalues = tok->nvalues;
1907 rtok->nval1 = tok->nval1;
1908 hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
1909 }
tok_init_samples(token_t * atok,token_t * btok,token_t * rtok)1910 inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok)
1911 {
1912 if ( (atok->nsamples || btok->nsamples) && !rtok->nsamples )
1913 {
1914 rtok->nsamples = atok->nsamples ? atok->nsamples : btok->nsamples;
1915 rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1);
1916 int i;
1917 for (i=0; i<atok->nsamples; i++) rtok->usmpl[i] |= atok->usmpl[i];
1918 for (i=0; i<btok->nsamples; i++) rtok->usmpl[i] |= btok->usmpl[i];
1919 }
1920 if (rtok->nsamples)
1921 memset(rtok->pass_samples, 0, rtok->nsamples);
1922 }
1923
1924 #define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \
1925 { \
1926 token_t *rtok = _rtok; \
1927 int i, has_values = 0; \
1928 if ( atok->nvalues && btok->nvalues ) \
1929 { \
1930 tok_init_values(atok, btok, rtok); \
1931 tok_init_samples(atok, btok, rtok); \
1932 if ( (atok->nsamples && btok->nsamples) || (!atok->nsamples && !btok->nsamples)) \
1933 { \
1934 assert( atok->nsamples==btok->nsamples ); \
1935 for (i=0; i<atok->nvalues; i++) \
1936 { \
1937 if ( bcf_double_is_missing_or_vector_end(atok->values[i]) || bcf_double_is_missing_or_vector_end(btok->values[i]) ) \
1938 { \
1939 bcf_double_set_missing(rtok->values[i]); \
1940 continue; \
1941 } \
1942 has_values = 1; \
1943 rtok->values[i] = atok->values[i] AOP btok->values[i]; \
1944 } \
1945 } \
1946 else if ( atok->nsamples ) \
1947 { \
1948 assert( btok->nvalues==1 ); \
1949 if ( !bcf_double_is_missing_or_vector_end(btok->values[0]) ) \
1950 { \
1951 for (i=0; i<atok->nvalues; i++) \
1952 { \
1953 if ( bcf_double_is_missing_or_vector_end(atok->values[i]) ) \
1954 { \
1955 bcf_double_set_missing(rtok->values[i]); \
1956 continue; \
1957 } \
1958 has_values = 1; \
1959 rtok->values[i] = atok->values[i] AOP btok->values[0]; \
1960 } \
1961 } \
1962 } \
1963 else \
1964 { \
1965 assert( atok->nvalues==1 ); \
1966 if ( !bcf_double_is_missing_or_vector_end(atok->values[0]) ) \
1967 { \
1968 for (i=0; i<btok->nvalues; i++) \
1969 { \
1970 if ( bcf_double_is_missing_or_vector_end(btok->values[i]) ) \
1971 { \
1972 bcf_double_set_missing(rtok->values[i]); \
1973 continue; \
1974 } \
1975 has_values = 1; \
1976 rtok->values[i] = atok->values[0] AOP btok->values[i]; \
1977 } \
1978 } \
1979 } \
1980 } \
1981 if ( !has_values ) rtok->nvalues = 0; \
1982 }
1983
vector_logic_or(filter_t * filter,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1984 static int vector_logic_or(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1985 {
1986 if ( nstack < 2 ) error("Error occurred while processing the filter \"%s\"\n", filter->str);
1987
1988 token_t *atok = stack[nstack-2];
1989 token_t *btok = stack[nstack-1];
1990 tok_init_samples(atok, btok, rtok);
1991
1992 if ( !atok->pass_site && !btok->pass_site ) return 2;
1993
1994 rtok->pass_site = 1;
1995 if ( !atok->nsamples && !btok->nsamples ) return 2;
1996
1997 int i;
1998 if ( rtok->tok_type==TOK_OR_VEC ) // ||, select all samples if one is true
1999 {
2000 if ( (!atok->nsamples && !atok->pass_site) || (!btok->nsamples && !btok->pass_site) )
2001 {
2002 // These two conditions are to ensure the following does not set all samples
2003 // at sites with QUAL<=30:
2004 // QUAL>30 || FMT/GQ>30
2005
2006 token_t *tok = atok->nsamples ? atok : btok;
2007 for (i=0; i<rtok->nsamples; i++)
2008 {
2009 if ( !rtok->usmpl[i] ) continue;
2010 rtok->pass_samples[i] = tok->pass_samples[i];
2011 }
2012 }
2013 else
2014 {
2015 for (i=0; i<rtok->nsamples; i++)
2016 {
2017 if ( !rtok->usmpl[i] ) continue;
2018 rtok->pass_samples[i] = 1;
2019 }
2020 }
2021 return 2;
2022 }
2023
2024 // |, only select samples which are actually true
2025
2026 if ( !atok->nsamples || !btok->nsamples )
2027 {
2028 token_t *tok = atok->nsamples ? atok : btok;
2029 for (i=0; i<rtok->nsamples; i++)
2030 {
2031 if ( !rtok->usmpl[i] ) continue;
2032 rtok->pass_samples[i] = tok->pass_samples[i];
2033 }
2034 return 2;
2035 }
2036
2037 assert( atok->nsamples==btok->nsamples );
2038
2039 for (i=0; i<rtok->nsamples; i++)
2040 {
2041 if ( !rtok->usmpl[i] ) continue;
2042 rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i];
2043 }
2044 return 2;
2045 }
vector_logic_and(filter_t * filter,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)2046 static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
2047 {
2048 if ( nstack < 2 ) error("Error occurred while processing the filter \"%s\". (nstack=%d)\n", filter->str,nstack);
2049
2050 token_t *atok = stack[nstack-2];
2051 token_t *btok = stack[nstack-1];
2052 tok_init_samples(atok, btok, rtok);
2053
2054 if ( !atok->pass_site || !btok->pass_site ) return 2;
2055 if ( !atok->nsamples && !btok->nsamples ) { rtok->pass_site = 1; return 2; }
2056
2057 int i;
2058 if ( !atok->nsamples || !btok->nsamples )
2059 {
2060 token_t *tok = atok->nsamples ? atok : btok;
2061 for (i=0; i<rtok->nsamples; i++)
2062 {
2063 if ( !rtok->usmpl[i] ) continue;
2064 rtok->pass_samples[i] = tok->pass_samples[i];
2065 }
2066 rtok->pass_site = 1;
2067 return 2;
2068 }
2069
2070 assert( atok->nsamples==btok->nsamples );
2071 if ( rtok->tok_type==TOK_AND_VEC ) // &&, can be true in different samples
2072 {
2073 for (i=0; i<rtok->nsamples; i++)
2074 {
2075 if ( !rtok->usmpl[i] ) continue;
2076 rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i];
2077 }
2078 rtok->pass_site = 1;
2079 }
2080 else // &, must be true within one sample
2081 {
2082 for (i=0; i<rtok->nsamples; i++)
2083 {
2084 if ( !rtok->usmpl[i] ) continue;
2085 rtok->pass_samples[i] = atok->pass_samples[i] & btok->pass_samples[i];
2086 if ( rtok->pass_samples[i] ) rtok->pass_site = 1;
2087 }
2088 }
2089 return 2;
2090 }
2091
2092 #define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \
2093 { \
2094 token_t *rtok = _rtok; \
2095 int i, j, k; \
2096 tok_init_samples(atok, btok, rtok); \
2097 if ( !atok->nsamples && !btok->nsamples ) \
2098 { \
2099 if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; } \
2100 else if ( !atok->nvalues || !btok->nvalues ) \
2101 { \
2102 token_t *tok = atok->nvalues ? atok : btok; \
2103 for (j=0; j<tok->nvalues; j++) \
2104 { \
2105 if ( bcf_double_is_missing_or_vector_end(tok->values[j]) ) \
2106 { \
2107 if ( missing_logic[2] ) { rtok->pass_site = 1; break; } \
2108 } \
2109 else if ( missing_logic[1] ) { rtok->pass_site = 1; break; } \
2110 } \
2111 } \
2112 else \
2113 { \
2114 for (i=0; i<atok->nvalues; i++) \
2115 { \
2116 int amiss = bcf_double_is_missing_or_vector_end(atok->values[i]) ? 1 : 0; \
2117 for (j=0; j<btok->nvalues; j++) \
2118 { \
2119 int nmiss = amiss + (bcf_double_is_missing_or_vector_end(btok->values[j]) ? 1 : 0); \
2120 if ( nmiss ) \
2121 { \
2122 if ( missing_logic[nmiss] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \
2123 } \
2124 else if ( atok->values[i] > 16777216 || btok->values[j] > 16777216 ) /* Ugly, see #871 */ \
2125 { \
2126 if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \
2127 } \
2128 else if ( (float)atok->values[i] CMP_OP (float)btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \
2129 } \
2130 } \
2131 } \
2132 } \
2133 else \
2134 { \
2135 if ( !atok->nvalues && !btok->nvalues ) \
2136 { \
2137 if ( missing_logic[2] ) \
2138 { \
2139 for (i=0; i<rtok->nsamples; i++) \
2140 if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; } \
2141 } \
2142 } \
2143 else if ( !atok->nvalues || !btok->nvalues ) \
2144 { \
2145 token_t *tok = atok->nvalues ? atok : btok; \
2146 if ( !tok->nsamples ) \
2147 { \
2148 int miss = 0; \
2149 for (j=0; j<tok->nvalues; j++) \
2150 miss |= bcf_double_is_missing_or_vector_end(tok->values[j]) ? 1 : 0; \
2151 if ( missing_logic[++miss] ) \
2152 { \
2153 for (i=0; i<rtok->nsamples; i++) \
2154 if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \
2155 } \
2156 } \
2157 else \
2158 for (i=0; i<tok->nsamples; i++) \
2159 { \
2160 if ( !rtok->usmpl[i] ) continue; \
2161 double *ptr = tok->values + i*tok->nval1; \
2162 int miss = 0; \
2163 for (j=0; j<tok->nval1; j++) \
2164 miss |= bcf_double_is_missing_or_vector_end(ptr[j]) ? 1 : 0; \
2165 if ( missing_logic[++miss] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \
2166 } \
2167 } \
2168 else if ( atok->nsamples && btok->nsamples ) \
2169 { \
2170 if ( atok->nval1!=btok->nval1 ) error("Incompatible number of per-sample values in comparison: %d vs %d\n",atok->nval1,btok->nval1); \
2171 if ( atok->nsamples!=btok->nsamples ) error("Incompatible number samples in comparison: %d vs %d\n",atok->nsamples,btok->nsamples); \
2172 for (i=0; i<atok->nsamples; i++) \
2173 { \
2174 if ( !atok->usmpl[i] || !btok->usmpl[i] ) { rtok->usmpl[i] = 0; continue; } \
2175 double *aptr = atok->values + i*atok->nval1; \
2176 double *bptr = btok->values + i*btok->nval1; \
2177 for (j=0; j<atok->nval1; j++) \
2178 { \
2179 int nmiss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \
2180 if ( nmiss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
2181 nmiss += (bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0); \
2182 if ( nmiss ) \
2183 { \
2184 if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \
2185 } \
2186 else if ( aptr[j] > 16777216 || bptr[j] > 16777216 ) /* Ugly, see #871 */ \
2187 { \
2188 if ( aptr[j] CMP_OP bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \
2189 } \
2190 else if ( (float)aptr[j] CMP_OP (float)bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \
2191 } \
2192 } \
2193 } \
2194 else if ( atok->nsamples )\
2195 { \
2196 for (i=0; i<atok->nsamples; i++) \
2197 { \
2198 if ( !rtok->usmpl[i] ) continue; \
2199 double *aptr = atok->values + i*atok->nval1; \
2200 double *bptr = btok->values + i*btok->nval1; \
2201 for (j=0; j<atok->nval1; j++) \
2202 { \
2203 int miss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \
2204 if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
2205 for (k=0; k<btok->nvalues; k++) \
2206 { \
2207 int nmiss = miss + (bcf_double_is_missing_or_vector_end(bptr[k]) ? 1 : 0); \
2208 if ( nmiss ) \
2209 { \
2210 if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
2211 } \
2212 else if ( aptr[j] > 16777216 || bptr[k] > 16777216 ) /* Ugly, see #871 */ \
2213 { \
2214 if ( aptr[j] CMP_OP bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
2215 } \
2216 else if ( (float)aptr[j] CMP_OP (float)bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
2217 } \
2218 } \
2219 } \
2220 } \
2221 else /* btok->nsamples */ \
2222 { \
2223 for (i=0; i<btok->nsamples; i++) \
2224 { \
2225 if ( !rtok->usmpl[i] ) continue; \
2226 double *aptr = atok->values + i*atok->nval1; \
2227 double *bptr = btok->values + i*btok->nval1; \
2228 for (j=0; j<btok->nval1; j++) \
2229 { \
2230 int miss = bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0; \
2231 if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
2232 for (k=0; k<atok->nvalues; k++) \
2233 { \
2234 int nmiss = miss + (bcf_double_is_missing_or_vector_end(aptr[k]) ? 1 : 0); \
2235 if ( nmiss ) \
2236 { \
2237 if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
2238 } \
2239 else if ( bptr[j] > 16777216 || aptr[k] > 16777216 ) /* Ugly, see #871 */ \
2240 { \
2241 if ( aptr[k] CMP_OP bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
2242 } \
2243 else if ( (float)aptr[k] CMP_OP (float)bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
2244 } \
2245 } \
2246 } \
2247 } \
2248 } \
2249 }
_regex_vector_strings(regex_t * regex,char * str,size_t len,int logic,int * missing_logic)2250 static int _regex_vector_strings(regex_t *regex, char *str, size_t len, int logic, int *missing_logic)
2251 {
2252 char *end = str + len;
2253 while ( str < end && *str )
2254 {
2255 char *mid = str;
2256 while ( mid < end && *mid && *mid!=',' ) mid++;
2257 int miss = mid - str == 1 && str[0]=='.' ? 1 : 0;
2258 if ( miss && missing_logic[miss] ) return 1;
2259 char tmp = *mid; *mid = 0;
2260 int match = regexec(regex, str, 0,NULL,0) ? 0 : 1;
2261 *mid = tmp;
2262 if ( logic==TOK_NLIKE ) match = match ? 0 : 1;
2263 if ( match ) return 1;
2264 if ( !*mid ) break;
2265 str = mid + 1;
2266 }
2267 return 0;
2268 }
_has_missing_string(char * beg)2269 static inline int _has_missing_string(char *beg)
2270 {
2271 while ( *beg )
2272 {
2273 char *end = beg;
2274 while ( *end && *end!=',' ) end++;
2275 if ( end-beg==1 && beg[0]=='.' ) return 1;
2276 if ( !*end ) break;
2277 beg = end + 1;
2278 }
2279 return 0;
2280 }
2281
2282 // Compare two strings with multiple fields, for example "A,B,.,C"=="X,Y,A".
2283 // Returns 1 if any field matches, otherwise returns 0
_match_vector_strings(char * abeg,size_t alen,char * bstr,size_t blen,int logic,int * missing_logic)2284 static inline int _match_vector_strings(char *abeg, size_t alen, char *bstr, size_t blen, int logic, int *missing_logic)
2285 {
2286 char *aend = abeg + alen;
2287 char *bend = bstr + blen;
2288 while ( abeg < aend && *abeg )
2289 {
2290 char *amid = abeg;
2291 while ( amid < aend && *amid && *amid!=',' ) amid++;
2292 int miss = amid - abeg == 1 && abeg[0]=='.' ? 1 : 0;
2293 char *bbeg = bstr;
2294 while ( bbeg < bend && *bbeg )
2295 {
2296 char *bmid = bbeg;
2297 while ( bmid < bend && *bmid && *bmid!=',' ) bmid++;
2298 int nmiss = miss + (bmid - bbeg == 1 && bbeg[0]=='.' ? 1 : 0);
2299 if ( nmiss )
2300 {
2301 if ( missing_logic[nmiss] ) return 1;
2302 }
2303 else
2304 {
2305 int match = amid-abeg==bmid-bbeg && !strncmp(abeg,bbeg,amid-abeg) ? 1 : 0;
2306 if ( logic==TOK_NE ) match = match==1 ? 0 : 1;
2307 if ( match ) return 1;
2308 }
2309 if ( !*bmid ) break;
2310 bbeg = bmid + 1;
2311 }
2312 if ( !*amid ) break;
2313 abeg = amid + 1;
2314 }
2315 return 0;
2316 }
cmp_vector_strings(token_t * atok,token_t * btok,token_t * rtok)2317 static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok)
2318 {
2319 tok_init_samples(atok, btok, rtok);
2320
2321 int i, logic = rtok->tok_type; // TOK_EQ, TOK_NE, TOK_LIKE, TOK_NLIKE
2322 regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL);
2323
2324 assert( atok->nvalues==atok->str_value.l && btok->nvalues==btok->str_value.l );
2325 assert( !atok->nsamples || !btok->nsamples );
2326 assert( (!regex && (logic==TOK_EQ || logic==TOK_NE)) || (regex && (logic==TOK_LIKE || logic==TOK_NLIKE)) );
2327
2328 int missing_logic[] = {0,0,0};
2329 if ( logic==TOK_EQ || logic==TOK_LIKE ) missing_logic[0] = missing_logic[2] = 1;
2330 else if ( logic==TOK_NE || logic==TOK_NLIKE ) missing_logic[0] = missing_logic[1] = 1;
2331
2332 if ( !atok->nsamples && !btok->nsamples )
2333 {
2334 if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; return; }
2335 if ( !atok->nvalues || !btok->nvalues )
2336 {
2337 int miss = _has_missing_string(atok->nvalues ? atok->str_value.s : btok->str_value.s);
2338 if ( missing_logic[miss+1] ) rtok->pass_site = 1;
2339 return;
2340 }
2341 if ( !regex )
2342 rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic);
2343 else
2344 {
2345 token_t *tok = atok->regex ? btok : atok;
2346 rtok->pass_site = _regex_vector_strings(regex, tok->str_value.s, tok->str_value.l, logic, missing_logic);
2347 }
2348 return;
2349 }
2350
2351 // The case of (!atok->nsamples || !btok->nsamples)
2352
2353 if ( !atok->nvalues && !btok->nvalues )
2354 {
2355 if ( missing_logic[2] )
2356 {
2357 for (i=0; i<rtok->nsamples; i++)
2358 if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; }
2359 }
2360 return;
2361 }
2362 if ( !atok->nvalues || !btok->nvalues )
2363 {
2364 token_t *tok = atok->nvalues ? atok : btok;
2365 if ( !tok->nsamples )
2366 {
2367 int miss = _has_missing_string(tok->str_value.s);
2368 if ( !missing_logic[miss+1] ) return;
2369 for (i=0; i<rtok->nsamples; i++)
2370 if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
2371 }
2372 else
2373 for (i=0; i<tok->nsamples; i++)
2374 {
2375 if ( !rtok->usmpl[i] ) continue;
2376 int miss = _has_missing_string(tok->str_value.s + i*tok->nval1);
2377 if ( missing_logic[miss+1] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
2378 }
2379 return;
2380 }
2381
2382 // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues)
2383 token_t *xtok = atok->nsamples ? atok : btok;
2384 token_t *ytok = atok->nsamples ? btok : atok;
2385 assert( regex==ytok->regex );
2386 for (i=0; i<xtok->nsamples; i++)
2387 {
2388 if ( !rtok->usmpl[i] ) continue;
2389 int match;
2390 if ( regex )
2391 match = _regex_vector_strings(regex, xtok->str_value.s + i*xtok->nval1, xtok->nval1, logic, missing_logic);
2392 else
2393 match = _match_vector_strings(xtok->str_value.s + i*xtok->nval1, xtok->nval1, ytok->str_value.s, ytok->str_value.l, logic, missing_logic);
2394 if ( match ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
2395 }
2396 }
2397
parse_idxs(char * tag_idx,int ** idxs,int * nidxs,int * idx)2398 static int parse_idxs(char *tag_idx, int **idxs, int *nidxs, int *idx)
2399 {
2400 // TAG[], TAG[*] .. any field; sets idx=-2, idxs[0]=-1
2401 if ( *tag_idx==0 || !strcmp("*", tag_idx) )
2402 {
2403 *idxs = (int*) malloc(sizeof(int));
2404 (*idxs)[0] = -1;
2405 *nidxs = 1;
2406 *idx = -2;
2407 return 0;
2408 }
2409
2410 // TAG[integer] .. one field; idx positive
2411 char *end, *beg = tag_idx;
2412 *idx = strtol(tag_idx, &end, 10);
2413 if ( *idx >= 0 && *end==0 ) return 0;
2414
2415 // TAG[0,1] or TAG[0-2] or [1-] etc; idx=-2, idxs[...]=0,0,1,1,..
2416 int i, ibeg = -1;
2417 while ( *beg )
2418 {
2419 int num = strtol(beg, &end, 10);
2420 if ( end[0]==',' ) beg = end + 1;
2421 else if ( end[0]==0 ) beg = end;
2422 else if ( end[0]=='-' ) { beg = end + 1; ibeg = num; continue; }
2423 else return -1;
2424 if ( num >= *nidxs )
2425 {
2426 *idxs = (int*) realloc(*idxs, sizeof(int)*(num+1));
2427 memset(*idxs + *nidxs, 0, sizeof(int)*(num - *nidxs + 1));
2428 *nidxs = num + 1;
2429 }
2430 if ( ibeg>=0 )
2431 {
2432 for (i=ibeg; i<=num; i++) (*idxs)[i] = 1;
2433 ibeg = -1;
2434 }
2435 (*idxs)[num] = 1;
2436 }
2437 if ( ibeg >=0 )
2438 {
2439 if ( ibeg >= *nidxs )
2440 {
2441 *idxs = (int*) realloc(*idxs, sizeof(int)*(ibeg+1));
2442 memset(*idxs + *nidxs, 0, sizeof(int)*(ibeg - *nidxs + 1));
2443 *nidxs = ibeg + 1;
2444 }
2445 (*idxs)[ibeg] = -1;
2446 }
2447 *idx = -2;
2448 return 0;
2449 }
2450
parse_tag_idx(bcf_hdr_t * hdr,int is_fmt,char * tag,char * tag_idx,token_t * tok)2451 static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, token_t *tok) // tag_idx points just after "TAG["
2452 {
2453 int i, len = strlen(tag_idx);
2454 if ( tag_idx[len-1] == ']' ) tag_idx[len-1] = 0;
2455 char *ori = strdup(tag_idx);
2456
2457 assert( !tok->idxs && !tok->usmpl );
2458 int *idxs1 = NULL, nidxs1 = 0, idx1 = 0;
2459 int *idxs2 = NULL, nidxs2 = 0, idx2 = 0;
2460
2461 int set_samples = 0;
2462 char *colon = strrchr(tag_idx, ':');
2463 if ( tag_idx[0]=='@' ) // file list with sample names
2464 {
2465 if ( !is_fmt ) error("Could not parse \"%s\". (Not a FORMAT tag yet a sample list provided.)\n", ori);
2466 char *fname = expand_path(tag_idx+1);
2467 #ifdef _WIN32
2468 if (fname && strlen(fname) > 2 && fname[1] == ':') // Deal with Windows paths, such as 'C:\..'
2469 colon = strrchr(fname+2, ':');
2470 #endif
2471 int nsmpl;
2472 char **list = hts_readlist(fname, 1, &nsmpl);
2473 if ( !list && colon )
2474 {
2475 if ( parse_idxs(colon+1, &idxs2, &nidxs2, &idx2) != 0 ) error("Could not parse the index: %s\n", ori);
2476 tok->idxs = idxs2;
2477 tok->nidxs = nidxs2;
2478 tok->idx = idx2;
2479 colon = strrchr(fname, ':');
2480 *colon = 0;
2481 list = hts_readlist(fname, 1, &nsmpl);
2482 }
2483 if ( !list ) error("Could not read: %s\n", fname);
2484 free(fname);
2485 tok->nsamples = bcf_hdr_nsamples(hdr);
2486 tok->usmpl = (uint8_t*) calloc(tok->nsamples,1);
2487 for (i=0; i<nsmpl; i++)
2488 {
2489 int ismpl = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,list[i]);
2490 if ( ismpl<0 ) error("No such sample in the VCF: \"%s\"\n", list[i]);
2491 free(list[i]);
2492 tok->usmpl[ismpl] = 1;
2493 }
2494 free(list);
2495 if ( !colon )
2496 {
2497 tok->idxs = (int*) malloc(sizeof(int));
2498 tok->idxs[0] = -1;
2499 tok->nidxs = 1;
2500 tok->idx = -2;
2501 }
2502 }
2503 else if ( colon )
2504 {
2505 if ( !is_fmt ) error("Could not parse the index \"%s\". (Not a FORMAT tag yet sample index implied.)\n", ori);
2506 *colon = 0;
2507 if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori);
2508 if ( parse_idxs(colon+1, &idxs2, &nidxs2, &idx2) != 0 ) error("Could not parse the index: %s\n", ori);
2509 tok->idxs = idxs2;
2510 tok->nidxs = nidxs2;
2511 tok->idx = idx2;
2512 set_samples = 1;
2513 }
2514 else
2515 {
2516 if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori);
2517 if ( is_fmt )
2518 {
2519 if ( nidxs1==1 && idxs1[0]==-1 )
2520 {
2521 tok->idxs = (int*) malloc(sizeof(int));
2522 tok->idxs[0] = -1;
2523 tok->nidxs = 1;
2524 tok->idx = -2;
2525 }
2526 else if ( bcf_hdr_id2number(hdr,BCF_HL_FMT,tok->hdr_id)!=1 )
2527 error("The FORMAT tag %s can have multiple subfields, run as %s[sample:subfield]\n", tag,tag);
2528 else
2529 tok->idx = 0;
2530 set_samples = 1;
2531 }
2532 else
2533 {
2534 tok->idxs = idxs1;
2535 tok->nidxs = nidxs1;
2536 tok->idx = idx1;
2537 }
2538 }
2539
2540 if ( set_samples )
2541 {
2542 tok->nsamples = bcf_hdr_nsamples(hdr);
2543 tok->usmpl = (uint8_t*) calloc(tok->nsamples,1);
2544 if ( idx1>=0 )
2545 {
2546 if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
2547 tok->usmpl[idx1] = 1;
2548 }
2549 else if ( idx1==-2 )
2550 {
2551 for (i=0; i<nidxs1; i++)
2552 {
2553 if ( idxs1[i]==0 ) continue;
2554 if ( idxs1[i]==-1 ) break;
2555 if ( i >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
2556 tok->usmpl[i] = 1;
2557 }
2558 if ( nidxs1 && idxs1[nidxs1-1]==-1 ) // open range, such as "7-"
2559 {
2560 for (; i<tok->nsamples; i++) tok->usmpl[i] = 1;
2561 }
2562 }
2563 else error("todo: %s:%d .. %d\n", __FILE__,__LINE__, idx2);
2564 free(idxs1);
2565 }
2566 free(ori);
2567
2568 if ( tok->nidxs && tok->idxs[tok->nidxs-1]!=-1 )
2569 {
2570 for (i=0; i<tok->nidxs; i++) if ( tok->idxs[i] ) tok->nuidxs++;
2571 }
2572 }
max_ac_an_unpack(bcf_hdr_t * hdr)2573 static int max_ac_an_unpack(bcf_hdr_t *hdr)
2574 {
2575 int hdr_id = bcf_hdr_id2int(hdr,BCF_DT_ID,"AC");
2576 if ( hdr_id<0 ) return BCF_UN_FMT;
2577 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,hdr_id) ) return BCF_UN_FMT;
2578
2579 hdr_id = bcf_hdr_id2int(hdr,BCF_DT_ID,"AN");
2580 if ( hdr_id<0 ) return BCF_UN_FMT;
2581 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,hdr_id) ) return BCF_UN_FMT;
2582
2583 return BCF_UN_INFO;
2584 }
filters_init1(filter_t * filter,char * str,int len,token_t * tok)2585 static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
2586 {
2587 tok->tag_type = -1;
2588 tok->tok_type = TOK_VAL;
2589 tok->hdr_id = -1;
2590 tok->pass_site = -1;
2591 tok->idx = 0;
2592
2593 // is this a string constant?
2594 if ( str[0]=='"' || str[0]=='\'' )
2595 {
2596 int quote = str[0];
2597 if ( str[len-1] != quote ) error("TODO: [%s]\n", filter->str);
2598 tok->key = (char*) calloc(len-1,sizeof(char));
2599 hts_expand(double,1,tok->mvalues,tok->values);
2600 tok->values[0] = len-2;
2601 memcpy(tok->key,str+1,len-2);
2602 tok->key[len-2] = 0;
2603 tok->is_str = 1;
2604 tok->nvalues = len-2;
2605 if ( !strcmp(".",tok->key) ) tok->is_missing = 1;
2606 return 0;
2607 }
2608
2609 // is it a file?
2610 if ( str[0]=='@' )
2611 {
2612 tok->tag = (char*) calloc(len+1,sizeof(char));
2613 memcpy(tok->tag,str,len);
2614 tok->tag[len] = 0;
2615 char *fname = expand_path(tok->tag+1);
2616 int i, n;
2617 char **list = hts_readlist(fname, 1, &n);
2618 if ( !list ) error("Could not read: %s\n", fname);
2619 free(fname);
2620 tok->hash = khash_str2int_init();
2621 for (i=0; i<n; i++)
2622 {
2623 char *se = list[i];
2624 while ( *se && !isspace(*se) ) se++;
2625 *se = 0;
2626 if ( !khash_str2int_has_key(tok->hash,list[i]) )
2627 khash_str2int_inc(tok->hash,list[i]);
2628 else
2629 free(list[i]);
2630 }
2631 free(list);
2632 return 0;
2633 }
2634
2635 int is_fmt = -1;
2636 if ( !strncasecmp(str,"FMT/",4) ) { str += 4; len -= 4; is_fmt = 1; }
2637 else if ( !strncasecmp(str,"FORMAT/",7) ) { str += 7; len -= 7; is_fmt = 1; }
2638 else
2639 {
2640 if ( !strncasecmp(str,"INFO/",5) ) { is_fmt = 0; str += 5; len -= 5; }
2641 else if ( !strncasecmp(str,"QUAL",len) || !strncmp(str,"%QUAL",len) /* for backward compatibility */ )
2642 {
2643 tok->setter = filters_set_qual;
2644 tok->tag = strdup("QUAL");
2645 return 0;
2646 }
2647 else if ( !strncasecmp(str,"TYPE",len) || !strncmp(str,"%TYPE",len) /* for backward compatibility */ )
2648 {
2649 tok->setter = filters_set_type;
2650 tok->tag = strdup("TYPE");
2651 return 0;
2652 }
2653 else if ( !strncasecmp(str,"FILTER",len) || !strncmp(str,"%FILTER",len) /* for backward compatibility */ )
2654 {
2655 tok->comparator = filters_cmp_filter;
2656 tok->tag = strdup("FILTER");
2657 filter->max_unpack |= BCF_UN_FLT;
2658 tok->tag_type = BCF_HL_FLT;
2659 return 0;
2660 }
2661 else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ )
2662 {
2663 tok->comparator = filters_cmp_id;
2664 tok->tag = strdup("ID");
2665 return 0;
2666 }
2667 else if ( !strncasecmp(str,"CHROM",len) )
2668 {
2669 tok->setter = &filters_set_chrom;
2670 tok->tag = strdup("CHROM");
2671 return 0;
2672 }
2673 else if ( !strncasecmp(str,"POS",len) )
2674 {
2675 tok->setter = &filters_set_pos;
2676 tok->tag = strdup("POS");
2677 return 0;
2678 }
2679 else if ( !strncasecmp(str,"REF",len) )
2680 {
2681 tok->setter = &filters_set_ref_string;
2682 tok->is_str = 1;
2683 tok->tag = strdup("REF");
2684 return 0;
2685 }
2686 else if ( !strncasecmp(str,"ALT",len) )
2687 {
2688 tok->setter = &filters_set_alt_string;
2689 tok->is_str = 1;
2690 tok->tag = strdup("ALT");
2691 tok->idxs = (int*) malloc(sizeof(int));
2692 tok->idxs[0] = -1;
2693 tok->nidxs = 1;
2694 tok->idx = -2;
2695 return 0;
2696 }
2697 else if ( !strncasecmp(str,"N_ALT",len) )
2698 {
2699 tok->setter = &filters_set_nalt;
2700 tok->tag = strdup("N_ALT");
2701 return 0;
2702 }
2703 else if ( !strncasecmp(str,"N_SAMPLES",len) )
2704 {
2705 tok->tok_type = TOK_VAL;
2706 tok->threshold = bcf_hdr_nsamples(filter->hdr);
2707 return 0;
2708 }
2709 else if ( !strncasecmp(str,"N_MISSING",len) )
2710 {
2711 filter->max_unpack |= BCF_UN_FMT;
2712 tok->setter = &filters_set_nmissing;
2713 tok->tag = strdup("N_MISSING");
2714 return 0;
2715 }
2716 else if ( !strncasecmp(str,"F_MISSING",len) )
2717 {
2718 filter->max_unpack |= BCF_UN_FMT;
2719 tok->setter = &filters_set_nmissing;
2720 tok->tag = strdup("F_MISSING");
2721 return 0;
2722 }
2723 }
2724
2725 // does it have array subscript?
2726 int is_array = 0;
2727 kstring_t tmp = {0,0,0};
2728 kputsn(str, len, &tmp);
2729 if ( tmp.s[tmp.l-1] == ']' )
2730 {
2731 int i;
2732 for (i=0; i<tmp.l; i++)
2733 if ( tmp.s[i]=='[' ) { tmp.s[i] = 0; is_array = i+1; break; }
2734 }
2735 tok->hdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s);
2736 if ( is_fmt==-1 )
2737 {
2738 if ( tok->hdr_id >=0 )
2739 {
2740 int is_info = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) ? 1 : 0;
2741 is_fmt = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ? 1 : 0;
2742 if ( is_info && is_fmt )
2743 error("Error: ambiguous filtering expression, both INFO/%s and FORMAT/%s are defined in the VCF header.\n" , tmp.s,tmp.s);
2744 }
2745 if ( is_fmt==-1 ) is_fmt = 0;
2746 }
2747 if ( is_array )
2748 parse_tag_idx(filter->hdr, is_fmt, tmp.s, tmp.s+is_array, tok);
2749 else if ( is_fmt && !tok->nsamples )
2750 {
2751 int i;
2752 tok->nsamples = bcf_hdr_nsamples(filter->hdr);
2753 tok->usmpl = (uint8_t*) malloc(tok->nsamples);
2754 for (i=0; i<tok->nsamples; i++) tok->usmpl[i] = 1;
2755 }
2756
2757 tok->tag_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO;
2758 if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT;
2759 if ( tok->hdr_id>=0 )
2760 {
2761 if ( is_fmt && !strcmp("GT",tmp.s) )
2762 {
2763 tok->setter = &filters_set_genotype_string; tok->is_str = 1;
2764 }
2765 else if ( is_fmt )
2766 {
2767 if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
2768 error("No such FORMAT field: %s\n", tmp.s);
2769 if ( bcf_hdr_id2number(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=1 && !is_array )
2770 {
2771 tok->idxs = (int*) malloc(sizeof(int));
2772 tok->idxs[0] = -1;
2773 tok->nidxs = 1;
2774 tok->idx = -2;
2775 }
2776 switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
2777 {
2778 case BCF_HT_INT: tok->setter = &filters_set_format_int; break;
2779 case BCF_HT_REAL: tok->setter = &filters_set_format_float; break;
2780 case BCF_HT_STR: tok->setter = &filters_set_format_string; tok->is_str = 1; break;
2781 default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
2782 }
2783 }
2784 else if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) )
2785 error("No such INFO field: %s\n", tmp.s);
2786 else
2787 {
2788 if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_FLAG )
2789 tok->setter = filters_set_info_flag;
2790 else
2791 {
2792 if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_STR ) tok->is_str = 1;
2793 if ( bcf_hdr_id2number(filter->hdr,BCF_HL_INFO,tok->hdr_id)==1 )
2794 tok->setter = filters_set_info;
2795 else
2796 {
2797 switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) )
2798 {
2799 case BCF_HT_INT: tok->setter = &filters_set_info_int; break;
2800 case BCF_HT_REAL: tok->setter = &filters_set_info_float; break;
2801 case BCF_HT_STR: tok->setter = &filters_set_info_string; tok->is_str = 1; break;
2802 default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
2803 }
2804 if (!is_array)
2805 {
2806 tok->idx = -2;
2807 tok->idxs = (int*) malloc(sizeof(int));
2808 tok->idxs[0] = -1;
2809 tok->nidxs = 1;
2810 }
2811 }
2812 }
2813 filter->max_unpack |= BCF_UN_INFO;
2814 }
2815 tok->tag = strdup(tmp.s);
2816 if ( tmp.s ) free(tmp.s);
2817 return 0;
2818 }
2819 else if ( !strcasecmp(tmp.s,"ALT") )
2820 {
2821 tok->setter = &filters_set_alt_string;
2822 tok->is_str = 1;
2823 tok->tag = strdup(tmp.s);
2824 free(tmp.s);
2825 return 0;
2826 }
2827 else if ( !strcasecmp(tmp.s,"AN") )
2828 {
2829 filter->max_unpack |= BCF_UN_FMT;
2830 tok->setter = &filters_set_an;
2831 tok->tag = strdup("AN");
2832 free(tmp.s);
2833 return 0;
2834 }
2835 else if ( !strcasecmp(tmp.s,"AC") )
2836 {
2837 filter->max_unpack |= BCF_UN_FMT;
2838 tok->setter = &filters_set_ac;
2839 tok->tag = strdup("AC");
2840 free(tmp.s);
2841 return 0;
2842 }
2843 else if ( !strcasecmp(tmp.s,"MAC") )
2844 {
2845 filter->max_unpack |= max_ac_an_unpack(filter->hdr);
2846 tok->setter = &filters_set_mac;
2847 tok->tag = strdup("MAC");
2848 free(tmp.s);
2849 return 0;
2850 }
2851 else if ( !strcasecmp(tmp.s,"AF") )
2852 {
2853 filter->max_unpack |= max_ac_an_unpack(filter->hdr);
2854 tok->setter = &filters_set_af;
2855 tok->tag = strdup("AF");
2856 free(tmp.s);
2857 return 0;
2858 }
2859 else if ( !strcasecmp(tmp.s,"MAF") )
2860 {
2861 filter->max_unpack |= max_ac_an_unpack(filter->hdr);
2862 tok->setter = &filters_set_maf;
2863 tok->tag = strdup("MAF");
2864 free(tmp.s);
2865 return 0;
2866 }
2867 else if ( !strcasecmp(tmp.s,"ILEN") || !strcasecmp(tmp.s,"%ILEN") )
2868 {
2869 filter->max_unpack |= BCF_UN_STR;
2870 tok->setter = &filters_set_ilen;
2871 tok->tag = strdup("ILEN");
2872 free(tmp.s);
2873 return 0;
2874 }
2875
2876 // is it a value? Here we parse as integer/float separately and use strtof
2877 // rather than strtod, because the more accurate double representation
2878 // would invalidate floating point comparisons like QUAL=59.2, obtained via
2879 // htslib/vcf parser.
2880 // Update: use strtod() and force floats only in comparisons
2881 char *end;
2882 tok->threshold = strtol(tmp.s, &end, 10); // integer?
2883 if ( end - tmp.s != strlen(tmp.s) )
2884 {
2885 errno = 0;
2886 tok->threshold = strtod(tmp.s, &end); // float?
2887 if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
2888 }
2889 tok->is_constant = 1;
2890
2891 if ( tmp.s ) free(tmp.s);
2892 return 0;
2893 }
2894
2895
filter_debug_print(token_t * toks,token_t ** tok_ptrs,int ntoks)2896 static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks)
2897 {
2898 int i;
2899 for (i=0; i<ntoks; i++)
2900 {
2901 token_t *tok = toks ? &toks[i] : tok_ptrs[i];
2902 if ( tok->tok_type==TOK_VAL )
2903 {
2904 if ( tok->key )
2905 fprintf(stderr,"%s", tok->key);
2906 else if ( tok->tag )
2907 fprintf(stderr,"%s", tok->tag);
2908 else
2909 fprintf(stderr,"%e", tok->threshold);
2910 }
2911 else
2912 fprintf(stderr,"%c", TOKEN_STRING[tok->tok_type]);
2913 if ( tok->setter ) fprintf(stderr,"\t[setter %p]", tok->setter);
2914 fprintf(stderr,"\n");
2915 }
2916 }
2917
str_to_lower(char * str)2918 static void str_to_lower(char *str)
2919 {
2920 while ( *str ) { *str = tolower(*str); str++; }
2921 }
perl_exec(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)2922 static int perl_exec(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
2923 {
2924 #if ENABLE_PERL_FILTERS
2925
2926 PerlInterpreter *perl = flt->perl;
2927 if ( !perl ) error("Error: perl expression without a perl script name\n");
2928
2929 dSP;
2930 ENTER;
2931 SAVETMPS;
2932
2933 PUSHMARK(SP);
2934 int i,j, istack = nstack - rtok->nargs;
2935 for (i=istack+1; i<nstack; i++)
2936 {
2937 token_t *tok = stack[i];
2938 if ( tok->is_str )
2939 XPUSHs(sv_2mortal(newSVpvn(tok->str_value.s,tok->str_value.l)));
2940 else if ( tok->nvalues==1 )
2941 XPUSHs(sv_2mortal(newSVnv(tok->values[0])));
2942 else if ( tok->nvalues>1 )
2943 {
2944 AV *av = newAV();
2945 for (j=0; j<tok->nvalues; j++) av_push(av, newSVnv(tok->values[j]));
2946 SV *rv = newRV_inc((SV*)av);
2947 XPUSHs(rv);
2948 }
2949 else
2950 {
2951 bcf_double_set_missing(tok->values[0]);
2952 XPUSHs(sv_2mortal(newSVnv(tok->values[0])));
2953 }
2954 }
2955 PUTBACK;
2956
2957 // A possible future todo: provide a means to select samples and indexes,
2958 // expressions like this don't work yet
2959 // perl.filter(FMT/AD)[1:0]
2960
2961 int nret = call_pv(stack[istack]->str_value.s, G_ARRAY);
2962
2963 SPAGAIN;
2964
2965 rtok->nvalues = nret;
2966 hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
2967 for (i=nret; i>0; i--)
2968 {
2969 rtok->values[i-1] = (double) POPn;
2970 if ( isnan(rtok->values[i-1]) ) bcf_double_set_missing(rtok->values[i-1]);
2971 }
2972
2973 PUTBACK;
2974 FREETMPS;
2975 LEAVE;
2976
2977 #else
2978 error("\nPerl filtering requires running `configure --enable-perl-filters` at compile time.\n\n");
2979 #endif
2980 return rtok->nargs;
2981 }
perl_init(filter_t * filter,char ** str)2982 static void perl_init(filter_t *filter, char **str)
2983 {
2984 char *beg = *str;
2985 while ( *beg && isspace(*beg) ) beg++;
2986 if ( !*beg ) return;
2987 if ( strncasecmp("perl:", beg, 5) ) return;
2988 #if ENABLE_PERL_FILTERS
2989 beg += 5;
2990 char *end = beg;
2991 while ( *end && *end!=';' ) end++; // for now not escaping semicolons
2992 *str = end+1;
2993
2994 if ( ++filter_ninit == 1 )
2995 {
2996 // must be executed only once, even for multiple filters; first time here
2997 int argc = 0;
2998 char **argv = NULL;
2999 char **env = NULL;
3000 PERL_SYS_INIT3(&argc, &argv, &env);
3001 }
3002
3003 filter->perl = perl_alloc();
3004 PerlInterpreter *perl = filter->perl;
3005
3006 if ( !perl ) error("perl_alloc failed\n");
3007 perl_construct(perl);
3008
3009 // name of the perl script to run
3010 char *rmme = (char*) calloc(end - beg + 1,1);
3011 memcpy(rmme, beg, end - beg);
3012 char *argv[] = { "", "" };
3013 argv[1] = expand_path(rmme);
3014 free(rmme);
3015
3016 PL_origalen = 1; // don't allow $0 change
3017 int ret = perl_parse(filter->perl, NULL, 2, argv, NULL);
3018 PL_exit_flags |= PERL_EXIT_DESTRUCT_END;
3019 if ( ret ) error("Failed to parse: %s\n", argv[1]);
3020 free(argv[1]);
3021
3022 perl_run(perl);
3023 #else
3024 error("\nPerl filtering requires running `configure --enable-perl-filters` at compile time.\n\n");
3025 #endif
3026 }
perl_destroy(filter_t * filter)3027 static void perl_destroy(filter_t *filter)
3028 {
3029 #if ENABLE_PERL_FILTERS
3030 if ( !filter->perl ) return;
3031
3032 PerlInterpreter *perl = filter->perl;
3033 perl_destruct(perl);
3034 perl_free(perl);
3035 if ( --filter_ninit <= 0 )
3036 {
3037 // similarly to PERL_SYS_INIT3, can must be executed only once? todo: test
3038 PERL_SYS_TERM();
3039 }
3040 #endif
3041 }
3042
3043
3044 // Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
filter_init(bcf_hdr_t * hdr,const char * str)3045 filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
3046 {
3047 filter_t *filter = (filter_t *) calloc(1,sizeof(filter_t));
3048 filter->str = strdup(str);
3049 filter->hdr = hdr;
3050 filter->max_unpack |= BCF_UN_STR;
3051
3052 int nops = 0, mops = 0; // operators stack
3053 int nout = 0, mout = 0; // filter tokens, RPN
3054 token_t *out = NULL;
3055 token_t *ops = NULL;
3056 char *tmp = filter->str;
3057 perl_init(filter, &tmp);
3058
3059 int last_op = -1;
3060 while ( *tmp )
3061 {
3062 int len, ret;
3063 ret = filters_next_token(&tmp, &len);
3064 if ( ret==-1 ) error("Missing quotes in: %s\n", str);
3065
3066 // fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
3067 // int i; for (i=0; i<nops; i++) fprintf(stderr," .%c", TOKEN_STRING[ops[i].tok_type]); fprintf(stderr,"\n");
3068
3069 if ( ret==TOK_LFT ) // left bracket
3070 {
3071 nops++;
3072 hts_expand0(token_t, nops, mops, ops);
3073 ops[nops-1].tok_type = ret;
3074 }
3075 else if ( ret==TOK_RGT ) // right bracket
3076 {
3077 while ( nops>0 && ops[nops-1].tok_type!=TOK_LFT )
3078 {
3079 nout++;
3080 hts_expand0(token_t, nout, mout, out);
3081 out[nout-1] = ops[nops-1];
3082 memset(&ops[nops-1],0,sizeof(token_t));
3083 nops--;
3084 }
3085 if ( nops<=0 ) error("Could not parse: %s\n", str);
3086 memset(&ops[nops-1],0,sizeof(token_t));
3087 nops--;
3088 }
3089 else if ( ret!=TOK_VAL ) // one of the operators
3090 {
3091 // detect unary minus: replace -value with -1*(value)
3092 if ( ret==TOK_SUB && last_op!=TOK_VAL && last_op!=TOK_RGT )
3093 {
3094 nout++;
3095 hts_expand0(token_t, nout, mout, out);
3096 token_t *tok = &out[nout-1];
3097 tok->tok_type = TOK_VAL;
3098 tok->hdr_id = -1;
3099 tok->pass_site = -1;
3100 tok->threshold = -1.0;
3101 ret = TOK_MULT;
3102 }
3103 else if ( ret == -TOK_FUNC )
3104 {
3105 // this is different from TOK_PERLSUB,TOK_BINOM in that the expression inside the
3106 // brackets gets evaluated as normal expression
3107 nops++;
3108 hts_expand0(token_t, nops, mops, ops);
3109 token_t *tok = &ops[nops-1];
3110 tok->tok_type = -ret;
3111 tok->hdr_id = -1;
3112 tok->pass_site = -1;
3113 tok->threshold = -1.0;
3114 if ( !strncasecmp(tmp-len,"N_PASS",6) )
3115 {
3116 filter->max_unpack |= BCF_UN_FMT;
3117 tok->func = func_npass;
3118 tok->tag = strdup("N_PASS");
3119 }
3120 else if ( !strncasecmp(tmp-len,"F_PASS",6) )
3121 {
3122 filter->max_unpack |= BCF_UN_FMT;
3123 tok->func = func_npass;
3124 tok->tag = strdup("F_PASS");
3125 }
3126 else error("The function \"%s\" is not supported\n", tmp-len);
3127 continue;
3128 }
3129 else if ( ret < 0 ) // variable number of arguments: TOK_PERLSUB,TOK_BINOM
3130 {
3131 ret = -ret;
3132
3133 tmp += len;
3134 char *beg = tmp;
3135 kstring_t rmme = {0,0,0};
3136 int i, margs, nargs = 0;
3137
3138 if ( ret == TOK_PERLSUB )
3139 {
3140 while ( *beg && ((isalnum(*beg) && !ispunct(*beg)) || *beg=='_') ) beg++;
3141 if ( *beg!='(' ) error("Could not parse the expression: %s\n", str);
3142
3143 // the subroutine name
3144 kputc('"', &rmme);
3145 kputsn(tmp, beg-tmp, &rmme);
3146 kputc('"', &rmme);
3147 nout++;
3148 hts_expand0(token_t, nout, mout, out);
3149 filters_init1(filter, rmme.s, rmme.l, &out[nout-1]);
3150 nargs++;
3151 }
3152 char *end = beg;
3153 while ( *end && *end!=')' ) end++;
3154 if ( !*end ) error("Could not parse the expression: %s\n", str);
3155
3156 // subroutine arguments
3157 rmme.l = 0;
3158 kputsn(beg+1, end-beg-1, &rmme);
3159 char **rmme_list = hts_readlist(rmme.s, 0, &margs);
3160 for (i=0; i<margs; i++)
3161 {
3162 nargs++;
3163 nout++;
3164 hts_expand0(token_t, nout, mout, out);
3165 filters_init1(filter, rmme_list[i], strlen(rmme_list[i]), &out[nout-1]);
3166 free(rmme_list[i]);
3167 }
3168 free(rmme_list);
3169 free(rmme.s);
3170
3171 nout++;
3172 hts_expand0(token_t, nout, mout, out);
3173 token_t *tok = &out[nout-1];
3174 tok->tok_type = ret;
3175 tok->nargs = nargs;
3176 tok->hdr_id = -1;
3177 tok->pass_site = -1;
3178 tok->threshold = -1.0;
3179
3180 tmp = end + 1;
3181 continue;
3182 }
3183 else
3184 {
3185 while ( nops>0 && op_prec[ret] < op_prec[ops[nops-1].tok_type] )
3186 {
3187 nout++;
3188 hts_expand0(token_t, nout, mout, out);
3189 out[nout-1] = ops[nops-1];
3190 memset(&ops[nops-1],0,sizeof(token_t));
3191 nops--;
3192 }
3193 }
3194 nops++;
3195 hts_expand0(token_t, nops, mops, ops);
3196 ops[nops-1].tok_type = ret;
3197 }
3198 else if ( !len )
3199 {
3200 if ( *tmp && !isspace(*tmp) ) error("Could not parse the expression: [%s]\n", str);
3201 break; // all tokens read
3202 }
3203 else // annotation name or filtering value
3204 {
3205 nout++;
3206 hts_expand0(token_t, nout, mout, out);
3207 if ( tmp[len-1]==',' )
3208 filters_init1(filter, tmp, len-1, &out[nout-1]);
3209 else
3210 filters_init1(filter, tmp, len, &out[nout-1]);
3211 tmp += len;
3212 }
3213 last_op = ret;
3214 }
3215 while ( nops>0 )
3216 {
3217 if ( ops[nops-1].tok_type==TOK_LFT || ops[nops-1].tok_type==TOK_RGT ) error("Could not parse the expression: [%s]\n", filter->str);
3218 nout++;
3219 hts_expand0(token_t, nout, mout, out);
3220 out[nout-1] = ops[nops-1];
3221 memset(&ops[nops-1],0,sizeof(token_t));
3222 nops--;
3223 }
3224
3225 // In the special cases of TYPE and FILTER the BCF header IDs are yet unknown. Walk through the
3226 // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be
3227 // just before or after the FILTER token and they must be followed with a comparison operator.
3228 // At this point we also initialize regex expressions which, in RPN, must preceed the LIKE/NLIKE operator.
3229 // Additionally, treat "." as missing value rather than a string in numeric equalities; that
3230 // @file is only used with ID; etc.
3231 // This code is fragile: improve me.
3232 static int comma_separator_warned = 0;
3233 int i;
3234 for (i=0; i<nout; i++)
3235 {
3236 if ( i+1<nout && (out[i].tok_type==TOK_LT || out[i].tok_type==TOK_BT) && out[i+1].tok_type==TOK_EQ )
3237 error("Error parsing the expression: \"%s\"\n", filter->str);
3238
3239 if ( out[i].hash )
3240 {
3241 int j = out[i+1].tok_type==TOK_VAL ? i+1 : i-1;
3242 if ( out[j].comparator!=filters_cmp_id )
3243 error("Error: could not parse the expression. Note that the \"@file_name\" syntax can be currently used with ID column only.\n");
3244 }
3245 if ( out[i].tok_type==TOK_OR || out[i].tok_type==TOK_OR_VEC )
3246 out[i].func = vector_logic_or;
3247 if ( out[i].tok_type==TOK_AND || out[i].tok_type==TOK_AND_VEC )
3248 out[i].func = vector_logic_and;
3249 if ( out[i].tok_type==TOK_EQ || out[i].tok_type==TOK_NE )
3250 {
3251 // Look for j="." and k numeric type
3252 int j = i-1, k = i-2;
3253 if ( !out[j].is_str ) { k = i-1, j = i-2; }
3254 if ( out[j].is_str && out[j].key && !strcmp(".",out[j].key) )
3255 {
3256 int set_missing = 0;
3257 if ( out[k].hdr_id>0 )
3258 {
3259 int type = bcf_hdr_id2type(filter->hdr,out[k].tag_type,out[k].hdr_id);
3260 if ( type==BCF_HT_INT ) set_missing = 1;
3261 else if ( type==BCF_HT_REAL ) set_missing = 1;
3262 }
3263 else if ( !strcmp("QUAL",out[k].tag) ) set_missing = 1;
3264 if ( set_missing ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
3265 }
3266 }
3267 if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE )
3268 {
3269 int j = i-1;
3270 if ( !out[j].key )
3271 error("Could not parse the expression, wrong value for regex operator: %s\n", filter->str);
3272 out[j].regex = (regex_t *) malloc(sizeof(regex_t));
3273 int cflags = REG_NOSUB;
3274 int len = strlen(out[j].key);
3275 if ( len>2 && out[j].key[len-1]=='i' && out[j].key[len-2]=='/' && out[j].key[len-3]!='\\' )
3276 {
3277 out[j].key[len-2] = 0;
3278 cflags |= REG_ICASE;
3279 }
3280 if ( regcomp(out[j].regex, out[j].key, cflags) )
3281 error("Could not compile the regex expression \"%s\": %s\n", out[j].key,filter->str);
3282 }
3283 if ( out[i].is_str && out[i].tok_type==TOK_VAL && out[i].key && strchr(out[i].key,',') )
3284 {
3285 int print_note = 0;
3286 if ( out[i+1].tok_type==TOK_EQ || (out[i+1].is_str && out[i+2].tok_type==TOK_EQ) ) print_note = 1;
3287 else if ( out[i+1].tok_type==TOK_NE || (out[i+1].is_str && out[i+2].tok_type==TOK_NE) ) print_note = 1;
3288 if ( print_note && !comma_separator_warned )
3289 {
3290 comma_separator_warned = 1;
3291 fprintf(stderr,
3292 "Warning: comma is interpreted as a separator and OR logic is used in string comparisons.\n"
3293 " (Search the manual for \"Comma in strings\" to learn more.)\n");
3294 }
3295 }
3296 if ( out[i].tok_type!=TOK_VAL ) continue;
3297 if ( !out[i].tag ) continue;
3298 if ( out[i].setter==filters_set_type )
3299 {
3300 if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
3301 int itok, ival;
3302 if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1, itok = i + 1;
3303 else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1, itok = i + 1;
3304 else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) itok = i + 2, ival = i + 1;
3305 else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) itok = i + 2, ival = i + 1;
3306 else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
3307 if ( !strcasecmp(out[ival].key,"snp") || !strcasecmp(out[ival].key,"snps") ) { out[ival].threshold = VCF_SNP<<1; out[ival].is_str = 0; }
3308 else if ( !strcasecmp(out[ival].key,"indel") || !strcasecmp(out[ival].key,"indels") ) { out[ival].threshold = VCF_INDEL<<1; out[ival].is_str = 0; }
3309 else if ( !strcasecmp(out[ival].key,"mnp") || !strcasecmp(out[ival].key,"mnps") ) { out[ival].threshold = VCF_MNP<<1; out[ival].is_str = 0; }
3310 else if ( !strcasecmp(out[ival].key,"other") ) { out[ival].threshold = VCF_OTHER<<1; out[ival].is_str = 0; }
3311 else if ( !strcasecmp(out[ival].key,"bnd") ) { out[ival].threshold = VCF_BND<<1; out[ival].is_str = 0; }
3312 else if ( !strcasecmp(out[ival].key,"overlap") ) { out[ival].threshold = VCF_OVERLAP<<1; out[ival].is_str = 0; }
3313 else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; }
3314 else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str);
3315 if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and;
3316 out[ival].tag = out[ival].key; out[ival].key = NULL;
3317 i = itok;
3318 continue;
3319 }
3320 if ( !strcmp(out[i].tag,"GT") )
3321 {
3322 if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
3323 int ival;
3324 if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
3325 else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1;
3326 else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = i + 1;
3327 else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) ival = i + 1;
3328 else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
3329
3330 if ( !out[ival].key ) error("Comparison between samples is not supported, sorry!\n");
3331
3332 // assign correct setters and unify expressions, eg ar->ra, HOM->hom, etc
3333 if ( !strcasecmp(out[ival].key,"hom") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
3334 else if ( !strcasecmp(out[ival].key,"het") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
3335 else if ( !strcasecmp(out[ival].key,"hap") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
3336 else if ( !strcasecmp(out[ival].key,"mis") ) { out[i].setter = filters_set_genotype4; str_to_lower(out[ival].key); }
3337 else if ( !strcasecmp(out[ival].key,"ref") ) { out[i].setter = filters_set_genotype4; str_to_lower(out[ival].key); }
3338 else if ( !strcasecmp(out[ival].key,"alt") ) { out[i].setter = filters_set_genotype4; str_to_lower(out[ival].key); }
3339 else if ( !strcasecmp(out[ival].key,"rr") ) { out[i].setter = filters_set_genotype2; str_to_lower(out[ival].key); }
3340 else if ( !strcasecmp(out[ival].key,"ra") || !strcasecmp(out[ival].key,"ar") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]='a'; } // ra
3341 else if ( !strcmp(out[ival].key,"aA") || !strcmp(out[ival].key,"Aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='A'; } // aA
3342 else if ( !strcasecmp(out[ival].key,"aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='a'; } // aa
3343 else if ( !strcasecmp(out[ival].key,"a") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]=0; } // a
3344 else if ( !strcasecmp(out[ival].key,"r") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]=0; } // r
3345 continue;
3346 }
3347 if ( out[i].tag_type==BCF_HL_FLT )
3348 {
3349 if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
3350 int itok = i, ival;
3351 if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
3352 else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_IN, ival = i - 1;
3353 else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NOT_IN, ival = i - 1;
3354 else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = ++i;
3355 else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_IN, ival = ++i;
3356 else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NOT_IN, ival = ++i;
3357 else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
3358 if ( out[ival].tok_type!=TOK_VAL || !out[ival].key )
3359 error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
3360 if ( strcmp(".",out[ival].key) )
3361 {
3362 out[ival].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[ival].key);
3363 if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[ival].hdr_id) )
3364 error("The filter \"%s\" not present in the VCF header\n", out[ival].key);
3365 }
3366 else
3367 out[ival].hdr_id = -1;
3368 out[ival].tag = out[ival].key; out[ival].key = NULL;
3369 out[itok].hdr_id = out[ival].hdr_id;
3370 continue;
3371 }
3372 }
3373 filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0;
3374 for (i=0; i<nout; i++)
3375 {
3376 if ( out[i].tok_type==TOK_MAX ) { out[i].func = func_max; out[i].tok_type = TOK_FUNC; }
3377 else if ( out[i].tok_type==TOK_MIN ) { out[i].func = func_min; out[i].tok_type = TOK_FUNC; }
3378 else if ( out[i].tok_type==TOK_AVG ) { out[i].func = func_avg; out[i].tok_type = TOK_FUNC; }
3379 else if ( out[i].tok_type==TOK_MEDIAN ) { out[i].func = func_median; out[i].tok_type = TOK_FUNC; }
3380 else if ( out[i].tok_type==TOK_AVG ) { out[i].func = func_avg; out[i].tok_type = TOK_FUNC; }
3381 else if ( out[i].tok_type==TOK_STDEV ) { out[i].func = func_stddev; out[i].tok_type = TOK_FUNC; }
3382 else if ( out[i].tok_type==TOK_SUM ) { out[i].func = func_sum; out[i].tok_type = TOK_FUNC; }
3383 else if ( out[i].tok_type==TOK_ABS ) { out[i].func = func_abs; out[i].tok_type = TOK_FUNC; }
3384 else if ( out[i].tok_type==TOK_CNT ) { out[i].func = func_count; out[i].tok_type = TOK_FUNC; }
3385 else if ( out[i].tok_type==TOK_LEN ) { out[i].func = func_strlen; out[i].tok_type = TOK_FUNC; }
3386 else if ( out[i].tok_type==TOK_PHRED ) { out[i].func = func_phred; out[i].tok_type = TOK_FUNC; }
3387 else if ( out[i].tok_type==TOK_BINOM ) { out[i].func = func_binom; out[i].tok_type = TOK_FUNC; }
3388 else if ( out[i].tok_type==TOK_PERLSUB ) { out[i].func = perl_exec; out[i].tok_type = TOK_FUNC; }
3389 else if ( out[i].tok_type==TOK_sMAX ) { out[i].func = func_smpl_max; out[i].tok_type = TOK_FUNC; }
3390 else if ( out[i].tok_type==TOK_sMIN ) { out[i].func = func_smpl_min; out[i].tok_type = TOK_FUNC; }
3391 else if ( out[i].tok_type==TOK_sAVG ) { out[i].func = func_smpl_avg; out[i].tok_type = TOK_FUNC; }
3392 else if ( out[i].tok_type==TOK_sMEDIAN ) { out[i].func = func_smpl_median; out[i].tok_type = TOK_FUNC; }
3393 else if ( out[i].tok_type==TOK_sSTDEV ) { out[i].func = func_smpl_stddev; out[i].tok_type = TOK_FUNC; }
3394 else if ( out[i].tok_type==TOK_sSUM ) { out[i].func = func_smpl_sum; out[i].tok_type = TOK_FUNC; }
3395 hts_expand0(double,1,out[i].mvalues,out[i].values);
3396 if ( filter->nsamples )
3397 {
3398 out[i].pass_samples = (uint8_t*)malloc(filter->nsamples);
3399 int j;
3400 for (j=0; j<filter->nsamples; j++) out[i].pass_samples[j] = 0;
3401 }
3402 }
3403
3404 if (0) filter_debug_print(out, NULL, nout);
3405
3406 if ( mops ) free(ops);
3407 filter->filters = out;
3408 filter->nfilters = nout;
3409 filter->flt_stack = (token_t **)malloc(sizeof(token_t*)*nout);
3410 return filter;
3411 }
3412
filter_destroy(filter_t * filter)3413 void filter_destroy(filter_t *filter)
3414 {
3415 perl_destroy(filter);
3416 int i;
3417 for (i=0; i<filter->nfilters; i++)
3418 {
3419 if ( filter->filters[i].key ) free(filter->filters[i].key);
3420 free(filter->filters[i].str_value.s);
3421 free(filter->filters[i].tag);
3422 free(filter->filters[i].idxs);
3423 free(filter->filters[i].usmpl);
3424 free(filter->filters[i].values);
3425 free(filter->filters[i].pass_samples);
3426 if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash);
3427 if (filter->filters[i].regex)
3428 {
3429 regfree(filter->filters[i].regex);
3430 free(filter->filters[i].regex);
3431 }
3432 }
3433 free(filter->filters);
3434 free(filter->flt_stack);
3435 free(filter->str);
3436 free(filter->tmpi);
3437 free(filter->tmpf);
3438 free(filter->tmps.s);
3439 free(filter);
3440 }
3441
filter_test(filter_t * filter,bcf1_t * line,const uint8_t ** samples)3442 int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
3443 {
3444 bcf_unpack(line, filter->max_unpack);
3445
3446 int i, nstack = 0;
3447 for (i=0; i<filter->nfilters; i++)
3448 {
3449 filter->filters[i].pass_site = 0;
3450 if ( filter->filters[i].tok_type == TOK_VAL )
3451 {
3452 if ( filter->filters[i].setter ) // variable, query the VCF line
3453 filter->filters[i].setter(filter, line, &filter->filters[i]);
3454 else if ( filter->filters[i].key ) // string constant
3455 {
3456 filter->filters[i].str_value.l = 0;
3457 kputs(filter->filters[i].key, &filter->filters[i].str_value);
3458 filter->filters[i].nvalues = filter->filters[i].str_value.l;
3459 }
3460 else // numeric constant
3461 {
3462 filter->filters[i].values[0] = filter->filters[i].threshold;
3463 filter->filters[i].nvalues = 1;
3464 }
3465 filter->flt_stack[nstack++] = &filter->filters[i];
3466 continue;
3467 }
3468 else if ( filter->filters[i].func )
3469 {
3470 int nargs = filter->filters[i].func(filter, line, &filter->filters[i], filter->flt_stack, nstack);
3471 filter->flt_stack[nstack-nargs] = &filter->filters[i];
3472 if ( --nargs > 0 ) nstack -= nargs;
3473 continue;
3474 }
3475 if ( nstack<2 )
3476 error("Error occurred while processing the filter \"%s\" (1:%d)\n", filter->str,nstack); // too few values left on the stack
3477
3478 int is_str = filter->flt_stack[nstack-1]->is_str + filter->flt_stack[nstack-2]->is_str;
3479
3480 if ( filter->filters[i].tok_type == TOK_ADD )
3481 {
3482 VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+);
3483 filter->flt_stack[nstack-2] = &filter->filters[i];
3484 nstack--;
3485 continue;
3486 }
3487 else if ( filter->filters[i].tok_type == TOK_SUB )
3488 {
3489 VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-);
3490 filter->flt_stack[nstack-2] = &filter->filters[i];
3491 nstack--;
3492 continue;
3493 }
3494 else if ( filter->filters[i].tok_type == TOK_MULT )
3495 {
3496 VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*);
3497 filter->flt_stack[nstack-2] = &filter->filters[i];
3498 nstack--;
3499 continue;
3500 }
3501 else if ( filter->filters[i].tok_type == TOK_DIV )
3502 {
3503 VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/);
3504 filter->flt_stack[nstack-2] = &filter->filters[i];
3505 nstack--;
3506 continue;
3507 }
3508
3509 // ideally, these comparators would become func, but this would require more work in init1()
3510 if ( filter->filters[i].comparator )
3511 {
3512 filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line);
3513 }
3514 else if ( filter->flt_stack[nstack-1]->comparator )
3515 {
3516 filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line);
3517 }
3518 else if ( filter->flt_stack[nstack-2]->comparator )
3519 {
3520 filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],line);
3521 }
3522 else if ( is_str==2 )
3523 {
3524 cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i]);
3525 }
3526 else
3527 {
3528 if ( is_str==1 ) error("Error: cannot use arithmetic operators to compare strings and numbers\n");
3529
3530 // Determine what to do with one [1] or both [2] sides missing. The first field [0] gives [1]|[2]
3531 int missing_logic[] = {0,0,0};
3532 if ( filter->filters[i].tok_type == TOK_EQ ) { missing_logic[0] = missing_logic[2] = 1; }
3533 if ( filter->filters[i].tok_type == TOK_NE ) { missing_logic[0] = missing_logic[1] = 1; }
3534
3535 if ( filter->filters[i].tok_type == TOK_EQ )
3536 CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],==,missing_logic)
3537 else if ( filter->filters[i].tok_type == TOK_NE )
3538 CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],!=,missing_logic)
3539 else if ( filter->filters[i].tok_type == TOK_LE )
3540 CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<=,missing_logic)
3541 else if ( filter->filters[i].tok_type == TOK_LT )
3542 CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<,missing_logic)
3543 else if ( filter->filters[i].tok_type == TOK_BT )
3544 CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>,missing_logic)
3545 else if ( filter->filters[i].tok_type == TOK_BE )
3546 CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>=,missing_logic)
3547 else
3548 error("todo: %s:%d .. type=%d\n", __FILE__,__LINE__,filter->filters[i].tok_type);
3549 }
3550 filter->flt_stack[nstack-2] = &filter->filters[i];
3551 nstack--;
3552 }
3553 if ( nstack>1 ) error("Error occurred while processing the filter \"%s\" (2:%d)\n", filter->str,nstack); // too few values left on the stack
3554 if ( samples )
3555 {
3556 *samples = filter->max_unpack&BCF_UN_FMT ? filter->flt_stack[0]->pass_samples : NULL;
3557 if ( *samples && !filter->flt_stack[0]->nsamples )
3558 {
3559 for (i=0; i<filter->nsamples; i++)
3560 filter->flt_stack[0]->pass_samples[i] = filter->flt_stack[0]->pass_site;
3561 }
3562 }
3563 return filter->flt_stack[0]->pass_site;
3564 }
3565
filter_max_unpack(filter_t * flt)3566 int filter_max_unpack(filter_t *flt)
3567 {
3568 return flt->max_unpack;
3569 }
3570
filter_get_doubles(filter_t * filter,int * nval,int * nval1)3571 const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1)
3572 {
3573 token_t *tok = filter->flt_stack[0];
3574 if ( tok->nvalues )
3575 {
3576 *nval = tok->nvalues;
3577 *nval1 = tok->nval1;
3578 }
3579 else
3580 {
3581 if ( !tok->values ) error("fixme in filter_get_doubles(): %s\n", filter->str);
3582 *nval = 1;
3583 *nval1 = 1;
3584 tok->values[0] = filter->flt_stack[0]->pass_site;
3585 }
3586 return tok->values;
3587 }
3588
filter_set_samples(filter_t * filter,const uint8_t * samples)3589 void filter_set_samples(filter_t *filter, const uint8_t *samples)
3590 {
3591 int i,j;
3592 for (i=0; i<filter->nfilters; i++)
3593 {
3594 if ( !filter->filters[i].nsamples ) continue;
3595 for (j=0; j<filter->filters[i].nsamples; j++) filter->filters[i].usmpl[j] = samples[j];
3596 }
3597 }
3598
3599