1 /*  filter.c -- filter expressions.
2 
3     Copyright (C) 2013-2021 Genome Research Ltd.
4 
5     Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.  */
24 
25 #include <ctype.h>
26 #include <stdlib.h>
27 #include <strings.h>
28 #include <assert.h>
29 #include <errno.h>
30 #include <math.h>
31 #include <sys/types.h>
32 #include <inttypes.h>
33 #ifndef _WIN32
34 #include <pwd.h>
35 #endif
36 #include <regex.h>
37 #include <htslib/khash_str2int.h>
38 #include <htslib/hts_defs.h>
39 #include <htslib/vcfutils.h>
40 #include <htslib/kfunc.h>
41 #include "config.h"
42 #include "filter.h"
43 #include "bcftools.h"
44 
45 #if ENABLE_PERL_FILTERS
46 #  define filter_t perl_filter_t
47 #  include <EXTERN.h>
48 #  include <perl.h>
49 #  undef filter_t
50 #  define my_perl perl
51 
52 static int filter_ninit = 0;
53 #endif
54 
55 
56 #ifndef __FUNCTION__
57 #  define __FUNCTION__ __func__
58 #endif
59 
60 typedef struct _token_t
61 {
62     // read-only values, same for all VCF lines
63     int tok_type;       // one of the TOK_* keys below
64     int nargs;          // with TOK_PERLSUB the first argument is the name of the subroutine
65     char *key;          // set only for string constants, otherwise NULL
66     char *tag;          // for debugging and printout only, VCF tag name
67     double threshold;   // filtering threshold
68     int is_constant;    // the threshold is set
69     int hdr_id, tag_type;   // BCF header lookup ID and one of BCF_HL_* types
70     int idx;            // 0-based index to VCF vectors,
71                         //  -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
72     int *idxs;          // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited; used by VCF retrievers only
73     int nidxs, nuidxs;  // size of idxs array and the number of elements set to 1
74     uint8_t *usmpl;     // bitmask of used samples as set by idx, set for FORMAT fields, NULL otherwise
75     int nsamples;       // number of samples for format fields, 0 for info and other fields
76     void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
77     int (*func)(filter_t *, bcf1_t *, struct _token_t *rtok, struct _token_t **stack, int nstack);
78     void (*comparator)(struct _token_t *, struct _token_t *, struct _token_t *rtok, bcf1_t *);
79     void *hash;         // test presence of str value in the hash via comparator
80     regex_t *regex;     // precompiled regex for string comparison
81 
82     // modified on filter evaluation at each VCF line
83     double *values;
84     kstring_t str_value;
85     int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
86     int pass_site;          // -1 not applicable, 0 fails, >0 pass
87     uint8_t *pass_samples;  // status of individual samples
88     int nvalues, mvalues;   // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l
89     int nval1;              // number of per-sample fields or string length
90 }
91 token_t;
92 
93 struct _filter_t
94 {
95     bcf_hdr_t *hdr;
96     char *str;
97     int nfilters;
98     token_t *filters, **flt_stack;  // filtering input tokens (in RPN) and evaluation stack
99     int32_t *tmpi;
100     float   *tmpf;
101     kstring_t tmps;
102     int max_unpack, mtmpi, mtmpf, nsamples;
103 #if ENABLE_PERL_FILTERS
104     PerlInterpreter *perl;
105 #endif
106 };
107 
108 
109 #define TOK_VAL     0
110 #define TOK_LFT     1       // (
111 #define TOK_RGT     2       // )
112 #define TOK_LE      3       // less or equal
113 #define TOK_LT      4       // less than
114 #define TOK_EQ      5       // equal
115 #define TOK_BT      6       // bigger than
116 #define TOK_BE      7       // bigger or equal
117 #define TOK_NE      8       // not equal
118 #define TOK_OR      9       // |
119 #define TOK_AND     10      // &
120 #define TOK_ADD     11      // +
121 #define TOK_SUB     12      // -
122 #define TOK_MULT    13      // *
123 #define TOK_DIV     14      // /
124 #define TOK_MAX     15
125 #define TOK_MIN     16
126 #define TOK_AVG     17
127 #define TOK_AND_VEC 18      // &&   (operator applied in samples)
128 #define TOK_OR_VEC  19      // ||   (operator applied in samples)
129 #define TOK_LIKE    20      //  ~ regular expression
130 #define TOK_NLIKE   21      // !~ regular expression
131 #define TOK_SUM     22
132 #define TOK_ABS     23
133 #define TOK_LEN     24
134 #define TOK_FUNC    25
135 #define TOK_CNT     26
136 #define TOK_PERLSUB 27
137 #define TOK_BINOM   28
138 #define TOK_PHRED   29
139 #define TOK_MEDIAN  30
140 #define TOK_STDEV   31
141 #define TOK_sMAX    32
142 #define TOK_sMIN    33
143 #define TOK_sAVG    34
144 #define TOK_sMEDIAN 35
145 #define TOK_sSTDEV  36
146 #define TOK_sSUM    37
147 #define TOK_IN      38      // contains, e.g. FILTER~"A"
148 #define TOK_NOT_IN  39      // does not contain, e.g. FILTER!~"A"
149 
150 //                      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
151 //                        ( ) [ < = > ] ! | &  +  -  *  /  M  m  a  A  O  ~  ^  S  .  l  f  c  p  b  P  i  s
152 static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
153 #define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis"       // this is only for debugging, not maintained diligently
154 
155 // Return negative values if it is a function with variable number of arguments
filters_next_token(char ** str,int * len)156 static int filters_next_token(char **str, int *len)
157 {
158     char *tmp = *str;
159     while ( *tmp && isspace(*tmp) ) tmp++;
160     *str = tmp;
161     *len = 0;
162 
163     // test for doubles: d.ddde[+-]dd
164     if ( isdigit(*str[0]) || *str[0]=='.' )   // strtod would eat +/-
165     {
166         double HTS_UNUSED v = strtod(*str, &tmp);
167         if ( *str!=tmp && (!tmp[0] || !isalnum(tmp[0])) )
168         {
169             *len = tmp - (*str);
170             return TOK_VAL;
171         }
172         tmp = *str;
173     }
174 
175     if ( !strncasecmp(tmp,"SMPL_MAX(",9) ) { (*str) += 8; return TOK_sMAX; }
176     if ( !strncasecmp(tmp,"SMPL_MIN(",9) ) { (*str) += 8; return TOK_sMIN; }
177     if ( !strncasecmp(tmp,"SMPL_MEAN(",10) ) { (*str) += 9; return TOK_sAVG; }
178     if ( !strncasecmp(tmp,"SMPL_MEDIAN(",12) ) { (*str) += 11; return TOK_sMEDIAN; }
179     if ( !strncasecmp(tmp,"SMPL_AVG(",9) ) { (*str) += 8; return TOK_sAVG; }
180     if ( !strncasecmp(tmp,"SMPL_STDEV(",11) ) { (*str) += 10; return TOK_sSTDEV; }
181     if ( !strncasecmp(tmp,"SMPL_SUM(",9) ) { (*str) += 8; return TOK_sSUM; }
182     if ( !strncasecmp(tmp,"sMAX(",5) ) { (*str) += 4; return TOK_sMAX; }
183     if ( !strncasecmp(tmp,"sMIN(",5) ) { (*str) += 4; return TOK_sMIN; }
184     if ( !strncasecmp(tmp,"sMEAN(",6) ) { (*str) += 5; return TOK_sAVG; }
185     if ( !strncasecmp(tmp,"sMEDIAN(",8) ) { (*str) += 7; return TOK_sMEDIAN; }
186     if ( !strncasecmp(tmp,"sAVG(",5) ) { (*str) += 4; return TOK_sAVG; }
187     if ( !strncasecmp(tmp,"sSTDEV(",7) ) { (*str) += 6; return TOK_sSTDEV; }
188     if ( !strncasecmp(tmp,"sSUM(",5) ) { (*str) += 4; return TOK_sSUM; }
189     if ( !strncasecmp(tmp,"MAX(",4) ) { (*str) += 3; return TOK_MAX; }
190     if ( !strncasecmp(tmp,"MIN(",4) ) { (*str) += 3; return TOK_MIN; }
191     if ( !strncasecmp(tmp,"MEAN(",5) ) { (*str) += 4; return TOK_AVG; }
192     if ( !strncasecmp(tmp,"MEDIAN(",7) ) { (*str) += 6; return TOK_MEDIAN; }
193     if ( !strncasecmp(tmp,"AVG(",4) ) { (*str) += 3; return TOK_AVG; }
194     if ( !strncasecmp(tmp,"STDEV(",6) ) { (*str) += 5; return TOK_STDEV; }
195     if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; }
196     if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; }
197     if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; }
198     if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; }
199     if ( !strncasecmp(tmp,"BINOM(",6) ) { (*str) += 5; return -TOK_BINOM; }
200     if ( !strncasecmp(tmp,"PHRED(",6) ) { (*str) += 5; return TOK_PHRED; }
201     if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility
202     if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility
203     if ( !strncasecmp(tmp,"%AVG(",5) ) { (*str) += 4; return TOK_AVG; } // for backward compatibility
204     if ( !strncasecmp(tmp,"%SUM(",5) ) { (*str) += 4; return TOK_SUM; } // for backward compatibility
205     if ( !strncasecmp(tmp,"INFO/",5) ) tmp += 5;
206     if ( !strncasecmp(tmp,"FORMAT/",7) ) tmp += 7;
207     if ( !strncasecmp(tmp,"FMT/",4) ) tmp += 4;
208     if ( !strncasecmp(tmp,"PERL.",5) ) { (*str) += 5; return -TOK_PERLSUB; }
209     if ( !strncasecmp(tmp,"N_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; }
210     if ( !strncasecmp(tmp,"F_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; }
211     if ( !strncasecmp(tmp,"%ILEN",5) ) { *len = 5; return TOK_VAL; }    // to be able to distinguish between INFO/ILEN and on-the-fly ILEN
212 
213     if ( tmp[0]=='@' )  // file name
214     {
215         while ( *tmp && !isspace(*tmp) && *tmp!='=' && *tmp!='!' ) tmp++;
216         *len = tmp - (*str);
217         return TOK_VAL;
218     }
219 
220     int square_brackets = 0;
221     while ( tmp[0] )
222     {
223         if ( !square_brackets )
224         {
225             if ( tmp[0]=='"' ) break;
226             if ( tmp[0]=='\'' ) break;
227             if ( isspace(tmp[0]) ) break;
228             if ( tmp[0]=='<' ) break;
229             if ( tmp[0]=='>' ) break;
230             if ( tmp[0]=='=' ) break;
231             if ( tmp[0]=='!' ) break;
232             if ( tmp[0]=='&' ) break;
233             if ( tmp[0]=='|' ) break;
234             if ( tmp[0]=='(' ) break;
235             if ( tmp[0]==')' ) break;
236             if ( tmp[0]=='+' ) break;
237             if ( tmp[0]=='*' ) break;
238             if ( tmp[0]=='-' ) break;
239             if ( tmp[0]=='/' ) break;
240             if ( tmp[0]=='~' ) break;
241         }
242         if ( tmp[0]==']' ) { if (square_brackets) tmp++; break; }
243         if ( tmp[0]=='[' ) square_brackets++;
244         tmp++;
245     }
246     if ( tmp > *str )
247     {
248         *len = tmp - (*str);
249         return TOK_VAL;
250     }
251     if ( tmp[0]=='"' || tmp[0]=='\'' )
252     {
253         int quote = tmp[0];
254         tmp++;
255         while ( *tmp && tmp[0]!=quote ) tmp++;
256         if ( !*tmp ) return -1;     // missing quotes
257         *len = tmp - (*str) + 1;
258         return TOK_VAL;
259     }
260     if ( tmp[0]=='!' )
261     {
262         if ( tmp[1]=='=' ) { (*str) += 2; return TOK_NE; }
263         if ( tmp[1]=='~' ) { (*str) += 2; return TOK_NLIKE; }
264     }
265     if ( tmp[0]=='<' )
266     {
267         if ( tmp[1]=='=' ) { (*str) += 2; return TOK_LE; }
268         (*str) += 1; return TOK_LT;
269     }
270     if ( tmp[0]=='>' )
271     {
272         if ( tmp[1]=='=' ) { (*str) += 2; return TOK_BE; }
273         (*str) += 1; return TOK_BT;
274     }
275     if ( tmp[0]=='=' )
276     {
277         if ( tmp[1]=='=' ) { (*str) += 2; return TOK_EQ; }
278         (*str) += 1; return TOK_EQ;
279     }
280     if ( tmp[0]=='(' ) { (*str) += 1; return TOK_LFT; }
281     if ( tmp[0]==')' ) { (*str) += 1; return TOK_RGT; }
282     if ( tmp[0]=='&' && tmp[1]=='&' ) { (*str) += 2; return TOK_AND_VEC; }
283     if ( tmp[0]=='|' && tmp[1]=='|' ) { (*str) += 2; return TOK_OR_VEC; }
284     if ( tmp[0]=='&' ) { (*str) += 1; return TOK_AND; }
285     if ( tmp[0]=='|' ) { (*str) += 1; return TOK_OR; }
286     if ( tmp[0]=='+' ) { (*str) += 1; return TOK_ADD; }
287     if ( tmp[0]=='-' ) { (*str) += 1; return TOK_SUB; }
288     if ( tmp[0]=='*' ) { (*str) += 1; return TOK_MULT; }
289     if ( tmp[0]=='/' ) { (*str) += 1; return TOK_DIV; }
290     if ( tmp[0]=='~' ) { (*str) += 1; return TOK_LIKE; }
291 
292     *len = tmp - (*str);
293     return TOK_VAL;
294 }
295 
296 
297 /*
298     Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller.
299 
300     Based on jkb's staden code with some adjustments.
301     https://sourceforge.net/p/staden/code/HEAD/tree/staden/trunk/src/Misc/getfile.c#l123
302 */
expand_path(char * path)303 char *expand_path(char *path)
304 {
305     kstring_t str = {0,0,0};
306 
307     if ( path[0] == '~' )
308     {
309         if ( !path[1] || path[1] == '/' )
310         {
311 #ifdef _WIN32
312             kputs(getenv("HOMEDRIVE"), &str);
313             kputs(getenv("HOMEPATH"), &str);
314 #else
315             // ~ or ~/path
316             kputs(getenv("HOME"), &str);
317             if ( path[1] ) kputs(path+1, &str);
318 #endif
319         }
320 #ifndef _WIN32
321         else
322         {
323             // user name: ~pd3/path
324             char *end = path;
325             while ( *end && *end!='/' ) end++;
326             kputsn(path+1, end-path-1, &str);
327             struct passwd *pwentry = getpwnam(str.s);
328             str.l = 0;
329 
330             if ( !pwentry ) kputsn(path, end-path, &str);
331             else kputs(pwentry->pw_dir, &str);
332             kputs(end, &str);
333         }
334 #endif
335         return ks_release(&str);
336     }
337     if ( path[0] == '$' )
338     {
339         char *var = getenv(path+1);
340         if ( var ) {
341             kputs(var, &str);
342             return ks_release(&str);
343         }
344     }
345 
346     return strdup(path);
347 }
348 
filters_set_qual(filter_t * flt,bcf1_t * line,token_t * tok)349 static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok)
350 {
351     float *ptr = &line->qual;
352     if ( bcf_float_is_missing(*ptr) )
353         bcf_double_set_missing(tok->values[0]);
354     else
355         tok->values[0] = (double)line->qual;
356     tok->nvalues  = 1;
357 }
filters_set_type(filter_t * flt,bcf1_t * line,token_t * tok)358 static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok)
359 {
360     tok->values[0] = bcf_get_variant_types(line);
361     if ( !tok->values[0] ) tok->values[0] = 1;      // mistake in htslib: VCF_* should start with 1
362     else tok->values[0] = ((int)tok->values[0]) << 1;
363     tok->nvalues = 1;
364 }
filters_set_info(filter_t * flt,bcf1_t * line,token_t * tok)365 static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
366 {
367     assert( tok->hdr_id >=0  );
368     int i;
369     for (i=0; i<line->n_info; i++)
370         if ( line->d.info[i].key == tok->hdr_id ) break;
371 
372     if ( i==line->n_info )
373         tok->nvalues = tok->str_value.l = 0;
374     else if ( line->d.info[i].type==BCF_BT_CHAR )
375     {
376         int n = line->d.info[i].len;
377         if ( n >= tok->str_value.m )
378         {
379             tok->str_value.m = n + 1;
380             tok->str_value.s = (char*) realloc(tok->str_value.s, tok->str_value.m);
381             if ( !tok->str_value.s ) error("Failed to alloc %d bytes\n", (int)tok->str_value.m);
382         }
383         memcpy(tok->str_value.s, line->d.info[i].vptr, n);
384         tok->str_value.s[n] = 0;
385         tok->nvalues = tok->str_value.l = n;
386     }
387     else if ( line->d.info[i].type==BCF_BT_FLOAT )
388     {
389         if ( bcf_float_is_missing(line->d.info[i].v1.f) ) tok->nvalues = 0;
390         else
391         {
392             tok->values[0] = line->d.info[i].v1.f;
393             tok->nvalues   = 1;
394         }
395         tok->str_value.l = 0;
396     }
397     else
398     {
399         tok->str_value.l = 0;
400         if ( line->d.info[i].type==BCF_BT_INT8 && line->d.info[i].v1.i==bcf_int8_missing ) tok->nvalues = 0;
401         else if ( line->d.info[i].type==BCF_BT_INT16 && line->d.info[i].v1.i==bcf_int16_missing ) tok->nvalues = 0;
402         else if ( line->d.info[i].type==BCF_BT_INT32 && line->d.info[i].v1.i==bcf_int32_missing ) tok->nvalues = 0;
403         else
404         {
405             tok->values[0] = line->d.info[i].v1.i;
406             tok->nvalues   = 1;
407         }
408     }
409 }
filters_cmp_bit_and(token_t * atok,token_t * btok,token_t * rtok,bcf1_t * line)410 static void filters_cmp_bit_and(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
411 {
412     int a = (int)(atok->nvalues?atok->values[0]:atok->threshold);
413     int b = (int)(btok->nvalues?btok->values[0]:btok->threshold);
414     if ( rtok->tok_type==TOK_LIKE )
415         rtok->pass_site = a&b ? 1 : 0;
416     else
417         rtok->pass_site = a&b ? 0 : 1;
418 }
filters_cmp_filter(token_t * atok,token_t * btok,token_t * rtok,bcf1_t * line)419 static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
420 {
421     int i;
422     if ( rtok->tok_type==TOK_NOT_IN )
423     {
424         if ( !line->d.n_flt )
425         {
426             if ( atok->hdr_id==-1 ) return;   // missing value
427             rtok->pass_site = 1;
428             return; // no filter present, eval to true
429         }
430         for (i=0; i<line->d.n_flt; i++)
431             if ( atok->hdr_id==line->d.flt[i] ) return;
432         rtok->pass_site = 1;
433         return;
434     }
435     else if ( rtok->tok_type==TOK_IN )
436     {
437         if ( !line->d.n_flt )
438         {
439             if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
440             return; // no filter present, eval to false
441         }
442         for (i=0; i<line->d.n_flt; i++)
443             if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; }
444         return;
445     }
446     else if ( rtok->tok_type==TOK_NE )  // exact match
447     {
448         if ( !line->d.n_flt )
449         {
450             if ( atok->hdr_id==-1 ) return;   // missing value
451             rtok->pass_site = 1;
452             return; // no filter present, eval to true
453         }
454         if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) return;    // exact match, fail iff a single matching value is present
455         rtok->pass_site = 1;
456         return;
457     }
458     else if ( rtok->tok_type==TOK_EQ )  // exact match, pass iff a single matching value is present
459     {
460         if ( !line->d.n_flt )
461         {
462             if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
463             return; // no filter present, eval to false
464         }
465         if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) rtok->pass_site = 1;
466         return;
467     }
468     else
469         error("Only ==, !=, ~, and !~ operators are supported for FILTER\n");
470     return;
471 }
filters_cmp_id(token_t * atok,token_t * btok,token_t * rtok,bcf1_t * line)472 static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
473 {
474     // multiple IDs not supported yet (easy to add though)
475     if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE )
476         error("Only == and != operators are supported for ID\n");
477 
478     if ( btok->hash )
479     {
480         token_t *tmp = atok; atok = btok; btok = tmp;
481     }
482     if ( atok->hash )
483     {
484         int ret = khash_str2int_has_key(atok->hash, line->d.id);
485         if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1;
486         rtok->pass_site = ret;
487         return;
488     }
489 
490     if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n");
491 
492     if ( rtok->tok_type==TOK_EQ )
493         rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
494     else
495         rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
496 }
497 
498 /**
499  *  bcf_get_info_value() - get single INFO value, int64_t or double
500  *  @line:      BCF line
501  *  @info_id:   tag ID, as returned by bcf_hdr_id2int
502  *  @ivec:      0-based index to retrieve, -1 when single value is expected
503  *  @vptr:      pointer to memory location of sufficient size to accomodate
504  *              info_id's type
505  *
506  *  The returned value is -1 if tag is not present, 0 if present but
507  *  values is missing or ivec is out of range, and 1 on success.
508  */
bcf_get_info_value(bcf1_t * line,int info_id,int ivec,void * value)509 static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value)
510 {
511     int j;
512     for (j=0; j<line->n_info; j++)
513         if ( line->d.info[j].key == info_id ) break;
514     if ( j==line->n_info ) return -1;
515 
516     bcf_info_t *info = &line->d.info[j];
517     if ( info->len == 1 )
518     {
519         if ( info->type==BCF_BT_FLOAT ) *((double*)value) = info->v1.f;
520         else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int64_t*)value) = info->v1.i;
521         return 1;
522     }
523 
524     if ( ivec<0 ) ivec = 0;
525 
526     #define BRANCH(type_t, is_missing, is_vector_end, out_type_t) { \
527         type_t *p = (type_t *) info->vptr; \
528         for (j=0; j<ivec && j<info->len; j++) \
529         { \
530             if ( is_vector_end ) return 0; \
531         } \
532         if ( is_missing ) return 0; \
533         *((out_type_t*)value) = p[j]; \
534         return 1; \
535     }
536     switch (info->type) {
537         case BCF_BT_INT8:  BRANCH(int8_t,  p[j]==bcf_int8_missing,  p[j]==bcf_int8_vector_end,  int64_t); break;
538         case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int64_t); break;
539         case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int64_t); break;
540         case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), double); break;
541         default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
542     }
543     #undef BRANCH
544     return -1;  // this shouldn't happen
545 }
546 
filters_set_chrom(filter_t * flt,bcf1_t * line,token_t * tok)547 static void filters_set_chrom(filter_t *flt, bcf1_t *line, token_t *tok)
548 {
549     tok->str_value.l = 0;
550     kputs(bcf_seqname(flt->hdr,line), &tok->str_value);
551     tok->nvalues = tok->str_value.l;
552     tok->is_str  = 1;
553 }
554 
filters_set_pos(filter_t * flt,bcf1_t * line,token_t * tok)555 static void filters_set_pos(filter_t *flt, bcf1_t *line, token_t *tok)
556 {
557     tok->values[0] = line->pos+1;
558     tok->nvalues = 1;
559 }
560 
filters_set_info_int(filter_t * flt,bcf1_t * line,token_t * tok)561 static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok)
562 {
563     if ( tok->idx==-2 )
564     {
565         tok->nvalues = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
566         if ( tok->nvalues<=0 ) tok->nvalues = 0;
567         else
568         {
569             hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
570             int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1;
571             if ( end >= tok->nvalues ) end = tok->nvalues - 1;
572             for (i=0; i<=end; i++)
573                 if ( i>=tok->nidxs || tok->idxs[i] ) tok->values[j++] = flt->tmpi[i];
574             tok->nvalues = j;
575         }
576     }
577     else
578     {
579         int64_t value = 0;
580         if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
581             tok->nvalues = 0;
582         else
583         {
584             tok->values[0] = value;
585             tok->nvalues = 1;
586         }
587     }
588 }
589 
filters_set_info_float(filter_t * flt,bcf1_t * line,token_t * tok)590 static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok)
591 {
592     if ( tok->idx==-2 )
593     {
594         tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf);
595         if ( tok->nvalues<=0 ) tok->nvalues = 0;
596         else
597         {
598             hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
599             int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1;
600             if ( end >= tok->nvalues ) end = tok->nvalues - 1;
601             for (i=0; i<=end; i++)
602                 if ( i>=tok->nidxs || tok->idxs[i] )
603                 {
604                     if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[j]);
605                     else tok->values[j] = flt->tmpf[i];
606                     j++;
607                 }
608             tok->nvalues = j;
609         }
610     }
611     else
612     {
613         double value;
614         if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
615             tok->nvalues = 0;
616         else
617         {
618             tok->values[0] = value;
619             tok->nvalues = 1;
620         }
621     }
622 }
623 
filters_set_info_string(filter_t * flt,bcf1_t * line,token_t * tok)624 static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok)
625 {
626     int32_t m = tok->str_value.m;
627     int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value.s,&m);
628     tok->str_value.m = m;
629     if ( n<0 ) { tok->nvalues = tok->str_value.l = 0; return; }
630 
631     if ( tok->idx>=0 )
632     {
633         // get ith field (i=tok->idx)
634         int i = 0;
635         char *ss = tok->str_value.s, *se = tok->str_value.s + n;
636         while ( ss<se && i<tok->idx )
637         {
638             if ( *ss==',' ) i++;
639             ss++;
640         }
641         if ( ss==se || i!=tok->idx ) { tok->nvalues = tok->str_value.l = 0; return; }
642         se = ss;
643         while ( se - tok->str_value.s < n && *se!=',' ) se++;
644         if ( ss==tok->str_value.s ) *se = 0;
645         else
646         {
647             memmove(tok->str_value.s, ss, se-ss);
648             tok->str_value.s[se-ss] = 0;
649         }
650         tok->str_value.l = se - ss;
651     }
652     else if ( tok->idx==-2 && tok->idxs[0]==-1 )    // keep all values, TAG[*]
653         tok->str_value.l = n;
654     else if ( tok->idx==-2 )
655     {
656         flt->tmps.l = 0;
657         ks_resize(&flt->tmps, n);
658         int i, iend = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1;
659         if ( iend >= n ) iend = n - 1;
660         char *beg = tok->str_value.s, *dst = flt->tmps.s;
661         for (i=0; i<=iend; i++)
662         {
663             char *end = beg;
664             while ( *end && *end!=',' ) end++;
665             if ( i>=tok->nidxs || tok->idxs[i] )
666             {
667                 memcpy(dst, beg, end - beg);
668                 dst += end - beg;
669                 dst[0] = ',';
670                 dst++;
671             }
672             beg = end+1;
673         }
674         dst[0] = 0;
675         tok->str_value.l = dst - flt->tmps.s;
676 
677         #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
678         SWAP(char *, flt->tmps.s, tok->str_value.s);
679         SWAP(size_t, flt->tmps.m, tok->str_value.m);
680     }
681     tok->nvalues = tok->str_value.l;
682 }
683 
filters_set_info_flag(filter_t * flt,bcf1_t * line,token_t * tok)684 static void filters_set_info_flag(filter_t *flt, bcf1_t *line, token_t *tok)
685 {
686     int j;
687     for (j=0; j<line->n_info; j++)
688         if ( line->d.info[j].key == tok->hdr_id ) break;
689     tok->values[0] = j==line->n_info ? 0 : 1;
690     tok->nvalues = 1;
691 }
692 
filters_set_format_int(filter_t * flt,bcf1_t * line,token_t * tok)693 static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
694 {
695     if ( line->n_sample != tok->nsamples )
696         error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples);
697 
698     int nvals;
699     if ( (nvals=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
700     {
701         tok->nvalues = 0;
702         return;
703     }
704     int i, nsrc1 = nvals / tok->nsamples;
705     tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1);
706     tok->nvalues = tok->nval1*tok->nsamples;
707     hts_expand(double, tok->nvalues, tok->mvalues, tok->values);
708 
709     if ( tok->idx >= 0 )    // scalar or vector index
710     {
711         for (i=0; i<tok->nsamples; i++)
712         {
713             if ( !tok->usmpl[i] ) continue;
714             int32_t *ptr = flt->tmpi + i*nsrc1;
715             if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing )
716                 bcf_double_set_missing(tok->values[i]);
717             else if ( ptr[tok->idx]==bcf_int32_vector_end )
718                 bcf_double_set_vector_end(tok->values[i]);
719             else
720                 tok->values[i] = ptr[tok->idx];
721         }
722     }
723     else
724     {
725         int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
726         for (i=0; i<tok->nsamples; i++)
727         {
728             if ( !tok->usmpl[i] ) continue;
729             int32_t *src = flt->tmpi + i*nsrc1;
730             double  *dst = tok->values + i*tok->nval1;
731             int k, j = 0;
732             for (k=0; k<kend; k++)
733             {
734                 if ( k<tok->nidxs && !tok->idxs[k] ) continue;
735                 if ( src[k]==bcf_int32_missing )
736                     bcf_double_set_missing(dst[j]);
737                 else if ( src[k]==bcf_int32_vector_end )
738                     bcf_double_set_vector_end(dst[j]);
739                 else
740                     dst[j] = src[k];
741                 j++;
742             }
743             if ( j==0 )
744             {
745                 bcf_double_set_missing(dst[j]);
746                 j++;
747             }
748             while (j < tok->nval1)
749             {
750                 bcf_double_set_vector_end(dst[j]);
751                 j++;
752             }
753         }
754     }
755 }
filters_set_format_float(filter_t * flt,bcf1_t * line,token_t * tok)756 static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
757 {
758     if ( line->n_sample != tok->nsamples )
759         error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples);
760 
761     int nvals;
762     if ( (nvals=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 )
763     {
764         tok->nvalues = 0;
765         return;
766     }
767     int i, nsrc1 = nvals / tok->nsamples;
768     tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1);
769     tok->nvalues = tok->nval1*tok->nsamples;
770     hts_expand(double, tok->nvalues, tok->mvalues, tok->values);
771 
772     if ( tok->idx >= 0 )    // scalar or vector index
773     {
774         for (i=0; i<tok->nsamples; i++)
775         {
776             if ( !tok->usmpl[i] ) continue;
777             float *ptr = flt->tmpf + i*nsrc1;
778             if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) )
779                 bcf_double_set_missing(tok->values[i]);
780             else if ( bcf_float_is_vector_end(ptr[tok->idx]) )
781                 bcf_double_set_vector_end(tok->values[i]);
782             else
783                 tok->values[i] = ptr[tok->idx];
784         }
785     }
786     else
787     {
788         int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
789         for (i=0; i<tok->nsamples; i++)
790         {
791             if ( !tok->usmpl[i] ) continue;
792             float *src = flt->tmpf + i*nsrc1;
793             double  *dst = tok->values + i*tok->nval1;
794             int k, j = 0;
795             for (k=0; k<kend; k++)
796             {
797                 if ( k<tok->nidxs && !tok->idxs[k] ) continue;
798                 if ( bcf_float_is_missing(src[k]) )
799                     bcf_double_set_missing(dst[j]);
800                 else if ( bcf_float_is_vector_end(src[k]) )
801                     bcf_double_set_vector_end(dst[j]);
802                 else
803                     dst[j] = src[k];
804                 j++;
805             }
806             if ( j==0 )
807             {
808                 bcf_double_set_missing(dst[j]);
809                 j++;
810             }
811             while (j < tok->nval1)
812             {
813                 bcf_double_set_vector_end(dst[j]);
814                 j++;
815             }
816         }
817     }
818 }
filters_set_format_string(filter_t * flt,bcf1_t * line,token_t * tok)819 static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
820 {
821     if ( line->n_sample != tok->nsamples )
822         error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples);
823 
824     int i, ndim = tok->str_value.m;
825     int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim);
826     tok->str_value.m = ndim;
827     tok->str_value.l = tok->nvalues = 0;
828 
829     if ( nstr<0 ) return;
830 
831     tok->nvalues = tok->str_value.l = nstr;
832     tok->nval1   = nstr / tok->nsamples;
833     for (i=0; i<tok->nsamples; i++)
834     {
835         if ( !tok->usmpl[i] ) continue;
836         char *src = tok->str_value.s + i*tok->nval1, *dst = src;
837         int ibeg = 0, idx = 0;
838         while ( ibeg < tok->nval1 )
839         {
840             int iend = ibeg;
841             while ( iend < tok->nval1 && src[iend] && src[iend]!=',' ) iend++;
842 
843             int keep = 0;
844             if ( tok->idx >= 0 )
845             {
846                 if ( tok->idx==idx ) keep = 1;
847             }
848             else if ( idx < tok->nidxs )
849             {
850                 if ( tok->idxs[idx] != 0 ) keep = 1;
851             }
852             else if ( tok->idxs[tok->nidxs-1] < 0 )
853                 keep = 1;
854 
855             if ( keep )
856             {
857                 if ( ibeg!=0 ) memmove(dst, src+ibeg, iend-ibeg+1);
858                 dst += iend - ibeg + 1;
859                 if ( tok->idx>=0 ) break;
860             }
861             if ( !src[iend] ) break;
862             ibeg = iend + 1;
863             idx++;
864         }
865         if ( dst==src ) { dst[0] = '.'; dst+=2; }
866         if ( dst - src < tok->nval1 ) memset(dst-1, 0, tok->nval1 - (dst - src));
867     }
868 }
_filters_set_genotype(filter_t * flt,bcf1_t * line,token_t * tok,int type)869 static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type)
870 {
871     bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
872     if ( !fmt )
873     {
874         tok->nvalues = tok->str_value.l = 0;
875         return;
876     }
877 
878     int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4;
879     if ( tok->str_value.m <= nvals1*nsmpl )
880     {
881         tok->str_value.m = nvals1*nsmpl + 1;
882         tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m);
883     }
884 
885 #define BRANCH_INT(type_t,vector_end) \
886     { \
887         for (i=0; i<line->n_sample; i++) \
888         { \
889             type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \
890             int is_het = 0, has_ref = 0, missing = 0; \
891             for (j=0; j<fmt->n; j++) \
892             { \
893                 if ( ptr[j]==vector_end ) break; /* smaller ploidy */ \
894                 if ( bcf_gt_is_missing(ptr[j]) ) { missing=1; break; } /* missing allele */ \
895                 int ial = ptr[j]; \
896                 if ( bcf_gt_allele(ial)==0 ) has_ref = 1; \
897                 if ( j>0 ) \
898                 { \
899                     int jal = ptr[j-1]; \
900                     if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \
901                 } \
902             } \
903             char *dst = &tok->str_value.s[nvals1*i]; \
904             if ( type==4 ) \
905             { \
906                 if ( !j || missing ) dst[0]='m', dst[1]='i', dst[2]='s', dst[3] = 0; /* mis, missing genotype */ \
907                 else if ( !has_ref ) dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, no ref, must have alt allele */ \
908                 else if ( !is_het ) dst[0]='r', dst[1]='e', dst[2]='f', dst[3] = 0; /* ref, must be ref-only, no alt alelle */ \
909                 else dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, is het, has alt allele */ \
910             } \
911             else if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \
912             else if ( type==3 ) \
913             { \
914                 if ( j==1 ) dst[0]='h', dst[1]='a', dst[2]='p', dst[3] = 0; /* hap, haploid */ \
915                 else if ( !is_het ) dst[0]='h', dst[1]='o', dst[2]='m', dst[3] = 0; /* hom */ \
916                 else dst[0]='h', dst[1]='e', dst[2]='t', dst[3] = 0; /* het */ \
917             } \
918             else \
919             { \
920                 if ( j==1 ) \
921                 { \
922                     if ( has_ref ) dst[0]='r', dst[1]=0; /* r, haploid */ \
923                     else dst[0]='a', dst[1]=0; /* a, haploid */ \
924                 } \
925                 else if ( !is_het ) \
926                 { \
927                     if ( has_ref ) dst[0]='r', dst[1]='r', dst[2] = 0; /* rr */ \
928                     else dst[0]='a', dst[1]='a', dst[2] = 0; /* aa */ \
929                 } \
930                 else \
931                 { \
932                     if ( has_ref ) dst[0]='r', dst[1]='a', dst[2] = 0; /* ra */ \
933                     else dst[0]='a', dst[1]='A', dst[2] = 0; /* aA */ \
934                 } \
935             } \
936         } \
937     }
938     switch (fmt->type) {
939         case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_vector_end); break;
940         case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
941         case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
942         default: error("The GT type is not lineognised: %d at %s:%"PRId64"\n",fmt->type, bcf_seqname(flt->hdr,line),(int64_t) line->pos+1); break;
943     }
944 #undef BRANCH_INT
945     assert( tok->nsamples == nsmpl );
946     tok->nvalues = tok->str_value.l = nvals1*nsmpl;
947     tok->str_value.s[tok->str_value.l] = 0;
948     tok->nval1 = nvals1;
949 }
filters_set_genotype2(filter_t * flt,bcf1_t * line,token_t * tok)950 static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); }
filters_set_genotype3(filter_t * flt,bcf1_t * line,token_t * tok)951 static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); }
filters_set_genotype4(filter_t * flt,bcf1_t * line,token_t * tok)952 static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); }
953 
filters_set_genotype_string(filter_t * flt,bcf1_t * line,token_t * tok)954 static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
955 {
956     bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
957     if ( !fmt )
958     {
959         tok->nvalues = 0;
960         return;
961     }
962     int i, blen = 4, nsmpl = line->n_sample;
963 
964 gt_length_too_big:
965     tok->str_value.l = 0;
966     for (i=0; i<nsmpl; i++)
967     {
968         size_t plen = tok->str_value.l;
969         bcf_format_gt(fmt, i, &tok->str_value);
970         kputc_(0, &tok->str_value);
971         if ( tok->str_value.l - plen > blen )
972         {
973             // too many alternate alleles or ploidy is too large, the genotype does not fit
974             // three characters ("0/0" vs "10/10").
975             blen *= 2;
976             goto gt_length_too_big;
977         }
978 
979         plen = tok->str_value.l - plen;
980         while ( plen < blen )
981         {
982             kputc_(0, &tok->str_value);
983             plen++;
984         }
985     }
986     assert( tok->nsamples == nsmpl );
987     tok->nvalues = tok->str_value.l;
988     tok->nval1 = blen;
989 }
filters_set_ilen(filter_t * flt,bcf1_t * line,token_t * tok)990 static void filters_set_ilen(filter_t *flt, bcf1_t *line, token_t *tok)
991 {
992     tok->nvalues = line->n_allele - 1;
993     hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
994 
995     int i, rlen = strlen(line->d.allele[0]);
996     for (i=1; i<line->n_allele; i++)
997     {
998         int alen = strlen(line->d.allele[i]);
999         if ( rlen==alen ) bcf_double_set_missing(tok->values[i-1]);
1000         else tok->values[i-1] = alen - rlen;
1001     }
1002 }
filters_set_ref_string(filter_t * flt,bcf1_t * line,token_t * tok)1003 static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok)
1004 {
1005     tok->str_value.l = 0;
1006     kputs(line->d.allele[0], &tok->str_value);
1007     tok->nvalues = tok->str_value.l;
1008 }
filters_set_alt_string(filter_t * flt,bcf1_t * line,token_t * tok)1009 static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok)
1010 {
1011     tok->str_value.l = 0;
1012     if ( tok->idx>=0 )
1013     {
1014         if ( line->n_allele > tok->idx + 1 )
1015             kputs(line->d.allele[tok->idx + 1], &tok->str_value);
1016         else
1017             kputc('.', &tok->str_value);
1018     }
1019     else if ( tok->idx==-2 )
1020     {
1021         int i, end = tok->nuidxs ? tok->nuidxs : line->n_allele - 1;
1022         if ( end >= line->n_allele - 1 ) end = line->n_allele - 2;
1023         for (i=0; i<=end; i++)
1024             if ( i>=tok->nidxs || tok->idxs[i] )
1025             {
1026                 if ( tok->str_value.l ) kputc(',', &tok->str_value);
1027                 kputs(line->d.allele[i+1], &tok->str_value);
1028             }
1029     }
1030     else if ( line->n_allele>1 )
1031     {
1032         kputs(line->d.allele[1], &tok->str_value);
1033         int i;
1034         for (i=2; i<line->n_allele; i++)
1035         {
1036             kputc(',', &tok->str_value);
1037             kputs(line->d.allele[i], &tok->str_value);
1038         }
1039     }
1040     else if ( line->n_allele==1 )
1041         kputc('.', &tok->str_value);
1042     tok->nvalues = tok->str_value.l;
1043 }
filters_set_nmissing(filter_t * flt,bcf1_t * line,token_t * tok)1044 static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok)
1045 {
1046     bcf_unpack(line, BCF_UN_FMT);
1047     if ( !line->n_sample )
1048     {
1049         tok->nvalues = 1;
1050         tok->values[0] = 0;
1051         return;
1052     }
1053 
1054     int i,igt = bcf_hdr_id2int(flt->hdr, BCF_DT_ID, "GT");
1055     bcf_fmt_t *fmt = NULL;
1056     for (i=0; i<line->n_fmt; i++)
1057         if ( line->d.fmt[i].id==igt ) { fmt = &line->d.fmt[i]; break; }
1058     if ( !fmt )
1059     {
1060         tok->nvalues = 0;
1061         return;
1062     }
1063 
1064     int j,nmissing = 0;
1065     #define BRANCH(type_t, is_vector_end) { \
1066         for (i=0; i<line->n_sample; i++) \
1067         { \
1068             type_t *ptr = (type_t *) (fmt->p + i*fmt->size); \
1069             for (j=0; j<fmt->n; j++) \
1070             { \
1071                 if ( ptr[j]==is_vector_end ) break; \
1072                 if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; } \
1073             } \
1074         } \
1075     }
1076     switch (fmt->type) {
1077         case BCF_BT_INT8:  BRANCH(int8_t,  bcf_int8_vector_end); break;
1078         case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
1079         case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
1080         default: fprintf(stderr,"todo: type %d\n", fmt->type); exit(1); break;
1081     }
1082     #undef BRANCH
1083     tok->nvalues = 1;
1084     tok->values[0] = tok->tag[0]=='N' ? nmissing : (double)nmissing / line->n_sample;
1085 }
func_npass(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1086 static int func_npass(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1087 {
1088     if ( nstack==0 ) error("Error parsing the expression\n");
1089     token_t *tok = stack[nstack - 1];
1090     if ( !tok->nsamples ) error("The function %s works with FORMAT fields\n", rtok->tag);
1091     assert(tok->usmpl);
1092 
1093     int i, npass = 0;
1094     for (i=0; i<tok->nsamples; i++)
1095     {
1096         if ( !tok->usmpl[i] ) continue;
1097         if ( tok->pass_samples[i] ) npass++;
1098     }
1099     hts_expand(double,1,rtok->mvalues,rtok->values);
1100     rtok->nsamples = 0;
1101     rtok->nvalues = 1;
1102     rtok->values[0] = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0);
1103 
1104     return 1;
1105 }
filters_set_nalt(filter_t * flt,bcf1_t * line,token_t * tok)1106 static void filters_set_nalt(filter_t *flt, bcf1_t *line, token_t *tok)
1107 {
1108     tok->nvalues = 1;
1109     tok->values[0] = line->n_allele - 1;
1110 }
filters_set_ac(filter_t * flt,bcf1_t * line,token_t * tok)1111 static void filters_set_ac(filter_t *flt, bcf1_t *line, token_t *tok)
1112 {
1113     hts_expand(int32_t, line->n_allele, flt->mtmpi, flt->tmpi);
1114     if ( !bcf_calc_ac(flt->hdr, line, flt->tmpi, BCF_UN_INFO|BCF_UN_FMT) )
1115     {
1116         tok->nvalues = 0;
1117         return;
1118     }
1119     int i, an = flt->tmpi[0];
1120     for (i=1; i<line->n_allele; i++) an += flt->tmpi[i];
1121     if ( !an )
1122     {
1123         tok->nvalues = 0;
1124         return;
1125     }
1126     flt->tmpi[0] = an;  // for filters_set_[mac|af|maf]
1127     if ( tok->idx>=0 )
1128     {
1129         tok->nvalues = 1;
1130         tok->values[0] = tok->idx+1<line->n_allele ? flt->tmpi[tok->idx+1] : 0;
1131     }
1132     else if ( line->n_allele==1 )   // no ALT
1133     {
1134         tok->nvalues = 1;
1135         tok->values[0] = 0;
1136     }
1137     else
1138     {
1139         hts_expand(double,line->n_allele,tok->mvalues,tok->values);
1140         for (i=1; i<line->n_allele; i++)
1141             tok->values[i-1] = flt->tmpi[i];
1142         tok->nvalues = line->n_allele - 1;
1143     }
1144 }
filters_set_an(filter_t * flt,bcf1_t * line,token_t * tok)1145 static void filters_set_an(filter_t *flt, bcf1_t *line, token_t *tok)
1146 {
1147     filters_set_ac(flt,line,tok);
1148     tok->values[0] = tok->nvalues ? flt->tmpi[0] : 0;
1149     tok->nvalues = 1;
1150 }
filters_set_mac(filter_t * flt,bcf1_t * line,token_t * tok)1151 static void filters_set_mac(filter_t *flt, bcf1_t *line, token_t *tok)
1152 {
1153     filters_set_ac(flt,line,tok);
1154     if ( !tok->nvalues ) return;
1155     int i, an = flt->tmpi[0];
1156     for (i=0; i<tok->nvalues; i++)
1157         if ( tok->values[i] > an*0.5 ) tok->values[i] = an - tok->values[i];
1158 }
filters_set_af(filter_t * flt,bcf1_t * line,token_t * tok)1159 static void filters_set_af(filter_t *flt, bcf1_t *line, token_t *tok)
1160 {
1161     filters_set_ac(flt,line,tok);
1162     if ( !tok->nvalues ) return;
1163     int i, an = flt->tmpi[0];
1164     for (i=0; i<tok->nvalues; i++)
1165         tok->values[i] /= (double)an;
1166 }
filters_set_maf(filter_t * flt,bcf1_t * line,token_t * tok)1167 static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok)
1168 {
1169     filters_set_ac(flt,line,tok);
1170     if ( !tok->nvalues ) return;
1171     int i, an = flt->tmpi[0];
1172     for (i=0; i<tok->nvalues; i++)
1173     {
1174         tok->values[i] /= (double)an;
1175         if ( tok->values[i] > 0.5 ) tok->values[i] = 1 - tok->values[i];
1176     }
1177 }
1178 
func_max(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1179 static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1180 {
1181     token_t *tok = stack[nstack - 1];
1182     rtok->nvalues = 0;
1183     if ( !tok->nvalues ) return 1;
1184     double *ptr, val = -HUGE_VAL;
1185     int i,j, has_value = 0;
1186     if ( tok->nsamples )
1187     {
1188         for (i=0; i<tok->nsamples; i++)
1189         {
1190             if ( !tok->usmpl[i] ) continue;
1191             ptr = tok->values + i*tok->nval1;
1192             for (j=0; j<tok->nval1; j++)
1193             {
1194                 if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1195                 has_value = 1;
1196                 if ( val < ptr[j] ) val = ptr[j];
1197             }
1198         }
1199     }
1200     else
1201     {
1202         for (i=0; i<tok->nvalues; i++)
1203         {
1204             if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
1205             has_value = 1;
1206             if ( val < tok->values[i] ) val = tok->values[i];
1207         }
1208     }
1209     if ( has_value )
1210     {
1211         rtok->values[0] = val;
1212         rtok->nvalues = has_value;
1213     }
1214     return 1;
1215 }
func_smpl_max(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1216 static int func_smpl_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1217 {
1218     token_t *tok = stack[nstack - 1];
1219     if ( !tok->nsamples ) return func_max(flt,line,rtok,stack,nstack);
1220     rtok->nsamples = tok->nsamples;
1221     rtok->nvalues  = tok->nsamples;
1222     rtok->nval1 = 1;
1223     hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
1224     assert(tok->usmpl);
1225     if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1226     memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1227     int i, j, has_value;
1228     double val, *ptr;
1229     for (i=0; i<tok->nsamples; i++)
1230     {
1231         if ( !rtok->usmpl[i] ) continue;
1232         val = -HUGE_VAL;
1233         has_value = 0;
1234         ptr = tok->values + i*tok->nval1;
1235         for (j=0; j<tok->nval1; j++)
1236         {
1237             if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1238             has_value = 1;
1239             if ( val < ptr[j] ) val = ptr[j];
1240         }
1241         if ( has_value ) rtok->values[i] = val;
1242         else bcf_double_set_missing(rtok->values[i]);
1243     }
1244     return 1;
1245 }
func_min(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1246 static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1247 {
1248     token_t *tok = stack[nstack - 1];
1249     rtok->nvalues = 0;
1250     if ( !tok->nvalues ) return 1;
1251     double *ptr, val = HUGE_VAL;
1252     int i,j, has_value = 0;
1253     if ( tok->nsamples )
1254     {
1255         for (i=0; i<tok->nsamples; i++)
1256         {
1257             if ( !tok->usmpl[i] ) continue;
1258             ptr = tok->values + i*tok->nval1;
1259             for (j=0; j<tok->nval1; j++)
1260             {
1261                 if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1262                 has_value = 1;
1263                 if ( val > ptr[j] ) val = ptr[j];
1264             }
1265         }
1266     }
1267     else
1268     {
1269         for (i=0; i<tok->nvalues; i++)
1270         {
1271             if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
1272             has_value = 1;
1273             if ( val > tok->values[i] ) val = tok->values[i];
1274         }
1275     }
1276     if ( has_value )
1277     {
1278         rtok->values[0] = val;
1279         rtok->nvalues = has_value;
1280     }
1281     return 1;
1282 }
func_smpl_min(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1283 static int func_smpl_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1284 {
1285     token_t *tok = stack[nstack - 1];
1286     if ( !tok->nsamples ) return func_min(flt,line,rtok,stack,nstack);
1287     rtok->nsamples = tok->nsamples;
1288     rtok->nvalues  = tok->nsamples;
1289     rtok->nval1 = 1;
1290     hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
1291     assert(tok->usmpl);
1292     if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1293     memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1294     int i, j, has_value;
1295     double val, *ptr;
1296     for (i=0; i<tok->nsamples; i++)
1297     {
1298         if ( !rtok->usmpl[i] ) continue;
1299         val = HUGE_VAL;
1300         has_value = 0;
1301         ptr = tok->values + i*tok->nval1;
1302         for (j=0; j<tok->nval1; j++)
1303         {
1304             if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1305             has_value = 1;
1306             if ( val > ptr[j] ) val = ptr[j];
1307         }
1308         if ( has_value ) rtok->values[i] = val;
1309         else bcf_double_set_missing(rtok->values[i]);
1310     }
1311     return 1;
1312 }
func_avg(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1313 static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1314 {
1315     token_t *tok = stack[nstack - 1];
1316     rtok->nvalues = 0;
1317     if ( !tok->nvalues ) return 1;
1318     double *ptr, val = 0;
1319     int i,j, n = 0;
1320     if ( tok->nsamples )
1321     {
1322         for (i=0; i<tok->nsamples; i++)
1323         {
1324             if ( !tok->usmpl[i] ) continue;
1325             ptr = tok->values + i*tok->nval1;
1326             for (j=0; j<tok->nval1; j++)
1327             {
1328                 if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1329                 val += ptr[j];
1330                 n++;
1331             }
1332         }
1333     }
1334     else
1335     {
1336         for (i=0; i<tok->nvalues; i++)
1337             if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
1338     }
1339     if ( n )
1340     {
1341         rtok->values[0] = val / n;
1342         rtok->nvalues   = 1;
1343     }
1344     return 1;
1345 }
func_smpl_avg(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1346 static int func_smpl_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1347 {
1348     token_t *tok = stack[nstack - 1];
1349     if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
1350     rtok->nsamples = tok->nsamples;
1351     rtok->nvalues  = tok->nsamples;
1352     rtok->nval1 = 1;
1353     hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
1354     assert(tok->usmpl);
1355     if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1356     memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1357     int i, j, n;
1358     double val, *ptr;
1359     for (i=0; i<tok->nsamples; i++)
1360     {
1361         if ( !rtok->usmpl[i] ) continue;
1362         val = 0;
1363         n = 0;
1364         ptr = tok->values + i*tok->nval1;
1365         for (j=0; j<tok->nval1; j++)
1366         {
1367             if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) { val += ptr[j]; n++; }
1368         }
1369         if ( n ) rtok->values[i] = val / n;
1370         else bcf_double_set_missing(rtok->values[i]);
1371     }
1372     return 1;
1373 }
compare_doubles(const void * lhs,const void * rhs)1374 static int compare_doubles(const void *lhs, const void *rhs)
1375 {
1376     double arg1 = *(const double*) lhs;
1377     double arg2 = *(const double*) rhs;
1378     if (arg1 < arg2) return -1;
1379     if (arg1 > arg2) return 1;
1380     return 0;
1381 }
func_median(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1382 static int func_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1383 {
1384     token_t *tok = stack[nstack - 1];
1385     rtok->nvalues = 0;
1386     if ( !tok->nvalues ) return 1;
1387     // sweep through all tok->values and while excluding all missing values reuse the very same array
1388     int i,j,k = 0, n = 0;
1389     if ( tok->nsamples )
1390     {
1391         for (i=0; i<tok->nsamples; i++)
1392         {
1393             if ( !tok->usmpl[i] ) { k += tok->nval1; continue; }
1394             for (j=0; j<tok->nval1; k++,j++)
1395             {
1396                 if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue;
1397                 if ( n < k ) tok->values[n] = tok->values[k];
1398                 n++;
1399             }
1400         }
1401     }
1402     else
1403     {
1404         for (i=0; i<tok->nvalues; i++)
1405         {
1406             if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
1407             if ( n < i ) tok->values[n] = tok->values[i];
1408             n++;
1409         }
1410     }
1411     if ( !n ) return 1;
1412     if ( n==1 ) rtok->values[0] = tok->values[0];
1413     else
1414     {
1415         qsort(tok->values, n, sizeof(double), compare_doubles);
1416         rtok->values[0] = n % 2 ? tok->values[n/2] : (tok->values[n/2-1] + tok->values[n/2]) * 0.5;
1417     }
1418     rtok->nvalues = 1;
1419     return 1;
1420 }
func_smpl_median(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1421 static int func_smpl_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1422 {
1423     token_t *tok = stack[nstack - 1];
1424     if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
1425     rtok->nsamples = tok->nsamples;
1426     rtok->nvalues  = tok->nsamples;
1427     rtok->nval1 = 1;
1428     hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
1429     assert(tok->usmpl);
1430     if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1431     memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1432     int i, j, n;
1433     double *ptr;
1434     for (i=0; i<tok->nsamples; i++)
1435     {
1436         if ( !rtok->usmpl[i] ) continue;
1437         n = 0;
1438         ptr = tok->values + i*tok->nval1;
1439         for (j=0; j<tok->nval1; j++)
1440         {
1441             if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1442             if ( n < j ) ptr[n] = ptr[j];
1443             n++;
1444         }
1445         if ( n==0 )
1446             bcf_double_set_missing(rtok->values[i]);
1447         else if ( n==1 )
1448             rtok->values[i] = ptr[0];
1449         else
1450         {
1451             qsort(ptr, n, sizeof(double), compare_doubles);
1452             rtok->values[i] = n % 2 ? ptr[n/2] : (ptr[n/2-1] + ptr[n/2]) * 0.5;
1453         }
1454     }
1455     return 1;
1456 }
func_stddev(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1457 static int func_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1458 {
1459     token_t *tok = stack[nstack - 1];
1460     rtok->nvalues = 0;
1461     if ( !tok->nvalues ) return 1;
1462     // sweep through all tok->values and while excluding all missing values reuse the very same array
1463     int i,j,k = 0, n = 0;
1464     if ( tok->nsamples )
1465     {
1466         for (i=0; i<tok->nsamples; i++)
1467         {
1468             if ( !tok->usmpl[i] ) { k += tok->nval1; continue; }
1469             for (j=0; j<tok->nval1; k++,j++)
1470             {
1471                 if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue;
1472                 if ( n < k ) tok->values[n] = tok->values[k];
1473                 n++;
1474             }
1475         }
1476     }
1477     else
1478     {
1479         for (i=0; i<tok->nvalues; i++)
1480         {
1481             if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
1482             if ( n < i ) tok->values[n] = tok->values[i];
1483             n++;
1484         }
1485     }
1486     if ( !n ) return 1;
1487     if ( n==1 ) rtok->values[0] = 0;
1488     else
1489     {
1490         double sdev = 0, avg = 0;
1491         for (i=0; i<n; i++) avg += tok->values[i];
1492         avg /= n;
1493         for (i=0; i<n; i++) sdev += (tok->values[i] - avg) * (tok->values[i] - avg);
1494         rtok->values[0] = sqrt(sdev/n);
1495     }
1496     rtok->nvalues = 1;
1497     return 1;
1498 }
func_smpl_stddev(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1499 static int func_smpl_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1500 {
1501     token_t *tok = stack[nstack - 1];
1502     if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
1503     rtok->nsamples = tok->nsamples;
1504     rtok->nvalues  = tok->nsamples;
1505     rtok->nval1 = 1;
1506     hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
1507     assert(tok->usmpl);
1508     if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1509     memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1510     int i, j, n;
1511     double *ptr;
1512     for (i=0; i<tok->nsamples; i++)
1513     {
1514         if ( !rtok->usmpl[i] ) continue;
1515         n = 0;
1516         ptr = tok->values + i*tok->nval1;
1517         for (j=0; j<tok->nval1; j++)
1518         {
1519             if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1520             if ( n < j ) ptr[n] = ptr[j];
1521             n++;
1522         }
1523         if ( n==0 )
1524             bcf_double_set_missing(rtok->values[i]);
1525         else if ( n==1 )
1526             rtok->values[i] = 0;
1527         else
1528         {
1529             double sdev = 0, avg = 0;
1530             for (j=0; j<n; j++) avg += ptr[j];
1531             avg /= n;
1532             for (j=0; j<n; j++) sdev += (ptr[j] - avg) * (ptr[j] - avg);
1533             rtok->values[i] = sqrt(sdev/n);
1534         }
1535     }
1536     return 1;
1537 }
func_sum(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1538 static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1539 {
1540     rtok->nvalues = 0;
1541     token_t *tok = stack[nstack - 1];
1542     if ( !tok->nvalues ) return 1;
1543     double *ptr, val = 0;
1544     int i,j, n = 0;
1545     if ( tok->nsamples )
1546     {
1547         for (i=0; i<tok->nsamples; i++)
1548         {
1549             if ( !tok->usmpl[i] ) continue;
1550             ptr = tok->values + i*tok->nval1;
1551             for (j=0; j<tok->nval1; j++)
1552             {
1553                 if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1554                 val += ptr[j];
1555                 n++;
1556             }
1557         }
1558     }
1559     else
1560     {
1561         for (i=0; i<tok->nvalues; i++)
1562             if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
1563     }
1564     if ( n )
1565     {
1566         rtok->values[0] = val;
1567         rtok->nvalues   = 1;
1568     }
1569     return 1;
1570 }
func_smpl_sum(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1571 static int func_smpl_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1572 {
1573     token_t *tok = stack[nstack - 1];
1574     if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
1575     rtok->nsamples = tok->nsamples;
1576     rtok->nvalues  = tok->nsamples;
1577     rtok->nval1 = 1;
1578     hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
1579     assert(tok->usmpl);
1580     if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1581     memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1582     int i, j, has_value;
1583     double val, *ptr;
1584     for (i=0; i<tok->nsamples; i++)
1585     {
1586         if ( !rtok->usmpl[i] ) continue;
1587         val = 0;
1588         has_value = 0;
1589         ptr = tok->values + i*tok->nval1;
1590         for (j=0; j<tok->nval1; j++)
1591         {
1592             if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
1593             has_value = 1;
1594             val += ptr[j];
1595         }
1596         if ( has_value ) rtok->values[i] = val;
1597         else bcf_double_set_missing(rtok->values[i]);
1598     }
1599     return 1;
1600 }
func_abs(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1601 static int func_abs(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1602 {
1603     token_t *tok = stack[nstack - 1];
1604     if ( tok->is_str ) error("ABS() can be applied only on numeric values\n");
1605     rtok->nsamples = tok->nsamples;
1606     rtok->nvalues = tok->nvalues;
1607     rtok->nval1 = tok->nval1;
1608     hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
1609     if ( tok->usmpl )
1610     {
1611         if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1612         memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1613     }
1614     if ( !tok->nvalues ) return 1;
1615     hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
1616     int i,j,k = 0;
1617     if ( tok->usmpl )
1618     {
1619         for (i=0; i<tok->nsamples; i++)
1620         {
1621             if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; }
1622             for (j=0; j<tok->nval1; k++,j++)
1623             {
1624                 if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]);
1625                 else rtok->values[k] = fabs(tok->values[k]);
1626             }
1627         }
1628     }
1629     else
1630     {
1631         for (i=0; i<tok->nvalues; i++)
1632         {
1633             if ( tok->usmpl && !tok->usmpl[i] ) continue;
1634             if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
1635             else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]);
1636         }
1637     }
1638     return 1;
1639 }
func_count(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1640 static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1641 {
1642     token_t *tok = stack[nstack - 1];
1643     int i,j, cnt = 0;
1644     if ( tok->tag && tok->nsamples )
1645     {
1646         // raw number of values in a FMT tag, e.g. COUNT(FMT/TAG)
1647         if ( tok->is_str ) error("todo: Type=String for COUNT on FORMAT fields?\n");
1648         for (i=0; i<tok->nsamples; i++)
1649         {
1650             if ( !tok->usmpl[i] ) continue;
1651             double *ptr = tok->values + i*tok->nval1;
1652             for (j=0; j<tok->nval1; j++)
1653                 if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) cnt++;
1654         }
1655     }
1656     else if ( tok->nsamples )
1657     {
1658         // number of samples that pass a processed FMT tag
1659         for (i=0; i<tok->nsamples; i++)
1660             if ( tok->pass_samples[i] ) cnt++;
1661     }
1662     else if ( tok->is_str )
1663     {
1664         if ( tok->str_value.l ) cnt = 1;
1665         for (i=0; i<tok->str_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++;
1666     }
1667     else
1668         cnt = tok->nvalues;
1669 
1670     rtok->nvalues = 1;
1671     rtok->values[0] = cnt;
1672     return 1;
1673 }
func_strlen(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1674 static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1675 {
1676     token_t *tok = stack[nstack - 1];
1677     rtok->nvalues = rtok->str_value.l = 0;
1678     if ( !tok->str_value.l ) return 1;
1679 
1680     if ( tok->idx==-2 )
1681     {
1682         int i = 0;
1683         char *ss = tok->str_value.s;
1684         while ( *ss )
1685         {
1686             char *se = ss;
1687             while ( *se && *se!=',' ) se++;
1688             hts_expand(double, i+1, rtok->mvalues, rtok->values);
1689             if ( !*se ) rtok->values[i] = strlen(ss);
1690             else
1691             {
1692                 *se = 0;
1693                 rtok->values[i] = strlen(ss);
1694                 *se = ',';
1695             }
1696             ss = *se ? se + 1 : se;
1697             i++;
1698         }
1699         rtok->nvalues = i;
1700     }
1701     else
1702     {
1703         if ( !tok->str_value.s[1] && tok->str_value.s[0]=='.' )
1704             rtok->values[0] = 0;
1705         else
1706             rtok->values[0] = strlen(tok->str_value.s);
1707         rtok->nvalues = 1;
1708     }
1709     return 1;
1710 }
calc_binom(int na,int nb)1711 static inline double calc_binom(int na, int nb)
1712 {
1713     if ( na==0 && nb==0 ) return -1;
1714     if ( na==nb ) return 1;
1715 
1716     // kfunc.h implements kf_betai, which is the regularized beta function  P(X<=k/N;p) = I_{1-p}(N-k,k+1)
1717 
1718     double pval = na < nb ? kf_betai(nb, na + 1, 0.5) : kf_betai(na, nb + 1, 0.5);
1719     pval *= 2;
1720     assert( pval-1 < 1e-10 );
1721     if ( pval>1 ) pval = 1;     // this can happen, machine precision error, eg. kf_betai(1,0,0.5)
1722 
1723     return pval;
1724 }
func_binom(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1725 static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1726 {
1727     int i, istack = nstack - rtok->nargs;
1728 
1729     if ( rtok->nargs!=2 && rtok->nargs!=1 ) error("Error: binom() takes one or two arguments\n");
1730     assert( istack>=0 );
1731 
1732     // The expected mean is 0.5. Should we support also prob!=0.5?
1733     //
1734     //  double prob = 0.5;
1735     //  if ( nstack==3 )
1736     //  {
1737     //      // three parameters, the first one must be a scalar:  binom(0.25,AD[0],AD[1])
1738     //      if ( !stack[istack]->is_constant )
1739     //          error("The first argument of binom() must be a numeric constant if three parameters are given\n");
1740     //      prob = stack[istack]->threshold;
1741     //      istack++;
1742     //  }
1743     //  else if ( nstack==2 && stack[istack]->is_constant )
1744     //  {
1745     //      // two parameters, the first can be a scalar:  binom(0.25,AD) or binom(AD[0],AD[1])
1746     //      prob = stack[istack]->threshold;
1747     //      istack++;
1748     //  }
1749 
1750     token_t *tok = stack[istack];
1751     if ( tok->nsamples )
1752     {
1753         // working with a FORMAT tag
1754         rtok->nval1    = 1;
1755         rtok->nvalues  = tok->nsamples;
1756         rtok->nsamples = tok->nsamples;
1757         hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
1758         assert(tok->usmpl);
1759         if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
1760         memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
1761 
1762         if ( istack+1==nstack )
1763         {
1764             // determine the index from the GT field: binom(AD)
1765             int ngt = bcf_get_genotypes(flt->hdr, line, &flt->tmpi, &flt->mtmpi);
1766             int max_ploidy = ngt/line->n_sample;
1767             if ( ngt <= 0 || max_ploidy < 2 ) // GT not present or not diploid, cannot set
1768             {
1769                 for (i=0; i<rtok->nsamples; i++)
1770                     if ( rtok->usmpl[i] ) bcf_double_set_missing(rtok->values[i]);
1771                 return rtok->nargs;
1772             }
1773             for (i=0; i<rtok->nsamples; i++)
1774             {
1775                 if ( !rtok->usmpl[i] ) continue;
1776                 int32_t *ptr = flt->tmpi + i*max_ploidy;
1777                 if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end )
1778                 {
1779                     bcf_double_set_missing(rtok->values[i]);
1780                     continue;
1781                 }
1782                 int idx1 = bcf_gt_allele(ptr[0]);
1783                 int idx2 = bcf_gt_allele(ptr[1]);
1784                 if ( idx1>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]);
1785                 if ( idx2>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]);
1786                 double *vals = tok->values + tok->nval1*i;
1787                 if ( bcf_double_is_missing_or_vector_end(vals[idx1]) || bcf_double_is_missing_or_vector_end(vals[idx2]) )
1788                 {
1789                     bcf_double_set_missing(rtok->values[i]);
1790                     continue;
1791                 }
1792                 rtok->values[i] = calc_binom(vals[idx1],vals[idx2]);
1793                 if ( rtok->values[i] < 0 )
1794                 {
1795                     bcf_double_set_missing(rtok->values[i]);
1796                     continue;
1797                 }
1798             }
1799         }
1800         else
1801         {
1802             // the fields given explicitly: binom(AD[:0],AD[:1])
1803             token_t *tok2 = stack[istack+1];
1804             if ( tok->nval1!=1 || tok2->nval1!=1 )
1805                 error("Expected one value per binom() argument, found %d and %d at %s:%"PRId64"\n",tok->nval1,tok2->nval1, bcf_seqname(flt->hdr,line),(int64_t) line->pos+1);
1806             for (i=0; i<rtok->nsamples; i++)
1807             {
1808                 if ( !rtok->usmpl[i] ) continue;
1809                 double *ptr1 = tok->values + tok->nval1*i;
1810                 double *ptr2 = tok2->values + tok2->nval1*i;
1811                 if ( bcf_double_is_missing_or_vector_end(ptr1[0]) || bcf_double_is_missing_or_vector_end(ptr2[0]) )
1812                 {
1813                     bcf_double_set_missing(rtok->values[i]);
1814                     continue;
1815                 }
1816                 rtok->values[i] = calc_binom(ptr1[0],ptr2[0]);
1817                 if ( rtok->values[i] < 0 )
1818                 {
1819                     bcf_double_set_missing(rtok->values[i]);
1820                     continue;
1821                 }
1822             }
1823         }
1824     }
1825     else
1826     {
1827         // working with an INFO tag
1828         rtok->nvalues  = 1;
1829         hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
1830 
1831         double *ptr1 = NULL, *ptr2 = NULL;
1832         if ( istack+1==nstack )
1833         {
1834             // only one tag, expecting two values: binom(INFO/AD)
1835             if ( tok->nvalues==2 )
1836             {
1837                 ptr1 = &tok->values[0];
1838                 ptr2 = &tok->values[1];
1839             }
1840         }
1841         else
1842         {
1843             // two tags, expecting one value in each: binom(INFO/AD[0],INFO/AD[1])
1844             token_t *tok2 = stack[istack+1];
1845             if ( tok->nvalues==1 && tok2->nvalues==1 )
1846             {
1847                 ptr1 = &tok->values[0];
1848                 ptr2 = &tok2->values[0];
1849             }
1850         }
1851         if ( !ptr1 || !ptr2 || bcf_double_is_missing_or_vector_end(ptr1[0]) || bcf_double_is_missing_or_vector_end(ptr2[0]) )
1852             bcf_double_set_missing(rtok->values[0]);
1853         else
1854         {
1855             rtok->values[0] = calc_binom(ptr1[0],ptr2[0]);
1856             if ( rtok->values[0] < 0 )
1857                 bcf_double_set_missing(rtok->values[0]);
1858         }
1859     }
1860     return rtok->nargs;
1861 }
func_phred(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1862 static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1863 {
1864     token_t *tok = stack[nstack - 1];
1865     if ( tok->is_str ) error("PHRED() can be applied only on numeric values\n");
1866 
1867     rtok->nsamples = tok->nsamples;
1868     rtok->nval1 = tok->nval1;
1869     memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples));
1870     assert(tok->usmpl);
1871     if ( !rtok->usmpl )
1872     {
1873         rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl));
1874         memcpy(rtok->usmpl, tok->usmpl, tok->nsamples*sizeof(*rtok->usmpl));
1875     }
1876     rtok->nvalues = tok->nvalues;
1877     if ( !tok->nvalues ) return 1;
1878 
1879     hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
1880     int i,j,k = 0;
1881     if ( tok->usmpl )
1882     {
1883         for (i=0; i<tok->nsamples; i++)
1884         {
1885             if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; }
1886             for (j=0; j<tok->nval1; k++,j++)
1887             {
1888                 if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]);
1889                 else rtok->values[k] = -4.34294481903*log(tok->values[k]);
1890             }
1891         }
1892     }
1893     else
1894     {
1895         for (i=0; i<tok->nvalues; i++)
1896         {
1897             if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
1898             else rtok->values[i] = -4.34294481903*log(tok->values[i]);
1899         }
1900     }
1901     return 1;
1902 }
tok_init_values(token_t * atok,token_t * btok,token_t * rtok)1903 inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok)
1904 {
1905     token_t *tok = atok->nvalues > btok->nvalues ? atok : btok;
1906     rtok->nvalues = tok->nvalues;
1907     rtok->nval1   = tok->nval1;
1908     hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
1909 }
tok_init_samples(token_t * atok,token_t * btok,token_t * rtok)1910 inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok)
1911 {
1912     if ( (atok->nsamples || btok->nsamples) && !rtok->nsamples )
1913     {
1914         rtok->nsamples = atok->nsamples ? atok->nsamples : btok->nsamples;
1915         rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1);
1916         int i;
1917         for (i=0; i<atok->nsamples; i++) rtok->usmpl[i] |= atok->usmpl[i];
1918         for (i=0; i<btok->nsamples; i++) rtok->usmpl[i] |= btok->usmpl[i];
1919     }
1920     if (rtok->nsamples)
1921         memset(rtok->pass_samples, 0, rtok->nsamples);
1922 }
1923 
1924 #define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \
1925 { \
1926     token_t *rtok = _rtok; \
1927     int i, has_values = 0; \
1928     if ( atok->nvalues && btok->nvalues ) \
1929     { \
1930         tok_init_values(atok, btok, rtok); \
1931         tok_init_samples(atok, btok, rtok); \
1932         if ( (atok->nsamples && btok->nsamples) || (!atok->nsamples && !btok->nsamples)) \
1933         { \
1934             assert( atok->nsamples==btok->nsamples ); \
1935             for (i=0; i<atok->nvalues; i++) \
1936             { \
1937                 if ( bcf_double_is_missing_or_vector_end(atok->values[i]) || bcf_double_is_missing_or_vector_end(btok->values[i]) ) \
1938                 { \
1939                     bcf_double_set_missing(rtok->values[i]); \
1940                     continue; \
1941                 } \
1942                 has_values = 1; \
1943                 rtok->values[i] = atok->values[i] AOP btok->values[i]; \
1944             } \
1945         } \
1946         else if ( atok->nsamples ) \
1947         { \
1948             assert( btok->nvalues==1 ); \
1949             if ( !bcf_double_is_missing_or_vector_end(btok->values[0]) ) \
1950             { \
1951                 for (i=0; i<atok->nvalues; i++) \
1952                 { \
1953                     if ( bcf_double_is_missing_or_vector_end(atok->values[i]) ) \
1954                     { \
1955                         bcf_double_set_missing(rtok->values[i]); \
1956                         continue; \
1957                     } \
1958                     has_values = 1; \
1959                     rtok->values[i] = atok->values[i] AOP btok->values[0]; \
1960                 } \
1961             } \
1962         } \
1963         else \
1964         { \
1965             assert( atok->nvalues==1 ); \
1966             if ( !bcf_double_is_missing_or_vector_end(atok->values[0]) ) \
1967             { \
1968                 for (i=0; i<btok->nvalues; i++) \
1969                 { \
1970                     if ( bcf_double_is_missing_or_vector_end(btok->values[i]) ) \
1971                     { \
1972                         bcf_double_set_missing(rtok->values[i]); \
1973                         continue; \
1974                     } \
1975                     has_values = 1; \
1976                     rtok->values[i] = atok->values[0] AOP btok->values[i]; \
1977                 } \
1978             } \
1979         } \
1980     } \
1981     if ( !has_values ) rtok->nvalues = 0; \
1982 }
1983 
vector_logic_or(filter_t * filter,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)1984 static int vector_logic_or(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
1985 {
1986     if ( nstack < 2 ) error("Error occurred while processing the filter \"%s\"\n", filter->str);
1987 
1988     token_t *atok = stack[nstack-2];
1989     token_t *btok = stack[nstack-1];
1990     tok_init_samples(atok, btok, rtok);
1991 
1992     if ( !atok->pass_site && !btok->pass_site ) return 2;
1993 
1994     rtok->pass_site = 1;
1995     if ( !atok->nsamples && !btok->nsamples ) return 2;
1996 
1997     int i;
1998     if ( rtok->tok_type==TOK_OR_VEC )   // ||, select all samples if one is true
1999     {
2000         if ( (!atok->nsamples && !atok->pass_site) || (!btok->nsamples && !btok->pass_site) )
2001         {
2002             // These two conditions are to ensure the following does not set all samples
2003             // at sites with QUAL<=30:
2004             //      QUAL>30 || FMT/GQ>30
2005 
2006             token_t *tok = atok->nsamples ? atok : btok;
2007             for (i=0; i<rtok->nsamples; i++)
2008             {
2009                 if ( !rtok->usmpl[i] ) continue;
2010                 rtok->pass_samples[i] = tok->pass_samples[i];
2011             }
2012         }
2013         else
2014         {
2015             for (i=0; i<rtok->nsamples; i++)
2016             {
2017                 if ( !rtok->usmpl[i] ) continue;
2018                 rtok->pass_samples[i] = 1;
2019             }
2020         }
2021         return 2;
2022     }
2023 
2024     //  |, only select samples which are actually true
2025 
2026     if ( !atok->nsamples || !btok->nsamples )
2027     {
2028         token_t *tok = atok->nsamples ? atok : btok;
2029         for (i=0; i<rtok->nsamples; i++)
2030         {
2031             if ( !rtok->usmpl[i] ) continue;
2032             rtok->pass_samples[i] = tok->pass_samples[i];
2033         }
2034         return 2;
2035     }
2036 
2037     assert( atok->nsamples==btok->nsamples );
2038 
2039     for (i=0; i<rtok->nsamples; i++)
2040     {
2041         if ( !rtok->usmpl[i] ) continue;
2042         rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i];
2043     }
2044     return 2;
2045 }
vector_logic_and(filter_t * filter,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)2046 static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
2047 {
2048     if ( nstack < 2 ) error("Error occurred while processing the filter \"%s\". (nstack=%d)\n", filter->str,nstack);
2049 
2050     token_t *atok = stack[nstack-2];
2051     token_t *btok = stack[nstack-1];
2052     tok_init_samples(atok, btok, rtok);
2053 
2054     if ( !atok->pass_site || !btok->pass_site ) return 2;
2055     if ( !atok->nsamples && !btok->nsamples ) { rtok->pass_site = 1; return 2; }
2056 
2057     int i;
2058     if ( !atok->nsamples || !btok->nsamples )
2059     {
2060         token_t *tok = atok->nsamples ? atok : btok;
2061         for (i=0; i<rtok->nsamples; i++)
2062         {
2063             if ( !rtok->usmpl[i] ) continue;
2064             rtok->pass_samples[i] = tok->pass_samples[i];
2065         }
2066         rtok->pass_site = 1;
2067         return 2;
2068     }
2069 
2070     assert( atok->nsamples==btok->nsamples );
2071     if ( rtok->tok_type==TOK_AND_VEC )  // &&, can be true in different samples
2072     {
2073         for (i=0; i<rtok->nsamples; i++)
2074         {
2075             if ( !rtok->usmpl[i] ) continue;
2076             rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i];
2077         }
2078         rtok->pass_site = 1;
2079     }
2080     else    // &, must be true within one sample
2081     {
2082         for (i=0; i<rtok->nsamples; i++)
2083         {
2084             if ( !rtok->usmpl[i] ) continue;
2085             rtok->pass_samples[i] = atok->pass_samples[i] & btok->pass_samples[i];
2086             if ( rtok->pass_samples[i] ) rtok->pass_site = 1;
2087         }
2088     }
2089     return 2;
2090 }
2091 
2092 #define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \
2093 { \
2094     token_t *rtok = _rtok; \
2095     int i, j, k; \
2096     tok_init_samples(atok, btok, rtok); \
2097     if ( !atok->nsamples && !btok->nsamples ) \
2098     { \
2099         if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; } \
2100         else if ( !atok->nvalues || !btok->nvalues ) \
2101         { \
2102             token_t *tok = atok->nvalues ? atok : btok; \
2103             for (j=0; j<tok->nvalues; j++) \
2104             { \
2105                 if ( bcf_double_is_missing_or_vector_end(tok->values[j]) ) \
2106                 { \
2107                     if ( missing_logic[2] ) { rtok->pass_site = 1; break; } \
2108                 } \
2109                 else if ( missing_logic[1] ) {  rtok->pass_site = 1; break; } \
2110             } \
2111         } \
2112         else \
2113         { \
2114             for (i=0; i<atok->nvalues; i++) \
2115             { \
2116                 int amiss = bcf_double_is_missing_or_vector_end(atok->values[i]) ? 1 : 0; \
2117                 for (j=0; j<btok->nvalues; j++) \
2118                 { \
2119                     int nmiss = amiss + (bcf_double_is_missing_or_vector_end(btok->values[j]) ? 1 : 0); \
2120                     if ( nmiss ) \
2121                     { \
2122                         if ( missing_logic[nmiss] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \
2123                     } \
2124                     else if ( atok->values[i] > 16777216 || btok->values[j] > 16777216 ) /* Ugly, see #871 */ \
2125                     { \
2126                         if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \
2127                     } \
2128                     else if ( (float)atok->values[i] CMP_OP (float)btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \
2129                 } \
2130             } \
2131         } \
2132     } \
2133     else \
2134     { \
2135         if ( !atok->nvalues && !btok->nvalues ) \
2136         { \
2137             if ( missing_logic[2] ) \
2138             { \
2139                 for (i=0; i<rtok->nsamples; i++) \
2140                     if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; } \
2141             } \
2142         } \
2143         else if ( !atok->nvalues || !btok->nvalues ) \
2144         { \
2145             token_t *tok = atok->nvalues ? atok : btok; \
2146             if ( !tok->nsamples ) \
2147             { \
2148                 int miss = 0; \
2149                 for (j=0; j<tok->nvalues; j++) \
2150                     miss |= bcf_double_is_missing_or_vector_end(tok->values[j]) ? 1 : 0; \
2151                 if ( missing_logic[++miss] ) \
2152                 { \
2153                     for (i=0; i<rtok->nsamples; i++) \
2154                         if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \
2155                 } \
2156             } \
2157             else \
2158                 for (i=0; i<tok->nsamples; i++) \
2159                 { \
2160                     if ( !rtok->usmpl[i] ) continue; \
2161                     double *ptr = tok->values + i*tok->nval1; \
2162                     int miss = 0; \
2163                     for (j=0; j<tok->nval1; j++) \
2164                         miss |= bcf_double_is_missing_or_vector_end(ptr[j]) ? 1 : 0; \
2165                     if ( missing_logic[++miss] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \
2166                 } \
2167         } \
2168         else if ( atok->nsamples && btok->nsamples ) \
2169         { \
2170             if ( atok->nval1!=btok->nval1 ) error("Incompatible number of per-sample values in comparison: %d vs %d\n",atok->nval1,btok->nval1); \
2171             if ( atok->nsamples!=btok->nsamples ) error("Incompatible number samples in comparison: %d vs %d\n",atok->nsamples,btok->nsamples); \
2172             for (i=0; i<atok->nsamples; i++) \
2173             { \
2174                 if ( !atok->usmpl[i] || !btok->usmpl[i] ) { rtok->usmpl[i] = 0; continue; } \
2175                 double *aptr = atok->values + i*atok->nval1; \
2176                 double *bptr = btok->values + i*btok->nval1; \
2177                 for (j=0; j<atok->nval1; j++) \
2178                 { \
2179                     int nmiss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \
2180                     if ( nmiss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
2181                     nmiss += (bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0); \
2182                     if ( nmiss ) \
2183                     { \
2184                         if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \
2185                     } \
2186                     else if ( aptr[j] > 16777216 || bptr[j] > 16777216 ) /* Ugly, see #871 */ \
2187                     { \
2188                         if ( aptr[j] CMP_OP bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \
2189                     } \
2190                     else if ( (float)aptr[j] CMP_OP (float)bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \
2191                 } \
2192             } \
2193         } \
2194         else if ( atok->nsamples )\
2195         { \
2196             for (i=0; i<atok->nsamples; i++) \
2197             { \
2198                 if ( !rtok->usmpl[i] ) continue; \
2199                 double *aptr = atok->values + i*atok->nval1; \
2200                 double *bptr = btok->values + i*btok->nval1; \
2201                 for (j=0; j<atok->nval1; j++) \
2202                 { \
2203                     int miss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \
2204                     if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
2205                     for (k=0; k<btok->nvalues; k++) \
2206                     { \
2207                         int nmiss = miss + (bcf_double_is_missing_or_vector_end(bptr[k]) ? 1 : 0); \
2208                         if ( nmiss ) \
2209                         { \
2210                             if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
2211                         } \
2212                         else if ( aptr[j] > 16777216 || bptr[k] > 16777216 ) /* Ugly, see #871 */ \
2213                         { \
2214                             if ( aptr[j] CMP_OP bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
2215                         } \
2216                         else if ( (float)aptr[j] CMP_OP (float)bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
2217                     } \
2218                 } \
2219             } \
2220         } \
2221         else /* btok->nsamples */ \
2222         { \
2223             for (i=0; i<btok->nsamples; i++) \
2224             { \
2225                 if ( !rtok->usmpl[i] ) continue; \
2226                 double *aptr = atok->values + i*atok->nval1; \
2227                 double *bptr = btok->values + i*btok->nval1; \
2228                 for (j=0; j<btok->nval1; j++) \
2229                 { \
2230                     int miss = bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0; \
2231                     if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
2232                     for (k=0; k<atok->nvalues; k++) \
2233                     { \
2234                         int nmiss = miss + (bcf_double_is_missing_or_vector_end(aptr[k]) ? 1 : 0); \
2235                         if ( nmiss ) \
2236                         { \
2237                             if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
2238                         } \
2239                         else if ( bptr[j] > 16777216 || aptr[k] > 16777216 ) /* Ugly, see #871 */ \
2240                         { \
2241                             if ( aptr[k] CMP_OP bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
2242                         } \
2243                         else if ( (float)aptr[k] CMP_OP (float)bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
2244                     } \
2245                 } \
2246             } \
2247         } \
2248     } \
2249 }
_regex_vector_strings(regex_t * regex,char * str,size_t len,int logic,int * missing_logic)2250 static int _regex_vector_strings(regex_t *regex, char *str, size_t len, int logic, int *missing_logic)
2251 {
2252     char *end = str + len;
2253     while ( str < end && *str )
2254     {
2255         char *mid = str;
2256         while ( mid < end && *mid && *mid!=',' ) mid++;
2257         int miss = mid - str == 1 && str[0]=='.' ? 1 : 0;
2258         if ( miss && missing_logic[miss] ) return 1;
2259         char tmp = *mid; *mid = 0;
2260         int match = regexec(regex, str, 0,NULL,0) ? 0 : 1;
2261         *mid = tmp;
2262         if ( logic==TOK_NLIKE ) match = match ? 0 : 1;
2263         if ( match ) return 1;
2264         if ( !*mid ) break;
2265         str = mid + 1;
2266     }
2267     return 0;
2268 }
_has_missing_string(char * beg)2269 static inline int _has_missing_string(char *beg)
2270 {
2271     while ( *beg )
2272     {
2273         char *end = beg;
2274         while ( *end && *end!=',' ) end++;
2275         if ( end-beg==1 && beg[0]=='.' ) return 1;
2276         if ( !*end ) break;
2277         beg = end + 1;
2278     }
2279     return 0;
2280 }
2281 
2282 // Compare two strings with multiple fields, for example "A,B,.,C"=="X,Y,A".
2283 // Returns 1 if any field matches, otherwise returns 0
_match_vector_strings(char * abeg,size_t alen,char * bstr,size_t blen,int logic,int * missing_logic)2284 static inline int _match_vector_strings(char *abeg, size_t alen, char *bstr, size_t blen, int logic, int *missing_logic)
2285 {
2286     char *aend = abeg + alen;
2287     char *bend = bstr + blen;
2288     while ( abeg < aend && *abeg )
2289     {
2290         char *amid = abeg;
2291         while ( amid < aend && *amid && *amid!=',' ) amid++;
2292         int miss = amid - abeg == 1 && abeg[0]=='.' ? 1 : 0;
2293         char *bbeg = bstr;
2294         while ( bbeg < bend && *bbeg )
2295         {
2296             char *bmid = bbeg;
2297             while ( bmid < bend && *bmid && *bmid!=',' ) bmid++;
2298             int nmiss = miss + (bmid - bbeg == 1 && bbeg[0]=='.' ? 1 : 0);
2299             if ( nmiss )
2300             {
2301                 if ( missing_logic[nmiss] ) return 1;
2302             }
2303             else
2304             {
2305                 int match = amid-abeg==bmid-bbeg && !strncmp(abeg,bbeg,amid-abeg) ? 1 : 0;
2306                 if ( logic==TOK_NE ) match = match==1 ? 0 : 1;
2307                 if ( match ) return 1;
2308             }
2309             if ( !*bmid ) break;
2310             bbeg = bmid + 1;
2311         }
2312         if ( !*amid ) break;
2313         abeg = amid + 1;
2314     }
2315     return 0;
2316 }
cmp_vector_strings(token_t * atok,token_t * btok,token_t * rtok)2317 static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok)
2318 {
2319     tok_init_samples(atok, btok, rtok);
2320 
2321     int i, logic = rtok->tok_type;     // TOK_EQ, TOK_NE, TOK_LIKE, TOK_NLIKE
2322     regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL);
2323 
2324     assert( atok->nvalues==atok->str_value.l && btok->nvalues==btok->str_value.l );
2325     assert( !atok->nsamples || !btok->nsamples );
2326     assert( (!regex && (logic==TOK_EQ || logic==TOK_NE)) || (regex && (logic==TOK_LIKE || logic==TOK_NLIKE)) );
2327 
2328     int missing_logic[] = {0,0,0};
2329     if ( logic==TOK_EQ || logic==TOK_LIKE ) missing_logic[0] = missing_logic[2] = 1;
2330     else if ( logic==TOK_NE || logic==TOK_NLIKE ) missing_logic[0] = missing_logic[1] = 1;
2331 
2332     if ( !atok->nsamples && !btok->nsamples )
2333     {
2334         if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; return; }
2335         if ( !atok->nvalues || !btok->nvalues )
2336         {
2337             int miss = _has_missing_string(atok->nvalues ? atok->str_value.s : btok->str_value.s);
2338             if ( missing_logic[miss+1] ) rtok->pass_site = 1;
2339             return;
2340         }
2341         if ( !regex )
2342             rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic);
2343         else
2344         {
2345             token_t *tok = atok->regex ? btok : atok;
2346             rtok->pass_site = _regex_vector_strings(regex, tok->str_value.s, tok->str_value.l, logic, missing_logic);
2347         }
2348         return;
2349     }
2350 
2351     // The case of (!atok->nsamples || !btok->nsamples)
2352 
2353     if ( !atok->nvalues && !btok->nvalues )
2354     {
2355         if ( missing_logic[2] )
2356         {
2357             for (i=0; i<rtok->nsamples; i++)
2358                 if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; }
2359         }
2360         return;
2361     }
2362     if ( !atok->nvalues || !btok->nvalues )
2363     {
2364         token_t *tok = atok->nvalues ? atok : btok;
2365         if ( !tok->nsamples )
2366         {
2367             int miss = _has_missing_string(tok->str_value.s);
2368             if ( !missing_logic[miss+1] ) return;
2369             for (i=0; i<rtok->nsamples; i++)
2370                 if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
2371         }
2372         else
2373             for (i=0; i<tok->nsamples; i++)
2374             {
2375                 if ( !rtok->usmpl[i] ) continue;
2376                 int miss = _has_missing_string(tok->str_value.s + i*tok->nval1);
2377                 if ( missing_logic[miss+1] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
2378             }
2379         return;
2380     }
2381 
2382     // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues)
2383     token_t *xtok = atok->nsamples ? atok : btok;
2384     token_t *ytok = atok->nsamples ? btok : atok;
2385     assert( regex==ytok->regex );
2386     for (i=0; i<xtok->nsamples; i++)
2387     {
2388         if ( !rtok->usmpl[i] ) continue;
2389         int match;
2390         if ( regex )
2391             match = _regex_vector_strings(regex, xtok->str_value.s + i*xtok->nval1, xtok->nval1, logic, missing_logic);
2392         else
2393             match = _match_vector_strings(xtok->str_value.s + i*xtok->nval1, xtok->nval1, ytok->str_value.s, ytok->str_value.l, logic, missing_logic);
2394         if ( match ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
2395     }
2396 }
2397 
parse_idxs(char * tag_idx,int ** idxs,int * nidxs,int * idx)2398 static int parse_idxs(char *tag_idx, int **idxs, int *nidxs, int *idx)
2399 {
2400     // TAG[], TAG[*] .. any field; sets idx=-2, idxs[0]=-1
2401     if ( *tag_idx==0 || !strcmp("*", tag_idx) )
2402     {
2403         *idxs = (int*) malloc(sizeof(int));
2404         (*idxs)[0] = -1;
2405         *nidxs     = 1;
2406         *idx       = -2;
2407         return 0;
2408     }
2409 
2410     // TAG[integer] .. one field; idx positive
2411     char *end, *beg = tag_idx;
2412     *idx = strtol(tag_idx, &end, 10);
2413     if ( *idx >= 0 && *end==0 ) return 0;
2414 
2415     // TAG[0,1] or TAG[0-2] or [1-] etc; idx=-2, idxs[...]=0,0,1,1,..
2416     int i, ibeg = -1;
2417     while ( *beg )
2418     {
2419         int num = strtol(beg, &end, 10);
2420         if ( end[0]==',' ) beg = end + 1;
2421         else if ( end[0]==0 ) beg = end;
2422         else if ( end[0]=='-' ) { beg = end + 1; ibeg = num; continue; }
2423         else return -1;
2424         if ( num >= *nidxs )
2425         {
2426             *idxs = (int*) realloc(*idxs, sizeof(int)*(num+1));
2427             memset(*idxs + *nidxs, 0, sizeof(int)*(num - *nidxs + 1));
2428             *nidxs = num + 1;
2429         }
2430         if ( ibeg>=0 )
2431         {
2432             for (i=ibeg; i<=num; i++) (*idxs)[i] = 1;
2433             ibeg = -1;
2434         }
2435         (*idxs)[num] = 1;
2436     }
2437     if ( ibeg >=0 )
2438     {
2439         if ( ibeg >= *nidxs )
2440         {
2441             *idxs = (int*) realloc(*idxs, sizeof(int)*(ibeg+1));
2442             memset(*idxs + *nidxs, 0, sizeof(int)*(ibeg - *nidxs + 1));
2443             *nidxs = ibeg + 1;
2444         }
2445         (*idxs)[ibeg] = -1;
2446     }
2447     *idx = -2;
2448     return 0;
2449 }
2450 
parse_tag_idx(bcf_hdr_t * hdr,int is_fmt,char * tag,char * tag_idx,token_t * tok)2451 static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, token_t *tok)   // tag_idx points just after "TAG["
2452 {
2453     int i, len = strlen(tag_idx);
2454     if ( tag_idx[len-1] == ']' ) tag_idx[len-1] = 0;
2455     char *ori = strdup(tag_idx);
2456 
2457     assert( !tok->idxs && !tok->usmpl );
2458     int *idxs1 = NULL, nidxs1 = 0, idx1 = 0;
2459     int *idxs2 = NULL, nidxs2 = 0, idx2 = 0;
2460 
2461     int set_samples = 0;
2462     char *colon = strrchr(tag_idx, ':');
2463     if ( tag_idx[0]=='@' )     // file list with sample names
2464     {
2465         if ( !is_fmt ) error("Could not parse \"%s\". (Not a FORMAT tag yet a sample list provided.)\n", ori);
2466         char *fname = expand_path(tag_idx+1);
2467 #ifdef _WIN32
2468         if (fname && strlen(fname) > 2 && fname[1] == ':') // Deal with Windows paths, such as 'C:\..'
2469             colon = strrchr(fname+2, ':');
2470 #endif
2471         int nsmpl;
2472         char **list = hts_readlist(fname, 1, &nsmpl);
2473         if ( !list && colon )
2474         {
2475             if ( parse_idxs(colon+1, &idxs2, &nidxs2, &idx2) != 0 ) error("Could not parse the index: %s\n", ori);
2476             tok->idxs  = idxs2;
2477             tok->nidxs = nidxs2;
2478             tok->idx   = idx2;
2479             colon = strrchr(fname, ':');
2480             *colon = 0;
2481             list = hts_readlist(fname, 1, &nsmpl);
2482         }
2483         if ( !list ) error("Could not read: %s\n", fname);
2484         free(fname);
2485         tok->nsamples = bcf_hdr_nsamples(hdr);
2486         tok->usmpl = (uint8_t*) calloc(tok->nsamples,1);
2487         for (i=0; i<nsmpl; i++)
2488         {
2489             int ismpl = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,list[i]);
2490             if ( ismpl<0 ) error("No such sample in the VCF: \"%s\"\n", list[i]);
2491             free(list[i]);
2492             tok->usmpl[ismpl] = 1;
2493         }
2494         free(list);
2495         if ( !colon )
2496         {
2497             tok->idxs = (int*) malloc(sizeof(int));
2498             tok->idxs[0] = -1;
2499             tok->nidxs   = 1;
2500             tok->idx     = -2;
2501         }
2502     }
2503     else if ( colon )
2504     {
2505         if ( !is_fmt ) error("Could not parse the index \"%s\". (Not a FORMAT tag yet sample index implied.)\n", ori);
2506         *colon = 0;
2507         if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori);
2508         if ( parse_idxs(colon+1, &idxs2, &nidxs2, &idx2) != 0 ) error("Could not parse the index: %s\n", ori);
2509         tok->idxs  = idxs2;
2510         tok->nidxs = nidxs2;
2511         tok->idx   = idx2;
2512         set_samples = 1;
2513     }
2514     else
2515     {
2516         if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori);
2517         if ( is_fmt )
2518         {
2519             if ( nidxs1==1 && idxs1[0]==-1 )
2520             {
2521                 tok->idxs = (int*) malloc(sizeof(int));
2522                 tok->idxs[0] = -1;
2523                 tok->nidxs   = 1;
2524                 tok->idx     = -2;
2525             }
2526             else if ( bcf_hdr_id2number(hdr,BCF_HL_FMT,tok->hdr_id)!=1 )
2527                 error("The FORMAT tag %s can have multiple subfields, run as %s[sample:subfield]\n", tag,tag);
2528             else
2529                 tok->idx = 0;
2530             set_samples = 1;
2531         }
2532         else
2533         {
2534             tok->idxs  = idxs1;
2535             tok->nidxs = nidxs1;
2536             tok->idx   = idx1;
2537         }
2538     }
2539 
2540     if ( set_samples )
2541     {
2542         tok->nsamples = bcf_hdr_nsamples(hdr);
2543         tok->usmpl = (uint8_t*) calloc(tok->nsamples,1);
2544         if ( idx1>=0 )
2545         {
2546             if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
2547             tok->usmpl[idx1] = 1;
2548         }
2549         else if ( idx1==-2 )
2550         {
2551             for (i=0; i<nidxs1; i++)
2552             {
2553                 if ( idxs1[i]==0 ) continue;
2554                 if ( idxs1[i]==-1 ) break;
2555                 if ( i >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
2556                 tok->usmpl[i] = 1;
2557             }
2558             if ( nidxs1 && idxs1[nidxs1-1]==-1 )    // open range, such as "7-"
2559             {
2560                 for (; i<tok->nsamples; i++) tok->usmpl[i] = 1;
2561             }
2562         }
2563         else error("todo: %s:%d .. %d\n", __FILE__,__LINE__, idx2);
2564         free(idxs1);
2565     }
2566     free(ori);
2567 
2568     if ( tok->nidxs && tok->idxs[tok->nidxs-1]!=-1 )
2569     {
2570         for (i=0; i<tok->nidxs; i++) if ( tok->idxs[i] ) tok->nuidxs++;
2571     }
2572 }
max_ac_an_unpack(bcf_hdr_t * hdr)2573 static int max_ac_an_unpack(bcf_hdr_t *hdr)
2574 {
2575     int hdr_id = bcf_hdr_id2int(hdr,BCF_DT_ID,"AC");
2576     if ( hdr_id<0 ) return BCF_UN_FMT;
2577     if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,hdr_id) ) return BCF_UN_FMT;
2578 
2579     hdr_id = bcf_hdr_id2int(hdr,BCF_DT_ID,"AN");
2580     if ( hdr_id<0 ) return BCF_UN_FMT;
2581     if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,hdr_id) ) return BCF_UN_FMT;
2582 
2583     return BCF_UN_INFO;
2584 }
filters_init1(filter_t * filter,char * str,int len,token_t * tok)2585 static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
2586 {
2587     tok->tag_type  = -1;
2588     tok->tok_type  = TOK_VAL;
2589     tok->hdr_id    = -1;
2590     tok->pass_site = -1;
2591     tok->idx       = 0;
2592 
2593     // is this a string constant?
2594     if ( str[0]=='"' || str[0]=='\'' )
2595     {
2596         int quote = str[0];
2597         if ( str[len-1] != quote ) error("TODO: [%s]\n", filter->str);
2598         tok->key = (char*) calloc(len-1,sizeof(char));
2599         hts_expand(double,1,tok->mvalues,tok->values);
2600         tok->values[0] = len-2;
2601         memcpy(tok->key,str+1,len-2);
2602         tok->key[len-2] = 0;
2603         tok->is_str = 1;
2604         tok->nvalues = len-2;
2605         if ( !strcmp(".",tok->key) ) tok->is_missing = 1;
2606         return 0;
2607     }
2608 
2609     // is it a file?
2610     if ( str[0]=='@' )
2611     {
2612         tok->tag = (char*) calloc(len+1,sizeof(char));
2613         memcpy(tok->tag,str,len);
2614         tok->tag[len] = 0;
2615         char *fname = expand_path(tok->tag+1);
2616         int i, n;
2617         char **list = hts_readlist(fname, 1, &n);
2618         if ( !list ) error("Could not read: %s\n", fname);
2619         free(fname);
2620         tok->hash = khash_str2int_init();
2621         for (i=0; i<n; i++)
2622         {
2623             char *se = list[i];
2624             while ( *se && !isspace(*se) ) se++;
2625             *se = 0;
2626             if ( !khash_str2int_has_key(tok->hash,list[i]) )
2627                 khash_str2int_inc(tok->hash,list[i]);
2628             else
2629                 free(list[i]);
2630         }
2631         free(list);
2632         return 0;
2633     }
2634 
2635     int is_fmt = -1;
2636     if ( !strncasecmp(str,"FMT/",4) ) { str += 4; len -= 4; is_fmt = 1; }
2637     else if ( !strncasecmp(str,"FORMAT/",7) ) { str += 7; len -= 7; is_fmt = 1; }
2638     else
2639     {
2640         if ( !strncasecmp(str,"INFO/",5) ) { is_fmt = 0; str += 5; len -= 5; }
2641         else if ( !strncasecmp(str,"QUAL",len) || !strncmp(str,"%QUAL",len) /* for backward compatibility */ )
2642         {
2643             tok->setter = filters_set_qual;
2644             tok->tag = strdup("QUAL");
2645             return 0;
2646         }
2647         else if ( !strncasecmp(str,"TYPE",len) || !strncmp(str,"%TYPE",len) /* for backward compatibility */ )
2648         {
2649             tok->setter = filters_set_type;
2650             tok->tag = strdup("TYPE");
2651             return 0;
2652         }
2653         else if ( !strncasecmp(str,"FILTER",len) || !strncmp(str,"%FILTER",len) /* for backward compatibility */ )
2654         {
2655             tok->comparator = filters_cmp_filter;
2656             tok->tag = strdup("FILTER");
2657             filter->max_unpack |= BCF_UN_FLT;
2658             tok->tag_type = BCF_HL_FLT;
2659             return 0;
2660         }
2661         else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ )
2662         {
2663             tok->comparator = filters_cmp_id;
2664             tok->tag = strdup("ID");
2665             return 0;
2666         }
2667         else if ( !strncasecmp(str,"CHROM",len) )
2668         {
2669             tok->setter = &filters_set_chrom;
2670             tok->tag = strdup("CHROM");
2671             return 0;
2672         }
2673         else if ( !strncasecmp(str,"POS",len) )
2674         {
2675             tok->setter = &filters_set_pos;
2676             tok->tag = strdup("POS");
2677             return 0;
2678         }
2679         else if ( !strncasecmp(str,"REF",len) )
2680         {
2681             tok->setter = &filters_set_ref_string;
2682             tok->is_str = 1;
2683             tok->tag = strdup("REF");
2684             return 0;
2685         }
2686         else if ( !strncasecmp(str,"ALT",len) )
2687         {
2688             tok->setter = &filters_set_alt_string;
2689             tok->is_str = 1;
2690             tok->tag = strdup("ALT");
2691             tok->idxs = (int*) malloc(sizeof(int));
2692             tok->idxs[0] = -1;
2693             tok->nidxs   = 1;
2694             tok->idx     = -2;
2695             return 0;
2696         }
2697         else if ( !strncasecmp(str,"N_ALT",len) )
2698         {
2699             tok->setter = &filters_set_nalt;
2700             tok->tag = strdup("N_ALT");
2701             return 0;
2702         }
2703         else if ( !strncasecmp(str,"N_SAMPLES",len) )
2704         {
2705             tok->tok_type = TOK_VAL;
2706             tok->threshold = bcf_hdr_nsamples(filter->hdr);
2707             return 0;
2708         }
2709         else if ( !strncasecmp(str,"N_MISSING",len) )
2710         {
2711             filter->max_unpack |= BCF_UN_FMT;
2712             tok->setter = &filters_set_nmissing;
2713             tok->tag = strdup("N_MISSING");
2714             return 0;
2715         }
2716         else if ( !strncasecmp(str,"F_MISSING",len) )
2717         {
2718             filter->max_unpack |= BCF_UN_FMT;
2719             tok->setter = &filters_set_nmissing;
2720             tok->tag = strdup("F_MISSING");
2721             return 0;
2722         }
2723     }
2724 
2725     // does it have array subscript?
2726     int is_array = 0;
2727     kstring_t tmp = {0,0,0};
2728     kputsn(str, len, &tmp);
2729     if ( tmp.s[tmp.l-1] == ']' )
2730     {
2731         int i;
2732         for (i=0; i<tmp.l; i++)
2733             if ( tmp.s[i]=='[' ) { tmp.s[i] = 0; is_array = i+1; break; }
2734     }
2735     tok->hdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s);
2736     if ( is_fmt==-1 )
2737     {
2738         if ( tok->hdr_id >=0 )
2739         {
2740             int is_info = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) ? 1 : 0;
2741             is_fmt = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ? 1 : 0;
2742             if ( is_info && is_fmt )
2743                 error("Error: ambiguous filtering expression, both INFO/%s and FORMAT/%s are defined in the VCF header.\n" , tmp.s,tmp.s);
2744         }
2745         if ( is_fmt==-1 ) is_fmt = 0;
2746     }
2747     if ( is_array )
2748         parse_tag_idx(filter->hdr, is_fmt, tmp.s, tmp.s+is_array, tok);
2749     else if ( is_fmt && !tok->nsamples )
2750     {
2751         int i;
2752         tok->nsamples = bcf_hdr_nsamples(filter->hdr);
2753         tok->usmpl = (uint8_t*) malloc(tok->nsamples);
2754         for (i=0; i<tok->nsamples; i++) tok->usmpl[i] = 1;
2755     }
2756 
2757     tok->tag_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO;
2758     if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT;
2759     if ( tok->hdr_id>=0 )
2760     {
2761         if ( is_fmt && !strcmp("GT",tmp.s) )
2762         {
2763             tok->setter = &filters_set_genotype_string; tok->is_str = 1;
2764         }
2765         else if ( is_fmt )
2766         {
2767             if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
2768                 error("No such FORMAT field: %s\n", tmp.s);
2769             if ( bcf_hdr_id2number(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=1 && !is_array )
2770             {
2771                 tok->idxs = (int*) malloc(sizeof(int));
2772                 tok->idxs[0] = -1;
2773                 tok->nidxs   = 1;
2774                 tok->idx     = -2;
2775             }
2776             switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
2777             {
2778                 case BCF_HT_INT:  tok->setter = &filters_set_format_int; break;
2779                 case BCF_HT_REAL: tok->setter = &filters_set_format_float; break;
2780                 case BCF_HT_STR:  tok->setter = &filters_set_format_string; tok->is_str = 1; break;
2781                 default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
2782             }
2783         }
2784         else if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) )
2785             error("No such INFO field: %s\n", tmp.s);
2786         else
2787         {
2788             if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_FLAG )
2789                 tok->setter = filters_set_info_flag;
2790             else
2791             {
2792                 if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_STR ) tok->is_str = 1;
2793                 if ( bcf_hdr_id2number(filter->hdr,BCF_HL_INFO,tok->hdr_id)==1 )
2794                     tok->setter = filters_set_info;
2795                 else
2796                 {
2797                     switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) )
2798                     {
2799                         case BCF_HT_INT:  tok->setter = &filters_set_info_int; break;
2800                         case BCF_HT_REAL: tok->setter = &filters_set_info_float; break;
2801                         case BCF_HT_STR:  tok->setter = &filters_set_info_string; tok->is_str = 1; break;
2802                         default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
2803                     }
2804                     if (!is_array)
2805                     {
2806                         tok->idx = -2;
2807                         tok->idxs = (int*) malloc(sizeof(int));
2808                         tok->idxs[0] = -1;
2809                         tok->nidxs   = 1;
2810                     }
2811                 }
2812             }
2813             filter->max_unpack |= BCF_UN_INFO;
2814         }
2815         tok->tag = strdup(tmp.s);
2816         if ( tmp.s ) free(tmp.s);
2817         return 0;
2818     }
2819     else if ( !strcasecmp(tmp.s,"ALT") )
2820     {
2821         tok->setter = &filters_set_alt_string;
2822         tok->is_str = 1;
2823         tok->tag = strdup(tmp.s);
2824         free(tmp.s);
2825         return 0;
2826     }
2827     else if ( !strcasecmp(tmp.s,"AN") )
2828     {
2829         filter->max_unpack |= BCF_UN_FMT;
2830         tok->setter = &filters_set_an;
2831         tok->tag = strdup("AN");
2832         free(tmp.s);
2833         return 0;
2834     }
2835     else if ( !strcasecmp(tmp.s,"AC") )
2836     {
2837         filter->max_unpack |= BCF_UN_FMT;
2838         tok->setter = &filters_set_ac;
2839         tok->tag = strdup("AC");
2840         free(tmp.s);
2841         return 0;
2842     }
2843     else if ( !strcasecmp(tmp.s,"MAC") )
2844     {
2845         filter->max_unpack |= max_ac_an_unpack(filter->hdr);
2846         tok->setter = &filters_set_mac;
2847         tok->tag = strdup("MAC");
2848         free(tmp.s);
2849         return 0;
2850     }
2851     else if ( !strcasecmp(tmp.s,"AF") )
2852     {
2853         filter->max_unpack |= max_ac_an_unpack(filter->hdr);
2854         tok->setter = &filters_set_af;
2855         tok->tag = strdup("AF");
2856         free(tmp.s);
2857         return 0;
2858     }
2859     else if ( !strcasecmp(tmp.s,"MAF") )
2860     {
2861         filter->max_unpack |= max_ac_an_unpack(filter->hdr);
2862         tok->setter = &filters_set_maf;
2863         tok->tag = strdup("MAF");
2864         free(tmp.s);
2865         return 0;
2866     }
2867     else if ( !strcasecmp(tmp.s,"ILEN") || !strcasecmp(tmp.s,"%ILEN") )
2868     {
2869         filter->max_unpack |= BCF_UN_STR;
2870         tok->setter = &filters_set_ilen;
2871         tok->tag = strdup("ILEN");
2872         free(tmp.s);
2873         return 0;
2874     }
2875 
2876     // is it a value? Here we parse as integer/float separately and use strtof
2877     // rather than strtod, because the more accurate double representation
2878     // would invalidate floating point comparisons like QUAL=59.2, obtained via
2879     // htslib/vcf parser.
2880     // Update: use strtod() and force floats only in comparisons
2881     char *end;
2882     tok->threshold = strtol(tmp.s, &end, 10);   // integer?
2883     if ( end - tmp.s != strlen(tmp.s) )
2884     {
2885         errno = 0;
2886         tok->threshold = strtod(tmp.s, &end);   // float?
2887         if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
2888     }
2889     tok->is_constant = 1;
2890 
2891     if ( tmp.s ) free(tmp.s);
2892     return 0;
2893 }
2894 
2895 
filter_debug_print(token_t * toks,token_t ** tok_ptrs,int ntoks)2896 static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks)
2897 {
2898     int i;
2899     for (i=0; i<ntoks; i++)
2900     {
2901         token_t *tok = toks ? &toks[i] : tok_ptrs[i];
2902         if ( tok->tok_type==TOK_VAL )
2903         {
2904             if ( tok->key )
2905                 fprintf(stderr,"%s", tok->key);
2906             else if ( tok->tag )
2907                 fprintf(stderr,"%s", tok->tag);
2908             else
2909                 fprintf(stderr,"%e", tok->threshold);
2910         }
2911         else
2912             fprintf(stderr,"%c", TOKEN_STRING[tok->tok_type]);
2913         if ( tok->setter ) fprintf(stderr,"\t[setter %p]", tok->setter);
2914         fprintf(stderr,"\n");
2915     }
2916 }
2917 
str_to_lower(char * str)2918 static void str_to_lower(char *str)
2919 {
2920     while ( *str ) { *str = tolower(*str); str++; }
2921 }
perl_exec(filter_t * flt,bcf1_t * line,token_t * rtok,token_t ** stack,int nstack)2922 static int perl_exec(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
2923 {
2924 #if ENABLE_PERL_FILTERS
2925 
2926     PerlInterpreter *perl = flt->perl;
2927     if ( !perl ) error("Error: perl expression without a perl script name\n");
2928 
2929     dSP;
2930     ENTER;
2931     SAVETMPS;
2932 
2933     PUSHMARK(SP);
2934     int i,j, istack = nstack - rtok->nargs;
2935     for (i=istack+1; i<nstack; i++)
2936     {
2937         token_t *tok = stack[i];
2938         if ( tok->is_str )
2939             XPUSHs(sv_2mortal(newSVpvn(tok->str_value.s,tok->str_value.l)));
2940         else if ( tok->nvalues==1 )
2941             XPUSHs(sv_2mortal(newSVnv(tok->values[0])));
2942         else if ( tok->nvalues>1 )
2943         {
2944             AV *av = newAV();
2945             for (j=0; j<tok->nvalues; j++) av_push(av, newSVnv(tok->values[j]));
2946             SV *rv = newRV_inc((SV*)av);
2947             XPUSHs(rv);
2948         }
2949         else
2950         {
2951             bcf_double_set_missing(tok->values[0]);
2952             XPUSHs(sv_2mortal(newSVnv(tok->values[0])));
2953         }
2954     }
2955     PUTBACK;
2956 
2957     // A possible future todo: provide a means to select samples and indexes,
2958     // expressions like this don't work yet
2959     //          perl.filter(FMT/AD)[1:0]
2960 
2961     int nret = call_pv(stack[istack]->str_value.s, G_ARRAY);
2962 
2963     SPAGAIN;
2964 
2965     rtok->nvalues = nret;
2966     hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
2967     for (i=nret; i>0; i--)
2968     {
2969         rtok->values[i-1] = (double) POPn;
2970         if ( isnan(rtok->values[i-1]) ) bcf_double_set_missing(rtok->values[i-1]);
2971     }
2972 
2973     PUTBACK;
2974     FREETMPS;
2975     LEAVE;
2976 
2977 #else
2978     error("\nPerl filtering requires running `configure --enable-perl-filters` at compile time.\n\n");
2979 #endif
2980     return rtok->nargs;
2981 }
perl_init(filter_t * filter,char ** str)2982 static void perl_init(filter_t *filter, char **str)
2983 {
2984     char *beg = *str;
2985     while ( *beg && isspace(*beg) ) beg++;
2986     if ( !*beg ) return;
2987     if ( strncasecmp("perl:", beg, 5) ) return;
2988 #if ENABLE_PERL_FILTERS
2989     beg += 5;
2990     char *end = beg;
2991     while ( *end && *end!=';' ) end++;  // for now not escaping semicolons
2992     *str = end+1;
2993 
2994     if ( ++filter_ninit == 1 )
2995     {
2996         // must be executed only once, even for multiple filters; first time here
2997         int argc = 0;
2998         char **argv = NULL;
2999         char **env  = NULL;
3000         PERL_SYS_INIT3(&argc, &argv, &env);
3001     }
3002 
3003     filter->perl = perl_alloc();
3004     PerlInterpreter *perl = filter->perl;
3005 
3006     if ( !perl ) error("perl_alloc failed\n");
3007     perl_construct(perl);
3008 
3009     // name of the perl script to run
3010     char *rmme = (char*) calloc(end - beg + 1,1);
3011     memcpy(rmme, beg, end - beg);
3012     char *argv[] = { "", "" };
3013     argv[1] = expand_path(rmme);
3014     free(rmme);
3015 
3016     PL_origalen = 1;    // don't allow $0 change
3017     int ret = perl_parse(filter->perl, NULL, 2, argv, NULL);
3018     PL_exit_flags |= PERL_EXIT_DESTRUCT_END;
3019     if ( ret ) error("Failed to parse: %s\n", argv[1]);
3020     free(argv[1]);
3021 
3022     perl_run(perl);
3023 #else
3024     error("\nPerl filtering requires running `configure --enable-perl-filters` at compile time.\n\n");
3025 #endif
3026 }
perl_destroy(filter_t * filter)3027 static void perl_destroy(filter_t *filter)
3028 {
3029 #if ENABLE_PERL_FILTERS
3030     if ( !filter->perl ) return;
3031 
3032     PerlInterpreter *perl = filter->perl;
3033     perl_destruct(perl);
3034     perl_free(perl);
3035     if ( --filter_ninit <= 0  )
3036     {
3037         // similarly to PERL_SYS_INIT3, can must be executed only once? todo: test
3038         PERL_SYS_TERM();
3039     }
3040 #endif
3041 }
3042 
3043 
3044 // Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
filter_init(bcf_hdr_t * hdr,const char * str)3045 filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
3046 {
3047     filter_t *filter = (filter_t *) calloc(1,sizeof(filter_t));
3048     filter->str = strdup(str);
3049     filter->hdr = hdr;
3050     filter->max_unpack |= BCF_UN_STR;
3051 
3052     int nops = 0, mops = 0;    // operators stack
3053     int nout = 0, mout = 0;    // filter tokens, RPN
3054     token_t *out = NULL;
3055     token_t *ops = NULL;
3056     char *tmp = filter->str;
3057     perl_init(filter, &tmp);
3058 
3059     int last_op = -1;
3060     while ( *tmp )
3061     {
3062         int len, ret;
3063         ret = filters_next_token(&tmp, &len);
3064         if ( ret==-1 ) error("Missing quotes in: %s\n", str);
3065 
3066         // fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
3067         // int i; for (i=0; i<nops; i++) fprintf(stderr," .%c", TOKEN_STRING[ops[i].tok_type]); fprintf(stderr,"\n");
3068 
3069         if ( ret==TOK_LFT )         // left bracket
3070         {
3071             nops++;
3072             hts_expand0(token_t, nops, mops, ops);
3073             ops[nops-1].tok_type = ret;
3074         }
3075         else if ( ret==TOK_RGT )    // right bracket
3076         {
3077             while ( nops>0 && ops[nops-1].tok_type!=TOK_LFT )
3078             {
3079                 nout++;
3080                 hts_expand0(token_t, nout, mout, out);
3081                 out[nout-1] = ops[nops-1];
3082                 memset(&ops[nops-1],0,sizeof(token_t));
3083                 nops--;
3084             }
3085             if ( nops<=0 ) error("Could not parse: %s\n", str);
3086             memset(&ops[nops-1],0,sizeof(token_t));
3087             nops--;
3088         }
3089         else if ( ret!=TOK_VAL )    // one of the operators
3090         {
3091             // detect unary minus: replace -value with -1*(value)
3092             if ( ret==TOK_SUB && last_op!=TOK_VAL && last_op!=TOK_RGT )
3093             {
3094                 nout++;
3095                 hts_expand0(token_t, nout, mout, out);
3096                 token_t *tok = &out[nout-1];
3097                 tok->tok_type  = TOK_VAL;
3098                 tok->hdr_id    = -1;
3099                 tok->pass_site = -1;
3100                 tok->threshold = -1.0;
3101                 ret = TOK_MULT;
3102             }
3103             else if ( ret == -TOK_FUNC )
3104             {
3105                 // this is different from TOK_PERLSUB,TOK_BINOM in that the expression inside the
3106                 // brackets gets evaluated as normal expression
3107                 nops++;
3108                 hts_expand0(token_t, nops, mops, ops);
3109                 token_t *tok = &ops[nops-1];
3110                 tok->tok_type  = -ret;
3111                 tok->hdr_id    = -1;
3112                 tok->pass_site = -1;
3113                 tok->threshold = -1.0;
3114                 if ( !strncasecmp(tmp-len,"N_PASS",6) )
3115                 {
3116                     filter->max_unpack |= BCF_UN_FMT;
3117                     tok->func = func_npass;
3118                     tok->tag = strdup("N_PASS");
3119                 }
3120                 else if ( !strncasecmp(tmp-len,"F_PASS",6) )
3121                 {
3122                     filter->max_unpack |= BCF_UN_FMT;
3123                     tok->func = func_npass;
3124                     tok->tag = strdup("F_PASS");
3125                 }
3126                 else error("The function \"%s\" is not supported\n", tmp-len);
3127                 continue;
3128             }
3129             else if ( ret < 0 )     // variable number of arguments: TOK_PERLSUB,TOK_BINOM
3130             {
3131                 ret = -ret;
3132 
3133                 tmp += len;
3134                 char *beg = tmp;
3135                 kstring_t rmme = {0,0,0};
3136                 int i, margs, nargs = 0;
3137 
3138                 if ( ret == TOK_PERLSUB )
3139                 {
3140                     while ( *beg && ((isalnum(*beg) && !ispunct(*beg)) || *beg=='_') ) beg++;
3141                     if ( *beg!='(' ) error("Could not parse the expression: %s\n", str);
3142 
3143                     // the subroutine name
3144                     kputc('"', &rmme);
3145                     kputsn(tmp, beg-tmp, &rmme);
3146                     kputc('"', &rmme);
3147                     nout++;
3148                     hts_expand0(token_t, nout, mout, out);
3149                     filters_init1(filter, rmme.s, rmme.l, &out[nout-1]);
3150                     nargs++;
3151                 }
3152                 char *end = beg;
3153                 while ( *end && *end!=')' ) end++;
3154                 if ( !*end ) error("Could not parse the expression: %s\n", str);
3155 
3156                 // subroutine arguments
3157                 rmme.l = 0;
3158                 kputsn(beg+1, end-beg-1, &rmme);
3159                 char **rmme_list = hts_readlist(rmme.s, 0, &margs);
3160                 for (i=0; i<margs; i++)
3161                 {
3162                     nargs++;
3163                     nout++;
3164                     hts_expand0(token_t, nout, mout, out);
3165                     filters_init1(filter, rmme_list[i], strlen(rmme_list[i]), &out[nout-1]);
3166                     free(rmme_list[i]);
3167                 }
3168                 free(rmme_list);
3169                 free(rmme.s);
3170 
3171                 nout++;
3172                 hts_expand0(token_t, nout, mout, out);
3173                 token_t *tok = &out[nout-1];
3174                 tok->tok_type  = ret;
3175                 tok->nargs     = nargs;
3176                 tok->hdr_id    = -1;
3177                 tok->pass_site = -1;
3178                 tok->threshold = -1.0;
3179 
3180                 tmp = end + 1;
3181                 continue;
3182             }
3183             else
3184             {
3185                 while ( nops>0 && op_prec[ret] < op_prec[ops[nops-1].tok_type] )
3186                 {
3187                     nout++;
3188                     hts_expand0(token_t, nout, mout, out);
3189                     out[nout-1] = ops[nops-1];
3190                     memset(&ops[nops-1],0,sizeof(token_t));
3191                     nops--;
3192                 }
3193             }
3194             nops++;
3195             hts_expand0(token_t, nops, mops, ops);
3196             ops[nops-1].tok_type = ret;
3197         }
3198         else if ( !len )
3199         {
3200             if ( *tmp && !isspace(*tmp) ) error("Could not parse the expression: [%s]\n", str);
3201             break;     // all tokens read
3202         }
3203         else           // annotation name or filtering value
3204         {
3205             nout++;
3206             hts_expand0(token_t, nout, mout, out);
3207             if ( tmp[len-1]==',' )
3208                 filters_init1(filter, tmp, len-1, &out[nout-1]);
3209             else
3210                 filters_init1(filter, tmp, len, &out[nout-1]);
3211             tmp += len;
3212         }
3213         last_op = ret;
3214     }
3215     while ( nops>0 )
3216     {
3217         if ( ops[nops-1].tok_type==TOK_LFT || ops[nops-1].tok_type==TOK_RGT ) error("Could not parse the expression: [%s]\n", filter->str);
3218         nout++;
3219         hts_expand0(token_t, nout, mout, out);
3220         out[nout-1] = ops[nops-1];
3221         memset(&ops[nops-1],0,sizeof(token_t));
3222         nops--;
3223     }
3224 
3225     // In the special cases of TYPE and FILTER the BCF header IDs are yet unknown. Walk through the
3226     // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be
3227     // just before or after the FILTER token and they must be followed with a comparison operator.
3228     // At this point we also initialize regex expressions which, in RPN, must preceed the LIKE/NLIKE operator.
3229     // Additionally, treat "." as missing value rather than a string in numeric equalities; that
3230     // @file is only used with ID; etc.
3231     // This code is fragile: improve me.
3232     static int comma_separator_warned = 0;
3233     int i;
3234     for (i=0; i<nout; i++)
3235     {
3236         if ( i+1<nout && (out[i].tok_type==TOK_LT || out[i].tok_type==TOK_BT) && out[i+1].tok_type==TOK_EQ )
3237             error("Error parsing the expression: \"%s\"\n", filter->str);
3238 
3239         if ( out[i].hash )
3240         {
3241             int j = out[i+1].tok_type==TOK_VAL ? i+1 : i-1;
3242             if ( out[j].comparator!=filters_cmp_id )
3243                 error("Error: could not parse the expression. Note that the \"@file_name\" syntax can be currently used with ID column only.\n");
3244         }
3245         if ( out[i].tok_type==TOK_OR || out[i].tok_type==TOK_OR_VEC )
3246             out[i].func = vector_logic_or;
3247         if ( out[i].tok_type==TOK_AND || out[i].tok_type==TOK_AND_VEC )
3248             out[i].func = vector_logic_and;
3249         if ( out[i].tok_type==TOK_EQ || out[i].tok_type==TOK_NE )
3250         {
3251             // Look for j="." and k numeric type
3252             int j = i-1, k = i-2;
3253             if ( !out[j].is_str ) { k = i-1, j = i-2; }
3254             if ( out[j].is_str && out[j].key && !strcmp(".",out[j].key) )
3255             {
3256                 int set_missing = 0;
3257                 if ( out[k].hdr_id>0 )
3258                 {
3259                     int type = bcf_hdr_id2type(filter->hdr,out[k].tag_type,out[k].hdr_id);
3260                     if ( type==BCF_HT_INT ) set_missing = 1;
3261                     else if ( type==BCF_HT_REAL ) set_missing = 1;
3262                 }
3263                 else if ( !strcmp("QUAL",out[k].tag) ) set_missing = 1;
3264                 if ( set_missing ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
3265             }
3266         }
3267         if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE )
3268         {
3269             int j = i-1;
3270             if ( !out[j].key )
3271                 error("Could not parse the expression, wrong value for regex operator: %s\n", filter->str);
3272             out[j].regex = (regex_t *) malloc(sizeof(regex_t));
3273             int cflags = REG_NOSUB;
3274             int len = strlen(out[j].key);
3275             if ( len>2 && out[j].key[len-1]=='i' && out[j].key[len-2]=='/' && out[j].key[len-3]!='\\'  )
3276             {
3277                 out[j].key[len-2] = 0;
3278                 cflags |= REG_ICASE;
3279             }
3280             if ( regcomp(out[j].regex, out[j].key, cflags) )
3281                 error("Could not compile the regex expression \"%s\": %s\n", out[j].key,filter->str);
3282         }
3283         if ( out[i].is_str && out[i].tok_type==TOK_VAL && out[i].key && strchr(out[i].key,',') )
3284         {
3285             int print_note = 0;
3286             if ( out[i+1].tok_type==TOK_EQ || (out[i+1].is_str && out[i+2].tok_type==TOK_EQ) ) print_note = 1;
3287             else if ( out[i+1].tok_type==TOK_NE || (out[i+1].is_str && out[i+2].tok_type==TOK_NE) ) print_note = 1;
3288             if ( print_note && !comma_separator_warned )
3289             {
3290                 comma_separator_warned = 1;
3291                 fprintf(stderr,
3292                     "Warning: comma is interpreted as a separator and OR logic is used in string comparisons.\n"
3293                     "         (Search the manual for \"Comma in strings\" to learn more.)\n");
3294             }
3295         }
3296         if ( out[i].tok_type!=TOK_VAL ) continue;
3297         if ( !out[i].tag ) continue;
3298         if ( out[i].setter==filters_set_type )
3299         {
3300             if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
3301             int itok, ival;
3302             if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1, itok = i + 1;
3303             else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1, itok = i + 1;
3304             else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) itok = i + 2, ival = i + 1;
3305             else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) itok = i + 2, ival = i + 1;
3306             else error("[%s:%d %s] Could not parse the expression: %s\n",  __FILE__,__LINE__,__FUNCTION__, filter->str);
3307             if ( !strcasecmp(out[ival].key,"snp") || !strcasecmp(out[ival].key,"snps") ) { out[ival].threshold = VCF_SNP<<1; out[ival].is_str = 0; }
3308             else if ( !strcasecmp(out[ival].key,"indel") || !strcasecmp(out[ival].key,"indels") ) { out[ival].threshold = VCF_INDEL<<1; out[ival].is_str = 0; }
3309             else if ( !strcasecmp(out[ival].key,"mnp") || !strcasecmp(out[ival].key,"mnps") ) { out[ival].threshold = VCF_MNP<<1; out[ival].is_str = 0; }
3310             else if ( !strcasecmp(out[ival].key,"other") ) { out[ival].threshold = VCF_OTHER<<1; out[ival].is_str = 0; }
3311             else if ( !strcasecmp(out[ival].key,"bnd") ) { out[ival].threshold = VCF_BND<<1; out[ival].is_str = 0; }
3312             else if ( !strcasecmp(out[ival].key,"overlap") ) { out[ival].threshold = VCF_OVERLAP<<1; out[ival].is_str = 0; }
3313             else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; }
3314             else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str);
3315             if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and;
3316             out[ival].tag = out[ival].key; out[ival].key = NULL;
3317             i = itok;
3318             continue;
3319         }
3320         if ( !strcmp(out[i].tag,"GT") )
3321         {
3322             if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
3323             int ival;
3324             if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
3325             else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1;
3326             else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = i + 1;
3327             else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) ival = i + 1;
3328             else error("[%s:%d %s] Could not parse the expression: %s\n",  __FILE__,__LINE__,__FUNCTION__, filter->str);
3329 
3330             if ( !out[ival].key ) error("Comparison between samples is not supported, sorry!\n");
3331 
3332             // assign correct setters and unify expressions, eg ar->ra, HOM->hom, etc
3333             if ( !strcasecmp(out[ival].key,"hom") ) { out[i].setter = filters_set_genotype3;  str_to_lower(out[ival].key); }
3334             else if ( !strcasecmp(out[ival].key,"het") ) { out[i].setter = filters_set_genotype3;  str_to_lower(out[ival].key); }
3335             else if ( !strcasecmp(out[ival].key,"hap") ) { out[i].setter = filters_set_genotype3;  str_to_lower(out[ival].key); }
3336             else if ( !strcasecmp(out[ival].key,"mis") ) { out[i].setter = filters_set_genotype4;  str_to_lower(out[ival].key); }
3337             else if ( !strcasecmp(out[ival].key,"ref") ) { out[i].setter = filters_set_genotype4;  str_to_lower(out[ival].key); }
3338             else if ( !strcasecmp(out[ival].key,"alt") ) { out[i].setter = filters_set_genotype4;  str_to_lower(out[ival].key); }
3339             else if ( !strcasecmp(out[ival].key,"rr") ) { out[i].setter = filters_set_genotype2;  str_to_lower(out[ival].key); }
3340             else if ( !strcasecmp(out[ival].key,"ra") || !strcasecmp(out[ival].key,"ar") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]='a'; }   // ra
3341             else if ( !strcmp(out[ival].key,"aA") || !strcmp(out[ival].key,"Aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='A'; }   // aA
3342             else if ( !strcasecmp(out[ival].key,"aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='a'; }  // aa
3343             else if ( !strcasecmp(out[ival].key,"a") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]=0; }  // a
3344             else if ( !strcasecmp(out[ival].key,"r") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]=0; }  // r
3345             continue;
3346         }
3347         if ( out[i].tag_type==BCF_HL_FLT )
3348         {
3349             if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
3350             int itok = i, ival;
3351             if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
3352             else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_IN, ival = i - 1;
3353             else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NOT_IN, ival = i - 1;
3354             else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = ++i;
3355             else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_IN, ival = ++i;
3356             else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NOT_IN, ival = ++i;
3357             else error("[%s:%d %s] Could not parse the expression: %s\n",  __FILE__,__LINE__,__FUNCTION__, filter->str);
3358             if ( out[ival].tok_type!=TOK_VAL || !out[ival].key )
3359                 error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
3360             if ( strcmp(".",out[ival].key) )
3361             {
3362                 out[ival].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[ival].key);
3363                 if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[ival].hdr_id) )
3364                     error("The filter \"%s\" not present in the VCF header\n", out[ival].key);
3365             }
3366             else
3367                 out[ival].hdr_id = -1;
3368             out[ival].tag = out[ival].key; out[ival].key = NULL;
3369             out[itok].hdr_id = out[ival].hdr_id;
3370             continue;
3371         }
3372     }
3373     filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0;
3374     for (i=0; i<nout; i++)
3375     {
3376         if ( out[i].tok_type==TOK_MAX )      { out[i].func = func_max; out[i].tok_type = TOK_FUNC; }
3377         else if ( out[i].tok_type==TOK_MIN ) { out[i].func = func_min; out[i].tok_type = TOK_FUNC; }
3378         else if ( out[i].tok_type==TOK_AVG ) { out[i].func = func_avg; out[i].tok_type = TOK_FUNC; }
3379         else if ( out[i].tok_type==TOK_MEDIAN ) { out[i].func = func_median; out[i].tok_type = TOK_FUNC; }
3380         else if ( out[i].tok_type==TOK_AVG ) { out[i].func = func_avg; out[i].tok_type = TOK_FUNC; }
3381         else if ( out[i].tok_type==TOK_STDEV ) { out[i].func = func_stddev; out[i].tok_type = TOK_FUNC; }
3382         else if ( out[i].tok_type==TOK_SUM ) { out[i].func = func_sum; out[i].tok_type = TOK_FUNC; }
3383         else if ( out[i].tok_type==TOK_ABS ) { out[i].func = func_abs; out[i].tok_type = TOK_FUNC; }
3384         else if ( out[i].tok_type==TOK_CNT ) { out[i].func = func_count; out[i].tok_type = TOK_FUNC; }
3385         else if ( out[i].tok_type==TOK_LEN ) { out[i].func = func_strlen; out[i].tok_type = TOK_FUNC; }
3386         else if ( out[i].tok_type==TOK_PHRED ) { out[i].func = func_phred; out[i].tok_type = TOK_FUNC; }
3387         else if ( out[i].tok_type==TOK_BINOM ) { out[i].func = func_binom; out[i].tok_type = TOK_FUNC; }
3388         else if ( out[i].tok_type==TOK_PERLSUB ) { out[i].func = perl_exec; out[i].tok_type = TOK_FUNC; }
3389         else if ( out[i].tok_type==TOK_sMAX ) { out[i].func = func_smpl_max; out[i].tok_type = TOK_FUNC; }
3390         else if ( out[i].tok_type==TOK_sMIN ) { out[i].func = func_smpl_min; out[i].tok_type = TOK_FUNC; }
3391         else if ( out[i].tok_type==TOK_sAVG ) { out[i].func = func_smpl_avg; out[i].tok_type = TOK_FUNC; }
3392         else if ( out[i].tok_type==TOK_sMEDIAN ) { out[i].func = func_smpl_median; out[i].tok_type = TOK_FUNC; }
3393         else if ( out[i].tok_type==TOK_sSTDEV ) { out[i].func = func_smpl_stddev; out[i].tok_type = TOK_FUNC; }
3394         else if ( out[i].tok_type==TOK_sSUM ) { out[i].func = func_smpl_sum; out[i].tok_type = TOK_FUNC; }
3395         hts_expand0(double,1,out[i].mvalues,out[i].values);
3396         if ( filter->nsamples )
3397         {
3398             out[i].pass_samples = (uint8_t*)malloc(filter->nsamples);
3399             int j;
3400             for (j=0; j<filter->nsamples; j++) out[i].pass_samples[j] = 0;
3401         }
3402     }
3403 
3404     if (0) filter_debug_print(out, NULL, nout);
3405 
3406     if ( mops ) free(ops);
3407     filter->filters   = out;
3408     filter->nfilters  = nout;
3409     filter->flt_stack = (token_t **)malloc(sizeof(token_t*)*nout);
3410     return filter;
3411 }
3412 
filter_destroy(filter_t * filter)3413 void filter_destroy(filter_t *filter)
3414 {
3415     perl_destroy(filter);
3416     int i;
3417     for (i=0; i<filter->nfilters; i++)
3418     {
3419         if ( filter->filters[i].key ) free(filter->filters[i].key);
3420         free(filter->filters[i].str_value.s);
3421         free(filter->filters[i].tag);
3422         free(filter->filters[i].idxs);
3423         free(filter->filters[i].usmpl);
3424         free(filter->filters[i].values);
3425         free(filter->filters[i].pass_samples);
3426         if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash);
3427         if (filter->filters[i].regex)
3428         {
3429             regfree(filter->filters[i].regex);
3430             free(filter->filters[i].regex);
3431         }
3432     }
3433     free(filter->filters);
3434     free(filter->flt_stack);
3435     free(filter->str);
3436     free(filter->tmpi);
3437     free(filter->tmpf);
3438     free(filter->tmps.s);
3439     free(filter);
3440 }
3441 
filter_test(filter_t * filter,bcf1_t * line,const uint8_t ** samples)3442 int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
3443 {
3444     bcf_unpack(line, filter->max_unpack);
3445 
3446     int i, nstack = 0;
3447     for (i=0; i<filter->nfilters; i++)
3448     {
3449         filter->filters[i].pass_site = 0;
3450         if ( filter->filters[i].tok_type == TOK_VAL )
3451         {
3452             if ( filter->filters[i].setter )    // variable, query the VCF line
3453                 filter->filters[i].setter(filter, line, &filter->filters[i]);
3454             else if ( filter->filters[i].key )  // string constant
3455             {
3456                 filter->filters[i].str_value.l = 0;
3457                 kputs(filter->filters[i].key, &filter->filters[i].str_value);
3458                 filter->filters[i].nvalues = filter->filters[i].str_value.l;
3459             }
3460             else    // numeric constant
3461             {
3462                 filter->filters[i].values[0] = filter->filters[i].threshold;
3463                 filter->filters[i].nvalues   = 1;
3464             }
3465             filter->flt_stack[nstack++] = &filter->filters[i];
3466             continue;
3467         }
3468         else if ( filter->filters[i].func )
3469         {
3470             int nargs = filter->filters[i].func(filter, line, &filter->filters[i], filter->flt_stack, nstack);
3471             filter->flt_stack[nstack-nargs] = &filter->filters[i];
3472             if ( --nargs > 0 ) nstack -= nargs;
3473             continue;
3474         }
3475         if ( nstack<2 )
3476             error("Error occurred while processing the filter \"%s\" (1:%d)\n", filter->str,nstack);  // too few values left on the stack
3477 
3478         int is_str  = filter->flt_stack[nstack-1]->is_str + filter->flt_stack[nstack-2]->is_str;
3479 
3480         if ( filter->filters[i].tok_type == TOK_ADD )
3481         {
3482             VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+);
3483             filter->flt_stack[nstack-2] = &filter->filters[i];
3484             nstack--;
3485             continue;
3486         }
3487         else if ( filter->filters[i].tok_type == TOK_SUB )
3488         {
3489             VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-);
3490             filter->flt_stack[nstack-2] = &filter->filters[i];
3491             nstack--;
3492             continue;
3493         }
3494         else if ( filter->filters[i].tok_type == TOK_MULT )
3495         {
3496             VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*);
3497             filter->flt_stack[nstack-2] = &filter->filters[i];
3498             nstack--;
3499             continue;
3500         }
3501         else if ( filter->filters[i].tok_type == TOK_DIV )
3502         {
3503             VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/);
3504             filter->flt_stack[nstack-2] = &filter->filters[i];
3505             nstack--;
3506             continue;
3507         }
3508 
3509         // ideally, these comparators would become func, but this would require more work in init1()
3510         if ( filter->filters[i].comparator )
3511         {
3512             filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line);
3513         }
3514         else if ( filter->flt_stack[nstack-1]->comparator )
3515         {
3516             filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line);
3517         }
3518         else if ( filter->flt_stack[nstack-2]->comparator )
3519         {
3520             filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],line);
3521         }
3522         else if ( is_str==2 )
3523         {
3524             cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i]);
3525         }
3526         else
3527         {
3528             if ( is_str==1 ) error("Error: cannot use arithmetic operators to compare strings and numbers\n");
3529 
3530             // Determine what to do with one [1] or both [2] sides missing. The first field [0] gives [1]|[2]
3531             int missing_logic[] = {0,0,0};
3532             if ( filter->filters[i].tok_type == TOK_EQ ) { missing_logic[0] = missing_logic[2] = 1; }
3533             if ( filter->filters[i].tok_type == TOK_NE ) { missing_logic[0] = missing_logic[1] = 1; }
3534 
3535             if ( filter->filters[i].tok_type == TOK_EQ )
3536                 CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],==,missing_logic)
3537             else if ( filter->filters[i].tok_type == TOK_NE )
3538                 CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],!=,missing_logic)
3539             else if ( filter->filters[i].tok_type == TOK_LE )
3540                 CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<=,missing_logic)
3541             else if ( filter->filters[i].tok_type == TOK_LT )
3542                 CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<,missing_logic)
3543             else if ( filter->filters[i].tok_type == TOK_BT )
3544                 CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>,missing_logic)
3545             else if ( filter->filters[i].tok_type == TOK_BE )
3546                 CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>=,missing_logic)
3547             else
3548                 error("todo: %s:%d .. type=%d\n", __FILE__,__LINE__,filter->filters[i].tok_type);
3549         }
3550         filter->flt_stack[nstack-2] = &filter->filters[i];
3551         nstack--;
3552     }
3553     if ( nstack>1 ) error("Error occurred while processing the filter \"%s\" (2:%d)\n", filter->str,nstack);    // too few values left on the stack
3554     if ( samples )
3555     {
3556         *samples = filter->max_unpack&BCF_UN_FMT ? filter->flt_stack[0]->pass_samples : NULL;
3557         if ( *samples && !filter->flt_stack[0]->nsamples )
3558         {
3559             for (i=0; i<filter->nsamples; i++)
3560                 filter->flt_stack[0]->pass_samples[i] = filter->flt_stack[0]->pass_site;
3561         }
3562     }
3563     return filter->flt_stack[0]->pass_site;
3564 }
3565 
filter_max_unpack(filter_t * flt)3566 int filter_max_unpack(filter_t *flt)
3567 {
3568     return flt->max_unpack;
3569 }
3570 
filter_get_doubles(filter_t * filter,int * nval,int * nval1)3571 const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1)
3572 {
3573     token_t *tok = filter->flt_stack[0];
3574     if ( tok->nvalues )
3575     {
3576         *nval  = tok->nvalues;
3577         *nval1 = tok->nval1;
3578     }
3579     else
3580     {
3581         if ( !tok->values ) error("fixme in filter_get_doubles(): %s\n", filter->str);
3582         *nval  = 1;
3583         *nval1 = 1;
3584         tok->values[0] = filter->flt_stack[0]->pass_site;
3585     }
3586     return tok->values;
3587 }
3588 
filter_set_samples(filter_t * filter,const uint8_t * samples)3589 void filter_set_samples(filter_t *filter, const uint8_t *samples)
3590 {
3591     int i,j;
3592     for (i=0; i<filter->nfilters; i++)
3593     {
3594         if ( !filter->filters[i].nsamples ) continue;
3595         for (j=0; j<filter->filters[i].nsamples; j++) filter->filters[i].usmpl[j] = samples[j];
3596     }
3597 }
3598 
3599