1 /* The MIT License
2 
3    Copyright (c) 2013 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "filter.h"
25 
26 /**
27  * Constructor.
28  */
Node()29 Node::Node()
30 {
31     parent = NULL;
32     left = NULL;
33     right = NULL;
34     tag = {0,0,0};
35     s = {0,0,0};
36     regex_set = false;
37 };
38 
39 /**
40  * Constructor with type initlialization.
41  */
Node(int32_t type)42 Node::Node(int32_t type)
43 {
44     parent = NULL;
45     left = NULL;
46     right = NULL;
47     tag = {0,0,0};
48     s = {0,0,0};
49     regex_set = false;
50     this->type = type;
51 };
52 
53 /**
54  * Evaluates the actions for this node.
55  */
evaluate(bcf_hdr_t * h,bcf1_t * v,Variant * variant,bool debug)56 void Node::evaluate(bcf_hdr_t *h, bcf1_t *v, Variant *variant, bool debug)
57 {
58     if (debug)
59         std::cerr << "evaluation  "  << type2string(type) << "\n";
60 
61     //by default
62     value_exists = true;
63 
64     if (type&VT_LOGIC_OP)
65     {
66         if (!left->value_exists && type==VT_NOT)
67         {
68             if (debug)
69                 std::cerr << "\tVT_NOT "   <<  left->b << " \n";
70 
71             b = true;
72             return;
73         }
74 
75         //think about this, what happens when INFO values are not present, do you treat as a flag?
76         if (!left->value_exists)
77         {
78             b = false;
79             value_exists = false;
80             return;
81         }
82 
83         if (type==VT_NOT)
84         {
85             if (debug)
86                 std::cerr << "\tVT_NOT "   <<  left->b << " \n";
87 
88             b = !(left->b);
89             return;
90         }
91 
92         if (!right->value_exists)
93         {
94             b = false;
95             value_exists = false;
96             return;
97         }
98 
99         if (type==VT_AND)
100         {
101             if (debug)
102                 std::cerr << "\tVT_AND "   <<  left->b << "&" << right->b    <<  " \n";
103             b = (left->b && right->b);
104             return;
105         }
106         else if (type==VT_OR)
107         {
108             b = (left->b || right->b);
109             return;
110         }
111     }
112     else if (type&VT_MATH_CMP)
113     {
114         if (!left->value_exists && !right->value_exists)
115         {
116             value_exists = false;
117             return;
118         }
119 
120         if (type==VT_EQ)
121         {
122             //if an INFO field is involved and does not exist, then we evaluate as false
123             if (!left->value_exists || !right->value_exists)
124             {
125                 value_exists = true;
126                 b = false;
127                 return;
128             }
129 
130             if ((left->type&VT_INT))
131             {
132                 if ((right->type&VT_INT))
133                 {
134                     if (debug)
135                         std::cerr << "\tVT_EQ "   <<  left->i << "==" << right->i    <<  " \n";
136                     b = (left->i==right->i);
137                     return;
138                 }
139                 else if ((right->type&VT_FLT))
140                 {
141                     if (debug)
142                         std::cerr << "\tVT_EQ "   <<  left->i << "==" << right->f    <<  " \n";
143                     b = (left->i==right->f);
144                     return;
145                 }
146             }
147             else if ((left->type&VT_FLT))
148             {
149                 if ((right->type&VT_INT))
150                 {
151                     if (debug)
152                         std::cerr << "\tVT_EQ "   <<  left->f << "==" << right->i    <<  " \n";
153                     b = (left->f==right->i);
154                     return;
155                 }
156                 else if ((right->type&VT_FLT))
157                 {
158                     if (debug)
159                         std::cerr << "\tVT_EQ "   <<  left->f << "==" << right->f    <<  " \n";
160                     b = (left->f==right->f);
161                     return;
162                 }
163             }
164             else if ((left->type&VT_STR) && (right->type&VT_STR))
165             {
166                 if (debug)
167                     std::cerr << "\tVT_EQ "   <<  left->s.s << "&" << right->s.s    <<  " \n";
168                 b = strcmp(left->s.s, right->s.s)==0 ? true : false;
169                 return;
170             }
171 
172             fprintf(stderr, "[%s:%d %s] evaluation not supported : == %s %s\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
173             exit(1);
174         }
175         else if (type==VT_MATCH)
176         {
177             //if an INFO field is involved and does not exist, then we evaluate as false
178             if (!left->value_exists || !right->value_exists)
179             {
180                 value_exists = true;
181                 b = false;
182                 return;
183             }
184 
185             if ((left->type&VT_STR) && (right->type&VT_STR))
186             {
187                 if (debug)
188                     std::cerr << "\tVT_MATCH "   <<  left->s.s << "&" << right->s.s    <<  " \n";
189 
190                 if (!regex_set)
191                 {
192                     pregex.set(right->s.s);
193                     regex_set = true;
194                 }
195 
196                 b = pregex.match(left->s.s);
197 
198                 return;
199             }
200 
201             fprintf(stderr, "[%s:%d %s] evaluation not supported : =~ %s %s\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
202             exit(1);
203         }
204         else if (type==VT_NO_MATCH)
205         {
206             //if an INFO field is involved and does not exist, then we evaluate as false
207             if (!left->value_exists || !right->value_exists)
208             {
209                 value_exists = true;
210                 b = false;
211                 return;
212             }
213 
214             if ((left->type&VT_STR) && (right->type&VT_STR))
215             {
216                 if (debug)
217                     std::cerr << "\tVT_NO_MATCH "   <<  left->s.s << "&" << right->s.s    <<  " \n";
218 
219                 if (!regex_set)
220                 {
221                     pregex.set(right->s.s);
222                     regex_set = true;
223                 }
224 
225                 b = !pregex.match(left->s.s);
226 
227                 return;
228             }
229 
230             fprintf(stderr, "[%s:%d %s] evaluation not supported : ~~ %s %s\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
231             exit(1);
232         }
233         else if (type==VT_NE)
234         {
235             //fprintf(stderr, "[%s:%d %s] check: %s %s: !=\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
236 
237             //if an INFO field is involved and does not exist, then we evaluate as false
238             if (!left->value_exists || !right->value_exists)
239             {
240                 value_exists = true;
241                 b = false;
242                 return;
243             }
244 
245             if ((left->type&VT_INT))
246             {
247                 if ((right->type&VT_INT))
248                 {
249                     b = (left->i!=right->i);
250                     return;
251                 }
252                 else if ((right->type&VT_FLT))
253                 {
254                     b = (left->i!=right->f);
255                     return;
256                 }
257             }
258             else if ((left->type&VT_FLT))
259             {
260                 if ((right->type&VT_INT))
261                 {
262                     b = (left->f!=right->i);
263                     return;
264                 }
265                 else if ((right->type&VT_FLT))
266                 {
267                     b = (left->f!=right->f);
268                     return;
269                 }
270             }
271             else if ((left->type&VT_STR) && (right->type&VT_STR))
272             {
273                 b = strcmp(left->s.s, right->s.s)==0 ? false : true;
274                 return;
275             }
276 
277             fprintf(stderr, "[%s:%d %s] evaluation not supported: %s %s: !=\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
278             exit(1);
279         }
280         else if (type==VT_LE)
281         {
282             //if an INFO field is involved and does not exist, then we evaluate as false
283             if (!left->value_exists || !right->value_exists)
284             {
285                 value_exists = true;
286                 b = false;
287                 return;
288             }
289 
290             if ((left->type&VT_INT))
291             {
292                 if ((right->type&VT_INT))
293                 {
294                     b = (left->i<=right->i);
295                     return;
296                 }
297                 else if ((right->type&VT_FLT))
298                 {
299                     b = (left->i<=right->f);
300                     return;
301                 }
302             }
303             else if ((left->type&VT_FLT))
304             {
305                 if ((right->type&VT_INT))
306                 {
307                     type |= VT_INT;
308                     b = (left->f<=right->i);
309                     return;
310                 }
311                 else if ((right->type&VT_FLT))
312                 {
313                     b = (left->f<=right->f);
314                     return;
315                 }
316             }
317             else if ((left->type&VT_STR) && (right->type&VT_STR))
318             {
319                 b = strcmp(left->s.s, right->s.s)<=0 ? true : false;
320                 return;
321             }
322 
323             fprintf(stderr, "[%s:%d %s] evaluation not supported: %s %s: <=\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
324             exit(1);
325         }
326         else if (type==VT_GE)
327         {
328             //if an INFO field is involved and does not exist, then we evaluate as false
329             if (!left->value_exists || !right->value_exists)
330             {
331                 value_exists = true;
332                 b = false;
333                 return;
334             }
335 
336             if ((left->type&VT_INT))
337             {
338                 if ((right->type&VT_INT))
339                 {
340                     b = (left->i>=right->i);
341                     return;
342                 }
343                 else if ((right->type&VT_FLT))
344                 {
345                     b = (left->i>=right->f);
346                     return;
347                 }
348             }
349             else if ((left->type&VT_FLT))
350             {
351                 if ((right->type&VT_INT))
352                 {
353                     b = (left->f>=right->i);
354                     return;
355                 }
356                 else if ((right->type&VT_FLT))
357                 {
358                     b = (left->f>=right->f);
359                     return;
360                 }
361             }
362             else if ((left->type&VT_STR) && (right->type&VT_STR))
363             {
364                 b = strcmp(left->s.s, right->s.s)>=0 ? true : false;
365                 return;
366             }
367 
368             fprintf(stderr, "[%s:%d %s] evaluation not supported: %s %s: >=\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
369             exit(1);
370         }
371         else if (type==VT_GT)
372         {
373             //if an INFO field is involved and does not exist, then we evaluate as false
374             if (!left->value_exists || !right->value_exists)
375             {
376                 value_exists = true;
377                 b = false;
378                 return;
379             }
380 
381             if ((left->type&VT_INT))
382             {
383                 if ((right->type&VT_INT))
384                 {
385                     b = (left->i>right->i);
386                     return;
387                 }
388                 else if ((right->type&VT_FLT))
389                 {
390                     b = (left->i>right->f);
391                     return;
392                 }
393             }
394             else if ((left->type&VT_FLT))
395             {
396                 if ((right->type&VT_INT))
397                 {
398                     b = (left->f>right->i);
399                     return;
400                 }
401                 else if ((right->type&VT_FLT))
402                 {
403                     b = (left->f>right->f);
404                     return;
405                 }
406             }
407             else if ((left->type&VT_STR) && (right->type&VT_STR))
408             {
409                 b = strcmp(left->s.s, right->s.s)>0 ? true : false;
410                 return;
411             }
412 
413             fprintf(stderr, "[%s:%d %s] evaluation not supported: %s %s: >\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
414             exit(1);
415         }
416         else if (type==VT_LT)
417         {
418             //if an INFO field is involved and does not exist, then we evaluate as false
419             if (!left->value_exists || !right->value_exists)
420             {
421                 value_exists = true;
422                 b = false;
423                 return;
424             }
425 
426             if ((left->type&VT_INT))
427             {
428                 if ((right->type&VT_INT))
429                 {
430                     if (debug)
431                         std::cerr << "\tVT_LT "   <<  left->i << "<" << right->i    <<  " \n";
432 
433                     b = (left->i<right->i);
434                     return;
435                 }
436                 else if ((right->type&VT_FLT))
437                 {
438                     if (debug)
439                         std::cerr << "\tVT_LT "   <<  left->i << "<" << right->f    <<  " \n";
440 
441                     b = (left->i<right->f);
442                     return;
443                 }
444             }
445             else if ((left->type&VT_FLT))
446             {
447                 if ((right->type&VT_INT))
448                 {
449                     if (debug)
450                         std::cerr << "\tVT_LT "   <<  left->f << "<" << right->i    <<  " \n";
451 
452                     b = (left->f<right->i);
453                     return;
454                 }
455                 else if ((right->type&VT_FLT))
456                 {
457                     if (debug)
458                         std::cerr << "\tVT_LT "   <<  left->f << "<" << right->f    <<  " \n";
459 
460                     b = (left->f<right->f);
461                     return;
462                 }
463             }
464             else if ((left->type&VT_STR) && (right->type&VT_STR))
465             {
466                 if (debug)
467                         std::cerr << "\tVT_LT "   <<  left->s.s << "<" << right->s.s    <<  " \n";
468 
469                 b = strcmp(left->s.s, right->s.s)<0 ? true : false;
470                 return;
471             }
472 
473             fprintf(stderr, "[%s:%d %s] evaluation not supported: %s %s: <\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
474             exit(1);
475         }
476     }
477     //this is
478     else if (type&VT_BCF_OP)
479     {
480 
481 //        std::cerr << "\tprocessing BCF OPT for the first time\n";
482 
483         value_exists = true;
484 
485         if (type==VT_REF_COL)
486         {
487             s.l = 0;
488             char* ref = bcf_get_ref(v);
489             kputs(ref, &s);
490         }
491         else if (type==VT_ALT)
492         {
493             int32_t no_allele = bcf_get_n_allele(v);
494             if (no_allele)
495             {
496                 s.l = 0;
497                 char** allele = bcf_get_allele(v);
498                 for (int32_t i=1; i<v->n_allele; ++i)
499                 {
500                     if (i>1) kputc(',', &s);
501                     kputs(allele[i], &s);
502                 }
503             }
504             else
505             {
506                 s.l = 0;
507             }
508         }
509         else if (type==VT_QUAL)
510         {
511             if (bcf_float_is_missing(bcf_get_qual(v)))
512             {
513                 f = -1;
514                 value_exists = false;
515             }
516             else
517             {
518                 f = bcf_get_qual(v);
519                 value_exists = true;
520             }
521         }
522         else if (type==VT_FILTER)
523         {
524             bcf_unpack(v, BCF_UN_FLT);
525             if (bcf_get_n_filter(v) && bcf_has_filter(h, v, tag.s))
526             {
527                 b = true;
528             }
529             else
530             {
531                 b = false;
532             }
533         }
534         else if (type==VT_N_FILTER)
535         {
536             i = bcf_get_n_filter(v);
537             f = i;
538             b = true;
539             value_exists = true;
540         }
541         //figure out type
542         //this should only be invoked once so that we can short circuit
543         //the type checking in subsequent applications of the filter
544         //expression
545         else if (type==VT_INFO)
546         {
547             if (!bcf_hdr_idinfo_exists(h, BCF_HL_INFO, bcf_hdr_id2int(h, BCF_DT_ID, tag.s)))
548             {
549                 fprintf(stderr, "[%s:%d %s] INFO tag %s does not exist in header of VCF file.\n", __FILE__, __LINE__, __FUNCTION__, tag.s);
550                 exit(1);
551             }
552 
553             int32_t int_id = bcf_hdr_id2int(h, BCF_DT_ID, tag.s);
554             int32_t info_type = bcf_hdr_id2type(h, BCF_HL_INFO, int_id);
555             var_length = bcf_hdr_id2length(h, BCF_HL_INFO, int_id);
556             number = bcf_hdr_id2number(h, BCF_HL_INFO, int_id);
557             bcf_unpack(v, BCF_UN_INFO);
558 
559             if (info_type==BCF_HT_FLAG)
560             {
561                 type |= VT_FLG;
562                 if (bcf_get_info_flag(h, v, tag.s, 0, 0)>0)
563                 {
564                     i = 1;
565                     f = 1;
566                     b = true;
567                     s.l=0;
568                 }
569                 else
570                 {
571                     b = false;
572                     value_exists = false;
573                 }
574             }
575             else if (info_type==BCF_HT_INT)
576             {
577 //                std::cerr << "\tadd INT to VT_INFO type\n";
578 
579 
580                 type |= VT_INT;
581 
582                 if (var_length==BCF_VL_FIXED)
583                 {
584 //                    std::cerr << "\tFIXED length\n";
585 
586 //                    std::cerr << "\tnumber : " << number << "\n";
587                     if (number==1)
588                     {
589                         int32_t ns = 0;
590                         int32_t *is = NULL;
591                         if (bcf_get_info_int32(h, v, tag.s, &is, &ns)>0)
592                         {
593 //                            std::cerr << "\tupdating field i\n";
594 
595                             i = is[0];
596                             f = (float)is[0];
597                         }
598                         else
599                         {
600                             b = false;
601                             value_exists = false;
602                         }
603 
604                         if (ns) free(is);
605                     }
606                     else if (number>1)
607                     {
608 //                        std::cerr << "\tnumber > 1 "  << "\n";
609 //                        std::cerr << "1) updating initial record\n";
610                         int32_t ns = 0;
611                         int32_t *is = NULL;
612                         if (bcf_get_info_int32(h, v, tag.s, &is, &ns)>0)
613                         {
614                             ivec.resize(0);
615                             fvec.resize(0);
616                             for (int32_t i=0; i<ns; ++i)
617                             {
618                                 ivec.push_back(is[i]);
619                                 fvec.push_back(is[i]);
620                             }
621                             i = is[index-1];
622                             f = (float)is[index-1];
623                         }
624                         else
625                         {
626                             b = false;
627                             value_exists = false;
628                         }
629 
630                         if (ns) free(is);
631                     }
632                     else
633                     {
634                         std::cerr << "\thmmmmmm "  << "\n";
635                         //hmmmmm.....
636                     }
637                 }
638                 else if (var_length==BCF_VL_R || var_length==BCF_VL_A || var_length==BCF_VL_G)
639                 {
640                     int32_t no_alleles = bcf_get_n_allele(v);
641 
642                     if (var_length==BCF_VL_R)
643                     {
644 //                        std::cerr << "\tR length\n";
645                         number = no_alleles;
646                     }
647                     else if (var_length==BCF_VL_A)
648                     {
649 //                        std::cerr << "\tA length\n";
650                         number = no_alleles-1;
651                     }
652                     else if (var_length==BCF_VL_G)
653                     {
654 //                        std::cerr << "\tG length\n";
655                         //assume ploidy is too for the time being
656                         //usage is not determinable for info fields because
657                         //ploidy is individual dependent
658                         number = bcf_ap2g(no_alleles, 2);
659                     }
660 
661 //                    std::cerr << "\tnumber : " << number << "\n";
662                     if (number==1)
663                     {
664                         int32_t ns = 0;
665                         int32_t *is = NULL;
666                         if (bcf_get_info_int32(h, v, tag.s, &is, &ns)>0)
667                         {
668 //                            std::cerr << "\tupdating field i\n";
669 
670                             i = is[0];
671                             f = (float)is[0];
672                         }
673                         else
674                         {
675                             b = false;
676                             value_exists = false;
677                         }
678 
679                         if (ns) free(is);
680                     }
681                     else if (number>1)
682                     {
683                         std::cerr << "\tnumber > 1 "  << "\n";
684 //                        std::cerr << "1) updating initial record\n";
685                         int32_t ns = 0;
686                         int32_t *is = NULL;
687                         if (bcf_get_info_int32(h, v, tag.s, &is, &ns)>0)
688                         {
689                             ivec.resize(0);
690                             fvec.resize(0);
691                             for (int32_t i=0; i<ns; ++i)
692                             {
693                                 ivec.push_back(is[i]);
694                                 fvec.push_back(is[i]);
695                             }
696                             i = is[index-1];
697                             f = (float)is[index-1];
698                         }
699                         else
700                         {
701                             b = false;
702                             value_exists = false;
703                         }
704 
705                         if (ns) free(is);
706                     }
707                     else
708                     {
709                         std::cerr << "\thmmmmmm "  << "\n";
710 
711                         //hmmmmm.....
712                     }
713                 }
714                 else if (var_length==BCF_VL_VAR)
715                 {
716                     std::cerr << "\tvariable length, should we treat as a blob?\n";
717                 }
718             }
719             else if (info_type==BCF_HT_REAL)
720             {
721                 type |= VT_FLT;
722 
723                 if (var_length==BCF_VL_FIXED)
724                 {
725                     if (number==1)
726                     {
727                         int32_t ns = 0;
728                         int32_t *fs = NULL;
729                         if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
730                         {
731                             i = (int32_t) fs[0];
732                             f = fs[0];
733                         }
734                         else
735                         {
736                             b = false;
737                             value_exists = false;
738                         }
739 
740                         if (ns) free(fs);
741                     }
742                     else if (number>1)
743                     {
744                         int32_t ns = 0;
745                         int32_t *fs = NULL;
746                         if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
747                         {
748                             ivec.resize(0);
749                             fvec.resize(0);
750                             for (int32_t i=0; i<ns; ++i)
751                             {
752                                 ivec.push_back(fs[i]);
753                                 fvec.push_back(fs[i]);
754                             }
755                             i = (int32_t) fs[index-1];
756                             f = fs[index-1];
757                         }
758                         else
759                         {
760                             b = false;
761                             value_exists = false;
762                         }
763 
764                         if (ns) free(fs);
765                     }
766                 }
767                 else if (var_length==BCF_VL_R || var_length==BCF_VL_A || var_length==BCF_VL_G)
768                 {
769                     int32_t no_alleles = bcf_get_n_allele(v);
770 
771                     if (var_length==BCF_VL_R)
772                     {
773                         number = no_alleles;
774                     }
775                     else if (var_length==BCF_VL_A)
776                     {
777                         number = no_alleles-1;
778                     }
779                     else if (var_length==BCF_VL_G)
780                     {
781                         //assume ploidy is too for the time being
782                         //usage is not determinable for info fields because
783                         //ploidy is individual dependent
784                         number = bcf_ap2g(no_alleles, 2);
785                     }
786 
787                     if (number==1)
788                     {
789                         int32_t ns = 0;
790                         int32_t *fs = NULL;
791                         if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
792                         {
793                             i = (int32_t)fs[0];
794                             f = fs[0];
795                         }
796                         else
797                         {
798                             b = false;
799                             value_exists = false;
800                         }
801 
802                         if (ns) free(fs);
803                     }
804                     else if (number>1)
805                     {
806                         int32_t ns = 0;
807                         int32_t *fs = NULL;
808                         if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
809                         {
810                             ivec.resize(0);
811                             fvec.resize(0);
812                             for (int32_t i=0; i<ns; ++i)
813                             {
814                                 ivec.push_back(fs[i]);
815                                 fvec.push_back(fs[i]);
816                             }
817                             i = (int32_t)fs[index-1];
818                             f = fs[index-1];
819                         }
820                         else
821                         {
822                             b = false;
823                             value_exists = false;
824                         }
825 
826                         if (ns) free(fs);
827                     }
828                 }
829                 else if (var_length==BCF_VL_VAR)
830                 {
831                     //variable length, should we treat as a blob?
832                 }
833             }
834             else if (info_type==BCF_HT_STR)
835             {
836                 type |= VT_STR;
837                 //todo: how do you handle a vector of strings?
838 
839                 int32_t n = s.m;
840                 if (bcf_get_info_string(h, v, tag.s, &s.s, &n)>0)
841                 {
842                     b = true;
843                     s.m = n;
844                 }
845                 else
846                 {
847                     b = false;
848                     value_exists = false;
849                 }
850             }
851         }
852         //fast lane for INFO-TYPEs
853         else if (type==(VT_INFO|VT_INT))
854         {
855             if (var_length==BCF_VL_FIXED)
856             {
857                 if (number==1)
858                 {
859                     int32_t n = 0;
860                     int32_t *data = NULL;
861                     if (bcf_get_info_int32(h, v, tag.s, &data, &n)>0)
862                     {
863                         b = true;
864                         i = data[0];
865                         f = (float)data[0];
866                     }
867                     else
868                     {
869                         b = false;
870                         value_exists = false;
871                     }
872 
873                     if (data) free(data);
874                 }
875                 else if (number>1)
876                 {
877                     int32_t n = 0;
878                     int32_t *data = NULL;
879                     if (bcf_get_info_int32(h, v, tag.s, &data, &n)>0)
880                     {
881                         ivec.resize(0);
882                         fvec.resize(0);
883                         for (int32_t i=0; i<n; ++i)
884                         {
885                             ivec.push_back(data[i]);
886                             fvec.push_back(data[i]);
887                         }
888                         b = true;
889                         i = data[index-1];
890                         f = (float)data[index-1];
891                     }
892                     else
893                     {
894                         b = false;
895                         value_exists = false;
896                     }
897 
898                     if (n) free(data);
899                 }
900                 else
901                 {
902                     //hmmmmm.....
903                 }
904             }
905             else if (var_length==BCF_VL_R || var_length==BCF_VL_A || var_length==BCF_VL_G)
906             {
907                 int32_t no_alleles = bcf_get_n_allele(v);
908 
909                 if (var_length==BCF_VL_R)
910                 {
911                     number = no_alleles;
912                 }
913                 else if (var_length==BCF_VL_A)
914                 {
915                     number = no_alleles-1;
916                 }
917                 else if (var_length==BCF_VL_G)
918                 {
919                     //assume ploidy is too for the time being
920                     //usage is not determinable for info fields because
921                     //ploidy is individual dependent
922                     number = bcf_ap2g(no_alleles, 2);
923                 }
924 
925                 if (number==1)
926                 {
927                     int32_t ns = 0;
928                     int32_t *is = NULL;
929                     if (bcf_get_info_int32(h, v, tag.s, &is, &ns)>0)
930                     {
931                         i = is[0];
932                         f = (float)is[0];
933                     }
934                     else
935                     {
936                         b = false;
937                         value_exists = false;
938                     }
939 
940                     if (ns) free(is);
941                 }
942                 else if (number>1)
943                 {
944                     int32_t ns = 0;
945                     int32_t *is = NULL;
946                     if (bcf_get_info_int32(h, v, tag.s, &is, &ns)>0)
947                     {
948                         ivec.resize(0);
949                         fvec.resize(0);
950                         for (int32_t i=0; i<ns; ++i)
951                         {
952                             ivec.push_back(is[i]);
953                             fvec.push_back(is[i]);
954                         }
955                         i = is[index-1];
956                         f = (float)is[index-1];
957                     }
958                     else
959                     {
960                         b = false;
961                         value_exists = false;
962                     }
963 
964                     if (ns) free(is);
965                 }
966             }
967         }
968         else if (type==(VT_INFO|VT_FLT))
969         {
970             float *data = NULL;
971             int32_t n = 0;
972 
973             if (bcf_get_info_float(h, v, tag.s, &data, &n)>0)
974             {
975                 b = true;
976                 i = (int32_t) data[0];
977                 f = data[0];
978             }
979             else
980             {
981                 b = false;
982                 value_exists = false;
983             }
984 
985             if (n) free(data);
986         }
987         else if (type==(VT_INFO|VT_FLT))
988         {
989             if (var_length==BCF_VL_FIXED)
990             {
991                 if (number==1)
992                 {
993                     int32_t ns = 0;
994                     int32_t *fs = NULL;
995                     if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
996                     {
997                         i = (int32_t) fs[0];
998                         f = fs[0];
999                     }
1000                     else
1001                     {
1002                         b = false;
1003                         value_exists = false;
1004                     }
1005 
1006                     if (ns) free(fs);
1007                 }
1008                 else if (number>1)
1009                 {
1010                     int32_t ns = 0;
1011                     int32_t *fs = NULL;
1012                     if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
1013                     {
1014                         ivec.resize(0);
1015                         fvec.resize(0);
1016                         for (int32_t i=0; i<ns; ++i)
1017                         {
1018                             ivec.push_back(fs[i]);
1019                             fvec.push_back(fs[i]);
1020                         }
1021                         i = (int32_t) fs[index-1];
1022                         f = fs[index-1];
1023                     }
1024                     else
1025                     {
1026                         b = false;
1027                         value_exists = false;
1028                     }
1029 
1030                     if (ns) free(fs);
1031                 }
1032             }
1033             else if (var_length==BCF_VL_R || var_length==BCF_VL_A || var_length==BCF_VL_G)
1034             {
1035                 int32_t no_alleles = bcf_get_n_allele(v);
1036 
1037                 if (var_length==BCF_VL_R)
1038                 {
1039                     number = no_alleles;
1040                 }
1041                 else if (var_length==BCF_VL_A)
1042                 {
1043                     number = no_alleles-1;
1044                 }
1045                 else if (var_length==BCF_VL_G)
1046                 {
1047                     //assume ploidy is too for the time being
1048                     //usage is not determinable for info fields because
1049                     //ploidy is individual dependent
1050                     number = bcf_ap2g(no_alleles, 2);
1051                 }
1052 
1053                 if (number==1)
1054                 {
1055                     int32_t ns = 0;
1056                     int32_t *fs = NULL;
1057                     if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
1058                     {
1059                         i = (int32_t)fs[0];
1060                         f = fs[0];
1061                     }
1062                     else
1063                     {
1064                         b = false;
1065                         value_exists = false;
1066                     }
1067 
1068                     if (ns) free(fs);
1069                 }
1070                 else if (number>1)
1071                 {
1072                     int32_t ns = 0;
1073                     int32_t *fs = NULL;
1074                     if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
1075                     {
1076                         ivec.resize(0);
1077                         fvec.resize(0);
1078                         for (int32_t i=0; i<ns; ++i)
1079                         {
1080                             ivec.push_back(fs[i]);
1081                             fvec.push_back(fs[i]);
1082                         }
1083                         i = (int32_t)fs[index-1];
1084                         f = fs[index-1];
1085                     }
1086                     else
1087                     {
1088                         b = false;
1089                         value_exists = false;
1090                     }
1091 
1092                     if (ns) free(fs);
1093                 }
1094             }
1095             else if (var_length==BCF_VL_VAR)
1096             {
1097                 std::cerr << "\tvariable length, should we treat as a blob?\n";
1098             }
1099         }
1100         else if (type==(VT_INFO|VT_STR))
1101         {
1102             int32_t n = s.m;
1103 
1104             if (bcf_get_info_string(h, v, tag.s, &s.s, &n)>0)
1105             {
1106                 b = true;
1107                 s.m = n;
1108             }
1109             else
1110             {
1111                 b = false;
1112                 value_exists = false;
1113             }
1114         }
1115         else if (type==(VT_INFO|VT_FLG))
1116         {
1117             if (bcf_get_info_flag(h, v, tag.s, 0, 0)>0)
1118             {
1119                 b = true;
1120             }
1121             else
1122             {
1123                 b = false;
1124             }
1125 
1126             if (debug)
1127                 std::cerr << "\tVT_INFO|VT_FLG "   << i << " " << f << " " << b << " " << s.s <<  " \n";
1128         }
1129         else if (type==VT_VARIANT_TYPE)
1130         {
1131             if (debug)
1132                 std::cerr << "\tVTYPE "   <<  Variant::vtype2string(variant->type) <<  " \n";
1133             i = variant->type;
1134             b = i;
1135         }
1136         else if (type==VT_VARIANT_DLEN)
1137         {
1138             if (debug)
1139                 std::cerr << "\tDLEN "   <<  variant->alleles[0].dlen <<  " \n";
1140             i = variant->alleles[0].dlen;
1141             f = i;
1142         }
1143         else if (type==VT_VARIANT_LEN)
1144         {
1145             if (debug)
1146                 std::cerr << "\tLEN "   <<  abs(variant->alleles[0].dlen) <<  " \n";
1147             i = abs(variant->alleles[0].dlen);
1148             f = i;
1149         }
1150         else if (type==VT_VARIANT_CONTAINS_N)
1151         {
1152             if (debug)
1153                 std::cerr << "\tVARIANT_CONTAINS_N "   <<  variant->contains_N <<  " \n";
1154 
1155             bcf_unpack(v, BCF_UN_FLT);
1156             b = variant->contains_N;
1157         }
1158         else if (type==VT_N_ALLELE)
1159         {
1160             if (debug)
1161                 std::cerr << "\tN_ALLELE "   <<  bcf_get_n_allele(v) <<  " \n";
1162             i = bcf_get_n_allele(v);
1163             f = i;
1164         }
1165     }
1166     else if (type&VT_MATH_OP)
1167     {
1168         if (!left->value_exists && !right->value_exists)
1169         {
1170             value_exists = false;
1171             return;
1172         }
1173 
1174         if ((type&15)==11) //ADD
1175         {
1176             if ((left->type&VT_INT))
1177             {
1178                 if ((right->type&VT_INT))
1179                 {
1180                     type |= VT_INT;
1181                     i = (left->i+right->i);
1182                     f = i;
1183                     return;
1184                 }
1185                 else if ((right->type&VT_FLT))
1186                 {
1187                     type |= VT_FLT;
1188                     f = (left->i+right->f);
1189                     return;
1190                 }
1191             }
1192             else if ((left->type&VT_FLT))
1193             {
1194                 if ((right->type&VT_INT))
1195                 {
1196                     type |= VT_FLT;
1197                     f = (left->f+right->i);
1198                     return;
1199                 }
1200                 else if ((right->type&VT_FLT))
1201                 {
1202                     type |= VT_FLT;
1203                     f = (left->f+right->f);
1204                     return;
1205                 }
1206             }
1207 
1208             fprintf(stderr, "[%s:%d %s] evaluation not supported : +\n", __FILE__, __LINE__, __FUNCTION__);
1209             exit(1);
1210         }
1211         else if ((type&15)==12) //SUB
1212         {
1213             if ((left->type&VT_INT))
1214             {
1215                 if ((right->type&VT_INT))
1216                 {
1217                     type |= VT_INT;
1218                     i = (left->i-right->i);
1219                     f = i;
1220                     return;
1221                 }
1222                 else if ((right->type&VT_FLT))
1223                 {
1224                     type |= VT_FLT;
1225                     f = (left->i-right->f);
1226                     return;
1227                 }
1228             }
1229             else if ((left->type&VT_FLT))
1230             {
1231                 if ((right->type&VT_INT))
1232                 {
1233                     type |= VT_FLT;
1234                     f = (left->f-right->i);
1235                     return;
1236                 }
1237                 else if ((right->type&VT_FLT))
1238                 {
1239                     type |= VT_FLT;
1240                     f = (left->f-right->f);
1241                     return;
1242                 }
1243             }
1244 
1245             fprintf(stderr, "[%s:%d %s] evaluation not supported : -\n", __FILE__, __LINE__, __FUNCTION__);
1246             exit(1);
1247         }
1248         else if ((type&15)==13) //MUL
1249         {
1250             if ((left->type&VT_INT))
1251             {
1252                 if ((right->type&VT_INT))
1253                 {
1254                     type |= VT_INT;
1255                     i = (left->i*right->i);
1256                     f = i;
1257                     return;
1258                 }
1259                 else if ((right->type&VT_FLT))
1260                 {
1261                     type |= VT_FLT;
1262                     f = (left->i*right->f);
1263                     return;
1264                 }
1265             }
1266             else if ((left->type&VT_FLT))
1267             {
1268                 if ((right->type&VT_INT))
1269                 {
1270                     type |= VT_FLT;
1271                     f = (left->f*right->i);
1272                     return;
1273                 }
1274                 else if ((right->type&VT_FLT))
1275                 {
1276                     type |= VT_FLT;
1277                     f = (left->f*right->f);
1278                     return;
1279                 }
1280             }
1281 
1282             fprintf(stderr, "[%s:%d %s] evaluation not supported : *\n", __FILE__, __LINE__, __FUNCTION__);
1283             exit(1);
1284         }
1285         else if ((type&15)==14) //DIV
1286         {
1287             if (left->type&VT_INT)
1288             {
1289                 if (right->type&VT_INT)
1290                 {
1291                     type |= VT_FLT;
1292                     f = ((float)left->i/right->i);
1293                     return;
1294                 }
1295                 else if (right->type&VT_FLT)
1296                 {
1297                     type |= VT_FLT;
1298                     f = (left->i/right->f);
1299                     return;
1300                 }
1301             }
1302             else if (left->type&VT_FLT)
1303             {
1304                 if (right->type&VT_INT)
1305                 {
1306                     type |= VT_FLT;
1307                     f = (left->f/right->i);
1308                     return;
1309                 }
1310                 else if (right->type&VT_FLT)
1311                 {
1312                     type |= VT_FLT;
1313                     f = (left->f/right->f);
1314                     return;
1315                 }
1316             }
1317 
1318             fprintf(stderr, "[%s:%d %s] evaluation not supported : /\n", __FILE__, __LINE__, __FUNCTION__);
1319             exit(1);
1320         }
1321         else if ((type&15)==15) //BIT AND
1322         {
1323             if ((left->type&VT_INT) && (right->type&VT_INT))
1324             {
1325                 i = (left->i & right->i);
1326                 b = i;
1327                 return;
1328             }
1329 
1330             fprintf(stderr, "[%s:%d %s] evaluation not supported for & :  %d %d\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type);
1331             exit(1);
1332         }
1333         else if ((type&15)==16) //BIT OR
1334         {
1335             if ((left->type&VT_INT) && (right->type&VT_INT))
1336             {
1337                 i = (left->i | right->i);
1338                 b = i;
1339                 return;
1340             }
1341 
1342             fprintf(stderr, "[%s:%d %s] evaluation not supported for | : %d %d\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type);
1343             exit(1);
1344         }
1345         else
1346         {
1347             fprintf(stderr, "[%s:%d %s] math op not supported : %d\n", __FILE__, __LINE__, __FUNCTION__, (type&15));
1348             exit(1);
1349         }
1350     }
1351 }
1352 
1353 /**
1354  * Converts type to string.
1355  */
type2string(int32_t type)1356 std::string Node::type2string(int32_t type)
1357 {
1358     std::string s = "";
1359     if (type&VT_BOOL)
1360     {
1361         s += (s==""? "" : "|");
1362         s += "BOOL";
1363     }
1364 
1365     if (type&VT_INT)
1366     {
1367         s += (s==""? "" : "|");
1368         s += "INT";
1369     }
1370 
1371     if (type&VT_FLT)
1372     {
1373         s += (s==""? "" : "|");
1374         s += "FLT";
1375     }
1376 
1377     if (type&VT_STR)
1378     {
1379         s += (s==""? "" : "|");
1380         s += "STR";
1381     }
1382 
1383     if (type&VT_FLG)
1384     {
1385         s += (s==""? "" : "|");
1386         s += "FLG";
1387     }
1388 
1389     if (type&VT_LOGIC_OP)
1390     {
1391         s += (s==""? "" : "|");
1392         s += "LOGIC";
1393     }
1394 
1395     if (type&VT_MATH_CMP)
1396     {
1397         s += (s==""? "" : "|");
1398         s += "MATH_CMP";
1399     }
1400 
1401     if (type&VT_MATH_OP)
1402     {
1403         s += (s==""? "" : "|");
1404         s += "MATH_OP";
1405     }
1406 
1407     if (type&VT_BCF_OP)
1408     {
1409         s += (s==""? "" : "|");
1410         s += "BCF_OP";
1411     }
1412 
1413     if (type==VT_QUAL)
1414     {
1415         s += (s==""? "" : "|");
1416         s += "QUAL";
1417     }
1418 
1419     if (type==VT_FILTER)
1420     {
1421         s += (s==""? "" : "|");
1422         s += "FILTER";
1423     }
1424 
1425     if (type==VT_N_FILTER)
1426     {
1427         s += (s==""? "" : "|");
1428         s += "FILTER";
1429     }
1430 
1431     if (type==VT_INFO)
1432     {
1433         s += (s==""? "" : "|");
1434         s += "INFO";
1435     }
1436 
1437     if (type==VT_N_ALLELE)
1438     {
1439         s += (s==""? "" : "|");
1440         s += "N_ALLELE";
1441     }
1442 
1443     if (type==VT_VARIANT_TYPE)
1444     {
1445         s += (s==""? "" : "|");
1446         s += "VARIANT_TYPE";
1447     }
1448 
1449     if (type==VT_VARIANT_DLEN)
1450     {
1451         s += (s==""? "" : "|");
1452         s += "VARIANT_DLEN";
1453     }
1454 
1455     if (type==VT_VARIANT_LEN)
1456     {
1457         s += (s==""? "" : "|");
1458         s += "VARIANT_LEN";
1459     }
1460 
1461     return s;
1462 };
1463 
1464 /**
1465  * Constructor.
1466  */
Filter()1467 Filter::Filter()
1468 {
1469     this->tree = NULL;
1470 };
1471 
1472 /**
1473  * Constructor with expression initialization.
1474  */
Filter(std::string exp)1475 Filter::Filter(std::string exp)
1476 {
1477     this->tree = NULL;
1478     parse(exp.c_str(), false);
1479 };
1480 
1481 /**
1482  * Parses filter expression.
1483  */
parse(const char * exp,bool debug)1484 void Filter::parse(const char* exp, bool debug)
1485 {
1486     if (strlen(exp)!=0)
1487     {
1488         //trim the white spaces out of the string
1489         std::string exp_no_space = "";
1490         for (uint32_t i=0; i<strlen(exp); ++i)
1491         {
1492             if (!isspace(exp[i]))
1493             {
1494                 exp_no_space.append(1, exp[i]);
1495             }
1496         }
1497 
1498         reset();
1499         tree = new Node();
1500         parse(exp_no_space.c_str(), exp_no_space.size(), tree, debug);
1501 
1502         if (!(tree->type&VT_BOOL))
1503         {
1504             fprintf(stderr, "[%s:%d %s] filter expression not boolean %s\n", __FILE__, __LINE__, __FUNCTION__, exp);
1505             exit(1);
1506         }
1507     }
1508     else
1509     {
1510         tree = NULL;
1511     }
1512 }
1513 
1514 /**
1515  * Applies filter to vcf record.
1516  */
apply(bcf_hdr_t * h,bcf1_t * v,Variant * variant,bool debug)1517 bool Filter::apply(bcf_hdr_t *h, bcf1_t *v, Variant *variant, bool debug) //recursive
1518 {
1519     if (tree==NULL)
1520     {
1521         return true;
1522     }
1523 
1524     this->h = h;
1525     this->v = v;
1526     this->variant = variant;
1527 
1528     if (debug) std::cerr << "==========\n";
1529     apply(tree, debug);
1530     if (debug) std::cerr << "==========\n";
1531 
1532     if (tree->value_exists)
1533     {
1534         return tree->b;
1535     }
1536     else
1537     {
1538         return false;
1539     }
1540 }
1541 
1542 /**
1543  * Attempts to simplify the expression tree by collapsing nodes that can be precomputed.
1544  */
simplify()1545 void Filter::simplify()
1546 {
1547 }
1548 
1549 /**
1550  * Resets filter.
1551  */
reset()1552 void Filter::reset()
1553 {
1554     if (tree!=NULL)
1555     {
1556         delete tree;
1557         tree = NULL;
1558     }
1559 }
1560 
1561 /**
1562  * Constructs the expression tree.
1563  */
parse(const char * exp,int32_t len,Node * node,bool debug)1564 void Filter::parse(const char* exp, int32_t len, Node *node, bool debug)
1565 {
1566     if (debug)
1567     {
1568         std::cerr << "parsing \"";
1569         for (int32_t i=0; i<len; ++i)
1570             std::cerr << exp[i] ;
1571         std::cerr << "\" " << len << "\n";
1572     }
1573 
1574     //******************************
1575     //trim white spaces and brackets
1576     //******************************
1577     while (*exp==' ') ++exp;
1578     while (exp[len-1]==' ') --len;
1579     trim_brackets(exp, len, debug);
1580 
1581     //this is a literal
1582     if (is_literal(exp, len, debug))
1583     {
1584         //will not recurse any further
1585         return parse_literal(exp, len, node, debug);
1586     }
1587     //unary operation
1588     else if (is_unary_op(exp, len, debug))
1589     {
1590         node->type = VT_NOT;
1591         if (debug) std::cerr << "\tis not_op\n";
1592 
1593         node->left = new Node();
1594         parse(exp+1, len-1, node->left, debug);
1595     }
1596     //binary operation
1597     else
1598     {
1599         if (debug) std::cerr << "\tis binary op\n";
1600 
1601         const char* p = exp; //points to end of first part
1602         const char* q = exp; //points to start of second part
1603         const char* r = exp; //for iteration
1604 
1605         int32_t type = INT_MAX;
1606 
1607         while(r-exp!=len)
1608         {
1609             //bypasses bracketed expressions
1610             fwd_to_closing_bracket(r, len);
1611 
1612             int32_t oplen = -1;
1613             int32_t ctype = peek_op(r, len, oplen, debug);
1614 
1615             if(ctype!=-1)
1616             {
1617                 if (debug) std::cerr<< "\ttype : " << ctype << " \n";
1618                 //this implements order of operations
1619                 if (ctype<type)
1620                 {
1621                     if (debug) std::cerr<< "\tupdating type\n";
1622                     type = ctype;
1623                     p = r-1;
1624                     q = r+oplen;
1625                 }
1626 
1627                 r += oplen-1;
1628             }
1629 
1630             ++r;
1631         }
1632 
1633         //valid binary operator not found
1634         if (type==INT_MAX)
1635         {
1636             kstring_t s = {0,0,0};
1637             kputsn(exp, len, &s);
1638             fprintf(stderr, "[%s:%d %s] binary operator not found in \"%s\". Valid operators are  ==,!=,=~,~~,&&,||,&,|,+,-,*,/\n", __FILE__, __LINE__, __FUNCTION__, s.s);
1639             if (s.m) free(s.s);
1640             exit(1);
1641         }
1642 
1643         node->type = type;
1644 
1645         node->left = new Node();
1646         parse(exp, p-exp+1, node->left, debug);
1647 
1648         node->right = new Node();
1649         parse(q, len-(q-exp), node->right, debug);
1650     }
1651 }
1652 
1653 /**
1654  * Checks if exp is a unary op.
1655  */
is_unary_op(const char * exp,int32_t len,bool debug)1656 bool Filter::is_unary_op(const char* exp, int32_t len, bool debug)
1657 {
1658     //check to make sure not a binary operator
1659 
1660     //NOT operator
1661     if (exp[0]=='~')
1662     {
1663         if (is_literal(exp+1, len-1, debug)||is_bracketed_expression(exp+1, len-1, debug))
1664         {
1665             if (debug) std::cerr << "\tis unary op\n";
1666             return true;
1667         }
1668     }
1669 
1670     if (debug) std::cerr << "\tis not unary op\n";
1671     return false;
1672 }
1673 
1674 /**
1675  * Checks is expression is bracketed.
1676  */
is_bracketed_expression(const char * exp,int32_t len,bool debug)1677 bool Filter::is_bracketed_expression(const char* exp, int32_t len, bool debug)
1678 {
1679     if (*exp=='(' && exp[len-1]==')')
1680     {
1681         int32_t opened_brackets = 1;
1682         bool nested = true;
1683         int32_t j=1;
1684         while(j<len-1)
1685         {
1686             if(exp[j]=='(')
1687             {
1688                if (opened_brackets<0)
1689                {
1690                     kstring_t s = {0,0,0};
1691                     kputsn(exp, len, &s);
1692                     fprintf(stderr, "[%s:%d %s] brackets not correct %s\n", __FILE__, __LINE__, __FUNCTION__, s.s);
1693                     if (s.m) free(s.s);
1694                     exit(1);
1695                }
1696 
1697                ++opened_brackets;
1698             }
1699             else if (exp[j]==')')
1700             {
1701                if (opened_brackets<=0)
1702                {
1703                     kstring_t s = {0,0,0};
1704                     kputsn(exp, len, &s);
1705                     fprintf(stderr, "[%s:%d %s] brackets not correct %s\n", __FILE__, __LINE__, __FUNCTION__, s.s);
1706                     if (s.m) free(s.s);
1707                     exit(1);
1708                }
1709 
1710                --opened_brackets;
1711             }
1712 
1713             ++j;
1714         }
1715 
1716         return opened_brackets==1;
1717     }
1718 
1719     return false;
1720 }
1721 
1722 /**
1723  * Check if exp is a literal (no binary operations, no unary operation).
1724  */
is_literal(const char * exp,int32_t len,bool debug)1725 bool Filter::is_literal(const char* exp, int32_t len, bool debug)
1726 {
1727     //checks if string literal
1728     if (exp==strchr(exp,'\'') && exp+len-1==strchr(exp+1,'\''))
1729     {
1730         return true;
1731     }
1732 
1733     const char* q = exp;
1734     while (q-exp<len)
1735     {
1736         if(*q=='=' ||
1737            *q=='&' ||
1738            *q=='|' ||
1739            *q=='>' ||
1740            *q=='<' ||
1741            *q=='!' ||
1742            *q=='+' ||
1743            *q=='*' ||
1744            *q=='/' ||
1745            *q=='~' ||
1746            (*q=='-' && exp!=q))
1747         {
1748             if (debug) std::cerr << "\tis not literal\n";
1749             return false;
1750         }
1751 
1752         ++q;
1753     }
1754 
1755     if (debug) std::cerr << "\tis literal\n";
1756     return true;
1757 }
1758 
1759 /**
1760  * Detect index width.
1761  * e.g. AC[1] => 3
1762  * e.g. EVIDENCE[12] => 4
1763  *
1764  * Populated index with the index queried.
1765  */
get_index_width(const char * exp,int32_t n,int32_t * index)1766 int32_t Filter::get_index_width(const char *exp, int32_t n, int32_t *index)
1767 {
1768     if (n<=3) return 0;
1769 
1770     int32_t width = 0;
1771     bool end_bracket = false;
1772     bool in_digits = false;
1773     bool beg_bracket = false;
1774     int32_t current_index = 0;
1775     int32_t index_weight = 1;
1776 
1777     //check end bracket and digit next to square bracket
1778     if (exp[n-1]==']' && exp[n-2]>=48&&exp[n-2]<=57)
1779     {
1780         in_digits = true;
1781         width = 2;
1782         current_index += exp[n-2]-48;
1783         index_weight *= 10;
1784     }
1785     else
1786     {
1787         *index = 0;
1788         return 0;
1789     }
1790 
1791     //check digits in square brackets
1792     for (int32_t i=n-3; i>=0; --i)
1793     {
1794         //check if digit
1795         if (exp[i]>=48&&exp[i]<=57)
1796         {
1797             //all is good, continue
1798             ++width;
1799             current_index += (exp[i]-48)*index_weight;
1800             index_weight *= 10;
1801         }
1802         else if (exp[i]=='[')
1803         {
1804             *index = current_index;
1805             return width+1;
1806         }
1807     }
1808 
1809     *index = 0;
1810     return 0;
1811 }
1812 
1813 /**
1814  * Parse literals.
1815  */
parse_literal(const char * exp,int32_t len,Node * node,bool debug)1816 void Filter::parse_literal(const char* exp, int32_t len, Node * node, bool debug)
1817 {
1818     node->type = VT_UNKNOWN;
1819 
1820     if (strncmp(exp, "PASS", len)==0)
1821     {
1822         node->type = VT_FILTER;
1823         kputsn(exp, len, &node->tag);
1824         if (debug) std::cerr << "\tis filter_op\n";
1825         return;
1826     }
1827     else if (strncmp(exp, "REF", 3)==0)
1828     {
1829         node->type = VT_REF_COL;
1830         exp += 3;
1831         if (debug) std::cerr << "\tis ref_op\n";
1832         return;
1833     }
1834     else if (strncmp(exp, "ALT", 3)==0)
1835     {
1836         node->type = VT_ALT;
1837         exp += 3;
1838         if (debug) std::cerr << "\tis alt_op\n";
1839         return;
1840     }
1841     else if (strncmp(exp, "QUAL", 4)==0)
1842     {
1843         node->type = VT_QUAL;
1844         exp += 4;
1845         if (debug) std::cerr << "\tis qual_op\n";
1846         return;
1847     }
1848     else if (strncmp(exp, "FILTER.", 7)==0)
1849     {
1850         node->type = VT_FILTER;
1851         exp += 7;
1852         kputsn(exp, len-7, &node->tag);
1853         if (debug) std::cerr << "\tis filter_op\n";
1854         return;
1855     }
1856     else if (strncmp(exp, "N_FILTER", len)==0)
1857     {
1858         node->type = VT_N_FILTER;
1859         if (debug) std::cerr << "\tis n_filter_op\n";
1860         return;
1861     }
1862     else if (strncmp(exp, "INFO.", 5)==0)
1863     {
1864         node->type = VT_INFO;
1865         exp += 5;
1866         //detect index access e.g. AC[1]
1867         int32_t index_width = get_index_width(exp, len-5, &node->index);
1868         kputsn(exp, len-5-index_width, &node->tag);
1869         if (debug) std::cerr << "\tis info_op\n";
1870         return;
1871     }
1872     else if (strncmp(exp, "VTYPE", 5)==0)
1873     {
1874         node->type = VT_VARIANT_TYPE;
1875         if (debug) std::cerr << "\tis variant_op\n";
1876         return;
1877     }
1878     else if (strncmp(exp, "N_ALLELE", len)==0)
1879     {
1880         node->type = VT_N_ALLELE;
1881         if (debug) std::cerr << "\tis nallele_op\n";
1882         return;
1883     }
1884     else if (strncmp(exp, "INDEL", len)==0)
1885     {
1886         node->type = VT_INT;
1887         node->i = VT_INDEL;
1888         node->value_exists = true;
1889         if (debug) std::cerr << "\tis INDEL\n";
1890         return;
1891     }
1892     else if (strncmp(exp, "SNP", len)==0)
1893     {
1894         node->type = VT_INT;
1895         node->i = VT_SNP;
1896         node->value_exists = true;
1897         if (debug) std::cerr << "\tis SNP\n";
1898         return;
1899     }
1900     else if (strncmp(exp, "MNP", len)==0)
1901     {
1902         node->type = VT_INT;
1903         node->i = VT_MNP;
1904         node->value_exists = true;
1905         if (debug) std::cerr << "\tis MNP\n";
1906         return;
1907     }
1908     else if (strncmp(exp, "CLUMPED", len)==0)
1909     {
1910         node->type = VT_INT;
1911         node->i = VT_CLUMPED;
1912         node->value_exists = true;
1913         if (debug) std::cerr << "\tis CLUMPED\n";
1914         return;
1915     }
1916     else if (strncmp(exp, "VNTR", len)==0)
1917     {
1918         node->type = VT_INT;
1919         node->i = VT_VNTR;
1920         if (debug) std::cerr << "\tis VNTR\n";
1921         return;
1922     }
1923     else if (strncmp(exp, "SV", len)==0)
1924     {
1925         node->type = VT_INT;
1926         node->i = VT_SV;
1927         if (debug) std::cerr << "\tis SV\n";
1928         return;
1929     }
1930     else if (strncmp(exp, "REF", len)==0)
1931     {
1932         node->type = VT_INT;
1933         node->i = VT_REF;
1934         node->value_exists = true;
1935         if (debug) std::cerr << "\tis REF\n";
1936         return;
1937     }
1938     else if (strncmp(exp, "DLEN", len)==0)
1939     {
1940         node->type = VT_VARIANT_DLEN;
1941         node->value_exists = false;
1942         if (debug) std::cerr << "\tis dlen\n";
1943         return;
1944     }
1945     else if (strncmp(exp, "LEN", len)==0)
1946     {
1947         node->type = VT_VARIANT_LEN;
1948         node->value_exists = false;
1949         if (debug) std::cerr << "\tis len\n";
1950         return;
1951     }
1952     else if (strncmp(exp, "VARIANT_CONTAINS_N", len)==0)
1953     {
1954         node->type = VT_VARIANT_CONTAINS_N;
1955         node->value_exists = false;
1956         if (debug) std::cerr << "\tis variant contains N\n";
1957         return;
1958     }
1959     else
1960     {
1961         //integer type
1962         const char* start = exp;
1963         char *end = NULL;
1964         int32_t i = std::strtoul(exp, &end, 10);
1965         if (end==exp+len)
1966         {
1967             node->type = VT_INT;
1968             node->i = i;
1969             node->f = (float)i;
1970             node->value_exists = true;
1971             if (debug) std::cerr << "\tis int\n";
1972             return;
1973         }
1974 
1975         //float type
1976         start = exp;
1977         end = NULL;
1978         float f = std::strtod(exp, &end);
1979         if (end==exp+len)
1980         {
1981             node->type = VT_FLT;
1982             node->f = f;
1983             node->value_exists = true;
1984             if (debug) std::cerr << "\tis float: " << f << "\n";
1985             return;
1986         }
1987 
1988         //string type
1989         if (exp[0]=='\'' && exp[len-1]=='\'')
1990         {
1991             node->type = VT_STR;
1992             kputsn(exp+1, len-2, &node->tag);
1993             kputc(0, &node->tag);
1994             kputsn(exp+1, len-2, &node->s);
1995             kputc(0, &node->s);
1996             node->value_exists = true;
1997             if (debug) std::cerr << "\tis string\n";
1998             return;
1999         }
2000 
2001         if (node->type==VT_UNKNOWN)
2002         {
2003             kstring_t s = {0,0,0};
2004             kputsn(exp, len, &s);
2005             fprintf(stderr, "[E:%s] %s is not recognized. If it is a string, you should place it in single inverted commas.\n", __FUNCTION__, s.s);
2006             print_filter_help();
2007             exit(1);
2008         }
2009     }
2010 
2011     if (debug)
2012     {
2013         std::cerr << "\tvalue_exists " << node->value_exists << "\n";
2014         std::cerr << "\ttag          " << node->tag.s << "\n";
2015         std::cerr << "\tb            " << node->b << "\n";
2016         std::cerr << "\ti            " << node->i << "\n";
2017         std::cerr << "\tf            " << node->f << "\n";
2018     }
2019 
2020     return;
2021 }
2022 
2023 /**
2024  * Help message on filter expressions.
2025  */
print_filter_help()2026 void Filter::print_filter_help()
2027 {
2028     fprintf(stderr, "\n");
2029     fprintf(stderr, "  Variant characteristics\n");
2030     fprintf(stderr, "    VTYPE,N_ALLELE,DLEN,LEN\n");
2031     fprintf(stderr, "\n");
2032     fprintf(stderr, "  Variant value types\n");
2033     fprintf(stderr, "    SNP,MNP,INDEL,CLUMPED\n");
2034     fprintf(stderr, "\n");
2035     fprintf(stderr, "  Biallelic SNPs only                       : VTYPE==SNP&&N_ALLELE==2\n");
2036     fprintf(stderr, "  Biallelic Indels with embedded SNP        : VTYPE==(SNP|INDEL)&&N_ALLELE==2\n");
2037     fprintf(stderr, "  Biallelic variants involving insertions   : VTYPE&INDEL&&DLEN>0&&N_ALLELE==2\n");
2038     fprintf(stderr, "  Biallelic variants involving 1bp variants : LEN==1&&N_ALLELE==2\n");
2039     fprintf(stderr, "\n");
2040     fprintf(stderr, "  QUAL field\n");
2041     fprintf(stderr, "    QUAL\n");
2042     fprintf(stderr, "\n");
2043     fprintf(stderr, "  FILTER fields\n");
2044     fprintf(stderr, "    PASS, FILTER.<tag>\n");
2045     fprintf(stderr, "\n");
2046     fprintf(stderr, "  INFO fields\n");
2047     fprintf(stderr, "    INFO.<tag>\n");
2048     fprintf(stderr, "\n");
2049     fprintf(stderr, "  Passed biallelic SNPs only                  : PASS&&VTYPE==SNP&&N_ALLELE==2\n");
2050     fprintf(stderr, "  Passed Common biallelic SNPs only           : PASS&&VTYPE==SNP&&N_ALLELE==2&&INFO.AF>0.005\n");
2051     fprintf(stderr, "  Passed Common biallelic SNPs or rare indels : (PASS&&VTYPE==SNP&&N_ALLELE==2&&INFO.AF>0.005)||(VTYPE&INDEL&&INFO.AF<=0.005)\n");
2052     fprintf(stderr, "\n");
2053     fprintf(stderr, "  Regular expressions for string fields using pcre2\n");
2054     fprintf(stderr, "\n");
2055     fprintf(stderr, "  Passed variants in intergenic regions or UTR : PASS&&INFO.ANNO=~'Intergenic|UTR'\n");
2056     fprintf(stderr, "  Passed variants in intergenic regions or UTR : PASS&&INFO.ANNO=~'(?i)Intergenic|UTR'\n");
2057     fprintf(stderr, "  ignoring case\n");
2058     fprintf(stderr, "\n");
2059     fprintf(stderr, "  Operations\n");
2060     fprintf(stderr, "     ==,!=,=~,~~,~,&&,||,&,|,+,-,*,/\n");
2061     fprintf(stderr, "\n");
2062     fprintf(stderr, "  Failed rare variants : ~PASS&&(INFO.AC/INFO.AN<0.005)\n");
2063     fprintf(stderr, "\n");
2064 }
2065 
2066 /**
2067  * Trim brackets from an expression.
2068  */
trim_brackets(const char * & exp,int32_t & len,bool debug)2069 void Filter::trim_brackets(const char* &exp, int32_t &len, bool debug)
2070 {
2071     if (*exp=='(' && exp[len-1]==')')
2072     {
2073         int32_t opened_brackets = 1;
2074         bool nested = true;
2075         int32_t j=1;
2076         while(j<len-1)
2077         {
2078             if(exp[j]=='(')
2079             {
2080                if (opened_brackets<0)
2081                {
2082                     kstring_t s = {0,0,0};
2083                     kputsn(exp, len, &s);
2084                     fprintf(stderr, "[%s:%d %s] brackets not correct %s\n", __FILE__, __LINE__, __FUNCTION__, s.s);
2085                     if (s.m) free(s.s);
2086                     exit(1);
2087                }
2088 
2089                ++opened_brackets;
2090             }
2091             else if (exp[j]==')')
2092             {
2093                if (opened_brackets<=0)
2094                {
2095                     kstring_t s = {0,0,0};
2096                     kputsn(exp, len, &s);
2097                     fprintf(stderr, "[%s:%d %s] brackets not correct %s\n", __FILE__, __LINE__, __FUNCTION__, s.s);
2098                     if (s.m) free(s.s);
2099                     exit(1);
2100                }
2101 
2102                --opened_brackets;
2103             }
2104 
2105             if (opened_brackets==0)
2106             {
2107                 nested = false;
2108                 break;
2109             }
2110             ++j;
2111 
2112 
2113         }
2114 
2115         if (nested)
2116         {
2117             if (opened_brackets == 1)
2118             {
2119                 ++exp;
2120                 len -=2;
2121                 if (debug) std::cerr << "\t\ttrimmed brackets\n";
2122                 trim_brackets(exp, len, debug);
2123             }
2124             else
2125             {
2126                 std::cerr << "Illegal expression: brackets not correct\n";
2127             }
2128         }
2129     }
2130 }
2131 
2132 /**
2133  * Moves r to the closing bracket if this expression starts with an open bracket.
2134  * Returns -1 if end of r else 0.
2135  */
fwd_to_closing_bracket(const char * & r,int32_t & len)2136 int32_t Filter::fwd_to_closing_bracket(const char* &r, int32_t &len)
2137 {
2138     const char* s = r;
2139     if (*r=='(')
2140     {
2141         ++s;
2142         int32_t opened_brackets = 1;
2143         bool nested = true;
2144         while(s-r!=len)
2145         {
2146             if(*s=='(')
2147             {
2148                if (opened_brackets<0)
2149                {
2150                     kstring_t s = {0,0,0};
2151                     kputsn(r, len, &s);
2152                     fprintf(stderr, "[%s:%d %s] brackets not correct %s\n", __FILE__, __LINE__, __FUNCTION__, s.s);
2153                     if (s.m) free(s.s);
2154                     exit(1);
2155                }
2156 
2157                ++opened_brackets;
2158             }
2159             else if (*s==')')
2160             {
2161                if (opened_brackets<=0)
2162                {
2163                     kstring_t s = {0,0,0};
2164                     kputsn(r, len, &s);
2165                     fprintf(stderr, "[%s:%d %s] brackets not correct %s\n", __FILE__, __LINE__, __FUNCTION__, s.s);
2166                     if (s.m) free(s.s);
2167                     exit(1);
2168                }
2169 
2170                --opened_brackets;
2171             }
2172 
2173             if (opened_brackets==0)
2174             {
2175                 r = s;
2176 
2177                 if (s-r==len-1)
2178                 {
2179                     return -1;
2180                 }
2181                 else
2182                 {
2183                     return 0;
2184                 }
2185             }
2186 
2187             ++s;
2188         }
2189 
2190         kstring_t s = {0,0,0};
2191         kputsn(r, len, &s);
2192         fprintf(stderr, "[%s:%d %s] brackets not correct %s\n", __FILE__, __LINE__, __FUNCTION__, s.s);
2193         if (s.m) free(s.s);
2194         exit(1);
2195     }
2196 
2197     return 0;
2198 }
2199 
2200 /**
2201  * Returns -1 if no operator found. Updates oplen to be the length of the operator observed.
2202  */
peek_op(const char * & r,int32_t len,int32_t & oplen,bool debug)2203 int32_t Filter::peek_op(const char* &r, int32_t len, int32_t &oplen, bool debug)
2204 {
2205     const char* s = r;
2206     oplen = -1;
2207     if (*s=='&' && (s+1-r<len) && *(s+1)=='&')
2208     {
2209         if (debug) std::cerr << "\tis && operator\n";
2210         oplen = 2;
2211         return VT_AND;
2212     }
2213     else if (*s=='|' && (s+1-r<len) && *(s+1)=='|')
2214     {
2215         if (debug) std::cerr << "\tis || operator\n";
2216         oplen = 2;
2217         return VT_OR;
2218     }
2219     else if (*s=='=' && (s+1-r<len) && *(s+1)=='=')
2220     {
2221         if (debug) std::cerr << "\tis == operator\n";
2222         oplen = 2;
2223         return VT_EQ;
2224     }
2225     else if (*s=='=' && (s+1-r<len) && *(s+1)=='~')
2226     {
2227         if (debug) std::cerr << "\tis =~ operator\n";
2228         oplen = 2;
2229         return VT_MATCH;
2230     }
2231     else if (*s=='~' && (s+1-r<len) && *(s+1)=='~')
2232     {
2233         if (debug) std::cerr << "\tis ~~ operator\n";
2234         oplen = 2;
2235         return VT_NO_MATCH;
2236     }
2237     else if (*s=='>' && (s+1-r<len) && *(s+1)=='=')
2238     {
2239         if (debug) std::cerr << "\tis >= operator\n";
2240         oplen = 2;
2241         return VT_GE;
2242     }
2243     else if (*s=='<' && (s+1-r<len) && *(s+1)=='=')
2244     {
2245         if (debug) std::cerr << "\tis <= operator\n";
2246         oplen = 2;
2247         return VT_LE;
2248     }
2249     else if (*s=='!' && (s+1-r<len) && *(s+1)=='=')
2250     {
2251         if (debug) std::cerr << "\tis != operator\n";
2252         oplen = 2;
2253         return VT_NE;
2254     }
2255     else if (*s=='+')
2256     {
2257         if (debug) std::cerr << "\tis + operator\n";
2258         oplen = 1;
2259         return VT_ADD;
2260     }
2261     else if (*s=='-')
2262     {
2263         if (debug) std::cerr << "\tis - operator\n";
2264         oplen = 1;
2265         return VT_SUB;
2266     }
2267     else if (*s=='*')
2268     {
2269         if (debug) std::cerr << "\tis * operator\n";
2270         oplen = 1;
2271         return VT_MUL;
2272     }
2273     else if (*s=='/')
2274     {
2275         if (debug) std::cerr << "\tis / operator\n";
2276         oplen = 1;
2277         return VT_DIV;
2278     }
2279     else if (*s=='&')
2280     {
2281         if (debug) std::cerr << "\tis & operator\n";
2282         oplen = 1;
2283         return VT_BIT_AND;
2284     }
2285     else if (*s=='|')
2286     {
2287         if (debug) std::cerr << "\tis | operator\n";
2288         oplen = 1;
2289         return VT_BIT_OR;
2290     }
2291     else if (*s=='>')
2292     {
2293         if (debug) std::cerr << "\tis > operator\n";
2294         oplen = 1;
2295         return VT_GT;
2296     }
2297     else if (*s=='<')
2298     {
2299         if (debug) std::cerr << "\tis < operator\n";
2300         oplen = 1;
2301         return VT_LT;
2302     }
2303 
2304     return -1;
2305 }
2306 
2307 /**
2308  * Recursive call for apply.
2309  */
apply(Node * node,bool debug)2310 void Filter::apply(Node* node, bool debug)
2311 {
2312     node->value_exists = false;
2313 
2314     //evaluate downstream
2315     if (node->left!=NULL)
2316     {
2317         apply(node->left, debug);
2318     }
2319 
2320     //can do some lazy evaluation here for && and || types.
2321     if (node->type&VT_LOGIC_OP)
2322     {
2323         if (node->type==VT_AND)
2324         {
2325             if (node->left->value_exists && !node->left->b)
2326             {
2327                 node->b = false;
2328                 node->value_exists = true;
2329                 return;
2330             }
2331             else
2332             {
2333                  apply(node->right, debug);
2334             }
2335         }
2336         else if (node->type==VT_OR)
2337         {
2338             if (node->left->value_exists && node->left->b)
2339             {
2340                 node->b = true;
2341                 node->value_exists = true;
2342                 return;
2343             }
2344             else
2345             {
2346                  apply(node->right, debug);
2347             }
2348         }
2349     }
2350     else
2351     {
2352         if (node->right!=NULL)
2353         {
2354             apply(node->right, debug);
2355         }
2356     }
2357 
2358     node->evaluate(h, v, variant, debug);
2359 };