1 /* The MIT License
2
3 Copyright (c) 2013 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "filter.h"
25
26 /**
27 * Constructor.
28 */
Node()29 Node::Node()
30 {
31 parent = NULL;
32 left = NULL;
33 right = NULL;
34 tag = {0,0,0};
35 s = {0,0,0};
36 regex_set = false;
37 };
38
39 /**
40 * Constructor with type initlialization.
41 */
Node(int32_t type)42 Node::Node(int32_t type)
43 {
44 parent = NULL;
45 left = NULL;
46 right = NULL;
47 tag = {0,0,0};
48 s = {0,0,0};
49 regex_set = false;
50 this->type = type;
51 };
52
53 /**
54 * Evaluates the actions for this node.
55 */
evaluate(bcf_hdr_t * h,bcf1_t * v,Variant * variant,bool debug)56 void Node::evaluate(bcf_hdr_t *h, bcf1_t *v, Variant *variant, bool debug)
57 {
58 if (debug)
59 std::cerr << "evaluation " << type2string(type) << "\n";
60
61 //by default
62 value_exists = true;
63
64 if (type&VT_LOGIC_OP)
65 {
66 if (!left->value_exists && type==VT_NOT)
67 {
68 if (debug)
69 std::cerr << "\tVT_NOT " << left->b << " \n";
70
71 b = true;
72 return;
73 }
74
75 //think about this, what happens when INFO values are not present, do you treat as a flag?
76 if (!left->value_exists)
77 {
78 b = false;
79 value_exists = false;
80 return;
81 }
82
83 if (type==VT_NOT)
84 {
85 if (debug)
86 std::cerr << "\tVT_NOT " << left->b << " \n";
87
88 b = !(left->b);
89 return;
90 }
91
92 if (!right->value_exists)
93 {
94 b = false;
95 value_exists = false;
96 return;
97 }
98
99 if (type==VT_AND)
100 {
101 if (debug)
102 std::cerr << "\tVT_AND " << left->b << "&" << right->b << " \n";
103 b = (left->b && right->b);
104 return;
105 }
106 else if (type==VT_OR)
107 {
108 b = (left->b || right->b);
109 return;
110 }
111 }
112 else if (type&VT_MATH_CMP)
113 {
114 if (!left->value_exists && !right->value_exists)
115 {
116 value_exists = false;
117 return;
118 }
119
120 if (type==VT_EQ)
121 {
122 //if an INFO field is involved and does not exist, then we evaluate as false
123 if (!left->value_exists || !right->value_exists)
124 {
125 value_exists = true;
126 b = false;
127 return;
128 }
129
130 if ((left->type&VT_INT))
131 {
132 if ((right->type&VT_INT))
133 {
134 if (debug)
135 std::cerr << "\tVT_EQ " << left->i << "==" << right->i << " \n";
136 b = (left->i==right->i);
137 return;
138 }
139 else if ((right->type&VT_FLT))
140 {
141 if (debug)
142 std::cerr << "\tVT_EQ " << left->i << "==" << right->f << " \n";
143 b = (left->i==right->f);
144 return;
145 }
146 }
147 else if ((left->type&VT_FLT))
148 {
149 if ((right->type&VT_INT))
150 {
151 if (debug)
152 std::cerr << "\tVT_EQ " << left->f << "==" << right->i << " \n";
153 b = (left->f==right->i);
154 return;
155 }
156 else if ((right->type&VT_FLT))
157 {
158 if (debug)
159 std::cerr << "\tVT_EQ " << left->f << "==" << right->f << " \n";
160 b = (left->f==right->f);
161 return;
162 }
163 }
164 else if ((left->type&VT_STR) && (right->type&VT_STR))
165 {
166 if (debug)
167 std::cerr << "\tVT_EQ " << left->s.s << "&" << right->s.s << " \n";
168 b = strcmp(left->s.s, right->s.s)==0 ? true : false;
169 return;
170 }
171
172 fprintf(stderr, "[%s:%d %s] evaluation not supported : == %s %s\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
173 exit(1);
174 }
175 else if (type==VT_MATCH)
176 {
177 //if an INFO field is involved and does not exist, then we evaluate as false
178 if (!left->value_exists || !right->value_exists)
179 {
180 value_exists = true;
181 b = false;
182 return;
183 }
184
185 if ((left->type&VT_STR) && (right->type&VT_STR))
186 {
187 if (debug)
188 std::cerr << "\tVT_MATCH " << left->s.s << "&" << right->s.s << " \n";
189
190 if (!regex_set)
191 {
192 pregex.set(right->s.s);
193 regex_set = true;
194 }
195
196 b = pregex.match(left->s.s);
197
198 return;
199 }
200
201 fprintf(stderr, "[%s:%d %s] evaluation not supported : =~ %s %s\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
202 exit(1);
203 }
204 else if (type==VT_NO_MATCH)
205 {
206 //if an INFO field is involved and does not exist, then we evaluate as false
207 if (!left->value_exists || !right->value_exists)
208 {
209 value_exists = true;
210 b = false;
211 return;
212 }
213
214 if ((left->type&VT_STR) && (right->type&VT_STR))
215 {
216 if (debug)
217 std::cerr << "\tVT_NO_MATCH " << left->s.s << "&" << right->s.s << " \n";
218
219 if (!regex_set)
220 {
221 pregex.set(right->s.s);
222 regex_set = true;
223 }
224
225 b = !pregex.match(left->s.s);
226
227 return;
228 }
229
230 fprintf(stderr, "[%s:%d %s] evaluation not supported : ~~ %s %s\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
231 exit(1);
232 }
233 else if (type==VT_NE)
234 {
235 //fprintf(stderr, "[%s:%d %s] check: %s %s: !=\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
236
237 //if an INFO field is involved and does not exist, then we evaluate as false
238 if (!left->value_exists || !right->value_exists)
239 {
240 value_exists = true;
241 b = false;
242 return;
243 }
244
245 if ((left->type&VT_INT))
246 {
247 if ((right->type&VT_INT))
248 {
249 b = (left->i!=right->i);
250 return;
251 }
252 else if ((right->type&VT_FLT))
253 {
254 b = (left->i!=right->f);
255 return;
256 }
257 }
258 else if ((left->type&VT_FLT))
259 {
260 if ((right->type&VT_INT))
261 {
262 b = (left->f!=right->i);
263 return;
264 }
265 else if ((right->type&VT_FLT))
266 {
267 b = (left->f!=right->f);
268 return;
269 }
270 }
271 else if ((left->type&VT_STR) && (right->type&VT_STR))
272 {
273 b = strcmp(left->s.s, right->s.s)==0 ? false : true;
274 return;
275 }
276
277 fprintf(stderr, "[%s:%d %s] evaluation not supported: %s %s: !=\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
278 exit(1);
279 }
280 else if (type==VT_LE)
281 {
282 //if an INFO field is involved and does not exist, then we evaluate as false
283 if (!left->value_exists || !right->value_exists)
284 {
285 value_exists = true;
286 b = false;
287 return;
288 }
289
290 if ((left->type&VT_INT))
291 {
292 if ((right->type&VT_INT))
293 {
294 b = (left->i<=right->i);
295 return;
296 }
297 else if ((right->type&VT_FLT))
298 {
299 b = (left->i<=right->f);
300 return;
301 }
302 }
303 else if ((left->type&VT_FLT))
304 {
305 if ((right->type&VT_INT))
306 {
307 type |= VT_INT;
308 b = (left->f<=right->i);
309 return;
310 }
311 else if ((right->type&VT_FLT))
312 {
313 b = (left->f<=right->f);
314 return;
315 }
316 }
317 else if ((left->type&VT_STR) && (right->type&VT_STR))
318 {
319 b = strcmp(left->s.s, right->s.s)<=0 ? true : false;
320 return;
321 }
322
323 fprintf(stderr, "[%s:%d %s] evaluation not supported: %s %s: <=\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
324 exit(1);
325 }
326 else if (type==VT_GE)
327 {
328 //if an INFO field is involved and does not exist, then we evaluate as false
329 if (!left->value_exists || !right->value_exists)
330 {
331 value_exists = true;
332 b = false;
333 return;
334 }
335
336 if ((left->type&VT_INT))
337 {
338 if ((right->type&VT_INT))
339 {
340 b = (left->i>=right->i);
341 return;
342 }
343 else if ((right->type&VT_FLT))
344 {
345 b = (left->i>=right->f);
346 return;
347 }
348 }
349 else if ((left->type&VT_FLT))
350 {
351 if ((right->type&VT_INT))
352 {
353 b = (left->f>=right->i);
354 return;
355 }
356 else if ((right->type&VT_FLT))
357 {
358 b = (left->f>=right->f);
359 return;
360 }
361 }
362 else if ((left->type&VT_STR) && (right->type&VT_STR))
363 {
364 b = strcmp(left->s.s, right->s.s)>=0 ? true : false;
365 return;
366 }
367
368 fprintf(stderr, "[%s:%d %s] evaluation not supported: %s %s: >=\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
369 exit(1);
370 }
371 else if (type==VT_GT)
372 {
373 //if an INFO field is involved and does not exist, then we evaluate as false
374 if (!left->value_exists || !right->value_exists)
375 {
376 value_exists = true;
377 b = false;
378 return;
379 }
380
381 if ((left->type&VT_INT))
382 {
383 if ((right->type&VT_INT))
384 {
385 b = (left->i>right->i);
386 return;
387 }
388 else if ((right->type&VT_FLT))
389 {
390 b = (left->i>right->f);
391 return;
392 }
393 }
394 else if ((left->type&VT_FLT))
395 {
396 if ((right->type&VT_INT))
397 {
398 b = (left->f>right->i);
399 return;
400 }
401 else if ((right->type&VT_FLT))
402 {
403 b = (left->f>right->f);
404 return;
405 }
406 }
407 else if ((left->type&VT_STR) && (right->type&VT_STR))
408 {
409 b = strcmp(left->s.s, right->s.s)>0 ? true : false;
410 return;
411 }
412
413 fprintf(stderr, "[%s:%d %s] evaluation not supported: %s %s: >\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
414 exit(1);
415 }
416 else if (type==VT_LT)
417 {
418 //if an INFO field is involved and does not exist, then we evaluate as false
419 if (!left->value_exists || !right->value_exists)
420 {
421 value_exists = true;
422 b = false;
423 return;
424 }
425
426 if ((left->type&VT_INT))
427 {
428 if ((right->type&VT_INT))
429 {
430 if (debug)
431 std::cerr << "\tVT_LT " << left->i << "<" << right->i << " \n";
432
433 b = (left->i<right->i);
434 return;
435 }
436 else if ((right->type&VT_FLT))
437 {
438 if (debug)
439 std::cerr << "\tVT_LT " << left->i << "<" << right->f << " \n";
440
441 b = (left->i<right->f);
442 return;
443 }
444 }
445 else if ((left->type&VT_FLT))
446 {
447 if ((right->type&VT_INT))
448 {
449 if (debug)
450 std::cerr << "\tVT_LT " << left->f << "<" << right->i << " \n";
451
452 b = (left->f<right->i);
453 return;
454 }
455 else if ((right->type&VT_FLT))
456 {
457 if (debug)
458 std::cerr << "\tVT_LT " << left->f << "<" << right->f << " \n";
459
460 b = (left->f<right->f);
461 return;
462 }
463 }
464 else if ((left->type&VT_STR) && (right->type&VT_STR))
465 {
466 if (debug)
467 std::cerr << "\tVT_LT " << left->s.s << "<" << right->s.s << " \n";
468
469 b = strcmp(left->s.s, right->s.s)<0 ? true : false;
470 return;
471 }
472
473 fprintf(stderr, "[%s:%d %s] evaluation not supported: %s %s: <\n", __FILE__, __LINE__, __FUNCTION__, type2string(left->type).c_str(), type2string(right->type).c_str());
474 exit(1);
475 }
476 }
477 //this is
478 else if (type&VT_BCF_OP)
479 {
480
481 // std::cerr << "\tprocessing BCF OPT for the first time\n";
482
483 value_exists = true;
484
485 if (type==VT_REF_COL)
486 {
487 s.l = 0;
488 char* ref = bcf_get_ref(v);
489 kputs(ref, &s);
490 }
491 else if (type==VT_ALT)
492 {
493 int32_t no_allele = bcf_get_n_allele(v);
494 if (no_allele)
495 {
496 s.l = 0;
497 char** allele = bcf_get_allele(v);
498 for (int32_t i=1; i<v->n_allele; ++i)
499 {
500 if (i>1) kputc(',', &s);
501 kputs(allele[i], &s);
502 }
503 }
504 else
505 {
506 s.l = 0;
507 }
508 }
509 else if (type==VT_QUAL)
510 {
511 if (bcf_float_is_missing(bcf_get_qual(v)))
512 {
513 f = -1;
514 value_exists = false;
515 }
516 else
517 {
518 f = bcf_get_qual(v);
519 value_exists = true;
520 }
521 }
522 else if (type==VT_FILTER)
523 {
524 bcf_unpack(v, BCF_UN_FLT);
525 if (bcf_get_n_filter(v) && bcf_has_filter(h, v, tag.s))
526 {
527 b = true;
528 }
529 else
530 {
531 b = false;
532 }
533 }
534 else if (type==VT_N_FILTER)
535 {
536 i = bcf_get_n_filter(v);
537 f = i;
538 b = true;
539 value_exists = true;
540 }
541 //figure out type
542 //this should only be invoked once so that we can short circuit
543 //the type checking in subsequent applications of the filter
544 //expression
545 else if (type==VT_INFO)
546 {
547 if (!bcf_hdr_idinfo_exists(h, BCF_HL_INFO, bcf_hdr_id2int(h, BCF_DT_ID, tag.s)))
548 {
549 fprintf(stderr, "[%s:%d %s] INFO tag %s does not exist in header of VCF file.\n", __FILE__, __LINE__, __FUNCTION__, tag.s);
550 exit(1);
551 }
552
553 int32_t int_id = bcf_hdr_id2int(h, BCF_DT_ID, tag.s);
554 int32_t info_type = bcf_hdr_id2type(h, BCF_HL_INFO, int_id);
555 var_length = bcf_hdr_id2length(h, BCF_HL_INFO, int_id);
556 number = bcf_hdr_id2number(h, BCF_HL_INFO, int_id);
557 bcf_unpack(v, BCF_UN_INFO);
558
559 if (info_type==BCF_HT_FLAG)
560 {
561 type |= VT_FLG;
562 if (bcf_get_info_flag(h, v, tag.s, 0, 0)>0)
563 {
564 i = 1;
565 f = 1;
566 b = true;
567 s.l=0;
568 }
569 else
570 {
571 b = false;
572 value_exists = false;
573 }
574 }
575 else if (info_type==BCF_HT_INT)
576 {
577 // std::cerr << "\tadd INT to VT_INFO type\n";
578
579
580 type |= VT_INT;
581
582 if (var_length==BCF_VL_FIXED)
583 {
584 // std::cerr << "\tFIXED length\n";
585
586 // std::cerr << "\tnumber : " << number << "\n";
587 if (number==1)
588 {
589 int32_t ns = 0;
590 int32_t *is = NULL;
591 if (bcf_get_info_int32(h, v, tag.s, &is, &ns)>0)
592 {
593 // std::cerr << "\tupdating field i\n";
594
595 i = is[0];
596 f = (float)is[0];
597 }
598 else
599 {
600 b = false;
601 value_exists = false;
602 }
603
604 if (ns) free(is);
605 }
606 else if (number>1)
607 {
608 // std::cerr << "\tnumber > 1 " << "\n";
609 // std::cerr << "1) updating initial record\n";
610 int32_t ns = 0;
611 int32_t *is = NULL;
612 if (bcf_get_info_int32(h, v, tag.s, &is, &ns)>0)
613 {
614 ivec.resize(0);
615 fvec.resize(0);
616 for (int32_t i=0; i<ns; ++i)
617 {
618 ivec.push_back(is[i]);
619 fvec.push_back(is[i]);
620 }
621 i = is[index-1];
622 f = (float)is[index-1];
623 }
624 else
625 {
626 b = false;
627 value_exists = false;
628 }
629
630 if (ns) free(is);
631 }
632 else
633 {
634 std::cerr << "\thmmmmmm " << "\n";
635 //hmmmmm.....
636 }
637 }
638 else if (var_length==BCF_VL_R || var_length==BCF_VL_A || var_length==BCF_VL_G)
639 {
640 int32_t no_alleles = bcf_get_n_allele(v);
641
642 if (var_length==BCF_VL_R)
643 {
644 // std::cerr << "\tR length\n";
645 number = no_alleles;
646 }
647 else if (var_length==BCF_VL_A)
648 {
649 // std::cerr << "\tA length\n";
650 number = no_alleles-1;
651 }
652 else if (var_length==BCF_VL_G)
653 {
654 // std::cerr << "\tG length\n";
655 //assume ploidy is too for the time being
656 //usage is not determinable for info fields because
657 //ploidy is individual dependent
658 number = bcf_ap2g(no_alleles, 2);
659 }
660
661 // std::cerr << "\tnumber : " << number << "\n";
662 if (number==1)
663 {
664 int32_t ns = 0;
665 int32_t *is = NULL;
666 if (bcf_get_info_int32(h, v, tag.s, &is, &ns)>0)
667 {
668 // std::cerr << "\tupdating field i\n";
669
670 i = is[0];
671 f = (float)is[0];
672 }
673 else
674 {
675 b = false;
676 value_exists = false;
677 }
678
679 if (ns) free(is);
680 }
681 else if (number>1)
682 {
683 std::cerr << "\tnumber > 1 " << "\n";
684 // std::cerr << "1) updating initial record\n";
685 int32_t ns = 0;
686 int32_t *is = NULL;
687 if (bcf_get_info_int32(h, v, tag.s, &is, &ns)>0)
688 {
689 ivec.resize(0);
690 fvec.resize(0);
691 for (int32_t i=0; i<ns; ++i)
692 {
693 ivec.push_back(is[i]);
694 fvec.push_back(is[i]);
695 }
696 i = is[index-1];
697 f = (float)is[index-1];
698 }
699 else
700 {
701 b = false;
702 value_exists = false;
703 }
704
705 if (ns) free(is);
706 }
707 else
708 {
709 std::cerr << "\thmmmmmm " << "\n";
710
711 //hmmmmm.....
712 }
713 }
714 else if (var_length==BCF_VL_VAR)
715 {
716 std::cerr << "\tvariable length, should we treat as a blob?\n";
717 }
718 }
719 else if (info_type==BCF_HT_REAL)
720 {
721 type |= VT_FLT;
722
723 if (var_length==BCF_VL_FIXED)
724 {
725 if (number==1)
726 {
727 int32_t ns = 0;
728 int32_t *fs = NULL;
729 if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
730 {
731 i = (int32_t) fs[0];
732 f = fs[0];
733 }
734 else
735 {
736 b = false;
737 value_exists = false;
738 }
739
740 if (ns) free(fs);
741 }
742 else if (number>1)
743 {
744 int32_t ns = 0;
745 int32_t *fs = NULL;
746 if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
747 {
748 ivec.resize(0);
749 fvec.resize(0);
750 for (int32_t i=0; i<ns; ++i)
751 {
752 ivec.push_back(fs[i]);
753 fvec.push_back(fs[i]);
754 }
755 i = (int32_t) fs[index-1];
756 f = fs[index-1];
757 }
758 else
759 {
760 b = false;
761 value_exists = false;
762 }
763
764 if (ns) free(fs);
765 }
766 }
767 else if (var_length==BCF_VL_R || var_length==BCF_VL_A || var_length==BCF_VL_G)
768 {
769 int32_t no_alleles = bcf_get_n_allele(v);
770
771 if (var_length==BCF_VL_R)
772 {
773 number = no_alleles;
774 }
775 else if (var_length==BCF_VL_A)
776 {
777 number = no_alleles-1;
778 }
779 else if (var_length==BCF_VL_G)
780 {
781 //assume ploidy is too for the time being
782 //usage is not determinable for info fields because
783 //ploidy is individual dependent
784 number = bcf_ap2g(no_alleles, 2);
785 }
786
787 if (number==1)
788 {
789 int32_t ns = 0;
790 int32_t *fs = NULL;
791 if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
792 {
793 i = (int32_t)fs[0];
794 f = fs[0];
795 }
796 else
797 {
798 b = false;
799 value_exists = false;
800 }
801
802 if (ns) free(fs);
803 }
804 else if (number>1)
805 {
806 int32_t ns = 0;
807 int32_t *fs = NULL;
808 if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
809 {
810 ivec.resize(0);
811 fvec.resize(0);
812 for (int32_t i=0; i<ns; ++i)
813 {
814 ivec.push_back(fs[i]);
815 fvec.push_back(fs[i]);
816 }
817 i = (int32_t)fs[index-1];
818 f = fs[index-1];
819 }
820 else
821 {
822 b = false;
823 value_exists = false;
824 }
825
826 if (ns) free(fs);
827 }
828 }
829 else if (var_length==BCF_VL_VAR)
830 {
831 //variable length, should we treat as a blob?
832 }
833 }
834 else if (info_type==BCF_HT_STR)
835 {
836 type |= VT_STR;
837 //todo: how do you handle a vector of strings?
838
839 int32_t n = s.m;
840 if (bcf_get_info_string(h, v, tag.s, &s.s, &n)>0)
841 {
842 b = true;
843 s.m = n;
844 }
845 else
846 {
847 b = false;
848 value_exists = false;
849 }
850 }
851 }
852 //fast lane for INFO-TYPEs
853 else if (type==(VT_INFO|VT_INT))
854 {
855 if (var_length==BCF_VL_FIXED)
856 {
857 if (number==1)
858 {
859 int32_t n = 0;
860 int32_t *data = NULL;
861 if (bcf_get_info_int32(h, v, tag.s, &data, &n)>0)
862 {
863 b = true;
864 i = data[0];
865 f = (float)data[0];
866 }
867 else
868 {
869 b = false;
870 value_exists = false;
871 }
872
873 if (data) free(data);
874 }
875 else if (number>1)
876 {
877 int32_t n = 0;
878 int32_t *data = NULL;
879 if (bcf_get_info_int32(h, v, tag.s, &data, &n)>0)
880 {
881 ivec.resize(0);
882 fvec.resize(0);
883 for (int32_t i=0; i<n; ++i)
884 {
885 ivec.push_back(data[i]);
886 fvec.push_back(data[i]);
887 }
888 b = true;
889 i = data[index-1];
890 f = (float)data[index-1];
891 }
892 else
893 {
894 b = false;
895 value_exists = false;
896 }
897
898 if (n) free(data);
899 }
900 else
901 {
902 //hmmmmm.....
903 }
904 }
905 else if (var_length==BCF_VL_R || var_length==BCF_VL_A || var_length==BCF_VL_G)
906 {
907 int32_t no_alleles = bcf_get_n_allele(v);
908
909 if (var_length==BCF_VL_R)
910 {
911 number = no_alleles;
912 }
913 else if (var_length==BCF_VL_A)
914 {
915 number = no_alleles-1;
916 }
917 else if (var_length==BCF_VL_G)
918 {
919 //assume ploidy is too for the time being
920 //usage is not determinable for info fields because
921 //ploidy is individual dependent
922 number = bcf_ap2g(no_alleles, 2);
923 }
924
925 if (number==1)
926 {
927 int32_t ns = 0;
928 int32_t *is = NULL;
929 if (bcf_get_info_int32(h, v, tag.s, &is, &ns)>0)
930 {
931 i = is[0];
932 f = (float)is[0];
933 }
934 else
935 {
936 b = false;
937 value_exists = false;
938 }
939
940 if (ns) free(is);
941 }
942 else if (number>1)
943 {
944 int32_t ns = 0;
945 int32_t *is = NULL;
946 if (bcf_get_info_int32(h, v, tag.s, &is, &ns)>0)
947 {
948 ivec.resize(0);
949 fvec.resize(0);
950 for (int32_t i=0; i<ns; ++i)
951 {
952 ivec.push_back(is[i]);
953 fvec.push_back(is[i]);
954 }
955 i = is[index-1];
956 f = (float)is[index-1];
957 }
958 else
959 {
960 b = false;
961 value_exists = false;
962 }
963
964 if (ns) free(is);
965 }
966 }
967 }
968 else if (type==(VT_INFO|VT_FLT))
969 {
970 float *data = NULL;
971 int32_t n = 0;
972
973 if (bcf_get_info_float(h, v, tag.s, &data, &n)>0)
974 {
975 b = true;
976 i = (int32_t) data[0];
977 f = data[0];
978 }
979 else
980 {
981 b = false;
982 value_exists = false;
983 }
984
985 if (n) free(data);
986 }
987 else if (type==(VT_INFO|VT_FLT))
988 {
989 if (var_length==BCF_VL_FIXED)
990 {
991 if (number==1)
992 {
993 int32_t ns = 0;
994 int32_t *fs = NULL;
995 if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
996 {
997 i = (int32_t) fs[0];
998 f = fs[0];
999 }
1000 else
1001 {
1002 b = false;
1003 value_exists = false;
1004 }
1005
1006 if (ns) free(fs);
1007 }
1008 else if (number>1)
1009 {
1010 int32_t ns = 0;
1011 int32_t *fs = NULL;
1012 if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
1013 {
1014 ivec.resize(0);
1015 fvec.resize(0);
1016 for (int32_t i=0; i<ns; ++i)
1017 {
1018 ivec.push_back(fs[i]);
1019 fvec.push_back(fs[i]);
1020 }
1021 i = (int32_t) fs[index-1];
1022 f = fs[index-1];
1023 }
1024 else
1025 {
1026 b = false;
1027 value_exists = false;
1028 }
1029
1030 if (ns) free(fs);
1031 }
1032 }
1033 else if (var_length==BCF_VL_R || var_length==BCF_VL_A || var_length==BCF_VL_G)
1034 {
1035 int32_t no_alleles = bcf_get_n_allele(v);
1036
1037 if (var_length==BCF_VL_R)
1038 {
1039 number = no_alleles;
1040 }
1041 else if (var_length==BCF_VL_A)
1042 {
1043 number = no_alleles-1;
1044 }
1045 else if (var_length==BCF_VL_G)
1046 {
1047 //assume ploidy is too for the time being
1048 //usage is not determinable for info fields because
1049 //ploidy is individual dependent
1050 number = bcf_ap2g(no_alleles, 2);
1051 }
1052
1053 if (number==1)
1054 {
1055 int32_t ns = 0;
1056 int32_t *fs = NULL;
1057 if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
1058 {
1059 i = (int32_t)fs[0];
1060 f = fs[0];
1061 }
1062 else
1063 {
1064 b = false;
1065 value_exists = false;
1066 }
1067
1068 if (ns) free(fs);
1069 }
1070 else if (number>1)
1071 {
1072 int32_t ns = 0;
1073 int32_t *fs = NULL;
1074 if (bcf_get_info_float(h, v, tag.s, &fs, &ns)>0)
1075 {
1076 ivec.resize(0);
1077 fvec.resize(0);
1078 for (int32_t i=0; i<ns; ++i)
1079 {
1080 ivec.push_back(fs[i]);
1081 fvec.push_back(fs[i]);
1082 }
1083 i = (int32_t)fs[index-1];
1084 f = fs[index-1];
1085 }
1086 else
1087 {
1088 b = false;
1089 value_exists = false;
1090 }
1091
1092 if (ns) free(fs);
1093 }
1094 }
1095 else if (var_length==BCF_VL_VAR)
1096 {
1097 std::cerr << "\tvariable length, should we treat as a blob?\n";
1098 }
1099 }
1100 else if (type==(VT_INFO|VT_STR))
1101 {
1102 int32_t n = s.m;
1103
1104 if (bcf_get_info_string(h, v, tag.s, &s.s, &n)>0)
1105 {
1106 b = true;
1107 s.m = n;
1108 }
1109 else
1110 {
1111 b = false;
1112 value_exists = false;
1113 }
1114 }
1115 else if (type==(VT_INFO|VT_FLG))
1116 {
1117 if (bcf_get_info_flag(h, v, tag.s, 0, 0)>0)
1118 {
1119 b = true;
1120 }
1121 else
1122 {
1123 b = false;
1124 }
1125
1126 if (debug)
1127 std::cerr << "\tVT_INFO|VT_FLG " << i << " " << f << " " << b << " " << s.s << " \n";
1128 }
1129 else if (type==VT_VARIANT_TYPE)
1130 {
1131 if (debug)
1132 std::cerr << "\tVTYPE " << Variant::vtype2string(variant->type) << " \n";
1133 i = variant->type;
1134 b = i;
1135 }
1136 else if (type==VT_VARIANT_DLEN)
1137 {
1138 if (debug)
1139 std::cerr << "\tDLEN " << variant->alleles[0].dlen << " \n";
1140 i = variant->alleles[0].dlen;
1141 f = i;
1142 }
1143 else if (type==VT_VARIANT_LEN)
1144 {
1145 if (debug)
1146 std::cerr << "\tLEN " << abs(variant->alleles[0].dlen) << " \n";
1147 i = abs(variant->alleles[0].dlen);
1148 f = i;
1149 }
1150 else if (type==VT_VARIANT_CONTAINS_N)
1151 {
1152 if (debug)
1153 std::cerr << "\tVARIANT_CONTAINS_N " << variant->contains_N << " \n";
1154
1155 bcf_unpack(v, BCF_UN_FLT);
1156 b = variant->contains_N;
1157 }
1158 else if (type==VT_N_ALLELE)
1159 {
1160 if (debug)
1161 std::cerr << "\tN_ALLELE " << bcf_get_n_allele(v) << " \n";
1162 i = bcf_get_n_allele(v);
1163 f = i;
1164 }
1165 }
1166 else if (type&VT_MATH_OP)
1167 {
1168 if (!left->value_exists && !right->value_exists)
1169 {
1170 value_exists = false;
1171 return;
1172 }
1173
1174 if ((type&15)==11) //ADD
1175 {
1176 if ((left->type&VT_INT))
1177 {
1178 if ((right->type&VT_INT))
1179 {
1180 type |= VT_INT;
1181 i = (left->i+right->i);
1182 f = i;
1183 return;
1184 }
1185 else if ((right->type&VT_FLT))
1186 {
1187 type |= VT_FLT;
1188 f = (left->i+right->f);
1189 return;
1190 }
1191 }
1192 else if ((left->type&VT_FLT))
1193 {
1194 if ((right->type&VT_INT))
1195 {
1196 type |= VT_FLT;
1197 f = (left->f+right->i);
1198 return;
1199 }
1200 else if ((right->type&VT_FLT))
1201 {
1202 type |= VT_FLT;
1203 f = (left->f+right->f);
1204 return;
1205 }
1206 }
1207
1208 fprintf(stderr, "[%s:%d %s] evaluation not supported : +\n", __FILE__, __LINE__, __FUNCTION__);
1209 exit(1);
1210 }
1211 else if ((type&15)==12) //SUB
1212 {
1213 if ((left->type&VT_INT))
1214 {
1215 if ((right->type&VT_INT))
1216 {
1217 type |= VT_INT;
1218 i = (left->i-right->i);
1219 f = i;
1220 return;
1221 }
1222 else if ((right->type&VT_FLT))
1223 {
1224 type |= VT_FLT;
1225 f = (left->i-right->f);
1226 return;
1227 }
1228 }
1229 else if ((left->type&VT_FLT))
1230 {
1231 if ((right->type&VT_INT))
1232 {
1233 type |= VT_FLT;
1234 f = (left->f-right->i);
1235 return;
1236 }
1237 else if ((right->type&VT_FLT))
1238 {
1239 type |= VT_FLT;
1240 f = (left->f-right->f);
1241 return;
1242 }
1243 }
1244
1245 fprintf(stderr, "[%s:%d %s] evaluation not supported : -\n", __FILE__, __LINE__, __FUNCTION__);
1246 exit(1);
1247 }
1248 else if ((type&15)==13) //MUL
1249 {
1250 if ((left->type&VT_INT))
1251 {
1252 if ((right->type&VT_INT))
1253 {
1254 type |= VT_INT;
1255 i = (left->i*right->i);
1256 f = i;
1257 return;
1258 }
1259 else if ((right->type&VT_FLT))
1260 {
1261 type |= VT_FLT;
1262 f = (left->i*right->f);
1263 return;
1264 }
1265 }
1266 else if ((left->type&VT_FLT))
1267 {
1268 if ((right->type&VT_INT))
1269 {
1270 type |= VT_FLT;
1271 f = (left->f*right->i);
1272 return;
1273 }
1274 else if ((right->type&VT_FLT))
1275 {
1276 type |= VT_FLT;
1277 f = (left->f*right->f);
1278 return;
1279 }
1280 }
1281
1282 fprintf(stderr, "[%s:%d %s] evaluation not supported : *\n", __FILE__, __LINE__, __FUNCTION__);
1283 exit(1);
1284 }
1285 else if ((type&15)==14) //DIV
1286 {
1287 if (left->type&VT_INT)
1288 {
1289 if (right->type&VT_INT)
1290 {
1291 type |= VT_FLT;
1292 f = ((float)left->i/right->i);
1293 return;
1294 }
1295 else if (right->type&VT_FLT)
1296 {
1297 type |= VT_FLT;
1298 f = (left->i/right->f);
1299 return;
1300 }
1301 }
1302 else if (left->type&VT_FLT)
1303 {
1304 if (right->type&VT_INT)
1305 {
1306 type |= VT_FLT;
1307 f = (left->f/right->i);
1308 return;
1309 }
1310 else if (right->type&VT_FLT)
1311 {
1312 type |= VT_FLT;
1313 f = (left->f/right->f);
1314 return;
1315 }
1316 }
1317
1318 fprintf(stderr, "[%s:%d %s] evaluation not supported : /\n", __FILE__, __LINE__, __FUNCTION__);
1319 exit(1);
1320 }
1321 else if ((type&15)==15) //BIT AND
1322 {
1323 if ((left->type&VT_INT) && (right->type&VT_INT))
1324 {
1325 i = (left->i & right->i);
1326 b = i;
1327 return;
1328 }
1329
1330 fprintf(stderr, "[%s:%d %s] evaluation not supported for & : %d %d\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type);
1331 exit(1);
1332 }
1333 else if ((type&15)==16) //BIT OR
1334 {
1335 if ((left->type&VT_INT) && (right->type&VT_INT))
1336 {
1337 i = (left->i | right->i);
1338 b = i;
1339 return;
1340 }
1341
1342 fprintf(stderr, "[%s:%d %s] evaluation not supported for | : %d %d\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type);
1343 exit(1);
1344 }
1345 else
1346 {
1347 fprintf(stderr, "[%s:%d %s] math op not supported : %d\n", __FILE__, __LINE__, __FUNCTION__, (type&15));
1348 exit(1);
1349 }
1350 }
1351 }
1352
1353 /**
1354 * Converts type to string.
1355 */
type2string(int32_t type)1356 std::string Node::type2string(int32_t type)
1357 {
1358 std::string s = "";
1359 if (type&VT_BOOL)
1360 {
1361 s += (s==""? "" : "|");
1362 s += "BOOL";
1363 }
1364
1365 if (type&VT_INT)
1366 {
1367 s += (s==""? "" : "|");
1368 s += "INT";
1369 }
1370
1371 if (type&VT_FLT)
1372 {
1373 s += (s==""? "" : "|");
1374 s += "FLT";
1375 }
1376
1377 if (type&VT_STR)
1378 {
1379 s += (s==""? "" : "|");
1380 s += "STR";
1381 }
1382
1383 if (type&VT_FLG)
1384 {
1385 s += (s==""? "" : "|");
1386 s += "FLG";
1387 }
1388
1389 if (type&VT_LOGIC_OP)
1390 {
1391 s += (s==""? "" : "|");
1392 s += "LOGIC";
1393 }
1394
1395 if (type&VT_MATH_CMP)
1396 {
1397 s += (s==""? "" : "|");
1398 s += "MATH_CMP";
1399 }
1400
1401 if (type&VT_MATH_OP)
1402 {
1403 s += (s==""? "" : "|");
1404 s += "MATH_OP";
1405 }
1406
1407 if (type&VT_BCF_OP)
1408 {
1409 s += (s==""? "" : "|");
1410 s += "BCF_OP";
1411 }
1412
1413 if (type==VT_QUAL)
1414 {
1415 s += (s==""? "" : "|");
1416 s += "QUAL";
1417 }
1418
1419 if (type==VT_FILTER)
1420 {
1421 s += (s==""? "" : "|");
1422 s += "FILTER";
1423 }
1424
1425 if (type==VT_N_FILTER)
1426 {
1427 s += (s==""? "" : "|");
1428 s += "FILTER";
1429 }
1430
1431 if (type==VT_INFO)
1432 {
1433 s += (s==""? "" : "|");
1434 s += "INFO";
1435 }
1436
1437 if (type==VT_N_ALLELE)
1438 {
1439 s += (s==""? "" : "|");
1440 s += "N_ALLELE";
1441 }
1442
1443 if (type==VT_VARIANT_TYPE)
1444 {
1445 s += (s==""? "" : "|");
1446 s += "VARIANT_TYPE";
1447 }
1448
1449 if (type==VT_VARIANT_DLEN)
1450 {
1451 s += (s==""? "" : "|");
1452 s += "VARIANT_DLEN";
1453 }
1454
1455 if (type==VT_VARIANT_LEN)
1456 {
1457 s += (s==""? "" : "|");
1458 s += "VARIANT_LEN";
1459 }
1460
1461 return s;
1462 };
1463
1464 /**
1465 * Constructor.
1466 */
Filter()1467 Filter::Filter()
1468 {
1469 this->tree = NULL;
1470 };
1471
1472 /**
1473 * Constructor with expression initialization.
1474 */
Filter(std::string exp)1475 Filter::Filter(std::string exp)
1476 {
1477 this->tree = NULL;
1478 parse(exp.c_str(), false);
1479 };
1480
1481 /**
1482 * Parses filter expression.
1483 */
parse(const char * exp,bool debug)1484 void Filter::parse(const char* exp, bool debug)
1485 {
1486 if (strlen(exp)!=0)
1487 {
1488 //trim the white spaces out of the string
1489 std::string exp_no_space = "";
1490 for (uint32_t i=0; i<strlen(exp); ++i)
1491 {
1492 if (!isspace(exp[i]))
1493 {
1494 exp_no_space.append(1, exp[i]);
1495 }
1496 }
1497
1498 reset();
1499 tree = new Node();
1500 parse(exp_no_space.c_str(), exp_no_space.size(), tree, debug);
1501
1502 if (!(tree->type&VT_BOOL))
1503 {
1504 fprintf(stderr, "[%s:%d %s] filter expression not boolean %s\n", __FILE__, __LINE__, __FUNCTION__, exp);
1505 exit(1);
1506 }
1507 }
1508 else
1509 {
1510 tree = NULL;
1511 }
1512 }
1513
1514 /**
1515 * Applies filter to vcf record.
1516 */
apply(bcf_hdr_t * h,bcf1_t * v,Variant * variant,bool debug)1517 bool Filter::apply(bcf_hdr_t *h, bcf1_t *v, Variant *variant, bool debug) //recursive
1518 {
1519 if (tree==NULL)
1520 {
1521 return true;
1522 }
1523
1524 this->h = h;
1525 this->v = v;
1526 this->variant = variant;
1527
1528 if (debug) std::cerr << "==========\n";
1529 apply(tree, debug);
1530 if (debug) std::cerr << "==========\n";
1531
1532 if (tree->value_exists)
1533 {
1534 return tree->b;
1535 }
1536 else
1537 {
1538 return false;
1539 }
1540 }
1541
1542 /**
1543 * Attempts to simplify the expression tree by collapsing nodes that can be precomputed.
1544 */
simplify()1545 void Filter::simplify()
1546 {
1547 }
1548
1549 /**
1550 * Resets filter.
1551 */
reset()1552 void Filter::reset()
1553 {
1554 if (tree!=NULL)
1555 {
1556 delete tree;
1557 tree = NULL;
1558 }
1559 }
1560
1561 /**
1562 * Constructs the expression tree.
1563 */
parse(const char * exp,int32_t len,Node * node,bool debug)1564 void Filter::parse(const char* exp, int32_t len, Node *node, bool debug)
1565 {
1566 if (debug)
1567 {
1568 std::cerr << "parsing \"";
1569 for (int32_t i=0; i<len; ++i)
1570 std::cerr << exp[i] ;
1571 std::cerr << "\" " << len << "\n";
1572 }
1573
1574 //******************************
1575 //trim white spaces and brackets
1576 //******************************
1577 while (*exp==' ') ++exp;
1578 while (exp[len-1]==' ') --len;
1579 trim_brackets(exp, len, debug);
1580
1581 //this is a literal
1582 if (is_literal(exp, len, debug))
1583 {
1584 //will not recurse any further
1585 return parse_literal(exp, len, node, debug);
1586 }
1587 //unary operation
1588 else if (is_unary_op(exp, len, debug))
1589 {
1590 node->type = VT_NOT;
1591 if (debug) std::cerr << "\tis not_op\n";
1592
1593 node->left = new Node();
1594 parse(exp+1, len-1, node->left, debug);
1595 }
1596 //binary operation
1597 else
1598 {
1599 if (debug) std::cerr << "\tis binary op\n";
1600
1601 const char* p = exp; //points to end of first part
1602 const char* q = exp; //points to start of second part
1603 const char* r = exp; //for iteration
1604
1605 int32_t type = INT_MAX;
1606
1607 while(r-exp!=len)
1608 {
1609 //bypasses bracketed expressions
1610 fwd_to_closing_bracket(r, len);
1611
1612 int32_t oplen = -1;
1613 int32_t ctype = peek_op(r, len, oplen, debug);
1614
1615 if(ctype!=-1)
1616 {
1617 if (debug) std::cerr<< "\ttype : " << ctype << " \n";
1618 //this implements order of operations
1619 if (ctype<type)
1620 {
1621 if (debug) std::cerr<< "\tupdating type\n";
1622 type = ctype;
1623 p = r-1;
1624 q = r+oplen;
1625 }
1626
1627 r += oplen-1;
1628 }
1629
1630 ++r;
1631 }
1632
1633 //valid binary operator not found
1634 if (type==INT_MAX)
1635 {
1636 kstring_t s = {0,0,0};
1637 kputsn(exp, len, &s);
1638 fprintf(stderr, "[%s:%d %s] binary operator not found in \"%s\". Valid operators are ==,!=,=~,~~,&&,||,&,|,+,-,*,/\n", __FILE__, __LINE__, __FUNCTION__, s.s);
1639 if (s.m) free(s.s);
1640 exit(1);
1641 }
1642
1643 node->type = type;
1644
1645 node->left = new Node();
1646 parse(exp, p-exp+1, node->left, debug);
1647
1648 node->right = new Node();
1649 parse(q, len-(q-exp), node->right, debug);
1650 }
1651 }
1652
1653 /**
1654 * Checks if exp is a unary op.
1655 */
is_unary_op(const char * exp,int32_t len,bool debug)1656 bool Filter::is_unary_op(const char* exp, int32_t len, bool debug)
1657 {
1658 //check to make sure not a binary operator
1659
1660 //NOT operator
1661 if (exp[0]=='~')
1662 {
1663 if (is_literal(exp+1, len-1, debug)||is_bracketed_expression(exp+1, len-1, debug))
1664 {
1665 if (debug) std::cerr << "\tis unary op\n";
1666 return true;
1667 }
1668 }
1669
1670 if (debug) std::cerr << "\tis not unary op\n";
1671 return false;
1672 }
1673
1674 /**
1675 * Checks is expression is bracketed.
1676 */
is_bracketed_expression(const char * exp,int32_t len,bool debug)1677 bool Filter::is_bracketed_expression(const char* exp, int32_t len, bool debug)
1678 {
1679 if (*exp=='(' && exp[len-1]==')')
1680 {
1681 int32_t opened_brackets = 1;
1682 bool nested = true;
1683 int32_t j=1;
1684 while(j<len-1)
1685 {
1686 if(exp[j]=='(')
1687 {
1688 if (opened_brackets<0)
1689 {
1690 kstring_t s = {0,0,0};
1691 kputsn(exp, len, &s);
1692 fprintf(stderr, "[%s:%d %s] brackets not correct %s\n", __FILE__, __LINE__, __FUNCTION__, s.s);
1693 if (s.m) free(s.s);
1694 exit(1);
1695 }
1696
1697 ++opened_brackets;
1698 }
1699 else if (exp[j]==')')
1700 {
1701 if (opened_brackets<=0)
1702 {
1703 kstring_t s = {0,0,0};
1704 kputsn(exp, len, &s);
1705 fprintf(stderr, "[%s:%d %s] brackets not correct %s\n", __FILE__, __LINE__, __FUNCTION__, s.s);
1706 if (s.m) free(s.s);
1707 exit(1);
1708 }
1709
1710 --opened_brackets;
1711 }
1712
1713 ++j;
1714 }
1715
1716 return opened_brackets==1;
1717 }
1718
1719 return false;
1720 }
1721
1722 /**
1723 * Check if exp is a literal (no binary operations, no unary operation).
1724 */
is_literal(const char * exp,int32_t len,bool debug)1725 bool Filter::is_literal(const char* exp, int32_t len, bool debug)
1726 {
1727 //checks if string literal
1728 if (exp==strchr(exp,'\'') && exp+len-1==strchr(exp+1,'\''))
1729 {
1730 return true;
1731 }
1732
1733 const char* q = exp;
1734 while (q-exp<len)
1735 {
1736 if(*q=='=' ||
1737 *q=='&' ||
1738 *q=='|' ||
1739 *q=='>' ||
1740 *q=='<' ||
1741 *q=='!' ||
1742 *q=='+' ||
1743 *q=='*' ||
1744 *q=='/' ||
1745 *q=='~' ||
1746 (*q=='-' && exp!=q))
1747 {
1748 if (debug) std::cerr << "\tis not literal\n";
1749 return false;
1750 }
1751
1752 ++q;
1753 }
1754
1755 if (debug) std::cerr << "\tis literal\n";
1756 return true;
1757 }
1758
1759 /**
1760 * Detect index width.
1761 * e.g. AC[1] => 3
1762 * e.g. EVIDENCE[12] => 4
1763 *
1764 * Populated index with the index queried.
1765 */
get_index_width(const char * exp,int32_t n,int32_t * index)1766 int32_t Filter::get_index_width(const char *exp, int32_t n, int32_t *index)
1767 {
1768 if (n<=3) return 0;
1769
1770 int32_t width = 0;
1771 bool end_bracket = false;
1772 bool in_digits = false;
1773 bool beg_bracket = false;
1774 int32_t current_index = 0;
1775 int32_t index_weight = 1;
1776
1777 //check end bracket and digit next to square bracket
1778 if (exp[n-1]==']' && exp[n-2]>=48&&exp[n-2]<=57)
1779 {
1780 in_digits = true;
1781 width = 2;
1782 current_index += exp[n-2]-48;
1783 index_weight *= 10;
1784 }
1785 else
1786 {
1787 *index = 0;
1788 return 0;
1789 }
1790
1791 //check digits in square brackets
1792 for (int32_t i=n-3; i>=0; --i)
1793 {
1794 //check if digit
1795 if (exp[i]>=48&&exp[i]<=57)
1796 {
1797 //all is good, continue
1798 ++width;
1799 current_index += (exp[i]-48)*index_weight;
1800 index_weight *= 10;
1801 }
1802 else if (exp[i]=='[')
1803 {
1804 *index = current_index;
1805 return width+1;
1806 }
1807 }
1808
1809 *index = 0;
1810 return 0;
1811 }
1812
1813 /**
1814 * Parse literals.
1815 */
parse_literal(const char * exp,int32_t len,Node * node,bool debug)1816 void Filter::parse_literal(const char* exp, int32_t len, Node * node, bool debug)
1817 {
1818 node->type = VT_UNKNOWN;
1819
1820 if (strncmp(exp, "PASS", len)==0)
1821 {
1822 node->type = VT_FILTER;
1823 kputsn(exp, len, &node->tag);
1824 if (debug) std::cerr << "\tis filter_op\n";
1825 return;
1826 }
1827 else if (strncmp(exp, "REF", 3)==0)
1828 {
1829 node->type = VT_REF_COL;
1830 exp += 3;
1831 if (debug) std::cerr << "\tis ref_op\n";
1832 return;
1833 }
1834 else if (strncmp(exp, "ALT", 3)==0)
1835 {
1836 node->type = VT_ALT;
1837 exp += 3;
1838 if (debug) std::cerr << "\tis alt_op\n";
1839 return;
1840 }
1841 else if (strncmp(exp, "QUAL", 4)==0)
1842 {
1843 node->type = VT_QUAL;
1844 exp += 4;
1845 if (debug) std::cerr << "\tis qual_op\n";
1846 return;
1847 }
1848 else if (strncmp(exp, "FILTER.", 7)==0)
1849 {
1850 node->type = VT_FILTER;
1851 exp += 7;
1852 kputsn(exp, len-7, &node->tag);
1853 if (debug) std::cerr << "\tis filter_op\n";
1854 return;
1855 }
1856 else if (strncmp(exp, "N_FILTER", len)==0)
1857 {
1858 node->type = VT_N_FILTER;
1859 if (debug) std::cerr << "\tis n_filter_op\n";
1860 return;
1861 }
1862 else if (strncmp(exp, "INFO.", 5)==0)
1863 {
1864 node->type = VT_INFO;
1865 exp += 5;
1866 //detect index access e.g. AC[1]
1867 int32_t index_width = get_index_width(exp, len-5, &node->index);
1868 kputsn(exp, len-5-index_width, &node->tag);
1869 if (debug) std::cerr << "\tis info_op\n";
1870 return;
1871 }
1872 else if (strncmp(exp, "VTYPE", 5)==0)
1873 {
1874 node->type = VT_VARIANT_TYPE;
1875 if (debug) std::cerr << "\tis variant_op\n";
1876 return;
1877 }
1878 else if (strncmp(exp, "N_ALLELE", len)==0)
1879 {
1880 node->type = VT_N_ALLELE;
1881 if (debug) std::cerr << "\tis nallele_op\n";
1882 return;
1883 }
1884 else if (strncmp(exp, "INDEL", len)==0)
1885 {
1886 node->type = VT_INT;
1887 node->i = VT_INDEL;
1888 node->value_exists = true;
1889 if (debug) std::cerr << "\tis INDEL\n";
1890 return;
1891 }
1892 else if (strncmp(exp, "SNP", len)==0)
1893 {
1894 node->type = VT_INT;
1895 node->i = VT_SNP;
1896 node->value_exists = true;
1897 if (debug) std::cerr << "\tis SNP\n";
1898 return;
1899 }
1900 else if (strncmp(exp, "MNP", len)==0)
1901 {
1902 node->type = VT_INT;
1903 node->i = VT_MNP;
1904 node->value_exists = true;
1905 if (debug) std::cerr << "\tis MNP\n";
1906 return;
1907 }
1908 else if (strncmp(exp, "CLUMPED", len)==0)
1909 {
1910 node->type = VT_INT;
1911 node->i = VT_CLUMPED;
1912 node->value_exists = true;
1913 if (debug) std::cerr << "\tis CLUMPED\n";
1914 return;
1915 }
1916 else if (strncmp(exp, "VNTR", len)==0)
1917 {
1918 node->type = VT_INT;
1919 node->i = VT_VNTR;
1920 if (debug) std::cerr << "\tis VNTR\n";
1921 return;
1922 }
1923 else if (strncmp(exp, "SV", len)==0)
1924 {
1925 node->type = VT_INT;
1926 node->i = VT_SV;
1927 if (debug) std::cerr << "\tis SV\n";
1928 return;
1929 }
1930 else if (strncmp(exp, "REF", len)==0)
1931 {
1932 node->type = VT_INT;
1933 node->i = VT_REF;
1934 node->value_exists = true;
1935 if (debug) std::cerr << "\tis REF\n";
1936 return;
1937 }
1938 else if (strncmp(exp, "DLEN", len)==0)
1939 {
1940 node->type = VT_VARIANT_DLEN;
1941 node->value_exists = false;
1942 if (debug) std::cerr << "\tis dlen\n";
1943 return;
1944 }
1945 else if (strncmp(exp, "LEN", len)==0)
1946 {
1947 node->type = VT_VARIANT_LEN;
1948 node->value_exists = false;
1949 if (debug) std::cerr << "\tis len\n";
1950 return;
1951 }
1952 else if (strncmp(exp, "VARIANT_CONTAINS_N", len)==0)
1953 {
1954 node->type = VT_VARIANT_CONTAINS_N;
1955 node->value_exists = false;
1956 if (debug) std::cerr << "\tis variant contains N\n";
1957 return;
1958 }
1959 else
1960 {
1961 //integer type
1962 const char* start = exp;
1963 char *end = NULL;
1964 int32_t i = std::strtoul(exp, &end, 10);
1965 if (end==exp+len)
1966 {
1967 node->type = VT_INT;
1968 node->i = i;
1969 node->f = (float)i;
1970 node->value_exists = true;
1971 if (debug) std::cerr << "\tis int\n";
1972 return;
1973 }
1974
1975 //float type
1976 start = exp;
1977 end = NULL;
1978 float f = std::strtod(exp, &end);
1979 if (end==exp+len)
1980 {
1981 node->type = VT_FLT;
1982 node->f = f;
1983 node->value_exists = true;
1984 if (debug) std::cerr << "\tis float: " << f << "\n";
1985 return;
1986 }
1987
1988 //string type
1989 if (exp[0]=='\'' && exp[len-1]=='\'')
1990 {
1991 node->type = VT_STR;
1992 kputsn(exp+1, len-2, &node->tag);
1993 kputc(0, &node->tag);
1994 kputsn(exp+1, len-2, &node->s);
1995 kputc(0, &node->s);
1996 node->value_exists = true;
1997 if (debug) std::cerr << "\tis string\n";
1998 return;
1999 }
2000
2001 if (node->type==VT_UNKNOWN)
2002 {
2003 kstring_t s = {0,0,0};
2004 kputsn(exp, len, &s);
2005 fprintf(stderr, "[E:%s] %s is not recognized. If it is a string, you should place it in single inverted commas.\n", __FUNCTION__, s.s);
2006 print_filter_help();
2007 exit(1);
2008 }
2009 }
2010
2011 if (debug)
2012 {
2013 std::cerr << "\tvalue_exists " << node->value_exists << "\n";
2014 std::cerr << "\ttag " << node->tag.s << "\n";
2015 std::cerr << "\tb " << node->b << "\n";
2016 std::cerr << "\ti " << node->i << "\n";
2017 std::cerr << "\tf " << node->f << "\n";
2018 }
2019
2020 return;
2021 }
2022
2023 /**
2024 * Help message on filter expressions.
2025 */
print_filter_help()2026 void Filter::print_filter_help()
2027 {
2028 fprintf(stderr, "\n");
2029 fprintf(stderr, " Variant characteristics\n");
2030 fprintf(stderr, " VTYPE,N_ALLELE,DLEN,LEN\n");
2031 fprintf(stderr, "\n");
2032 fprintf(stderr, " Variant value types\n");
2033 fprintf(stderr, " SNP,MNP,INDEL,CLUMPED\n");
2034 fprintf(stderr, "\n");
2035 fprintf(stderr, " Biallelic SNPs only : VTYPE==SNP&&N_ALLELE==2\n");
2036 fprintf(stderr, " Biallelic Indels with embedded SNP : VTYPE==(SNP|INDEL)&&N_ALLELE==2\n");
2037 fprintf(stderr, " Biallelic variants involving insertions : VTYPE&INDEL&&DLEN>0&&N_ALLELE==2\n");
2038 fprintf(stderr, " Biallelic variants involving 1bp variants : LEN==1&&N_ALLELE==2\n");
2039 fprintf(stderr, "\n");
2040 fprintf(stderr, " QUAL field\n");
2041 fprintf(stderr, " QUAL\n");
2042 fprintf(stderr, "\n");
2043 fprintf(stderr, " FILTER fields\n");
2044 fprintf(stderr, " PASS, FILTER.<tag>\n");
2045 fprintf(stderr, "\n");
2046 fprintf(stderr, " INFO fields\n");
2047 fprintf(stderr, " INFO.<tag>\n");
2048 fprintf(stderr, "\n");
2049 fprintf(stderr, " Passed biallelic SNPs only : PASS&&VTYPE==SNP&&N_ALLELE==2\n");
2050 fprintf(stderr, " Passed Common biallelic SNPs only : PASS&&VTYPE==SNP&&N_ALLELE==2&&INFO.AF>0.005\n");
2051 fprintf(stderr, " Passed Common biallelic SNPs or rare indels : (PASS&&VTYPE==SNP&&N_ALLELE==2&&INFO.AF>0.005)||(VTYPE&INDEL&&INFO.AF<=0.005)\n");
2052 fprintf(stderr, "\n");
2053 fprintf(stderr, " Regular expressions for string fields using pcre2\n");
2054 fprintf(stderr, "\n");
2055 fprintf(stderr, " Passed variants in intergenic regions or UTR : PASS&&INFO.ANNO=~'Intergenic|UTR'\n");
2056 fprintf(stderr, " Passed variants in intergenic regions or UTR : PASS&&INFO.ANNO=~'(?i)Intergenic|UTR'\n");
2057 fprintf(stderr, " ignoring case\n");
2058 fprintf(stderr, "\n");
2059 fprintf(stderr, " Operations\n");
2060 fprintf(stderr, " ==,!=,=~,~~,~,&&,||,&,|,+,-,*,/\n");
2061 fprintf(stderr, "\n");
2062 fprintf(stderr, " Failed rare variants : ~PASS&&(INFO.AC/INFO.AN<0.005)\n");
2063 fprintf(stderr, "\n");
2064 }
2065
2066 /**
2067 * Trim brackets from an expression.
2068 */
trim_brackets(const char * & exp,int32_t & len,bool debug)2069 void Filter::trim_brackets(const char* &exp, int32_t &len, bool debug)
2070 {
2071 if (*exp=='(' && exp[len-1]==')')
2072 {
2073 int32_t opened_brackets = 1;
2074 bool nested = true;
2075 int32_t j=1;
2076 while(j<len-1)
2077 {
2078 if(exp[j]=='(')
2079 {
2080 if (opened_brackets<0)
2081 {
2082 kstring_t s = {0,0,0};
2083 kputsn(exp, len, &s);
2084 fprintf(stderr, "[%s:%d %s] brackets not correct %s\n", __FILE__, __LINE__, __FUNCTION__, s.s);
2085 if (s.m) free(s.s);
2086 exit(1);
2087 }
2088
2089 ++opened_brackets;
2090 }
2091 else if (exp[j]==')')
2092 {
2093 if (opened_brackets<=0)
2094 {
2095 kstring_t s = {0,0,0};
2096 kputsn(exp, len, &s);
2097 fprintf(stderr, "[%s:%d %s] brackets not correct %s\n", __FILE__, __LINE__, __FUNCTION__, s.s);
2098 if (s.m) free(s.s);
2099 exit(1);
2100 }
2101
2102 --opened_brackets;
2103 }
2104
2105 if (opened_brackets==0)
2106 {
2107 nested = false;
2108 break;
2109 }
2110 ++j;
2111
2112
2113 }
2114
2115 if (nested)
2116 {
2117 if (opened_brackets == 1)
2118 {
2119 ++exp;
2120 len -=2;
2121 if (debug) std::cerr << "\t\ttrimmed brackets\n";
2122 trim_brackets(exp, len, debug);
2123 }
2124 else
2125 {
2126 std::cerr << "Illegal expression: brackets not correct\n";
2127 }
2128 }
2129 }
2130 }
2131
2132 /**
2133 * Moves r to the closing bracket if this expression starts with an open bracket.
2134 * Returns -1 if end of r else 0.
2135 */
fwd_to_closing_bracket(const char * & r,int32_t & len)2136 int32_t Filter::fwd_to_closing_bracket(const char* &r, int32_t &len)
2137 {
2138 const char* s = r;
2139 if (*r=='(')
2140 {
2141 ++s;
2142 int32_t opened_brackets = 1;
2143 bool nested = true;
2144 while(s-r!=len)
2145 {
2146 if(*s=='(')
2147 {
2148 if (opened_brackets<0)
2149 {
2150 kstring_t s = {0,0,0};
2151 kputsn(r, len, &s);
2152 fprintf(stderr, "[%s:%d %s] brackets not correct %s\n", __FILE__, __LINE__, __FUNCTION__, s.s);
2153 if (s.m) free(s.s);
2154 exit(1);
2155 }
2156
2157 ++opened_brackets;
2158 }
2159 else if (*s==')')
2160 {
2161 if (opened_brackets<=0)
2162 {
2163 kstring_t s = {0,0,0};
2164 kputsn(r, len, &s);
2165 fprintf(stderr, "[%s:%d %s] brackets not correct %s\n", __FILE__, __LINE__, __FUNCTION__, s.s);
2166 if (s.m) free(s.s);
2167 exit(1);
2168 }
2169
2170 --opened_brackets;
2171 }
2172
2173 if (opened_brackets==0)
2174 {
2175 r = s;
2176
2177 if (s-r==len-1)
2178 {
2179 return -1;
2180 }
2181 else
2182 {
2183 return 0;
2184 }
2185 }
2186
2187 ++s;
2188 }
2189
2190 kstring_t s = {0,0,0};
2191 kputsn(r, len, &s);
2192 fprintf(stderr, "[%s:%d %s] brackets not correct %s\n", __FILE__, __LINE__, __FUNCTION__, s.s);
2193 if (s.m) free(s.s);
2194 exit(1);
2195 }
2196
2197 return 0;
2198 }
2199
2200 /**
2201 * Returns -1 if no operator found. Updates oplen to be the length of the operator observed.
2202 */
peek_op(const char * & r,int32_t len,int32_t & oplen,bool debug)2203 int32_t Filter::peek_op(const char* &r, int32_t len, int32_t &oplen, bool debug)
2204 {
2205 const char* s = r;
2206 oplen = -1;
2207 if (*s=='&' && (s+1-r<len) && *(s+1)=='&')
2208 {
2209 if (debug) std::cerr << "\tis && operator\n";
2210 oplen = 2;
2211 return VT_AND;
2212 }
2213 else if (*s=='|' && (s+1-r<len) && *(s+1)=='|')
2214 {
2215 if (debug) std::cerr << "\tis || operator\n";
2216 oplen = 2;
2217 return VT_OR;
2218 }
2219 else if (*s=='=' && (s+1-r<len) && *(s+1)=='=')
2220 {
2221 if (debug) std::cerr << "\tis == operator\n";
2222 oplen = 2;
2223 return VT_EQ;
2224 }
2225 else if (*s=='=' && (s+1-r<len) && *(s+1)=='~')
2226 {
2227 if (debug) std::cerr << "\tis =~ operator\n";
2228 oplen = 2;
2229 return VT_MATCH;
2230 }
2231 else if (*s=='~' && (s+1-r<len) && *(s+1)=='~')
2232 {
2233 if (debug) std::cerr << "\tis ~~ operator\n";
2234 oplen = 2;
2235 return VT_NO_MATCH;
2236 }
2237 else if (*s=='>' && (s+1-r<len) && *(s+1)=='=')
2238 {
2239 if (debug) std::cerr << "\tis >= operator\n";
2240 oplen = 2;
2241 return VT_GE;
2242 }
2243 else if (*s=='<' && (s+1-r<len) && *(s+1)=='=')
2244 {
2245 if (debug) std::cerr << "\tis <= operator\n";
2246 oplen = 2;
2247 return VT_LE;
2248 }
2249 else if (*s=='!' && (s+1-r<len) && *(s+1)=='=')
2250 {
2251 if (debug) std::cerr << "\tis != operator\n";
2252 oplen = 2;
2253 return VT_NE;
2254 }
2255 else if (*s=='+')
2256 {
2257 if (debug) std::cerr << "\tis + operator\n";
2258 oplen = 1;
2259 return VT_ADD;
2260 }
2261 else if (*s=='-')
2262 {
2263 if (debug) std::cerr << "\tis - operator\n";
2264 oplen = 1;
2265 return VT_SUB;
2266 }
2267 else if (*s=='*')
2268 {
2269 if (debug) std::cerr << "\tis * operator\n";
2270 oplen = 1;
2271 return VT_MUL;
2272 }
2273 else if (*s=='/')
2274 {
2275 if (debug) std::cerr << "\tis / operator\n";
2276 oplen = 1;
2277 return VT_DIV;
2278 }
2279 else if (*s=='&')
2280 {
2281 if (debug) std::cerr << "\tis & operator\n";
2282 oplen = 1;
2283 return VT_BIT_AND;
2284 }
2285 else if (*s=='|')
2286 {
2287 if (debug) std::cerr << "\tis | operator\n";
2288 oplen = 1;
2289 return VT_BIT_OR;
2290 }
2291 else if (*s=='>')
2292 {
2293 if (debug) std::cerr << "\tis > operator\n";
2294 oplen = 1;
2295 return VT_GT;
2296 }
2297 else if (*s=='<')
2298 {
2299 if (debug) std::cerr << "\tis < operator\n";
2300 oplen = 1;
2301 return VT_LT;
2302 }
2303
2304 return -1;
2305 }
2306
2307 /**
2308 * Recursive call for apply.
2309 */
apply(Node * node,bool debug)2310 void Filter::apply(Node* node, bool debug)
2311 {
2312 node->value_exists = false;
2313
2314 //evaluate downstream
2315 if (node->left!=NULL)
2316 {
2317 apply(node->left, debug);
2318 }
2319
2320 //can do some lazy evaluation here for && and || types.
2321 if (node->type&VT_LOGIC_OP)
2322 {
2323 if (node->type==VT_AND)
2324 {
2325 if (node->left->value_exists && !node->left->b)
2326 {
2327 node->b = false;
2328 node->value_exists = true;
2329 return;
2330 }
2331 else
2332 {
2333 apply(node->right, debug);
2334 }
2335 }
2336 else if (node->type==VT_OR)
2337 {
2338 if (node->left->value_exists && node->left->b)
2339 {
2340 node->b = true;
2341 node->value_exists = true;
2342 return;
2343 }
2344 else
2345 {
2346 apply(node->right, debug);
2347 }
2348 }
2349 }
2350 else
2351 {
2352 if (node->right!=NULL)
2353 {
2354 apply(node->right, debug);
2355 }
2356 }
2357
2358 node->evaluate(h, v, variant, debug);
2359 };