1 /*
2 * entry_filters.cpp
3 *
4 * Created on: Aug 9, 2013
5 * Author: amarcketta
6 */
7
8 #include "entry.h"
9
10 set<string> entry::local_snps_to_keep;
11 set<string> entry::snps_to_exclude;
12 vector< set<int > > entry::keep_positions;
13 vector< set<int > > entry::exclude_positions;
14 map<string,int> entry::chr_to_idx;
15 vector< deque<pair<int,int> > > entry::lims;
16 ifstream entry::mask;
17 string entry::mask_chr;
18 string entry::mask_line;
19 int entry::mask_pos;
20 string entry::thin_chrom;
21 int entry::thin_pos;
22
apply_filters(const parameters & params)23 int entry::apply_filters(const parameters ¶ms)
24 {
25 if (line.empty())
26 {
27 passed_filters = false;
28 return 0;
29 }
30
31 // Apply all filters in turn.
32 filter_sites_by_allele_type(params.keep_only_indels, params.remove_indels);
33 filter_sites(params.snps_to_keep, params.snps_to_keep_file, params.snps_to_exclude_file);
34 filter_sites_by_filter_status(params.site_filter_flags_to_exclude, params.site_filter_flags_to_keep, params.remove_all_filtered_sites);
35 string chr_to_keep = "";
36 if (params.chrs_to_keep.size() == 1)
37 chr_to_keep = *(params.chrs_to_keep.begin()); // Get first chromosome in list (there should only be one).
38 filter_sites_by_position(chr_to_keep, params.start_pos, params.end_pos);
39 filter_sites_by_positions(params.positions_file, params.exclude_positions_file);
40 filter_sites_by_overlap_positions(params.positions_overlap_file, params.exclude_positions_overlap_file);
41 filter_sites_by_chromosome(params.chrs_to_keep, params.chrs_to_exclude);
42 filter_sites_by_BED_file(params.BED_file, params.BED_exclude);
43 filter_sites_by_number_of_alleles(params.min_alleles, params.max_alleles);
44 filter_sites_by_INFO(params.site_INFO_flags_to_remove, params.site_INFO_flags_to_keep);
45 filter_sites_by_quality(params.min_quality);
46 filter_sites_by_mean_depth(params.min_mean_depth, params.max_mean_depth);
47 filter_sites_by_mask(params.mask_file, params.invert_mask, params.min_kept_mask_value);
48 if (params.phased_only == true)
49 filter_sites_by_phase();
50 filter_genotypes_by_quality_value(params.min_genotype_quality);
51 filter_genotypes_by_depth_range(params.min_genotype_depth, params.max_genotype_depth);
52 filter_genotypes_by_filter_flag(params.geno_filter_flags_to_exclude, params.remove_all_filtered_genotypes);
53 filter_sites_by_frequency_and_call_rate(params.min_maf, params.max_maf, params.min_non_ref_af, params.max_non_ref_af, params.min_non_ref_af_any, params.max_non_ref_af_any, params.min_site_call_rate);
54 filter_sites_by_allele_count(params.min_mac, params.max_mac, params.min_non_ref_ac, params.max_non_ref_ac, params.min_non_ref_ac_any, params.max_non_ref_ac_any, params.max_missing_call_count);
55 filter_sites_by_HWE_pvalue(params.min_HWE_pvalue);
56 filter_sites_by_thinning(params.min_interSNP_distance);
57
58 return 1;
59 }
60
filter_genotypes_by_quality_value(double min_genotype_quality)61 void entry::filter_genotypes_by_quality_value(double min_genotype_quality)
62 {
63 // Filter genotypes by quality
64 if (passed_filters == false)
65 return;
66
67 if (min_genotype_quality <= 0)
68 return;
69
70 parse_genotype_entries(false, true);
71
72 if (entry_header.has_genotypes == false)
73 LOG.error("Require Genotypes in variant file in order to filter genotypes by Quality.");
74
75 filter_genotypes_by_quality(min_genotype_quality);
76 }
77
filter_genotypes_by_depth_range(int min_depth,int max_depth)78 void entry::filter_genotypes_by_depth_range(int min_depth, int max_depth)
79 {
80 // Filter genotypes by depth
81 if (passed_filters == false)
82 return;
83
84 if ((min_depth <= 0) && (max_depth == numeric_limits<int>::max()))
85 return;
86 if (entry_header.has_genotypes == false)
87 LOG.error("Require Genotypes in variant file in order to filter genotypes by Depth.");
88
89 parse_genotype_entries(false, false, true);
90 filter_genotypes_by_depth(min_depth, max_depth);
91
92 }
93
filter_genotypes_by_filter_flag(const set<string> & filter_flags_to_remove,bool remove_all)94 void entry::filter_genotypes_by_filter_flag(const set<string> &filter_flags_to_remove, bool remove_all)
95 {
96 // Filter genotypes by Filter Flags
97 if (passed_filters == false)
98 return;
99
100 if ((remove_all == false) && (filter_flags_to_remove.empty()))
101 return;
102
103 parse_genotype_entries(false, false, false, true);
104 if (entry_header.has_genotypes == false)
105 LOG.error("Require Genotypes in variant file in order to filter genotypes by Filter Flag.");
106
107 filter_genotypes_by_filter_status(filter_flags_to_remove, remove_all);
108 }
109
filter_sites(const set<string> & snps_to_keep,const string & snps_to_keep_file,const string & snps_to_exclude_file,bool keep_then_exclude)110 void entry::filter_sites(const set<string> &snps_to_keep, const string &snps_to_keep_file, const string &snps_to_exclude_file, bool keep_then_exclude)
111 {
112 // Filter sites by user provided lists
113 if (keep_then_exclude)
114 {
115 filter_sites_to_keep(snps_to_keep, snps_to_keep_file);
116 filter_sites_to_exclude(snps_to_exclude_file);
117 }
118 else
119 {
120 filter_sites_to_exclude(snps_to_exclude_file);
121 filter_sites_to_keep(snps_to_keep, snps_to_keep_file);
122 }
123 }
124
filter_sites_to_keep(const set<string> & snps_to_keep,const string & snps_to_keep_file)125 void entry::filter_sites_to_keep(const set<string> &snps_to_keep, const string &snps_to_keep_file)
126 {
127 // Filter sites by user provided list
128 if(passed_filters == false)
129 return;
130
131 if ((snps_to_keep.empty()) && (snps_to_keep_file == ""))
132 return;
133
134 if (snps_to_keep_file != "" && local_snps_to_keep.empty())
135 {
136 ifstream in(snps_to_keep_file.c_str());
137 string tmp;
138 local_snps_to_keep = snps_to_keep;
139
140 if (!in.is_open())
141 {
142 LOG.error("Could not open SNPs to Keep file" + snps_to_keep_file, 0);
143 }
144 while (!in.eof())
145 {
146 in >> tmp;
147 local_snps_to_keep.insert(tmp);
148 in.ignore(numeric_limits<streamsize>::max(), '\n');
149 }
150
151 in.close();
152 }
153 parse_basic_entry();
154
155 if ( (local_snps_to_keep.find(ID) == local_snps_to_keep.end()) && (snps_to_keep.find(ID) == snps_to_keep.end()) )
156 passed_filters = false;
157 }
158
filter_sites_to_exclude(const string & snps_to_exclude_file)159 void entry::filter_sites_to_exclude(const string &snps_to_exclude_file)
160 {
161 // Filter sites by user provided list
162 if(passed_filters == false)
163 return;
164
165 if (snps_to_exclude_file == "")
166 return;
167
168 if (snps_to_exclude_file != "" && snps_to_exclude.empty())
169 {
170 ifstream in(snps_to_exclude_file.c_str());
171 string tmp;
172 if (!in.is_open())
173 {
174 LOG.error("Could not open SNPs to Exclude file" + snps_to_exclude_file, 0);
175 }
176 while (!in.eof())
177 {
178 in >> tmp;
179 snps_to_exclude.insert(tmp);
180 in.ignore(numeric_limits<streamsize>::max(), '\n');
181 }
182 in.close();
183 }
184
185 parse_basic_entry();
186
187 if (snps_to_exclude.find(ID) != snps_to_exclude.end())
188 passed_filters = false;
189 }
190
filter_sites_by_quality(double min_quality)191 void entry::filter_sites_by_quality(double min_quality)
192 {
193 // Filter sites by quality
194 if (passed_filters == false)
195 return;
196
197 if (min_quality < 0)
198 return;
199
200 parse_basic_entry(true);
201
202 string alt_allele = get_ALT_allele(0);
203 // The QUAL field has different definitions depending on the state of the
204 // alternative allele. Here I treat them separately, although in this case
205 // it is unnecessary.
206 if ((alt_allele == ".") || (alt_allele == ""))
207 { // The case that the alternative allele is unknown
208 // QUAL is -10log_10 p(variant)
209 if (QUAL < min_quality)
210 passed_filters = false;
211 }
212 else
213 { // The normal case
214 // QUAL is -10log_10 p(no variant)
215 if (QUAL < min_quality)
216 passed_filters = false;
217 }
218 }
219
filter_sites_by_mean_depth(double min_mean_depth,double max_mean_depth)220 void entry::filter_sites_by_mean_depth(double min_mean_depth, double max_mean_depth)
221 {
222 // Filter sites by mean depth
223 if (passed_filters == false)
224 return;
225
226 if ((min_mean_depth <= 0) && (max_mean_depth == numeric_limits<double>::max()))
227 return;
228
229 int depth;
230
231 unsigned int N_indv_included = 0;
232 double depth_sum = 0.0;
233 for (unsigned int ui=0; ui<N_indv; ui++)
234 {
235 if (include_indv[ui] == false)
236 continue;
237
238 if (include_genotype[ui] == true)
239 {
240 parse_genotype_entry(ui, false, false, true);
241
242 if (entry_header.has_genotypes == false)
243 LOG.error("Require Genotypes in VCF file in order to filter sites by mean depth");
244
245 depth = get_indv_DEPTH(ui);
246 if (depth >= 0)
247 {
248 depth_sum += depth;
249 }
250 N_indv_included++;
251 }
252 }
253 double mean_depth = depth_sum / N_indv_included;
254
255 if ((mean_depth < min_mean_depth) || (mean_depth > max_mean_depth))
256 passed_filters = false;
257 }
258
filter_sites_by_position(const string & chr,int start_pos,int end_pos)259 void entry::filter_sites_by_position(const string &chr, int start_pos, int end_pos)
260 {
261 // Filter sites by user provided position range
262 if (passed_filters == false)
263 return;
264
265 if ((chr == "") || ((start_pos == -1) && (end_pos==numeric_limits<int>::max())))
266 return;
267
268 parse_basic_entry();
269
270 if (CHROM == chr)
271 {
272 if ((POS < start_pos) || (POS > end_pos))
273 passed_filters = false;
274 }
275 else
276 passed_filters = false;
277 }
278
filter_sites_by_positions(const string & positions_file,const string & exclude_positions_file)279 void entry::filter_sites_by_positions(const string &positions_file, const string &exclude_positions_file)
280 {
281 // Filter sites by a user defined file containing a list of positions
282 if (passed_filters == false)
283 return;
284
285 if ((positions_file == "") && (exclude_positions_file == ""))
286 return;
287
288 int idx;
289 if (keep_positions.empty() && positions_file != "")
290 {
291 string chr;
292 int pos1;
293 unsigned int N_chr=chr_to_idx.size();
294 stringstream ss;
295 string line;
296 unsigned int gzMAX_LINE_LEN = 1024*1024;
297 char *gz_readbuffer = new char[gzMAX_LINE_LEN];
298
299 gzFile gz_in = gzopen(positions_file.c_str(), "rb");
300 if (gz_in == NULL)
301 LOG.error("Could not open Positions file: " + positions_file);
302
303 keep_positions.resize(N_chr);
304 while (!gzeof(gz_in))
305 {
306 line = "";
307 bool again = true;
308 while (again == true)
309 {
310 gzgets(gz_in, gz_readbuffer, gzMAX_LINE_LEN);
311 line.append(gz_readbuffer);
312 if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1)
313 again = false;
314 }
315 if (line[0] == '#')
316 continue;
317 line.erase( line.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line (required in gzipped case!)
318
319 ss.clear();
320 ss.str(line);
321 ss >> chr >> pos1;
322
323 if (chr_to_idx.find(chr) == chr_to_idx.end())
324 {
325 N_chr++;
326 chr_to_idx[chr] = (N_chr-1);
327 keep_positions.resize(N_chr);
328 }
329
330 idx = chr_to_idx[chr];
331 keep_positions[idx].insert(pos1);
332 }
333 gzclose(gz_in);
334 delete [] gz_readbuffer;
335 }
336 if (exclude_positions.empty() && exclude_positions_file != "")
337 {
338 string chr;
339 int pos1;
340 unsigned int N_chr=chr_to_idx.size();
341 stringstream ss;
342 string line;
343 unsigned int gzMAX_LINE_LEN = 1024*1024;
344 char *gz_readbuffer = new char[gzMAX_LINE_LEN];
345
346 gzFile gz_in = gzopen(exclude_positions_file.c_str(), "rb");
347 if (gz_in == NULL)
348 LOG.error("Could not open Positions file: " + exclude_positions_file);
349
350 exclude_positions.resize(N_chr);
351 while (!gzeof(gz_in))
352 {
353 line = "";
354 bool again = true;
355 while (again == true)
356 {
357 gzgets(gz_in, gz_readbuffer, gzMAX_LINE_LEN);
358 line.append(gz_readbuffer);
359 if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1)
360 again = false;
361 }
362 if (line[0] == '#')
363 continue;
364 line.erase( line.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line (required in gzipped case!)
365
366 ss.clear();
367 ss.str(line);
368 ss >> chr >> pos1;
369
370 if (chr_to_idx.find(chr) == chr_to_idx.end())
371 {
372 N_chr++;
373 chr_to_idx[chr] = (N_chr-1);
374 exclude_positions.resize(N_chr);
375 }
376
377 idx = chr_to_idx[chr];
378 exclude_positions[idx].insert(pos1);
379 }
380 gzclose(gz_in);
381 delete [] gz_readbuffer;
382 }
383
384 parse_basic_entry();
385
386 if (!keep_positions.empty())
387 { // Check to see if position is in keep list
388 if (chr_to_idx.find(CHROM) == chr_to_idx.end())
389 passed_filters = false;
390 else
391 {
392 idx = chr_to_idx[CHROM];
393 if (keep_positions[idx].find(POS) == keep_positions[idx].end())
394 passed_filters = false;
395 }
396 }
397 if (!exclude_positions.empty())
398 { // Check to see if position is in exclude list
399 if (chr_to_idx.find(CHROM) != chr_to_idx.end())
400 {
401 idx = chr_to_idx[CHROM];
402 if (exclude_positions[idx].find(POS) != exclude_positions[idx].end())
403 passed_filters = false;
404 }
405 }
406 }
407
filter_sites_by_overlap_positions(const string & positions_overlap_file,const string & exclude_positions_overlap_file)408 void entry::filter_sites_by_overlap_positions(const string &positions_overlap_file, const string &exclude_positions_overlap_file)
409 {
410 // Filter sites by overlapping with a user defined file containing a list of positions
411 if (passed_filters == false)
412 return;
413
414 if ((positions_overlap_file == "") && (exclude_positions_overlap_file == ""))
415 return;
416
417 int idx;
418 if (keep_positions.empty() && positions_overlap_file != "")
419 {
420 string chr;
421 int pos1;
422 unsigned int N_chr=chr_to_idx.size();
423 stringstream ss;
424 string line;
425 unsigned int gzMAX_LINE_LEN = 1024*1024;
426 char *gz_readbuffer = new char[gzMAX_LINE_LEN];
427
428 gzFile gz_in = gzopen(positions_overlap_file.c_str(), "rb");
429 if (gz_in == NULL)
430 LOG.error("Could not open Positions file: " + positions_overlap_file);
431
432 while (!gzeof(gz_in))
433 {
434 line = "";
435 bool again = true;
436 while (again == true)
437 {
438 gzgets(gz_in, gz_readbuffer, gzMAX_LINE_LEN);
439 line.append(gz_readbuffer);
440 if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1)
441 again = false;
442 }
443 if (line[0] == '#')
444 continue;
445 line.erase( line.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line (required in gzipped case!)
446
447 ss.clear();
448 ss.str(line);
449 ss >> chr >> pos1;
450
451 if (chr_to_idx.find(chr) == chr_to_idx.end())
452 {
453 N_chr++;
454 chr_to_idx[chr] = (N_chr-1);
455 keep_positions.resize(N_chr);
456 }
457
458 idx = chr_to_idx[chr];
459 keep_positions[idx].insert(pos1);
460 }
461 gzclose(gz_in);
462 delete [] gz_readbuffer;
463 }
464 if (exclude_positions.empty() && exclude_positions_overlap_file != "")
465 {
466 string chr;
467 int pos1;
468 unsigned int N_chr=0;
469 stringstream ss;
470 string line;
471 unsigned int gzMAX_LINE_LEN = 1024*1024;
472 char *gz_readbuffer = new char[gzMAX_LINE_LEN];
473
474 gzFile gz_in = gzopen(exclude_positions_overlap_file.c_str(), "rb");
475 if (gz_in == NULL)
476 LOG.error("Could not open Positions file: " + exclude_positions_overlap_file);
477
478 while (!gzeof(gz_in))
479 {
480 line = "";
481 bool again = true;
482 while (again == true)
483 {
484 gzgets(gz_in, gz_readbuffer, gzMAX_LINE_LEN);
485 line.append(gz_readbuffer);
486 if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1)
487 again = false;
488 }
489 if (line[0] == '#')
490 continue;
491 line.erase( line.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line (required in gzipped case!)
492
493 ss.clear();
494 ss.str(line);
495 ss >> chr >> pos1;
496
497 if (chr_to_idx.find(chr) == chr_to_idx.end())
498 {
499 N_chr++;
500 chr_to_idx[chr] = (N_chr-1);
501 exclude_positions.resize(N_chr);
502 }
503
504 idx = chr_to_idx[chr];
505 exclude_positions[idx].insert(pos1);
506 }
507 gzclose(gz_in);
508 delete [] gz_readbuffer;
509 }
510
511 parse_basic_entry();
512
513 if (!keep_positions.empty())
514 { // Check to see if position is in keep list
515 if (chr_to_idx.find(CHROM) == chr_to_idx.end())
516 passed_filters = false;
517 else
518 {
519 idx = chr_to_idx[CHROM];
520 bool found=false;
521
522 for (unsigned int ui=POS; ui<(POS+REF.size()); ui++)
523 if (keep_positions[idx].find(ui) != keep_positions[idx].end())
524 {
525 found = true;
526 break;
527 }
528
529 if (found == false)
530 passed_filters = false;
531 }
532 }
533 if (!exclude_positions.empty())
534 { // Check to see if position is in exclude list
535 if (chr_to_idx.find(CHROM) != chr_to_idx.end())
536 {
537 idx = chr_to_idx[CHROM];
538 bool found=false;
539
540 for (unsigned int ui=POS; ui<(POS+REF.size()); ui++)
541 if (exclude_positions[idx].find(ui) != exclude_positions[idx].end())
542 found = true;
543
544 if (found == true)
545 passed_filters = false;
546 }
547 }
548 }
549
filter_sites_by_chromosome(const set<string> & chrs_to_keep,const set<string> & chrs_to_exclude)550 void entry::filter_sites_by_chromosome(const set<string> &chrs_to_keep, const set<string> &chrs_to_exclude)
551 {
552 if (passed_filters == false)
553 return;
554
555 if (chrs_to_keep.empty() && chrs_to_exclude.empty())
556 return;
557
558 parse_basic_entry();
559
560 if (!chrs_to_keep.empty())
561 {
562 if (chrs_to_keep.find(CHROM) == chrs_to_keep.end())
563 passed_filters = false;
564 }
565 else
566 {
567 if (chrs_to_exclude.find(CHROM) != chrs_to_exclude.end())
568 passed_filters = false;
569 }
570 }
571
filter_sites_by_BED_file(const string & bed_file,bool BED_exclude)572 void entry::filter_sites_by_BED_file(const string &bed_file, bool BED_exclude)
573 {
574 // Filter sites depending on positions in a BED file.
575 if (passed_filters == false)
576 return;
577
578 if (bed_file == "")
579 return;
580
581 int pos1, pos2, idx;
582 if (lims.empty())
583 {
584 ifstream BED(bed_file.c_str());
585 if (!BED.is_open())
586 LOG.error("Could not open BED file: " + bed_file);
587
588 string chr;
589 unsigned int N_chr=chr_to_idx.size();
590 BED.ignore(numeric_limits<streamsize>::max(), '\n'); // Ignore header
591 unsigned int N_BED_entries=0;
592 while (!BED.eof())
593 {
594 BED >> chr >> pos1 >> pos2;
595 BED.ignore(numeric_limits<streamsize>::max(), '\n');
596
597 if (chr_to_idx.find(chr) == chr_to_idx.end())
598 {
599 N_chr++;
600 chr_to_idx[chr] = (N_chr-1);
601 lims.resize(N_chr);
602 }
603
604 idx = chr_to_idx[chr];
605 lims[idx].push_back(make_pair(pos1,pos2));
606 N_BED_entries++;
607 }
608 BED.close();
609
610 LOG.printLOG("\tRead " + output_log::int2str(N_BED_entries) + " BED file entries.\n");
611
612 for (unsigned int ui=0; ui<lims.size(); ui++)
613 sort(lims[ui].begin(), lims[ui].end());
614 }
615 vector<unsigned int> min_ui(lims.size(), 0);
616 parse_basic_entry(true);
617
618 pos1 = POS;
619 pos2 = pos1;
620 unsigned int N_alleles = get_N_alleles();
621 for (int i=0; i<(int)N_alleles; i++)
622 pos2 = max(pos2, (int)(pos1 + get_allele(i).length() - 1));
623
624 if (BED_exclude == false)
625 { // Exclude sites not in BED file
626 if (chr_to_idx.find(CHROM) == chr_to_idx.end())
627 passed_filters = false;
628 else
629 {
630 idx = chr_to_idx[CHROM];
631 bool found=false;
632 unsigned int max_ui = lims[idx].size();
633 for (unsigned int ui=min_ui[idx]; ui<max_ui; ui++)
634 { // No need to start this loop at zero every time...
635 if (((pos1 > lims[idx][ui].first) && (pos1 <= lims[idx][ui].second)) || // Start pos inside bin
636 ((pos2 > lims[idx][ui].first) && (pos2 <= lims[idx][ui].second)) || // End pos inside bin
637 ((pos1 <= lims[idx][ui].first) && (pos2 >= lims[idx][ui].second))) // Variant spans bin
638 {
639 found=true;
640 break;
641 }
642 else if (pos1 > lims[idx][ui].second)
643 min_ui[idx] = ui+1;
644 }
645 if (found == false)
646 passed_filters = false;
647 }
648 }
649 else
650 { // Exclude sites in BED file
651 if (chr_to_idx.find(CHROM) != chr_to_idx.end())
652 {
653 idx = chr_to_idx[CHROM];
654 bool found=false;
655 unsigned int max_ui = lims[idx].size();
656 for (unsigned int ui=min_ui[idx]; ui<max_ui; ui++)
657 { // No need to start this loop at zero every time...
658 if (((pos1 > lims[idx][ui].first) && (pos1 <= lims[idx][ui].second)) || // Start pos inside bin
659 ((pos2 > lims[idx][ui].first) && (pos2 <= lims[idx][ui].second)) || // End pos inside bin
660 ((pos1 <= lims[idx][ui].first) && (pos2 >= lims[idx][ui].second))) // Variant spans bin
661 {
662 found=true;
663 break;
664 }
665 else if (pos1 > lims[idx][ui].second)
666 min_ui[idx] = ui+1;
667 }
668 if (found == true)
669 passed_filters = false;
670 }
671 }
672 }
673
filter_sites_by_mask(const string & mask_file,bool invert_mask,int min_kept_mask_value)674 void entry::filter_sites_by_mask(const string &mask_file, bool invert_mask, int min_kept_mask_value)
675 {
676 // Filter sites on the basis of a fasta-like mask file.
677 if (passed_filters == false || mask_file == "")
678 return;
679
680 if (!mask.is_open())
681 {
682 mask.open(mask_file.c_str());
683 mask_chr = "";
684 mask_line = "";
685 mask_pos = 1;
686
687 if (!mask.is_open())
688 LOG.error("Could not open mask file: " + mask_file);
689 }
690
691 string line;
692 string next_chr="";
693 unsigned int next_pos = 0;
694
695 parse_basic_entry();
696 next_chr = CHROM;
697
698 while (mask_chr != next_chr && !mask.eof())
699 {
700 getline(mask, line);
701 line.erase( line.find_last_not_of(" \t") + 1);
702
703 if (line[0] == '>')
704 {
705 mask_chr = line.substr(1, line.find_first_of(" \t")-1);
706 mask_pos = 1;
707 getline(mask, line);
708 mask_line = line;
709 }
710 }
711
712 if (next_chr == mask_chr)
713 next_pos = (unsigned)POS;
714 else
715 {
716 passed_filters = false;
717 return;
718 }
719
720 while (next_pos > (mask_pos + mask_line.size()) && !mask.eof())
721 {
722 getline(mask, line);
723 line.erase( line.find_last_not_of(" \t") + 1);
724
725 if (line[0] == '>')
726 {
727 mask_chr = line.substr(1, line.find_first_of(" \t")-1);
728 mask_pos = 1;
729 passed_filters = false;
730 return;
731 }
732 else
733 {
734 mask_pos += mask_line.size();
735 mask_line = line;
736 }
737 }
738
739 if (next_chr == mask_chr && next_pos <= (mask_pos+mask_line.size()))
740 {
741 char mask_base = mask_line[next_pos-mask_pos]-48;
742 bool keep = (mask_base <= min_kept_mask_value);
743
744 if (invert_mask == true)
745 keep = !keep;
746
747 if (keep == false)
748 passed_filters = false;
749 }
750 else
751 passed_filters = false;
752 }
753
filter_sites_by_number_of_alleles(int min_alleles,int max_alleles)754 void entry::filter_sites_by_number_of_alleles(int min_alleles, int max_alleles)
755 {
756 // Filter sites by the number of alleles (e.g. 2 for bi-allelic)
757 if (passed_filters == false)
758 return;
759
760 if ((min_alleles <= 0) && (max_alleles == numeric_limits<int>::max()))
761 return;
762
763 int N_alleles;
764 parse_basic_entry(true);
765 N_alleles = get_N_alleles();
766 if ((N_alleles < min_alleles) || (N_alleles > max_alleles))
767 passed_filters = false;
768 }
769
filter_sites_by_frequency_and_call_rate(double min_maf,double max_maf,double min_non_ref_af,double max_non_ref_af,double min_non_ref_af_any,double max_non_ref_af_any,double min_site_call_rate)770 void entry::filter_sites_by_frequency_and_call_rate(double min_maf, double max_maf,
771 double min_non_ref_af, double max_non_ref_af,
772 double min_non_ref_af_any, double max_non_ref_af_any,
773 double min_site_call_rate)
774 {
775 // Filter sites so that all allele frequencies are between limits
776 if (passed_filters == false)
777 return;
778
779 if ((min_maf <= 0.0) && (max_maf >= 1.0) &&
780 (min_site_call_rate <= 0) &&
781 (min_non_ref_af <= 0.0) && (max_non_ref_af >= 1.0) &&
782 (min_non_ref_af_any <= 0.0) && (max_non_ref_af_any >= 1.0))
783 return;
784
785 unsigned int N_alleles;
786 unsigned int N_non_missing_chr;
787 parse_basic_entry(true);
788 parse_genotype_entries(true);
789
790 if (GT_idx == -1)
791 LOG.error("Require Genotypes in variant file to filter by frequency and/or call rate");
792
793 N_alleles = get_N_alleles();
794
795 vector<int> allele_counts;
796 get_allele_counts(allele_counts, N_non_missing_chr);
797
798 double freq, folded_freq;
799 double maf=numeric_limits<double>::max();
800 int N_failed = 0;
801 for (unsigned int ui=0; ui<N_alleles; ui++)
802 {
803 freq = allele_counts[ui] / (double)N_non_missing_chr;
804 folded_freq = min(freq, 1.0 - freq);
805
806 maf = min(maf, folded_freq);
807 if ((ui > 0) && ((freq < min_non_ref_af) || (freq > max_non_ref_af)))
808 passed_filters = false;
809
810 if ((ui > 0) && ((freq < min_non_ref_af_any) || (freq > max_non_ref_af_any)))
811 N_failed++;
812 }
813
814 if (((min_non_ref_af > 0.0) || (max_non_ref_af < 1.0)) && (N_failed == (N_alleles-1)))
815 passed_filters = false;
816
817 if ((maf < min_maf) || (maf > max_maf))
818 passed_filters = false;
819
820 double call_rate = N_non_missing_chr / double(get_N_chr());
821
822 if (call_rate < min_site_call_rate)
823 passed_filters = false;
824 }
825
filter_sites_by_allele_type(bool keep_only_indels,bool remove_indels)826 void entry::filter_sites_by_allele_type(bool keep_only_indels, bool remove_indels)
827 {
828 if (passed_filters == false)
829 return;
830
831 if ((keep_only_indels == false) && (remove_indels == false))
832 return;
833 if ((keep_only_indels == true) && (remove_indels == true))
834 LOG.error("Can't both keep and remove all indels!");
835
836 string allele;
837 unsigned int ref_len, N_alleles;
838 bool is_indel;
839
840 parse_basic_entry(true);
841 is_indel = false;
842 allele = REF;
843 ref_len = allele.size();
844 if (ref_len != 1)
845 is_indel = true;
846 N_alleles = get_N_alleles();
847 for (unsigned int ui=1; ui<N_alleles; ui++)
848 {
849 get_allele(ui, allele);
850 if (allele.size() != ref_len)
851 {
852 is_indel = true;
853 break;
854 }
855 }
856
857 if (keep_only_indels == true)
858 {
859 if (is_indel == false)
860 passed_filters = false;
861 }
862 else if (remove_indels == true)
863 {
864 if (is_indel == true)
865 passed_filters = false;
866 }
867 }
868
filter_sites_by_allele_count(double min_mac,double max_mac,double min_non_ref_ac,double max_non_ref_ac,double min_non_ref_ac_any,double max_non_ref_ac_any,double max_missing_call_count)869 void entry::filter_sites_by_allele_count(double min_mac, double max_mac,
870 double min_non_ref_ac, double max_non_ref_ac,
871 double min_non_ref_ac_any, double max_non_ref_ac_any,
872 double max_missing_call_count)
873 {
874 // Filter sites so that all allele counts are between limits
875 if (passed_filters == false)
876 return;
877
878 if ((min_mac <= 0) && (max_mac == numeric_limits<int>::max()) &&
879 (min_non_ref_ac <= 0) && (max_non_ref_ac == numeric_limits<int>::max()) &&
880 (min_non_ref_ac_any <= 0) && (max_non_ref_ac_any == numeric_limits<int>::max()) &&
881 (max_missing_call_count == numeric_limits<int>::max()))
882 return;
883
884 unsigned int N_alleles, N_chr, N_non_missing_chr;
885 parse_basic_entry(true);
886 parse_genotype_entries(true);
887
888 if (entry_header.has_genotypes == false)
889 LOG.error("Require Genotypes in variant file to filter by allele counts and/or missing data");
890
891 N_alleles = get_N_alleles();
892
893 if (N_alleles <= 1 && min_mac > 0)
894 passed_filters = false;
895
896 vector<int> allele_counts;
897 get_allele_counts(allele_counts, N_non_missing_chr);
898 N_chr = get_N_chr();
899
900 int mac = numeric_limits<int>::max();
901 int N_failed = 0;
902 for (unsigned int ui=0; ui<N_alleles; ui++)
903 {
904 mac = min(allele_counts[ui], mac);
905 if ((ui > 0) && ((allele_counts[ui] < min_non_ref_ac) || (allele_counts[ui] > max_non_ref_ac)))
906 passed_filters = false;
907
908 if ((ui > 0) && ((allele_counts[ui] < min_non_ref_ac_any) || (allele_counts[ui] > max_non_ref_ac_any)))
909 N_failed++;
910 }
911
912 if (((min_non_ref_ac_any > 0) || (max_non_ref_ac_any < numeric_limits<int>::max())) && (N_failed == (N_alleles-1)))
913 passed_filters = false;
914
915 if ((mac < min_mac) || (mac > max_mac))
916 passed_filters = false;
917
918 if ((N_chr-N_non_missing_chr) > max_missing_call_count)
919 passed_filters = false;
920 }
921
filter_sites_by_HWE_pvalue(double min_HWE_pvalue)922 void entry::filter_sites_by_HWE_pvalue(double min_HWE_pvalue)
923 {
924 // Filter sites by HWE p-value
925 // Note this assumes Biallelic SNPs.
926 if(passed_filters == false)
927 return;
928
929 if (min_HWE_pvalue <= 0)
930 return;
931
932 unsigned int b11, b12, b22;
933 double p_hwe, p_lo, p_hi;
934
935 parse_basic_entry(true);
936 parse_genotype_entries(true);
937
938 if (entry_header.has_genotypes == false)
939 LOG.error("Require Genotypes in variant file to filter sites by HWE.");
940
941 get_genotype_counts(b11, b12, b22);
942 entry::SNPHWE(b12, b11, b22, p_hwe, p_lo, p_hi);
943
944 if (p_hwe < min_HWE_pvalue)
945 passed_filters = false;
946 }
947
filter_sites_by_filter_status(const set<string> & filter_flags_to_remove,const set<string> & filter_flags_to_keep,bool remove_all)948 void entry::filter_sites_by_filter_status(const set<string> &filter_flags_to_remove, const set<string> &filter_flags_to_keep, bool remove_all)
949 {
950 // Filter sites by entries in the FILTER field.
951 if (passed_filters == false)
952 return;
953
954 if ((remove_all == false) && (filter_flags_to_remove.empty()) && (filter_flags_to_keep.empty()))
955 return;
956
957 vector<string> FILTERs;
958 unsigned int N_to_remove = filter_flags_to_remove.size();
959 unsigned int N_to_keep = filter_flags_to_keep.size();
960 parse_basic_entry(false, true);
961 get_FILTER_vector(FILTERs);
962
963 if (N_to_keep > 0)
964 {
965 bool keep = false;
966 for (unsigned int ui=0; ui<FILTERs.size(); ui++)
967 if (filter_flags_to_keep.find(FILTERs[ui]) != filter_flags_to_keep.end())
968 {
969 keep = true; break;
970 }
971
972 passed_filters = keep;
973 }
974 if (passed_filters==false)
975 return;
976
977 if ( (FILTERs.size() >= 1) && (FILTERs[0] == "PASS") )
978 return;
979 else if ((remove_all == true) && (!FILTERs.empty()))
980 passed_filters = false;
981 else if (N_to_remove > 0)
982 {
983 for (unsigned int ui=0; ui<FILTERs.size(); ui++)
984 if (filter_flags_to_remove.find(FILTERs[ui]) != filter_flags_to_remove.end())
985 passed_filters = false;
986 }
987 }
988
filter_sites_by_phase()989 void entry::filter_sites_by_phase()
990 {
991 // Filter out sites with unphased entries
992 // TODO: Alter this to allow for a max/min level of unphased-ness.
993 if (passed_filters == false)
994 return;
995
996 unsigned int count_unphased = 0;
997 for (unsigned int ui=0; ui<N_indv; ui++)
998 {
999 if (include_indv[ui] == false)
1000 continue;
1001
1002 parse_genotype_entry(ui, true);
1003
1004 if (get_indv_PHASE(ui) != '|')
1005 count_unphased++;
1006 }
1007
1008 if (count_unphased > 0)
1009 passed_filters = false;
1010 }
1011
filter_sites_by_thinning(int min_SNP_distance)1012 void entry::filter_sites_by_thinning(int min_SNP_distance)
1013 {
1014 // Filter sites so that no two SNPs are within some minimum distance
1015 if (passed_filters == false)
1016 return;
1017
1018 if (min_SNP_distance < 1)
1019 return;
1020
1021 parse_basic_entry();
1022 if (CHROM == thin_chrom)
1023 {
1024 int distance_from_last_SNP = POS - thin_pos;
1025 if (distance_from_last_SNP < min_SNP_distance)
1026 passed_filters = false;
1027 }
1028 if (passed_filters == true)
1029 thin_pos = POS;
1030 thin_chrom = CHROM;
1031 }
1032
filter_sites_by_INFO(const set<string> & flags_to_remove,const set<string> & flags_to_keep)1033 void entry::filter_sites_by_INFO(const set<string> &flags_to_remove, const set<string> &flags_to_keep)
1034 {
1035 // Filter sites by entries in the INFO field.
1036 if (passed_filters == false)
1037 return;
1038
1039 if ((flags_to_remove.empty()) && (flags_to_keep.empty()))
1040 return;
1041
1042 string value;
1043 unsigned int N_to_remove = flags_to_remove.size();
1044 unsigned int N_to_keep = flags_to_keep.size();
1045
1046 parse_basic_entry(false, false, true);
1047
1048 if (N_to_keep > 0)
1049 {
1050 bool keep = false;
1051 for (set<string>::iterator it=flags_to_keep.begin(); it != flags_to_keep.end(); ++it)
1052 {
1053 if (entry_header.INFO_map[ entry_header.INFO_reverse_map[*it] ].Type != Flag)
1054 LOG.error("Using INFO flag filtering on non flag type "+*it+" will not work correctly.");
1055 else
1056 {
1057 value = get_INFO_value(*it);
1058 if (value == "1")
1059 keep = true;
1060 }
1061 }
1062 passed_filters = keep;
1063 }
1064
1065 if (passed_filters==false)
1066 return;
1067
1068 if (N_to_remove > 0)
1069 {
1070 for (set<string>::iterator it=flags_to_remove.begin(); it != flags_to_remove.end(); ++it)
1071 {
1072 if (entry_header.INFO_map[ entry_header.INFO_reverse_map[*it] ].Type != Flag)
1073 LOG.error("Using INFO flag filtering on non flag type "+*it+" will not work correctly.");
1074 else
1075 {
1076 value = get_INFO_value(*it);
1077 if (value == "1")
1078 {
1079 passed_filters = false;
1080 continue;
1081 }
1082 }
1083 }
1084 }
1085 }
1086