1 /*
2  * entry_filters.cpp
3  *
4  *  Created on: Aug 9, 2013
5  *      Author: amarcketta
6  */
7 
8 #include "entry.h"
9 
10 set<string> entry::local_snps_to_keep;
11 set<string> entry::snps_to_exclude;
12 vector< set<int > > entry::keep_positions;
13 vector< set<int > > entry::exclude_positions;
14 map<string,int> entry::chr_to_idx;
15 vector< deque<pair<int,int> > > entry::lims;
16 ifstream entry::mask;
17 string entry::mask_chr;
18 string entry::mask_line;
19 int entry::mask_pos;
20 string entry::thin_chrom;
21 int entry::thin_pos;
22 
apply_filters(const parameters & params)23 int entry::apply_filters(const parameters &params)
24 {
25 	if (line.empty())
26 	{
27 		passed_filters = false;
28 		return 0;
29 	}
30 
31 // 	Apply all filters in turn.
32 	filter_sites_by_allele_type(params.keep_only_indels, params.remove_indels);
33 	filter_sites(params.snps_to_keep, params.snps_to_keep_file, params.snps_to_exclude_file);
34 	filter_sites_by_filter_status(params.site_filter_flags_to_exclude, params.site_filter_flags_to_keep, params.remove_all_filtered_sites);
35 	string chr_to_keep = "";
36 	if (params.chrs_to_keep.size() == 1)
37 		chr_to_keep = *(params.chrs_to_keep.begin()); // Get first chromosome in list (there should only be one).
38 	filter_sites_by_position(chr_to_keep, params.start_pos, params.end_pos);
39 	filter_sites_by_positions(params.positions_file, params.exclude_positions_file);
40 	filter_sites_by_overlap_positions(params.positions_overlap_file, params.exclude_positions_overlap_file);
41 	filter_sites_by_chromosome(params.chrs_to_keep, params.chrs_to_exclude);
42 	filter_sites_by_BED_file(params.BED_file, params.BED_exclude);
43 	filter_sites_by_number_of_alleles(params.min_alleles, params.max_alleles);
44 	filter_sites_by_INFO(params.site_INFO_flags_to_remove, params.site_INFO_flags_to_keep);
45 	filter_sites_by_quality(params.min_quality);
46 	filter_sites_by_mean_depth(params.min_mean_depth, params.max_mean_depth);
47 	filter_sites_by_mask(params.mask_file, params.invert_mask, params.min_kept_mask_value);
48 	if (params.phased_only == true)
49 		filter_sites_by_phase();
50 	filter_genotypes_by_quality_value(params.min_genotype_quality);
51 	filter_genotypes_by_depth_range(params.min_genotype_depth, params.max_genotype_depth);
52 	filter_genotypes_by_filter_flag(params.geno_filter_flags_to_exclude, params.remove_all_filtered_genotypes);
53 	filter_sites_by_frequency_and_call_rate(params.min_maf, params.max_maf, params.min_non_ref_af, params.max_non_ref_af, params.min_non_ref_af_any, params.max_non_ref_af_any, params.min_site_call_rate);
54 	filter_sites_by_allele_count(params.min_mac, params.max_mac, params.min_non_ref_ac, params.max_non_ref_ac, params.min_non_ref_ac_any, params.max_non_ref_ac_any, params.max_missing_call_count);
55 	filter_sites_by_HWE_pvalue(params.min_HWE_pvalue);
56 	filter_sites_by_thinning(params.min_interSNP_distance);
57 
58 	return 1;
59 }
60 
filter_genotypes_by_quality_value(double min_genotype_quality)61 void entry::filter_genotypes_by_quality_value(double min_genotype_quality)
62 {
63 	// Filter genotypes by quality
64 	if (passed_filters == false)
65 		return;
66 
67 	if (min_genotype_quality <= 0)
68 		return;
69 
70 	parse_genotype_entries(false, true);
71 
72 	if (entry_header.has_genotypes == false)
73 		LOG.error("Require Genotypes in variant file in order to filter genotypes by Quality.");
74 
75 	filter_genotypes_by_quality(min_genotype_quality);
76 }
77 
filter_genotypes_by_depth_range(int min_depth,int max_depth)78 void entry::filter_genotypes_by_depth_range(int min_depth, int max_depth)
79 {
80 	// Filter genotypes by depth
81 	if (passed_filters == false)
82 		return;
83 
84 	if ((min_depth <= 0) && (max_depth == numeric_limits<int>::max()))
85 		return;
86 	if (entry_header.has_genotypes == false)
87 		LOG.error("Require Genotypes in variant file in order to filter genotypes by Depth.");
88 
89 	parse_genotype_entries(false, false, true);
90 	filter_genotypes_by_depth(min_depth, max_depth);
91 
92 }
93 
filter_genotypes_by_filter_flag(const set<string> & filter_flags_to_remove,bool remove_all)94 void entry::filter_genotypes_by_filter_flag(const set<string> &filter_flags_to_remove, bool remove_all)
95 {
96 	// Filter genotypes by Filter Flags
97 	if (passed_filters == false)
98 		return;
99 
100 	if ((remove_all == false) && (filter_flags_to_remove.empty()))
101 		return;
102 
103 	parse_genotype_entries(false, false, false, true);
104 	if (entry_header.has_genotypes == false)
105 		LOG.error("Require Genotypes in variant file in order to filter genotypes by Filter Flag.");
106 
107 	filter_genotypes_by_filter_status(filter_flags_to_remove, remove_all);
108 }
109 
filter_sites(const set<string> & snps_to_keep,const string & snps_to_keep_file,const string & snps_to_exclude_file,bool keep_then_exclude)110 void entry::filter_sites(const set<string> &snps_to_keep, const string &snps_to_keep_file, const string &snps_to_exclude_file, bool keep_then_exclude)
111 {
112 	// Filter sites by user provided lists
113 	if (keep_then_exclude)
114 	{
115 		filter_sites_to_keep(snps_to_keep, snps_to_keep_file);
116 		filter_sites_to_exclude(snps_to_exclude_file);
117 	}
118 	else
119 	{
120 		filter_sites_to_exclude(snps_to_exclude_file);
121 		filter_sites_to_keep(snps_to_keep, snps_to_keep_file);
122 	}
123 }
124 
filter_sites_to_keep(const set<string> & snps_to_keep,const string & snps_to_keep_file)125 void entry::filter_sites_to_keep(const set<string> &snps_to_keep, const string &snps_to_keep_file)
126 {
127 	// Filter sites by user provided list
128 	if(passed_filters == false)
129 		return;
130 
131 	if ((snps_to_keep.empty()) && (snps_to_keep_file == ""))
132 		return;
133 
134 	if (snps_to_keep_file != "" && local_snps_to_keep.empty())
135 	{
136 		ifstream in(snps_to_keep_file.c_str());
137 		string tmp;
138 		local_snps_to_keep = snps_to_keep;
139 
140 		if (!in.is_open())
141 		{
142 			LOG.error("Could not open SNPs to Keep file" + snps_to_keep_file, 0);
143 		}
144 		while (!in.eof())
145 		{
146 			in >> tmp;
147 			local_snps_to_keep.insert(tmp);
148 			in.ignore(numeric_limits<streamsize>::max(), '\n');
149 		}
150 
151 		in.close();
152 	}
153 	parse_basic_entry();
154 
155 	if ( (local_snps_to_keep.find(ID) == local_snps_to_keep.end()) && (snps_to_keep.find(ID) == snps_to_keep.end()) )
156 		passed_filters = false;
157 }
158 
filter_sites_to_exclude(const string & snps_to_exclude_file)159 void entry::filter_sites_to_exclude(const string &snps_to_exclude_file)
160 {
161 	// Filter sites by user provided list
162 	if(passed_filters == false)
163 		return;
164 
165 	if (snps_to_exclude_file == "")
166 		return;
167 
168 	if (snps_to_exclude_file != "" && snps_to_exclude.empty())
169 	{
170 		ifstream in(snps_to_exclude_file.c_str());
171 		string tmp;
172 		if (!in.is_open())
173 		{
174 			LOG.error("Could not open SNPs to Exclude file" + snps_to_exclude_file, 0);
175 		}
176 		while (!in.eof())
177 		{
178 			in >> tmp;
179 			snps_to_exclude.insert(tmp);
180 			in.ignore(numeric_limits<streamsize>::max(), '\n');
181 		}
182 		in.close();
183 	}
184 
185 	parse_basic_entry();
186 
187 	if (snps_to_exclude.find(ID) != snps_to_exclude.end())
188 		passed_filters = false;
189 }
190 
filter_sites_by_quality(double min_quality)191 void entry::filter_sites_by_quality(double min_quality)
192 {
193 	// Filter sites by quality
194 	if (passed_filters == false)
195 		return;
196 
197 	if (min_quality < 0)
198 		return;
199 
200 	parse_basic_entry(true);
201 
202 	string alt_allele = get_ALT_allele(0);
203 	// The QUAL field has different definitions depending on the state of the
204 	// alternative allele. Here I treat them separately, although in this case
205 	// it is unnecessary.
206 	if ((alt_allele == ".") || (alt_allele == ""))
207 	{	// The case that the alternative allele is unknown
208 		// QUAL is -10log_10 p(variant)
209 			if (QUAL < min_quality)
210 				passed_filters = false;
211 	}
212 	else
213 	{	// The normal case
214 		// QUAL is -10log_10 p(no variant)
215 		if (QUAL < min_quality)
216 			passed_filters = false;
217 	}
218 }
219 
filter_sites_by_mean_depth(double min_mean_depth,double max_mean_depth)220 void entry::filter_sites_by_mean_depth(double min_mean_depth, double max_mean_depth)
221 {
222 	// Filter sites by mean depth
223 	if (passed_filters == false)
224 		return;
225 
226 	if ((min_mean_depth <= 0) && (max_mean_depth == numeric_limits<double>::max()))
227 		return;
228 
229 	int depth;
230 
231 	unsigned int N_indv_included = 0;
232 	double depth_sum = 0.0;
233 	for (unsigned int ui=0; ui<N_indv; ui++)
234 	{
235 		if (include_indv[ui] == false)
236 			continue;
237 
238 		if (include_genotype[ui] == true)
239 		{
240 			parse_genotype_entry(ui, false, false, true);
241 
242 			if (entry_header.has_genotypes == false)
243 				LOG.error("Require Genotypes in VCF file in order to filter sites by mean depth");
244 
245 			depth = get_indv_DEPTH(ui);
246 			if (depth >= 0)
247 			{
248 				depth_sum += depth;
249 			}
250 			N_indv_included++;
251 		}
252 	}
253 	double mean_depth = depth_sum / N_indv_included;
254 
255 	if ((mean_depth < min_mean_depth) || (mean_depth > max_mean_depth))
256 		passed_filters = false;
257 }
258 
filter_sites_by_position(const string & chr,int start_pos,int end_pos)259 void entry::filter_sites_by_position(const string &chr, int start_pos, int end_pos)
260 {
261 	// Filter sites by user provided position range
262 	if (passed_filters == false)
263 		return;
264 
265 	if ((chr == "") || ((start_pos == -1) && (end_pos==numeric_limits<int>::max())))
266 		return;
267 
268 	parse_basic_entry();
269 
270 	if (CHROM == chr)
271 	{
272 		if ((POS < start_pos) || (POS > end_pos))
273 			passed_filters = false;
274 	}
275 	else
276 		passed_filters = false;
277 }
278 
filter_sites_by_positions(const string & positions_file,const string & exclude_positions_file)279 void entry::filter_sites_by_positions(const string &positions_file, const string &exclude_positions_file)
280 {
281 	// Filter sites by a user defined file containing a list of positions
282 	if (passed_filters == false)
283 		return;
284 
285 	if ((positions_file == "") && (exclude_positions_file == ""))
286 		return;
287 
288 	int idx;
289 	if (keep_positions.empty() && positions_file != "")
290 	{
291 		string chr;
292 		int pos1;
293 		unsigned int N_chr=chr_to_idx.size();
294 		stringstream ss;
295 		string line;
296 		unsigned int gzMAX_LINE_LEN = 1024*1024;
297 		char *gz_readbuffer = new char[gzMAX_LINE_LEN];
298 
299 		gzFile gz_in = gzopen(positions_file.c_str(), "rb");
300 		if (gz_in == NULL)
301 			LOG.error("Could not open Positions file: " + positions_file);
302 
303 		keep_positions.resize(N_chr);
304 		while (!gzeof(gz_in))
305 		{
306 			line = "";
307 			bool again = true;
308 			while (again == true)
309 			{
310 				gzgets(gz_in, gz_readbuffer, gzMAX_LINE_LEN);
311 				line.append(gz_readbuffer);
312 				if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1)
313 					again = false;
314 			}
315 			if (line[0] == '#')
316 				continue;
317 			line.erase( line.find_last_not_of(" \t\n\r") + 1);	// Trim whitespace at end of line (required in gzipped case!)
318 
319 			ss.clear();
320 			ss.str(line);
321 			ss >> chr >> pos1;
322 
323 			if (chr_to_idx.find(chr) == chr_to_idx.end())
324 			{
325 				N_chr++;
326 				chr_to_idx[chr] = (N_chr-1);
327 				keep_positions.resize(N_chr);
328 			}
329 
330 			idx = chr_to_idx[chr];
331 			keep_positions[idx].insert(pos1);
332 		}
333 		gzclose(gz_in);
334 		delete [] gz_readbuffer;
335 	}
336 	if (exclude_positions.empty() && exclude_positions_file != "")
337 	{
338 		string chr;
339 		int pos1;
340 		unsigned int N_chr=chr_to_idx.size();
341 		stringstream ss;
342 		string line;
343 		unsigned int gzMAX_LINE_LEN = 1024*1024;
344 		char *gz_readbuffer = new char[gzMAX_LINE_LEN];
345 
346 		gzFile gz_in = gzopen(exclude_positions_file.c_str(), "rb");
347 		if (gz_in == NULL)
348 			LOG.error("Could not open Positions file: " + exclude_positions_file);
349 
350 		exclude_positions.resize(N_chr);
351 		while (!gzeof(gz_in))
352 		{
353 			line = "";
354 			bool again = true;
355 			while (again == true)
356 			{
357 				gzgets(gz_in, gz_readbuffer, gzMAX_LINE_LEN);
358 				line.append(gz_readbuffer);
359 				if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1)
360 					again = false;
361 			}
362 			if (line[0] == '#')
363 				continue;
364 			line.erase( line.find_last_not_of(" \t\n\r") + 1);	// Trim whitespace at end of line (required in gzipped case!)
365 
366 			ss.clear();
367 			ss.str(line);
368 			ss >> chr >> pos1;
369 
370 			if (chr_to_idx.find(chr) == chr_to_idx.end())
371 			{
372 				N_chr++;
373 				chr_to_idx[chr] = (N_chr-1);
374 				exclude_positions.resize(N_chr);
375 			}
376 
377 			idx = chr_to_idx[chr];
378 			exclude_positions[idx].insert(pos1);
379 		}
380 		gzclose(gz_in);
381 		delete [] gz_readbuffer;
382 	}
383 
384 	parse_basic_entry();
385 
386 	if (!keep_positions.empty())
387 	{	// Check to see if position is in keep list
388 			if (chr_to_idx.find(CHROM) == chr_to_idx.end())
389 				passed_filters = false;
390 			else
391 			{
392 				idx = chr_to_idx[CHROM];
393 				if (keep_positions[idx].find(POS) == keep_positions[idx].end())
394 					passed_filters = false;
395 			}
396 	}
397 	if (!exclude_positions.empty())
398 	{	// Check to see if position is in exclude list
399 		if (chr_to_idx.find(CHROM) != chr_to_idx.end())
400 		{
401 			idx = chr_to_idx[CHROM];
402 			if (exclude_positions[idx].find(POS) != exclude_positions[idx].end())
403                 passed_filters = false;
404 		}
405 	}
406 }
407 
filter_sites_by_overlap_positions(const string & positions_overlap_file,const string & exclude_positions_overlap_file)408 void entry::filter_sites_by_overlap_positions(const string &positions_overlap_file, const string &exclude_positions_overlap_file)
409 {
410 	// Filter sites by overlapping with a user defined file containing a list of positions
411 	if (passed_filters == false)
412 		return;
413 
414 	if ((positions_overlap_file == "") && (exclude_positions_overlap_file == ""))
415 		return;
416 
417 	int idx;
418 	if (keep_positions.empty() && positions_overlap_file != "")
419 	{
420 		string chr;
421 		int pos1;
422 		unsigned int N_chr=chr_to_idx.size();
423 		stringstream ss;
424 		string line;
425 		unsigned int gzMAX_LINE_LEN = 1024*1024;
426 		char *gz_readbuffer = new char[gzMAX_LINE_LEN];
427 
428 		gzFile gz_in = gzopen(positions_overlap_file.c_str(), "rb");
429 		if (gz_in == NULL)
430 			LOG.error("Could not open Positions file: " + positions_overlap_file);
431 
432 		while (!gzeof(gz_in))
433 		{
434 			line = "";
435 			bool again = true;
436 			while (again == true)
437 			{
438 				gzgets(gz_in, gz_readbuffer, gzMAX_LINE_LEN);
439 				line.append(gz_readbuffer);
440 				if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1)
441 					again = false;
442 			}
443 			if (line[0] == '#')
444 				continue;
445 			line.erase( line.find_last_not_of(" \t\n\r") + 1);	// Trim whitespace at end of line (required in gzipped case!)
446 
447 			ss.clear();
448 			ss.str(line);
449 			ss >> chr >> pos1;
450 
451 			if (chr_to_idx.find(chr) == chr_to_idx.end())
452 			{
453 				N_chr++;
454 				chr_to_idx[chr] = (N_chr-1);
455 				keep_positions.resize(N_chr);
456 			}
457 
458 			idx = chr_to_idx[chr];
459 			keep_positions[idx].insert(pos1);
460 		}
461 		gzclose(gz_in);
462 		delete [] gz_readbuffer;
463 	}
464 	if (exclude_positions.empty() && exclude_positions_overlap_file != "")
465 	{
466 		string chr;
467 		int pos1;
468 		unsigned int N_chr=0;
469 		stringstream ss;
470 		string line;
471 		unsigned int gzMAX_LINE_LEN = 1024*1024;
472 		char *gz_readbuffer = new char[gzMAX_LINE_LEN];
473 
474 		gzFile gz_in = gzopen(exclude_positions_overlap_file.c_str(), "rb");
475 		if (gz_in == NULL)
476 			LOG.error("Could not open Positions file: " + exclude_positions_overlap_file);
477 
478 		while (!gzeof(gz_in))
479 		{
480 			line = "";
481 			bool again = true;
482 			while (again == true)
483 			{
484 				gzgets(gz_in, gz_readbuffer, gzMAX_LINE_LEN);
485 				line.append(gz_readbuffer);
486 				if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1)
487 					again = false;
488 			}
489 			if (line[0] == '#')
490 				continue;
491 			line.erase( line.find_last_not_of(" \t\n\r") + 1);	// Trim whitespace at end of line (required in gzipped case!)
492 
493 			ss.clear();
494 			ss.str(line);
495 			ss >> chr >> pos1;
496 
497 			if (chr_to_idx.find(chr) == chr_to_idx.end())
498 			{
499 				N_chr++;
500 				chr_to_idx[chr] = (N_chr-1);
501 				exclude_positions.resize(N_chr);
502 			}
503 
504 			idx = chr_to_idx[chr];
505 			exclude_positions[idx].insert(pos1);
506 		}
507 		gzclose(gz_in);
508 		delete [] gz_readbuffer;
509 	}
510 
511 	parse_basic_entry();
512 
513 	if (!keep_positions.empty())
514 	{	// Check to see if position is in keep list
515 			if (chr_to_idx.find(CHROM) == chr_to_idx.end())
516 				passed_filters = false;
517 			else
518 			{
519 				idx = chr_to_idx[CHROM];
520 				bool found=false;
521 
522 				for (unsigned int ui=POS; ui<(POS+REF.size()); ui++)
523 					if (keep_positions[idx].find(ui) != keep_positions[idx].end())
524 					{
525 						found = true;
526 						break;
527 					}
528 
529 				if (found == false)
530 					passed_filters = false;
531 			}
532 	}
533 	if (!exclude_positions.empty())
534 	{	// Check to see if position is in exclude list
535 		if (chr_to_idx.find(CHROM) != chr_to_idx.end())
536 		{
537 			idx = chr_to_idx[CHROM];
538 			bool found=false;
539 
540 			for (unsigned int ui=POS; ui<(POS+REF.size()); ui++)
541 				if (exclude_positions[idx].find(ui) != exclude_positions[idx].end())
542 					found = true;
543 
544 				if (found == true)
545 					passed_filters = false;
546 		}
547 	}
548 }
549 
filter_sites_by_chromosome(const set<string> & chrs_to_keep,const set<string> & chrs_to_exclude)550 void entry::filter_sites_by_chromosome(const set<string> &chrs_to_keep, const set<string> &chrs_to_exclude)
551 {
552 	if (passed_filters == false)
553 		return;
554 
555 	if (chrs_to_keep.empty() && chrs_to_exclude.empty())
556 		return;
557 
558 	parse_basic_entry();
559 
560 	if (!chrs_to_keep.empty())
561 	{
562 		if (chrs_to_keep.find(CHROM) == chrs_to_keep.end())
563 			passed_filters = false;
564 	}
565 	else
566 	{
567 		if (chrs_to_exclude.find(CHROM) != chrs_to_exclude.end())
568 			passed_filters = false;
569 	}
570 }
571 
filter_sites_by_BED_file(const string & bed_file,bool BED_exclude)572 void entry::filter_sites_by_BED_file(const string &bed_file, bool BED_exclude)
573 {
574 	// Filter sites depending on positions in a BED file.
575 	if (passed_filters == false)
576 		return;
577 
578 	if (bed_file == "")
579 		return;
580 
581 	int pos1, pos2, idx;
582 	if (lims.empty())
583 	{
584 		ifstream BED(bed_file.c_str());
585 		if (!BED.is_open())
586 			LOG.error("Could not open BED file: " + bed_file);
587 
588 		string chr;
589 		unsigned int N_chr=chr_to_idx.size();
590 		BED.ignore(numeric_limits<streamsize>::max(), '\n');	// Ignore header
591 		unsigned int N_BED_entries=0;
592 		while (!BED.eof())
593 		{
594 			BED >> chr >> pos1 >> pos2;
595 			BED.ignore(numeric_limits<streamsize>::max(), '\n');
596 
597 			if (chr_to_idx.find(chr) == chr_to_idx.end())
598 			{
599 				N_chr++;
600 				chr_to_idx[chr] = (N_chr-1);
601 				lims.resize(N_chr);
602 			}
603 
604 			idx = chr_to_idx[chr];
605 			lims[idx].push_back(make_pair(pos1,pos2));
606 			N_BED_entries++;
607 		}
608 		BED.close();
609 
610 		LOG.printLOG("\tRead " + output_log::int2str(N_BED_entries) + " BED file entries.\n");
611 
612 		for (unsigned int ui=0; ui<lims.size(); ui++)
613 			sort(lims[ui].begin(), lims[ui].end());
614 	}
615 	vector<unsigned int> min_ui(lims.size(), 0);
616 	parse_basic_entry(true);
617 
618 	pos1 = POS;
619 	pos2 = pos1;
620 	unsigned int N_alleles = get_N_alleles();
621 	for (int i=0; i<(int)N_alleles; i++)
622 		pos2 = max(pos2, (int)(pos1 + get_allele(i).length() - 1));
623 
624 	if (BED_exclude == false)
625 	{	// Exclude sites not in BED file
626 		if (chr_to_idx.find(CHROM) == chr_to_idx.end())
627 			passed_filters = false;
628 		else
629 		{
630 			idx = chr_to_idx[CHROM];
631 			bool found=false;
632 			unsigned int max_ui = lims[idx].size();
633 			for (unsigned int ui=min_ui[idx]; ui<max_ui; ui++)
634 			{	// No need to start this loop at zero every time...
635 				if (((pos1 > lims[idx][ui].first) && (pos1 <= lims[idx][ui].second)) ||	// Start pos inside bin
636 					((pos2 > lims[idx][ui].first) && (pos2 <= lims[idx][ui].second)) ||	// End pos inside bin
637 					((pos1 <= lims[idx][ui].first) && (pos2 >= lims[idx][ui].second)))	// Variant spans bin
638 				{
639 					found=true;
640 					break;
641 				}
642 				else if (pos1 > lims[idx][ui].second)
643 					min_ui[idx] = ui+1;
644 			}
645 			if (found == false)
646 				passed_filters = false;
647 		}
648 	}
649 	else
650 	{	// Exclude sites in BED file
651 		if (chr_to_idx.find(CHROM) != chr_to_idx.end())
652 		{
653 			idx = chr_to_idx[CHROM];
654 			bool found=false;
655 			unsigned int max_ui = lims[idx].size();
656 			for (unsigned int ui=min_ui[idx]; ui<max_ui; ui++)
657 			{	// No need to start this loop at zero every time...
658 				if (((pos1 > lims[idx][ui].first) && (pos1 <= lims[idx][ui].second)) ||	// Start pos inside bin
659 					((pos2 > lims[idx][ui].first) && (pos2 <= lims[idx][ui].second)) ||	// End pos inside bin
660 					((pos1 <= lims[idx][ui].first) && (pos2 >= lims[idx][ui].second)))	// Variant spans bin
661 				{
662 					found=true;
663 					break;
664 				}
665 				else if (pos1 > lims[idx][ui].second)
666 					min_ui[idx] = ui+1;
667 			}
668 			if (found == true)
669 				passed_filters = false;
670 		}
671 	}
672 }
673 
filter_sites_by_mask(const string & mask_file,bool invert_mask,int min_kept_mask_value)674 void entry::filter_sites_by_mask(const string &mask_file, bool invert_mask, int min_kept_mask_value)
675 {
676 	// Filter sites on the basis of a fasta-like mask file.
677 	if (passed_filters == false || mask_file == "")
678 		return;
679 
680 	if (!mask.is_open())
681 	{
682 		mask.open(mask_file.c_str());
683 		mask_chr = "";
684 		mask_line = "";
685 		mask_pos = 1;
686 
687 		if (!mask.is_open())
688 			LOG.error("Could not open mask file: " + mask_file);
689 	}
690 
691 	string line;
692 	string next_chr="";
693 	unsigned int next_pos = 0;
694 
695 	parse_basic_entry();
696 	next_chr = CHROM;
697 
698 	while (mask_chr != next_chr && !mask.eof())
699 	{
700 		getline(mask, line);
701 		line.erase( line.find_last_not_of(" \t") + 1);
702 
703 		if (line[0] == '>')
704 		{
705 			mask_chr = line.substr(1, line.find_first_of(" \t")-1);
706 			mask_pos = 1;
707 			getline(mask, line);
708 			mask_line = line;
709 		}
710 	}
711 
712 	if (next_chr == mask_chr)
713 		next_pos = (unsigned)POS;
714 	else
715 	{
716 		passed_filters = false;
717 		return;
718 	}
719 
720 	while (next_pos > (mask_pos + mask_line.size()) && !mask.eof())
721 	{
722 		getline(mask, line);
723 		line.erase( line.find_last_not_of(" \t") + 1);
724 
725 		if (line[0] == '>')
726 		{
727 			mask_chr = line.substr(1, line.find_first_of(" \t")-1);
728 			mask_pos = 1;
729 			passed_filters = false;
730 			return;
731 		}
732 		else
733 		{
734 			mask_pos += mask_line.size();
735 			mask_line = line;
736 		}
737 	}
738 
739 	if (next_chr == mask_chr && next_pos <= (mask_pos+mask_line.size()))
740 	{
741 		char mask_base = mask_line[next_pos-mask_pos]-48;
742 		bool keep = (mask_base <= min_kept_mask_value);
743 
744 		if (invert_mask == true)
745 			keep = !keep;
746 
747 		if (keep == false)
748 			passed_filters = false;
749 	}
750 	else
751 		passed_filters = false;
752 }
753 
filter_sites_by_number_of_alleles(int min_alleles,int max_alleles)754 void entry::filter_sites_by_number_of_alleles(int min_alleles, int max_alleles)
755 {
756 	// Filter sites by the number of alleles (e.g. 2 for bi-allelic)
757 	if (passed_filters == false)
758 		return;
759 
760 	if ((min_alleles <= 0) && (max_alleles == numeric_limits<int>::max()))
761 		return;
762 
763 	int N_alleles;
764 	parse_basic_entry(true);
765 	N_alleles = get_N_alleles();
766 	if ((N_alleles < min_alleles) || (N_alleles > max_alleles))
767 		passed_filters = false;
768 }
769 
filter_sites_by_frequency_and_call_rate(double min_maf,double max_maf,double min_non_ref_af,double max_non_ref_af,double min_non_ref_af_any,double max_non_ref_af_any,double min_site_call_rate)770 void entry::filter_sites_by_frequency_and_call_rate(double min_maf, double max_maf,
771 		double min_non_ref_af, double max_non_ref_af,
772 		double min_non_ref_af_any, double max_non_ref_af_any,
773 		double min_site_call_rate)
774 {
775 	// Filter sites so that all allele frequencies are between limits
776 	if (passed_filters == false)
777 		return;
778 
779 	if ((min_maf <= 0.0) && (max_maf >= 1.0) &&
780 			(min_site_call_rate <= 0) &&
781 			(min_non_ref_af <= 0.0) && (max_non_ref_af >= 1.0) &&
782 			(min_non_ref_af_any <= 0.0) && (max_non_ref_af_any >= 1.0))
783 		return;
784 
785 	unsigned int N_alleles;
786 	unsigned int N_non_missing_chr;
787 	parse_basic_entry(true);
788 	parse_genotype_entries(true);
789 
790 	if (GT_idx == -1)
791 		LOG.error("Require Genotypes in variant file to filter by frequency and/or call rate");
792 
793 	N_alleles = get_N_alleles();
794 
795 	vector<int> allele_counts;
796 	get_allele_counts(allele_counts, N_non_missing_chr);
797 
798 	double freq, folded_freq;
799 	double maf=numeric_limits<double>::max();
800 	int N_failed = 0;
801 	for (unsigned int ui=0; ui<N_alleles; ui++)
802 	{
803 		freq = allele_counts[ui] / (double)N_non_missing_chr;
804 		folded_freq = min(freq, 1.0 - freq);
805 
806 		maf = min(maf, folded_freq);
807 		if ((ui > 0) && ((freq < min_non_ref_af) || (freq > max_non_ref_af)))
808 			passed_filters = false;
809 
810 		if ((ui > 0) && ((freq < min_non_ref_af_any) || (freq > max_non_ref_af_any)))
811 			N_failed++;
812 	}
813 
814 	if (((min_non_ref_af > 0.0) || (max_non_ref_af < 1.0)) && (N_failed == (N_alleles-1)))
815 		passed_filters = false;
816 
817 	if ((maf < min_maf) || (maf > max_maf))
818 		passed_filters = false;
819 
820 	double call_rate = N_non_missing_chr / double(get_N_chr());
821 
822 	if (call_rate < min_site_call_rate)
823 		passed_filters = false;
824 }
825 
filter_sites_by_allele_type(bool keep_only_indels,bool remove_indels)826 void entry::filter_sites_by_allele_type(bool keep_only_indels, bool remove_indels)
827 {
828 	if (passed_filters == false)
829 		return;
830 
831 	if ((keep_only_indels == false) && (remove_indels == false))
832 		return;
833 	if ((keep_only_indels == true) && (remove_indels == true))
834 		LOG.error("Can't both keep and remove all indels!");
835 
836 	string allele;
837 	unsigned int ref_len, N_alleles;
838 	bool is_indel;
839 
840 	parse_basic_entry(true);
841 	is_indel = false;
842 	allele = REF;
843 	ref_len = allele.size();
844 	if (ref_len != 1)
845 		is_indel = true;
846 	N_alleles = get_N_alleles();
847 	for (unsigned int ui=1; ui<N_alleles; ui++)
848 	{
849 		get_allele(ui, allele);
850 		if (allele.size() != ref_len)
851 		{
852 			is_indel = true;
853 			break;
854 		}
855 	}
856 
857 	if (keep_only_indels == true)
858 	{
859 		if (is_indel == false)
860 			passed_filters = false;
861 	}
862 	else if (remove_indels == true)
863 	{
864 		if (is_indel == true)
865 			passed_filters = false;
866 	}
867 }
868 
filter_sites_by_allele_count(double min_mac,double max_mac,double min_non_ref_ac,double max_non_ref_ac,double min_non_ref_ac_any,double max_non_ref_ac_any,double max_missing_call_count)869 void entry::filter_sites_by_allele_count(double min_mac, double max_mac,
870 		double min_non_ref_ac, double max_non_ref_ac,
871 		double min_non_ref_ac_any, double max_non_ref_ac_any,
872 		double max_missing_call_count)
873 {
874 	// Filter sites so that all allele counts are between limits
875 	if (passed_filters == false)
876 		return;
877 
878 	if ((min_mac <= 0) && (max_mac == numeric_limits<int>::max()) &&
879 			(min_non_ref_ac <= 0) && (max_non_ref_ac == numeric_limits<int>::max()) &&
880 			(min_non_ref_ac_any <= 0) && (max_non_ref_ac_any == numeric_limits<int>::max()) &&
881 			(max_missing_call_count == numeric_limits<int>::max()))
882 		return;
883 
884 	unsigned int N_alleles, N_chr, N_non_missing_chr;
885 	parse_basic_entry(true);
886 	parse_genotype_entries(true);
887 
888 	if (entry_header.has_genotypes == false)
889 		LOG.error("Require Genotypes in variant file to filter by allele counts and/or missing data");
890 
891 	N_alleles = get_N_alleles();
892 
893 	if (N_alleles <= 1 && min_mac > 0)
894 		passed_filters = false;
895 
896 	vector<int> allele_counts;
897 	get_allele_counts(allele_counts, N_non_missing_chr);
898 	N_chr = get_N_chr();
899 
900 	int mac = numeric_limits<int>::max();
901 	int N_failed = 0;
902 	for (unsigned int ui=0; ui<N_alleles; ui++)
903 	{
904 		mac = min(allele_counts[ui], mac);
905 		if ((ui > 0) && ((allele_counts[ui] < min_non_ref_ac) || (allele_counts[ui] > max_non_ref_ac)))
906 			passed_filters = false;
907 
908 		if ((ui > 0) && ((allele_counts[ui] < min_non_ref_ac_any) || (allele_counts[ui] > max_non_ref_ac_any)))
909 			N_failed++;
910 	}
911 
912 	if (((min_non_ref_ac_any > 0) || (max_non_ref_ac_any < numeric_limits<int>::max())) && (N_failed == (N_alleles-1)))
913 		passed_filters = false;
914 
915 	if ((mac < min_mac) || (mac > max_mac))
916 		passed_filters = false;
917 
918 	if ((N_chr-N_non_missing_chr) > max_missing_call_count)
919 		passed_filters = false;
920 }
921 
filter_sites_by_HWE_pvalue(double min_HWE_pvalue)922 void entry::filter_sites_by_HWE_pvalue(double min_HWE_pvalue)
923 {
924 	// Filter sites by HWE p-value
925 	// Note this assumes Biallelic SNPs.
926 	if(passed_filters == false)
927 		return;
928 
929 	if (min_HWE_pvalue <= 0)
930 		return;
931 
932 	unsigned int b11, b12, b22;
933 	double p_hwe, p_lo, p_hi;
934 
935 	parse_basic_entry(true);
936 	parse_genotype_entries(true);
937 
938 	if (entry_header.has_genotypes == false)
939 		LOG.error("Require Genotypes in variant file to filter sites by HWE.");
940 
941 	get_genotype_counts(b11, b12, b22);
942 	entry::SNPHWE(b12, b11, b22, p_hwe, p_lo, p_hi);
943 
944 	if (p_hwe < min_HWE_pvalue)
945 		passed_filters = false;
946 }
947 
filter_sites_by_filter_status(const set<string> & filter_flags_to_remove,const set<string> & filter_flags_to_keep,bool remove_all)948 void entry::filter_sites_by_filter_status(const set<string> &filter_flags_to_remove, const set<string> &filter_flags_to_keep, bool remove_all)
949 {
950 	// Filter sites by entries in the FILTER field.
951 	if (passed_filters == false)
952 		return;
953 
954 	if ((remove_all == false) && (filter_flags_to_remove.empty()) && (filter_flags_to_keep.empty()))
955 		return;
956 
957 	vector<string> FILTERs;
958 	unsigned int N_to_remove = filter_flags_to_remove.size();
959 	unsigned int N_to_keep = filter_flags_to_keep.size();
960 	parse_basic_entry(false, true);
961 	get_FILTER_vector(FILTERs);
962 
963 	if (N_to_keep > 0)
964 	{
965 		bool keep = false;
966 		for (unsigned int ui=0; ui<FILTERs.size(); ui++)
967 			if (filter_flags_to_keep.find(FILTERs[ui]) != filter_flags_to_keep.end())
968 			{
969 				keep = true; break;
970 			}
971 
972 		passed_filters = keep;
973 	}
974 	if (passed_filters==false)
975 		return;
976 
977 	if ( (FILTERs.size() >= 1) && (FILTERs[0] == "PASS") )
978 		return;
979 	else if ((remove_all == true) && (!FILTERs.empty()))
980 		passed_filters = false;
981 	else if (N_to_remove > 0)
982 	{
983 		for (unsigned int ui=0; ui<FILTERs.size(); ui++)
984 			if (filter_flags_to_remove.find(FILTERs[ui]) != filter_flags_to_remove.end())
985 				passed_filters = false;
986 	}
987 }
988 
filter_sites_by_phase()989 void entry::filter_sites_by_phase()
990 {
991 	// Filter out sites with unphased entries
992 	// TODO: Alter this to allow for a max/min level of unphased-ness.
993 	if (passed_filters == false)
994 		return;
995 
996 	unsigned int count_unphased = 0;
997 	for (unsigned int ui=0; ui<N_indv; ui++)
998 	{
999 		if (include_indv[ui] == false)
1000 			continue;
1001 
1002 		parse_genotype_entry(ui, true);
1003 
1004 		if (get_indv_PHASE(ui) != '|')
1005 			count_unphased++;
1006 	}
1007 
1008 	if (count_unphased > 0)
1009 		passed_filters = false;
1010 }
1011 
filter_sites_by_thinning(int min_SNP_distance)1012 void entry::filter_sites_by_thinning(int min_SNP_distance)
1013 {
1014 	// Filter sites so that no two SNPs are within some minimum distance
1015 	if (passed_filters == false)
1016 		return;
1017 
1018 	if (min_SNP_distance < 1)
1019 		return;
1020 
1021 	parse_basic_entry();
1022 	if (CHROM == thin_chrom)
1023 	{
1024 		int distance_from_last_SNP = POS - thin_pos;
1025 		if (distance_from_last_SNP < min_SNP_distance)
1026 			passed_filters = false;
1027 	}
1028 	if (passed_filters == true)
1029 		thin_pos = POS;
1030 	thin_chrom = CHROM;
1031 }
1032 
filter_sites_by_INFO(const set<string> & flags_to_remove,const set<string> & flags_to_keep)1033 void entry::filter_sites_by_INFO(const set<string> &flags_to_remove, const set<string> &flags_to_keep)
1034 {
1035 	// Filter sites by entries in the INFO field.
1036 	if (passed_filters == false)
1037 		return;
1038 
1039 	if ((flags_to_remove.empty()) && (flags_to_keep.empty()))
1040 		return;
1041 
1042 	string value;
1043 	unsigned int N_to_remove = flags_to_remove.size();
1044 	unsigned int N_to_keep = flags_to_keep.size();
1045 
1046 	parse_basic_entry(false, false, true);
1047 
1048 	if (N_to_keep > 0)
1049 	{
1050 		bool keep = false;
1051 		for (set<string>::iterator it=flags_to_keep.begin(); it != flags_to_keep.end(); ++it)
1052 		{
1053 			if (entry_header.INFO_map[ entry_header.INFO_reverse_map[*it] ].Type != Flag)
1054 				LOG.error("Using INFO flag filtering on non flag type "+*it+" will not work correctly.");
1055 			else
1056 			{
1057 				value = get_INFO_value(*it);
1058 				if (value == "1")
1059 					keep = true;
1060 			}
1061 		}
1062 		passed_filters = keep;
1063 	}
1064 
1065 	if (passed_filters==false)
1066 		return;
1067 
1068 	if (N_to_remove > 0)
1069 	{
1070 		for (set<string>::iterator it=flags_to_remove.begin(); it != flags_to_remove.end(); ++it)
1071 		{
1072 			if (entry_header.INFO_map[ entry_header.INFO_reverse_map[*it] ].Type != Flag)
1073 				LOG.error("Using INFO flag filtering on non flag type "+*it+" will not work correctly.");
1074 			else
1075 			{
1076 				value = get_INFO_value(*it);
1077 				if (value == "1")
1078 				{
1079 					passed_filters = false;
1080 					continue;
1081 				}
1082 			}
1083 		}
1084 	}
1085 }
1086