1 /*
2  * variant_file_filters.cpp
3  *
4  *      Author: amarcketta
5  */
6 
7 #include "variant_file.h"
8 
apply_filters(const parameters & params)9 void variant_file::apply_filters(const parameters &params)
10 {
11 	filter_individuals(params.indv_to_keep, params.indv_to_exclude, params.indv_keep_files, params.indv_exclude_files);
12 	filter_individuals_randomly(params.max_N_indv);
13 }
14 
filter_individuals(const set<string> & indv_to_keep,const set<string> & indv_to_exclude,const vector<string> & indv_to_keep_filenames,const vector<string> & indv_to_exclude_filenames,bool keep_then_exclude)15 void variant_file::filter_individuals(const set<string> &indv_to_keep, const set<string> &indv_to_exclude, const vector<string> &indv_to_keep_filenames, const vector<string> &indv_to_exclude_filenames, bool keep_then_exclude)
16 {
17 	// Filter individuals by user provided lists
18 	if (keep_then_exclude)
19 	{
20 		filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filenames);
21 		filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filenames);
22 	}
23 	else
24 	{
25 		filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filenames);
26 		filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filenames);
27 	}
28 }
29 
filter_individuals_by_keep_list(const set<string> & indv_to_keep,const vector<string> & indv_to_keep_filenames)30 void variant_file::filter_individuals_by_keep_list(const set<string> &indv_to_keep, const vector<string> &indv_to_keep_filenames)
31 {
32 	// Filter individuals by user provided list
33 	if ((indv_to_keep_filenames.size() == 0) && (indv_to_keep.size() == 0))
34 		return;
35 
36 	LOG.printLOG("Keeping individuals in 'keep' list\n");
37 	set<string> indv_to_keep_copy = indv_to_keep;
38 	if (indv_to_keep_filenames.size() != 0)
39 	{
40 		for (unsigned int ui=0; ui<indv_to_keep_filenames.size(); ui++)
41 		{
42 			ifstream infile(indv_to_keep_filenames[ui].c_str());
43 			if (!infile.is_open())
44 				LOG.error("Could not open Individual file:" + indv_to_keep_filenames[ui], 1);
45 			string line;
46 			string tmp_indv;
47 			stringstream ss;
48 			while (!infile.eof())
49 			{
50 				getline(infile, line);
51 				ss.str(line);
52 				ss >> tmp_indv;
53 				indv_to_keep_copy.insert(tmp_indv);
54 				ss.clear();
55 			}
56 			infile.close();
57 		}
58 	}
59 	for (unsigned int ui=0; ui<include_indv.size(); ui++)
60 	{
61 		if (include_indv[ui] == false)
62 			continue;
63 		if (indv_to_keep_copy.find(meta_data.indv[ui]) == indv_to_keep_copy.end())
64 			include_indv[ui] = false;
65 	}
66 }
67 
filter_individuals_by_exclude_list(const set<string> & indv_to_exclude,const vector<string> & indv_to_exclude_filenames)68 void variant_file::filter_individuals_by_exclude_list(const set<string> &indv_to_exclude, const vector<string> &indv_to_exclude_filenames)
69 {
70 	// Filter individuals by user provided list
71 	if ((indv_to_exclude_filenames.size() == 0) && (indv_to_exclude.size() == 0))
72 		return;
73 	LOG.printLOG("Excluding individuals in 'exclude' list\n");
74 	set<string> indv_to_exclude_copy = indv_to_exclude;
75 	if (indv_to_exclude_filenames.size() != 0)
76 	{
77 		for (unsigned int ui=0; ui<indv_to_exclude_filenames.size(); ui++)
78 		{
79 			ifstream infile(indv_to_exclude_filenames[ui].c_str());
80 			if (!infile.is_open())
81 				LOG.error("Could not open Individual file:" + indv_to_exclude_filenames[ui], 1);
82 			string line;
83 			string tmp_indv;
84 			stringstream ss;
85 			while (!infile.eof())
86 			{
87 				getline(infile, line);
88 				ss.str(line);
89 				ss >> tmp_indv;
90 				indv_to_exclude_copy.insert(tmp_indv);
91 				ss.clear();
92 			}
93 			infile.close();
94 		}
95 	}
96 	for (unsigned int ui=0; ui<include_indv.size(); ui++)
97 	{
98 		if (include_indv[ui] == false)
99 			continue;
100 		if (indv_to_exclude_copy.find(meta_data.indv[ui]) != indv_to_exclude_copy.end())
101 			include_indv[ui] = false;
102 	}
103 }
104 
filter_individuals_randomly(int max_N_indv)105 void variant_file::filter_individuals_randomly(int max_N_indv)
106 {
107 	// Filter individuals randomly until have a random subset
108 	if (max_N_indv < 0)
109 		return;
110 	LOG.printLOG("Filtering Individuals Randomly\n");
111 
112 	if (meta_data.has_genotypes == false)
113 		LOG.error("Require Genotypes in variant file filter individuals.");
114 
115 	unsigned int N_kept_indv = N_kept_individuals();
116 
117 	srand ( time(NULL) );
118 	vector<unsigned int> keep_index(N_kept_indv);
119 	int count = 0;
120 	for (unsigned int ui=0; ui<meta_data.N_indv; ui++)
121 	{
122 		if (include_indv[ui] == true)
123 		{
124 			keep_index[count] = ui;
125 			count++;
126 		}
127 	}
128 
129 	random_shuffle(keep_index.begin(), keep_index.end());			// Get a random order
130 	keep_index.resize(min(max_N_indv, (signed)keep_index.size()));	// Only keep a subset
131 
132 	for (unsigned int ui=0; ui<meta_data.N_indv; ui++)
133 	{
134 		if (include_indv[ui] == false)
135 			continue;
136 		bool found = false;
137 		for (unsigned int uj=0; uj<keep_index.size(); uj++)
138 		{
139 			if (keep_index[uj] == ui)
140 			{
141 				found = true;
142 			}
143 		}
144 		if (found == false)
145 			include_indv[ui] = false;
146 	}
147 }
148