1 /*
2 * Copyright (C) 2012 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include "VcfSubsetSamples.h"
19
reset()20 void VcfSubsetSamples::reset()
21 {
22 mySampleSubsetIndicator.clear();
23 mySampleNames.clear();
24 }
25
26
init(const VcfHeader & header,bool include)27 void VcfSubsetSamples::init(const VcfHeader& header, bool include)
28 {
29 // Get the number of samples from the header.
30 unsigned int origNumSamples = header.getNumSamples();
31
32 // Resize the sampleSubsetIndicator to nothing to clear it out.
33 mySampleSubsetIndicator.resize(0);
34
35 // Now resize sampleSubsetIndicator to indicate that all of the original
36 // samples are to be kept or not kept based on the include parameter.
37 // mySampleSubsetIndicator is sized to the original number of samples
38 // so it can be used when reading records to determine which ones should
39 // be removed/kept.
40 mySampleSubsetIndicator.resize(origNumSamples, include);
41
42 // Copy the vector of original sample names.
43 mySampleNames.clear();
44 mySampleNames.resize(origNumSamples);
45 for(unsigned int i = 0; i < origNumSamples; i++)
46 {
47 mySampleNames[i] = header.getSampleName(i);
48 }
49 }
50
51
addIncludeSample(const char * sampleName)52 bool VcfSubsetSamples::addIncludeSample(const char* sampleName)
53 {
54 // Look for the sample name.
55 for(unsigned int i = 0; i < mySampleNames.size(); i++)
56 {
57 if(mySampleNames[i] == sampleName)
58 {
59 // Found the sample index.
60 if(mySampleSubsetIndicator.size() <= i)
61 {
62 // SampleSubsetIndicator not setup properly.
63 return(false);
64 }
65 mySampleSubsetIndicator[i] = true;
66 return(true);
67 }
68 }
69 // Did not find the sample, so can't include it.
70 return(false);
71 }
72
73
addExcludeSample(const char * sampleName)74 bool VcfSubsetSamples::addExcludeSample(const char* sampleName)
75 {
76 // Look for the sample name.
77 for(unsigned int i = 0; i < mySampleNames.size(); i++)
78 {
79 if(mySampleNames[i] == sampleName)
80 {
81 // Found the sample index.
82 if(mySampleSubsetIndicator.size() <= i)
83 {
84 // SampleSubsetIndicator not setup properly.
85 return(false);
86 }
87 mySampleSubsetIndicator[i] = false;
88 return(true);
89 }
90 }
91 // Did not find the sample, so can't include it.
92 return(false);
93 }
94
95
init(VcfHeader & header,const char * includeFileName,const char * excludeSample,const char * excludeFileName,const char * delims)96 bool VcfSubsetSamples::init(VcfHeader& header,
97 const char* includeFileName,
98 const char* excludeSample,
99 const char* excludeFileName,
100 const char* delims)
101 {
102 // Setup the sample lists to include/exclude.
103 std::set<std::string> includeList;
104 std::set<std::string> excludeList;
105 if(includeFileName != NULL)
106 {
107 if(!readSamplesFromFile(includeFileName, includeList, delims))
108 {
109 // Failed, so return.
110 return(false);
111 }
112 }
113
114 if(excludeFileName != NULL)
115 {
116 if(!readSamplesFromFile(excludeFileName, excludeList, delims))
117 {
118 // Failed, so return.
119 return(false);
120 }
121 }
122 if(excludeSample != NULL)
123 {
124 excludeList.insert(excludeSample);
125 }
126
127 int origNumSamples = header.getNumSamples();
128
129 // Resize the sampleSubsetIndicator to nothing to clear it out.
130 mySampleSubsetIndicator.resize(0);
131
132 // Now resize sampleSubsetIndicator to indicate that all of the original
133 // samples are to be kept. The ones that are not to be kept will be
134 // modified to be unkept (false).
135 // mySampleSubsetIndicator is sized to the original number of samples
136 // so it can be used when reading records to determine which ones should
137 // be removed/kept.
138 mySampleSubsetIndicator.resize(origNumSamples, true);
139
140 // if no samples, return.
141 if(origNumSamples == 0)
142 {
143 return(true);
144 }
145
146 // Now that the sample lists to include/exclude are setup and the
147 // indicator vector is setup, subset the header removing samples that
148 // should not be kept (not in the include list if set or in the exclude
149 // list). Loop from the back of the samples to the beginning since
150 // removing samples changes the index of all following samples.
151 for(int i = (origNumSamples-1); i >= 0; i--)
152 {
153 // Check if the sample should be kept.
154 const char* sampleName = header.getSampleName(i);
155 // Remove the sample if the includeList was specified and the sample
156 // was not in it or if the excludeList was specified and the sample
157 // was in it.
158 if((!includeList.empty() &&
159 (includeList.count(sampleName) == 0)) ||
160 (!excludeList.empty() &&
161 (excludeList.count(sampleName) != 0)))
162 {
163 // This sample should be removed.
164 header.removeSample(i);
165 mySampleSubsetIndicator[i] = false;
166 }
167 }
168 return(true);
169 }
170
171
keep(unsigned int sampleIndex)172 bool VcfSubsetSamples::keep(unsigned int sampleIndex)
173 {
174 if(sampleIndex >= mySampleSubsetIndicator.size())
175 {
176 // index out of range.
177 return(false);
178 }
179 return(mySampleSubsetIndicator[sampleIndex]);
180 }
181
182
readSamplesFromFile(const char * fileName,std::set<std::string> & sampleList,const char * delims)183 bool VcfSubsetSamples::readSamplesFromFile(const char* fileName,
184 std::set<std::string>& sampleList,
185 const char* delims)
186 {
187 // Open the file.
188 IFILE sampleFile = ifopen(fileName, "r");
189
190 if(sampleFile == NULL)
191 {
192 // Failed to open.
193 return(false);
194 }
195
196 // read the file.
197 std::string tempString;
198
199 std::string delimString = delims;
200 delimString += '\n';
201
202 int readResult = 0;
203 while(readResult != -1)
204 {
205 readResult = sampleFile->readTilChar(delimString, tempString);
206
207 // Check to see if something was read (tempString is not empty).
208 if(!tempString.empty())
209 {
210 // sample name found, so add it to the container.
211 sampleList.insert(tempString);
212 }
213 // Clear the string being read into.
214 tempString.clear();
215 }
216 return(true);
217 }
218