1 /*
2  *  Copyright (C) 2012  Regents of the University of Michigan
3  *
4  *   This program is free software: you can redistribute it and/or modify
5  *   it under the terms of the GNU General Public License as published by
6  *   the Free Software Foundation, either version 3 of the License, or
7  *   (at your option) any later version.
8  *
9  *   This program is distributed in the hope that it will be useful,
10  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *   GNU General Public License for more details.
13  *
14  *   You should have received a copy of the GNU General Public License
15  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "VcfSubsetSamples.h"
19 
reset()20 void VcfSubsetSamples::reset()
21 {
22     mySampleSubsetIndicator.clear();
23     mySampleNames.clear();
24 }
25 
26 
init(const VcfHeader & header,bool include)27 void VcfSubsetSamples::init(const VcfHeader& header, bool include)
28 {
29     // Get the number of samples from the header.
30     unsigned int origNumSamples = header.getNumSamples();
31 
32     // Resize the sampleSubsetIndicator to nothing to clear it out.
33     mySampleSubsetIndicator.resize(0);
34 
35     // Now resize sampleSubsetIndicator to indicate that all of the original
36     // samples are to be kept or not kept based on the include parameter.
37     // mySampleSubsetIndicator is sized to the original number of samples
38     // so it can be used when reading records to determine which ones should
39     // be removed/kept.
40     mySampleSubsetIndicator.resize(origNumSamples, include);
41 
42     // Copy the vector of original sample names.
43     mySampleNames.clear();
44     mySampleNames.resize(origNumSamples);
45     for(unsigned int i = 0; i < origNumSamples; i++)
46     {
47         mySampleNames[i] = header.getSampleName(i);
48     }
49 }
50 
51 
addIncludeSample(const char * sampleName)52 bool VcfSubsetSamples::addIncludeSample(const char* sampleName)
53 {
54     // Look for the sample name.
55     for(unsigned int i = 0; i < mySampleNames.size(); i++)
56     {
57         if(mySampleNames[i] == sampleName)
58         {
59             // Found the sample index.
60             if(mySampleSubsetIndicator.size() <= i)
61             {
62                 // SampleSubsetIndicator not setup properly.
63                 return(false);
64             }
65             mySampleSubsetIndicator[i] = true;
66             return(true);
67         }
68     }
69     // Did not find the sample, so can't include it.
70     return(false);
71 }
72 
73 
addExcludeSample(const char * sampleName)74 bool VcfSubsetSamples::addExcludeSample(const char* sampleName)
75 {
76     // Look for the sample name.
77     for(unsigned int i = 0; i < mySampleNames.size(); i++)
78     {
79         if(mySampleNames[i] == sampleName)
80         {
81             // Found the sample index.
82             if(mySampleSubsetIndicator.size() <= i)
83             {
84                 // SampleSubsetIndicator not setup properly.
85                 return(false);
86             }
87             mySampleSubsetIndicator[i] = false;
88             return(true);
89         }
90     }
91     // Did not find the sample, so can't include it.
92     return(false);
93 }
94 
95 
init(VcfHeader & header,const char * includeFileName,const char * excludeSample,const char * excludeFileName,const char * delims)96 bool VcfSubsetSamples::init(VcfHeader& header,
97                             const char* includeFileName,
98                             const char* excludeSample,
99                             const char* excludeFileName,
100                             const char* delims)
101 {
102     // Setup the sample lists to include/exclude.
103     std::set<std::string> includeList;
104     std::set<std::string> excludeList;
105     if(includeFileName != NULL)
106     {
107         if(!readSamplesFromFile(includeFileName, includeList, delims))
108         {
109             // Failed, so return.
110             return(false);
111         }
112     }
113 
114     if(excludeFileName != NULL)
115     {
116         if(!readSamplesFromFile(excludeFileName, excludeList, delims))
117         {
118             // Failed, so return.
119             return(false);
120         }
121     }
122     if(excludeSample != NULL)
123     {
124         excludeList.insert(excludeSample);
125     }
126 
127     int origNumSamples = header.getNumSamples();
128 
129     // Resize the sampleSubsetIndicator to nothing to clear it out.
130     mySampleSubsetIndicator.resize(0);
131 
132     // Now resize sampleSubsetIndicator to indicate that all of the original
133     // samples are to be kept.  The ones that are not to be kept will be
134     // modified to be unkept (false).
135     // mySampleSubsetIndicator is sized to the original number of samples
136     // so it can be used when reading records to determine which ones should
137     // be removed/kept.
138     mySampleSubsetIndicator.resize(origNumSamples, true);
139 
140     // if no samples, return.
141     if(origNumSamples == 0)
142     {
143         return(true);
144     }
145 
146     // Now that the sample lists to include/exclude are setup and the
147     // indicator vector is setup, subset the header removing samples that
148     // should not be kept (not in the include list if set or in the exclude
149     // list). Loop from the back of the samples to the beginning since
150     // removing samples changes the index of all following samples.
151     for(int i = (origNumSamples-1); i >= 0; i--)
152     {
153         // Check if the sample should be kept.
154         const char* sampleName = header.getSampleName(i);
155         // Remove the sample if the includeList was specified and the sample
156         // was not in it or if the excludeList was specified and the sample
157         // was in it.
158         if((!includeList.empty() &&
159             (includeList.count(sampleName) == 0)) ||
160            (!excludeList.empty() &&
161             (excludeList.count(sampleName) != 0)))
162         {
163             // This sample should be removed.
164             header.removeSample(i);
165             mySampleSubsetIndicator[i] = false;
166         }
167     }
168     return(true);
169 }
170 
171 
keep(unsigned int sampleIndex)172 bool VcfSubsetSamples::keep(unsigned int sampleIndex)
173 {
174     if(sampleIndex >= mySampleSubsetIndicator.size())
175     {
176         // index out of range.
177         return(false);
178     }
179     return(mySampleSubsetIndicator[sampleIndex]);
180 }
181 
182 
readSamplesFromFile(const char * fileName,std::set<std::string> & sampleList,const char * delims)183 bool VcfSubsetSamples::readSamplesFromFile(const char* fileName,
184                                            std::set<std::string>& sampleList,
185                                            const char* delims)
186 {
187     // Open the file.
188     IFILE sampleFile = ifopen(fileName, "r");
189 
190     if(sampleFile == NULL)
191     {
192         // Failed to open.
193         return(false);
194     }
195 
196     // read the file.
197     std::string tempString;
198 
199     std::string delimString = delims;
200     delimString += '\n';
201 
202     int readResult = 0;
203     while(readResult != -1)
204     {
205         readResult = sampleFile->readTilChar(delimString, tempString);
206 
207         // Check to see if something was read (tempString is not empty).
208         if(!tempString.empty())
209         {
210             // sample name found, so add it to the container.
211             sampleList.insert(tempString);
212         }
213         // Clear the string being read into.
214         tempString.clear();
215     }
216     return(true);
217 }
218