1 /*
2 vcflib C++ library for parsing and manipulating VCF files
3
4 Copyright © 2010-2020 Erik Garrison
5 Copyright © 2020 Pjotr Prins
6
7 This software is published under the MIT License. See the LICENSE file.
8 */
9
10
11 /*
12
13 This program was created at: Tue Sep 8 21:05:23 2015
14 This program was created by: Zev N. Kronenberg
15
16
17 Contact: zev.kronenber@gmail.com
18
19 Organization: Unviersity of Utah
20 School of Medicine
21 Salt Lake City, Utah
22
23
24 The MIT License (MIT)
25
26 Copyright (c) <2015> <Zev N. Kronenberg>
27
28 Permission is hereby granted, free of charge, to any person obtaining a copy
29 of this software and associated documentation files (the "Software"), to deal
30 in the Software without restriction, including without limitation the rights
31 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
32 copies of the Software, and to permit persons to whom the Software is
33 furnished to do so, subject to the following conditions:
34
35 The above copyright notice and this permission notice shall be included in
36 all copies or substantial portions of the Software.
37
38 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
39 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
40 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
43 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
44 THE SOFTWARE.
45
46
47 */
48
49 #include <string>
50 #include <iostream>
51 #include <fstream>
52 #include <math.h>
53 #include <cmath>
54 #include <stdlib.h>
55 #include <time.h>
56 #include <stdio.h>
57 #include <unistd.h>
58 #include <algorithm>
59 #include "split.h"
60 #include "gpatInfo.hpp"
61
62 struct options{
63 std::string file;
64 double cut;
65 }globalOpts;
66
67 using namespace std;
68
69 static const char *optString = "hf:s:";
70
printHelp(void)71 void printHelp(void){
72
73 cerr << endl << endl;
74 cerr << "INFO: help" << endl;
75 cerr << "INFO: description:" << endl;
76 cerr << "segmentFst creates genomic segments (bed file) for regions with high wcFst" << endl << endl ;
77 cerr << "segmentFst provides a way to find continious regions with high Fst values. It takes the output of wcFst and produces a BED file. These high Fst region can be permutated with 'permuteGPATwindow'" << endl ;
78
79 cerr << "Output : 8 columns : " << endl;
80 cerr << " 1. Seqid " << endl;
81 cerr << " 2. Start (zero based) " << endl;
82 cerr << " 3. End (zero based) " << endl;
83 cerr << " 4. Average Fst " << endl;
84 cerr << " 5. Average high Fst (Fst > -s) " << endl;
85 cerr << " 6. N Fst values in segment " << endl;
86 cerr << " 7. N high fst values in segment " << endl;
87 cerr << " 8. Segment length " << endl;
88
89 cerr << "INFO: usage: segmentFst -s 0.7 -f wcFst.output.txt " << endl;
90 cerr << endl;
91 cerr << "INFO: required: -f -- Output from wcFst " << endl;
92 cerr << "INFO: optional: -s -- High Fst cutoff [0.8] " << endl;
93 cerr << endl << "Type: statistics" << endl << endl;
94 cerr << endl;
95
96 printVersion();
97 }
98
99 //------------------------------- OPTIONS --------------------------------
parseOpts(int argc,char ** argv)100 int parseOpts(int argc, char** argv)
101 {
102 int opt = 0;
103 globalOpts.file = "NA";
104 opt = getopt(argc, argv, optString);
105 while(opt != -1){
106 switch(opt){
107 case 's':
108 {
109 string op = optarg;
110 globalOpts.cut = atof(op.c_str());
111 break;
112 }
113 case 'h':
114 {
115 printHelp();
116 exit(1);
117 break;
118 }
119
120 case 'f':
121 {
122 globalOpts.file = optarg;
123 break;
124 }
125 case '?':
126 {
127 break;
128 }
129 }
130 opt = getopt( argc, argv, optString );
131 }
132 return 1;
133 }
134 //------------------------------- SUBROUTINE --------------------------------
135 /*
136 Function input : The vector of values
137 Function does : Finds the bounds of the window
138 Function returns: bool
139 */
140
findWindow(vector<double> & values,int * begin,int * end)141 bool findWindow(vector<double> & values,
142 int * begin ,
143 int * end ){
144
145 *begin -= 1;
146 *end += 1;
147
148 if(*begin < 0){
149 return false;
150 }
151 if(*end >= values.size()){
152 return false;
153 }
154
155 int tmpHigh = 0;
156 int tmpLow = 0;
157
158 for(int index = *begin; index <= *end; index++){
159 if(values[index] > globalOpts.cut){
160 tmpHigh += 1;
161 }
162 else{
163 tmpLow += 1;
164 }
165 }
166 if((tmpHigh*2) < tmpLow){
167 return false;
168 }
169 return true;
170 }
171
172 //------------------------------- SUBROUTINE --------------------------------
173 /*
174 Function input : takes the sorted Fst scores, positions, and seqids
175 Function does : segments and prints bed
176 Function returns:
177
178 */
179
180
process(vector<int> & pos,vector<double> & value,vector<string> & seqid)181 void process(vector<int> & pos, vector<double> & value, vector<string> & seqid)
182 {
183
184 // i is the index of the outter loop/aka variant sites.
185 // always start the seed with 9 SNPs the window grows to 10 in "growWindow"
186 for(int i = 9; i < value.size()-9; i++){
187
188 int begin = i -9;
189 int end = i +9;
190
191 bool anyGroth = false;
192
193 while(findWindow(value, &begin, &end)){
194 anyGroth = true;
195 }
196 // the current window had an extention
197 if(anyGroth){
198 // reset the index beyond current window
199
200 i = end + 1;
201
202 if(begin < 0){
203 begin = 0;
204 }
205 if(end >= value.size()){
206 end = (value.size() - 1);
207 }
208
209 int nHigh = 0;
210 int nLow = 0;
211
212 double HighSum = 0;
213 double LowSum = 0;
214
215 for(long int lb = begin ; lb <= end; lb++){
216 if( value[lb] >= globalOpts.cut ){
217 nHigh += 1;
218 HighSum += value[lb];
219 }
220 else{
221 nLow +=1;
222 LowSum += value[lb];
223 }
224 }
225
226
227 double avgFstHigh = HighSum / double(nHigh);
228 double avgFst = (HighSum + LowSum) / (double(nHigh)+double(nLow));
229
230
231 cout << seqid[begin] << "\t"
232 << pos[begin] -1 << "\t"
233 << pos[end] -1 << "\t"
234 << avgFst << "\t"
235 << avgFstHigh << "\t"
236 << nHigh + nLow << "\t"
237 << nHigh << "\t"
238 << (pos[end] - pos[begin])
239 << endl;
240
241 }
242 }
243 }
244
245 //------------------------------- MAIN --------------------------------
246 /*
247 Comments:
248 */
249
main(int argc,char ** argv)250 int main( int argc, char** argv)
251 {
252 globalOpts.cut = 0.8;
253 int parse = parseOpts(argc, argv);
254
255 string last;
256 int lastPos = 0;
257
258 vector<string> seqid;
259 vector<int> pos;
260 vector<double> value;
261
262 if(globalOpts.file.empty()){
263 printHelp();
264 exit(1);
265 }
266
267 string line;
268 ifstream myfile (globalOpts.file);
269 if (myfile.is_open())
270 {
271 while ( getline (myfile,line) )
272 {
273 vector<string> lineDat = split(line, '\t');
274 if(lineDat.size() != 5){
275 cerr << "FATAL: not valid wcFst input" << endl;
276 exit(1);
277 }
278 if(last.compare(lineDat[0]) != 0){
279 last = lineDat[0];
280 process(pos, value, seqid);
281 pos.clear();
282 value.clear();
283 seqid.clear();
284 lastPos = 0;
285 }
286 else{
287 if(atoi(lineDat[1].c_str()) < lastPos){
288 cerr << "FATAL: wcFst input must be sorted by position." << endl;
289 exit(1);
290 }
291 lastPos = atoi(lineDat[1].c_str());
292 seqid.push_back(lineDat[0]);
293 pos.push_back(atoi(lineDat[1].c_str()));
294 value.push_back(atof(lineDat[4].c_str()));
295 }
296 }
297
298 std::cerr << "INFO: about to segment: " << pos.size() << " scores." << std::endl;
299 process(pos, value, seqid);
300 myfile.close();
301 }
302 else{
303 cerr << "FATAL: could not open file: " << globalOpts.file << endl;
304 exit(1);
305 }
306 return 0;
307 }
308