1
2 /*
3
4 This program was created at: Tue Sep 8 21:05:23 2015
5 This program was created by: Zev N. Kronenberg
6
7
8 Contact: zev.kronenber@gmail.com
9
10 Organization: Unviersity of Utah
11 School of Medicine
12 Salt Lake City, Utah
13
14
15 The MIT License (MIT)
16
17 Copyright (c) <2015> <Zev N. Kronenberg>
18
19 Permission is hereby granted, free of charge, to any person obtaining a copy
20 of this software and associated documentation files (the "Software"), to deal
21 in the Software without restriction, including without limitation the rights
22 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
23 copies of the Software, and to permit persons to whom the Software is
24 furnished to do so, subject to the following conditions:
25
26 The above copyright notice and this permission notice shall be included in
27 all copies or substantial portions of the Software.
28
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
34 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
35 THE SOFTWARE.
36
37
38 */
39
40 #include <string>
41 #include <iostream>
42 #include <fstream>
43 #include <math.h>
44 #include <cmath>
45 #include <stdlib.h>
46 #include <time.h>
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <algorithm>
50 #include "split.h"
51 #include "gpatInfo.hpp"
52
53 struct options{
54 std::string file;
55 double cut;
56 }globalOpts;
57
58 using namespace std;
59
60 static const char *optString = "hf:s:";
61
printHelp(void)62 void printHelp(void){
63
64 cerr << endl << endl;
65 cerr << "INFO: help" << endl;
66 cerr << "INFO: description:" << endl;
67 cerr << " Creates genomic segments (bed file) for regions with high wcFst " << endl;
68
69 cerr << "Output : 8 columns : " << endl;
70 cerr << " 1. Seqid " << endl;
71 cerr << " 2. Start (zero based) " << endl;
72 cerr << " 3. End (zero based) " << endl;
73 cerr << " 4. Average Fst " << endl;
74 cerr << " 5. Average high Fst (Fst > -s) " << endl;
75 cerr << " 6. N Fst values in segment " << endl;
76 cerr << " 7. N high fst values in segment " << endl;
77 cerr << " 8. Segment length " << endl;
78
79 cerr << "INFO: usage: segmentFst -s 0.7 -f wcFst.output.txt " << endl;
80 cerr << endl;
81 cerr << "INFO: required: -f -- Output from wcFst " << endl;
82 cerr << "INFO: optional: -s -- High Fst cutoff [0.8] " << endl;
83
84 cerr << endl;
85
86 printVersion();
87 }
88
89 //------------------------------- OPTIONS --------------------------------
parseOpts(int argc,char ** argv)90 int parseOpts(int argc, char** argv)
91 {
92 int opt = 0;
93 globalOpts.file = "NA";
94 opt = getopt(argc, argv, optString);
95 while(opt != -1){
96 switch(opt){
97 case 's':
98 {
99 string op = optarg;
100 globalOpts.cut = atof(op.c_str());
101 break;
102 }
103 case 'h':
104 {
105 printHelp();
106 exit(1);
107 break;
108 }
109
110 case 'f':
111 {
112 globalOpts.file = optarg;
113 break;
114 }
115 case '?':
116 {
117 break;
118 }
119 }
120 opt = getopt( argc, argv, optString );
121 }
122 return 1;
123 }
124 //------------------------------- SUBROUTINE --------------------------------
125 /*
126 Function input : The vector of values
127 Function does : Finds the bounds of the window
128 Function returns: bool
129 */
130
findWindow(vector<double> & values,int * begin,int * end)131 bool findWindow(vector<double> & values,
132 int * begin ,
133 int * end ){
134
135 *begin -= 1;
136 *end += 1;
137
138 if(*begin < 0){
139 return false;
140 }
141 if(*end >= values.size()){
142 return false;
143 }
144
145 int tmpHigh = 0;
146 int tmpLow = 0;
147
148 for(int index = *begin; index <= *end; index++){
149 if(values[index] > globalOpts.cut){
150 tmpHigh += 1;
151 }
152 else{
153 tmpLow += 1;
154 }
155 }
156 if((tmpHigh*2) < tmpLow){
157 return false;
158 }
159 return true;
160 }
161
162 //------------------------------- SUBROUTINE --------------------------------
163 /*
164 Function input : takes the sorted Fst scores, positions, and seqids
165 Function does : segments and prints bed
166 Function returns:
167
168 */
169
170
process(vector<int> & pos,vector<double> & value,vector<string> & seqid)171 void process(vector<int> & pos, vector<double> & value, vector<string> & seqid)
172 {
173
174 // i is the index of the outter loop/aka variant sites.
175 // always start the seed with 9 SNPs the window grows to 10 in "growWindow"
176 for(int i = 9; i < value.size()-9; i++){
177
178 int begin = i -9;
179 int end = i +9;
180
181 bool anyGroth = false;
182
183 while(findWindow(value, &begin, &end)){
184 anyGroth = true;
185 }
186 // the current window had an extention
187 if(anyGroth){
188 // reset the index beyond current window
189
190 i = end + 1;
191
192 if(begin < 0){
193 begin = 0;
194 }
195 if(end >= value.size()){
196 end = (value.size() - 1);
197 }
198
199 int nHigh = 0;
200 int nLow = 0;
201
202 double HighSum = 0;
203 double LowSum = 0;
204
205 for(long int lb = begin ; lb <= end; lb++){
206 if( value[lb] >= globalOpts.cut ){
207 nHigh += 1;
208 HighSum += value[lb];
209 }
210 else{
211 nLow +=1;
212 LowSum += value[lb];
213 }
214 }
215
216
217 double avgFstHigh = HighSum / double(nHigh);
218 double avgFst = (HighSum + LowSum) / (double(nHigh)+double(nLow));
219
220
221 cout << seqid[begin] << "\t"
222 << pos[begin] -1 << "\t"
223 << pos[end] -1 << "\t"
224 << avgFst << "\t"
225 << avgFstHigh << "\t"
226 << nHigh + nLow << "\t"
227 << nHigh << "\t"
228 << (pos[end] - pos[begin])
229 << endl;
230
231 }
232 }
233 }
234
235 //------------------------------- MAIN --------------------------------
236 /*
237 Comments:
238 */
239
main(int argc,char ** argv)240 int main( int argc, char** argv)
241 {
242 globalOpts.cut = 0.8;
243 int parse = parseOpts(argc, argv);
244
245 string last;
246 int lastPos = 0;
247
248 vector<string> seqid;
249 vector<int> pos;
250 vector<double> value;
251
252 if(globalOpts.file.empty()){
253 printHelp();
254 exit(1);
255 }
256
257 string line;
258 ifstream myfile (globalOpts.file);
259 if (myfile.is_open())
260 {
261 while ( getline (myfile,line) )
262 {
263 vector<string> lineDat = split(line, '\t');
264 if(lineDat.size() != 5){
265 cerr << "FATAL: not valid wcFst input" << endl;
266 exit(1);
267 }
268 if(last.compare(lineDat[0]) != 0){
269 last = lineDat[0];
270 process(pos, value, seqid);
271 pos.clear();
272 value.clear();
273 seqid.clear();
274 lastPos = 0;
275 }
276 else{
277 if(atoi(lineDat[1].c_str()) < lastPos){
278 cerr << "FATAL: wcFst input must be sorted by position." << endl;
279 exit(1);
280 }
281 lastPos = atoi(lineDat[1].c_str());
282 seqid.push_back(lineDat[0]);
283 pos.push_back(atoi(lineDat[1].c_str()));
284 value.push_back(atof(lineDat[4].c_str()));
285 }
286 }
287
288 std::cerr << "INFO: about to segment: " << pos.size() << " scores." << std::endl;
289 process(pos, value, seqid);
290 myfile.close();
291 }
292 else{
293 cerr << "FATAL: could not open file: " << globalOpts.file << endl;
294 exit(1);
295 }
296 return 0;
297 }
298