1
2 /*
3
4 This program was created at: Tue Sep 8 21:05:23 2015
5 This program was created by: Zev N. Kronenberg
6
7
8 Contact: zev.kronenber@gmail.com
9
10 Organization: Unviersity of Utah
11 School of Medicine
12 Salt Lake City, Utah
13
14
15 The MIT License (MIT)
16
17 Copyright (c) <2015> <Zev N. Kronenberg>
18
19 Permission is hereby granted, free of charge, to any person obtaining a copy
20 of this software and associated documentation files (the "Software"), to deal
21 in the Software without restriction, including without limitation the rights
22 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
23 copies of the Software, and to permit persons to whom the Software is
24 furnished to do so, subject to the following conditions:
25
26 The above copyright notice and this permission notice shall be included in
27 all copies or substantial portions of the Software.
28
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
34 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
35 THE SOFTWARE.
36
37
38 */
39
40 #include <string>
41 #include <iostream>
42 #include <fstream>
43 #include <math.h>
44 #include <cmath>
45 #include <stdlib.h>
46 #include <time.h>
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <algorithm>
50 #include "split.h"
51 #include "gpatInfo.hpp"
52
53 struct options{
54 std::string file;
55 double cut;
56 }globalOpts;
57
58 using namespace std;
59
60 static const char *optString = "hf:s:";
61
printHelp(void)62 void printHelp(void){
63
64 cerr << endl << endl;
65 cerr << "INFO: help" << endl;
66 cerr << "INFO: description:" << endl;
67 cerr << " Creates genomic segments (bed file) for regions with high wcFst " << endl;
68
69 cerr << "Output : 8 columns : " << endl;
70 cerr << " 1. Seqid " << endl;
71 cerr << " 2. Start (zero based) " << endl;
72 cerr << " 3. End (zero based) " << endl;
73 cerr << " 4. Average iHS " << endl;
74 cerr << " 5. Average high Fst (iHS > -s) " << endl;
75 cerr << " 6. N iHS values in segment " << endl;
76 cerr << " 7. N high iHS values in segment " << endl;
77 cerr << " 8. Segment length " << endl;
78
79 cerr << "INFO: usage: segmentFst -s 2 -f iHS.normalized.output.txt " << endl;
80 cerr << endl;
81 cerr << "INFO: required: -f -- Output from normalizeIHS " << endl;
82 cerr << "INFO: optional: -s -- High absolute iHS cutoff [2] " << endl;
83
84 cerr << endl;
85
86 printVersion();
87 }
88
89 //------------------------------- OPTIONS --------------------------------
parseOpts(int argc,char ** argv)90 int parseOpts(int argc, char** argv)
91 {
92 int opt = 0;
93 globalOpts.file = "NA";
94 opt = getopt(argc, argv, optString);
95 while(opt != -1){
96 switch(opt){
97 case 's':
98 {
99 string op = optarg;
100 globalOpts.cut = atof(op.c_str());
101 break;
102 }
103 case 'h':
104 {
105 printHelp();
106 exit(1);
107 break;
108 }
109
110 case 'f':
111 {
112 globalOpts.file = optarg;
113 break;
114 }
115 case '?':
116 {
117 break;
118 }
119 }
120 opt = getopt( argc, argv, optString );
121 }
122 return 1;
123 }
124 //------------------------------- SUBROUTINE --------------------------------
125 /*
126 Function input :
127
128 Function does :
129
130 Function returns:
131
132 */
133
growWindow(vector<double> & values,int * begin,int * end,int * nhigh,int * nlow,double * hSum,double * lSum)134 bool growWindow(vector<double> & values,
135 int * begin ,
136 int * end ,
137 int * nhigh ,
138 int * nlow ,
139 double * hSum ,
140 double * lSum ){
141
142 *begin -= 1;
143 *end += 1;
144
145 if(*begin < 0){
146 return false;
147 }
148 if(*end >= values.size()){
149 return false;
150 }
151
152 for(int index = *begin; index <= *end; index++){
153 if(values[index] > globalOpts.cut){
154 *nhigh += 1;
155 *hSum += values[index];
156 }
157 else{
158 *nlow += 1;
159 *lSum += values[index];
160 }
161 }
162 if((*nhigh)*2 < (*nlow)){
163 return false;
164 }
165 return true;
166 }
167
168 //------------------------------- SUBROUTINE --------------------------------
169 /*
170 Function input : takes the sorted Fst scores, positions, and seqids
171
172 Function does : segments and prints bed
173
174 Function returns:
175
176 */
177
178
process(vector<int> & pos,vector<double> & value,vector<string> & seqid)179 void process(vector<int> & pos, vector<double> & value, vector<string> & seqid)
180 {
181
182
183 // i is the index of the outter loop/aka variant sites.
184 // always start the seed with 9 SNPs the window grows to 10 in "growWindow"
185 for(int i = 9; i < value.size()-9; i++){
186
187 int begin = i -9;
188 int end = i +9;
189
190 int nHigh = 0;
191 int nLow = 0;
192
193 double HighSum = 0;
194 double LowSum = 0;
195
196 bool anyGroth = false;
197
198 while(growWindow(value, &begin, &end,
199 &nHigh, &nLow, &HighSum, &LowSum)){
200 anyGroth = true;
201 }
202 // the current window had an extention
203 if(anyGroth){
204 // reset the index beyond current window
205 i = end + 1;
206
207 if(begin < 0){
208 begin = 0;
209 }
210 if(end >= value.size()){
211 end = (value.size() - 1);
212 }
213
214 double avgFstHigh = HighSum / double(nHigh);
215 double avgFst = (HighSum + LowSum) / (double(nHigh)+double(nLow));
216
217
218
219 cout << seqid[begin] << "\t"
220 << pos[begin] -1 << "\t"
221 << pos[end] -1 << "\t"
222 << avgFst << "\t"
223 << avgFstHigh << "\t"
224 << nHigh + nLow << "\t"
225 << nHigh << "\t"
226 << (pos[end] - pos[begin])
227 << endl;
228
229 }
230 }
231 }
232
233 //------------------------------- MAIN --------------------------------
234 /*
235 Comments:
236 */
237
main(int argc,char ** argv)238 int main( int argc, char** argv)
239 {
240 globalOpts.cut = 2;
241 int parse = parseOpts(argc, argv);
242
243 string last;
244 int lastPos = 0;
245
246 vector<string> seqid;
247 vector<int> pos;
248 vector<double> value;
249
250 if(globalOpts.file.empty()){
251 printHelp();
252 exit(1);
253 }
254
255 string line;
256 ifstream myfile (globalOpts.file);
257 if (myfile.is_open())
258 {
259 while ( getline (myfile,line) )
260 {
261 vector<string> lineDat = split(line, '\t');
262 if(lineDat.size() != 9){
263 cerr << "FATAL: not valid normalized iHS file" << endl;
264 exit(1);
265 }
266 if(last.compare(lineDat[0]) != 0){
267 last = lineDat[0];
268 process(pos, value, seqid);
269 pos.clear();
270 value.clear();
271 seqid.clear();
272 lastPos = 0;
273 }
274 else{
275 if(atoi(lineDat[1].c_str()) < lastPos){
276 cerr << "FATAL: normalized iHS input must be sorted by position." << endl;
277 exit(1);
278 }
279 lastPos = atoi(lineDat[1].c_str());
280 seqid.push_back(lineDat[0]);
281 pos.push_back(atoi(lineDat[1].c_str()));
282 value.push_back(abs(atof(lineDat[6].c_str())));
283 }
284 }
285
286 std::cerr << "INFO: about to segment: " << pos.size() << " scores." << std::endl;
287 process(pos, value, seqid);
288 myfile.close();
289 }
290 else{
291 cerr << "FATAL: could not open file: " << globalOpts.file << endl;
292 exit(1);
293 }
294 return 0;
295 }
296