1 /*
2 vcflib C++ library for parsing and manipulating VCF files
3
4 Copyright © 2010-2020 Erik Garrison
5 Copyright © 2020 Pjotr Prins
6
7 This software is published under the MIT License. See the LICENSE file.
8 */
9
10
11 /*
12
13 This program was created at: Tue Sep 8 21:05:23 2015
14 This program was created by: Zev N. Kronenberg
15
16
17 Contact: zev.kronenber@gmail.com
18
19 Organization: Unviersity of Utah
20 School of Medicine
21 Salt Lake City, Utah
22
23
24 The MIT License (MIT)
25
26 Copyright (c) <2015> <Zev N. Kronenberg>
27
28 Permission is hereby granted, free of charge, to any person obtaining a copy
29 of this software and associated documentation files (the "Software"), to deal
30 in the Software without restriction, including without limitation the rights
31 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
32 copies of the Software, and to permit persons to whom the Software is
33 furnished to do so, subject to the following conditions:
34
35 The above copyright notice and this permission notice shall be included in
36 all copies or substantial portions of the Software.
37
38 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
39 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
40 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
43 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
44 THE SOFTWARE.
45
46
47 */
48
49 #include <string>
50 #include <iostream>
51 #include <fstream>
52 #include <math.h>
53 #include <cmath>
54 #include <stdlib.h>
55 #include <time.h>
56 #include <stdio.h>
57 #include <unistd.h>
58 #include <algorithm>
59 #include "split.h"
60 #include "gpatInfo.hpp"
61
62 struct options{
63 std::string file;
64 double cut;
65 }globalOpts;
66
67 using namespace std;
68
69 static const char *optString = "hf:s:";
70
printHelp(void)71 void printHelp(void){
72
73 cerr << endl << endl;
74 cerr << "INFO: help" << endl;
75 cerr << "INFO: description:" << endl;
76 cerr << " Creates genomic segments (bed file) for regions with high wcFst " << endl;
77
78 cerr << "Output : 8 columns : " << endl;
79 cerr << " 1. Seqid " << endl;
80 cerr << " 2. Start (zero based) " << endl;
81 cerr << " 3. End (zero based) " << endl;
82 cerr << " 4. Average iHS " << endl;
83 cerr << " 5. Average high Fst (iHS > -s) " << endl;
84 cerr << " 6. N iHS values in segment " << endl;
85 cerr << " 7. N high iHS values in segment " << endl;
86 cerr << " 8. Segment length " << endl;
87
88 cerr << "INFO: usage: segmentFst -s 2 -f iHS.normalized.output.txt " << endl;
89 cerr << endl;
90 cerr << "INFO: required: -f -- Output from normalizeIHS " << endl;
91 cerr << "INFO: optional: -s -- High absolute iHS cutoff [2] " << endl;
92 cerr << endl << "Type: statistics" << endl << endl;
93 cerr << endl;
94
95 printVersion();
96 }
97
98 //------------------------------- OPTIONS --------------------------------
parseOpts(int argc,char ** argv)99 int parseOpts(int argc, char** argv)
100 {
101 int opt = 0;
102 globalOpts.file = "NA";
103 opt = getopt(argc, argv, optString);
104 while(opt != -1){
105 switch(opt){
106 case 's':
107 {
108 string op = optarg;
109 globalOpts.cut = atof(op.c_str());
110 break;
111 }
112 case 'h':
113 {
114 printHelp();
115 exit(1);
116 break;
117 }
118
119 case 'f':
120 {
121 globalOpts.file = optarg;
122 break;
123 }
124 case '?':
125 {
126 break;
127 }
128 }
129 opt = getopt( argc, argv, optString );
130 }
131 return 1;
132 }
133 //------------------------------- SUBROUTINE --------------------------------
134 /*
135 Function input :
136
137 Function does :
138
139 Function returns:
140
141 */
142
growWindow(vector<double> & values,int * begin,int * end,int * nhigh,int * nlow,double * hSum,double * lSum)143 bool growWindow(vector<double> & values,
144 int * begin ,
145 int * end ,
146 int * nhigh ,
147 int * nlow ,
148 double * hSum ,
149 double * lSum ){
150
151 *begin -= 1;
152 *end += 1;
153
154 if(*begin < 0){
155 return false;
156 }
157 if(*end >= values.size()){
158 return false;
159 }
160
161 for(int index = *begin; index <= *end; index++){
162 if(values[index] > globalOpts.cut){
163 *nhigh += 1;
164 *hSum += values[index];
165 }
166 else{
167 *nlow += 1;
168 *lSum += values[index];
169 }
170 }
171 if((*nhigh)*2 < (*nlow)){
172 return false;
173 }
174 return true;
175 }
176
177 //------------------------------- SUBROUTINE --------------------------------
178 /*
179 Function input : takes the sorted Fst scores, positions, and seqids
180
181 Function does : segments and prints bed
182
183 Function returns:
184
185 */
186
187
process(vector<int> & pos,vector<double> & value,vector<string> & seqid)188 void process(vector<int> & pos, vector<double> & value, vector<string> & seqid)
189 {
190
191
192 // i is the index of the outter loop/aka variant sites.
193 // always start the seed with 9 SNPs the window grows to 10 in "growWindow"
194 for(int i = 9; i < value.size()-9; i++){
195
196 int begin = i -9;
197 int end = i +9;
198
199 int nHigh = 0;
200 int nLow = 0;
201
202 double HighSum = 0;
203 double LowSum = 0;
204
205 bool anyGroth = false;
206
207 while(growWindow(value, &begin, &end,
208 &nHigh, &nLow, &HighSum, &LowSum)){
209 anyGroth = true;
210 }
211 // the current window had an extention
212 if(anyGroth){
213 // reset the index beyond current window
214 i = end + 1;
215
216 if(begin < 0){
217 begin = 0;
218 }
219 if(end >= value.size()){
220 end = (value.size() - 1);
221 }
222
223 double avgFstHigh = HighSum / double(nHigh);
224 double avgFst = (HighSum + LowSum) / (double(nHigh)+double(nLow));
225
226
227
228 cout << seqid[begin] << "\t"
229 << pos[begin] -1 << "\t"
230 << pos[end] -1 << "\t"
231 << avgFst << "\t"
232 << avgFstHigh << "\t"
233 << nHigh + nLow << "\t"
234 << nHigh << "\t"
235 << (pos[end] - pos[begin])
236 << endl;
237
238 }
239 }
240 }
241
242 //------------------------------- MAIN --------------------------------
243 /*
244 Comments:
245 */
246
main(int argc,char ** argv)247 int main( int argc, char** argv)
248 {
249 globalOpts.cut = 2;
250 int parse = parseOpts(argc, argv);
251
252 string last;
253 int lastPos = 0;
254
255 vector<string> seqid;
256 vector<int> pos;
257 vector<double> value;
258
259 if(globalOpts.file.empty()){
260 printHelp();
261 exit(1);
262 }
263
264 string line;
265 ifstream myfile (globalOpts.file);
266 if (myfile.is_open())
267 {
268 while ( getline (myfile,line) )
269 {
270 vector<string> lineDat = split(line, '\t');
271 if(lineDat.size() != 9){
272 cerr << "FATAL: not valid normalized iHS file" << endl;
273 exit(1);
274 }
275 if(last.compare(lineDat[0]) != 0){
276 last = lineDat[0];
277 process(pos, value, seqid);
278 pos.clear();
279 value.clear();
280 seqid.clear();
281 lastPos = 0;
282 }
283 else{
284 if(atoi(lineDat[1].c_str()) < lastPos){
285 cerr << "FATAL: normalized iHS input must be sorted by position." << endl;
286 exit(1);
287 }
288 lastPos = atoi(lineDat[1].c_str());
289 seqid.push_back(lineDat[0]);
290 pos.push_back(atoi(lineDat[1].c_str()));
291 value.push_back(abs(atof(lineDat[6].c_str())));
292 }
293 }
294
295 std::cerr << "INFO: about to segment: " << pos.size() << " scores." << std::endl;
296 process(pos, value, seqid);
297 myfile.close();
298 }
299 else{
300 cerr << "FATAL: could not open file: " << globalOpts.file << endl;
301 exit(1);
302 }
303 return 0;
304 }
305