1 /*
2     vcflib C++ library for parsing and manipulating VCF files
3 
4     Copyright © 2010-2020 Erik Garrison
5     Copyright © 2020      Pjotr Prins
6 
7     This software is published under the MIT License. See the LICENSE file.
8 */
9 
10 
11 /*
12 
13 This program was created at:  Tue Sep  8 21:05:23 2015
14 This program was created by:  Zev N. Kronenberg
15 
16 
17 Contact: zev.kronenber@gmail.com
18 
19 Organization: Unviersity of Utah
20     School of Medicine
21     Salt Lake City, Utah
22 
23 
24 The MIT License (MIT)
25 
26 Copyright (c) <2015> <Zev N. Kronenberg>
27 
28 Permission is hereby granted, free of charge, to any person obtaining a copy
29 of this software and associated documentation files (the "Software"), to deal
30 in the Software without restriction, including without limitation the rights
31 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
32 copies of the Software, and to permit persons to whom the Software is
33 furnished to do so, subject to the following conditions:
34 
35 The above copyright notice and this permission notice shall be included in
36 all copies or substantial portions of the Software.
37 
38 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
39 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
40 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
43 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
44 THE SOFTWARE.
45 
46 
47 */
48 
49 #include <string>
50 #include <iostream>
51 #include <fstream>
52 #include <math.h>
53 #include <cmath>
54 #include <stdlib.h>
55 #include <time.h>
56 #include <stdio.h>
57 #include <unistd.h>
58 #include <algorithm>
59 #include "split.h"
60 #include "gpatInfo.hpp"
61 
62 struct options{
63   std::string file;
64   double cut;
65 }globalOpts;
66 
67 using namespace std;
68 
69 static const char *optString = "hf:s:";
70 
printHelp(void)71 void printHelp(void){
72 
73   cerr << endl << endl;
74   cerr << "INFO: help" << endl;
75   cerr << "INFO: description:" << endl;
76   cerr << "      Creates genomic segments (bed file) for regions with high wcFst  " << endl;
77 
78   cerr << "Output : 8 columns :                 "    << endl;
79   cerr << "     1. Seqid                        "    << endl;
80   cerr << "     2. Start (zero based)           "    << endl;
81   cerr << "     3. End   (zero based)           "    << endl;
82   cerr << "     4. Average iHS                  "    << endl;
83   cerr << "     5. Average high Fst (iHS > -s)  "    << endl;
84   cerr << "     6. N iHS values in segment      "    << endl;
85   cerr << "     7. N high iHS values in segment "    << endl;
86   cerr << "     8. Segment length               "    << endl;
87 
88   cerr << "INFO: usage:  segmentFst -s 2 -f iHS.normalized.output.txt " << endl;
89   cerr << endl;
90   cerr << "INFO: required: -f            -- Output from normalizeIHS     "   << endl;
91   cerr << "INFO: optional: -s            -- High absolute iHS cutoff [2] "    << endl;
92   cerr << endl << "Type: statistics" << endl << endl;
93   cerr << endl;
94 
95   printVersion();
96 }
97 
98 //-------------------------------   OPTIONS   --------------------------------
parseOpts(int argc,char ** argv)99 int parseOpts(int argc, char** argv)
100     {
101     int opt = 0;
102     globalOpts.file = "NA";
103     opt = getopt(argc, argv, optString);
104     while(opt != -1){
105       switch(opt){
106       case 's':
107 	{
108 	  string op = optarg;
109 	  globalOpts.cut = atof(op.c_str());
110 	  break;
111 	}
112       case 'h':
113 	{
114 	  printHelp();
115 	  exit(1);
116 	  break;
117 	}
118 
119       case 'f':
120 	{
121 	  globalOpts.file = optarg;
122 	  break;
123 	}
124       case '?':
125 	{
126 	  break;
127 	}
128       }
129 	opt = getopt( argc, argv, optString );
130    }
131 return 1;
132 }
133 //------------------------------- SUBROUTINE --------------------------------
134 /*
135  Function input  :
136 
137  Function does   :
138 
139  Function returns:
140 
141 */
142 
growWindow(vector<double> & values,int * begin,int * end,int * nhigh,int * nlow,double * hSum,double * lSum)143 bool growWindow(vector<double> & values,
144 		int * begin            ,
145 		int * end              ,
146 		int * nhigh            ,
147 		int * nlow             ,
148 		double * hSum          ,
149 		double * lSum           ){
150 
151   *begin -= 1;
152   *end   += 1;
153 
154   if(*begin < 0){
155     return false;
156   }
157   if(*end >= values.size()){
158     return false;
159   }
160 
161   for(int index = *begin; index <= *end; index++){
162     if(values[index] > globalOpts.cut){
163       *nhigh += 1;
164       *hSum += values[index];
165     }
166     else{
167       *nlow += 1;
168       *lSum += values[index];
169     }
170   }
171   if((*nhigh)*2 < (*nlow)){
172     return false;
173   }
174   return true;
175 }
176 
177 //------------------------------- SUBROUTINE --------------------------------
178 /*
179  Function input  : takes the sorted Fst scores, positions, and seqids
180 
181  Function does   : segments and prints bed
182 
183  Function returns:
184 
185 */
186 
187 
process(vector<int> & pos,vector<double> & value,vector<string> & seqid)188 void process(vector<int> & pos, vector<double> & value, vector<string> & seqid)
189 {
190 
191 
192   // i is the index of the outter loop/aka variant sites.
193   // always start the seed with 9 SNPs the window grows to 10 in "growWindow"
194   for(int i = 9; i < value.size()-9; i++){
195 
196     int begin = i -9;
197     int end   = i +9;
198 
199     int nHigh = 0;
200     int nLow  = 0;
201 
202     double HighSum = 0;
203     double LowSum  = 0;
204 
205     bool anyGroth = false;
206 
207     while(growWindow(value, &begin, &end,
208 		     &nHigh, &nLow, &HighSum, &LowSum)){
209       anyGroth = true;
210     }
211     // the current window had an extention
212     if(anyGroth){
213     // reset the index beyond current window
214       i = end + 1;
215 
216       if(begin < 0){
217 	begin = 0;
218       }
219       if(end >= value.size()){
220 	end = (value.size() - 1);
221       }
222 
223       double avgFstHigh = HighSum / double(nHigh);
224       double avgFst     = (HighSum + LowSum) / (double(nHigh)+double(nLow));
225 
226 
227 
228       cout << seqid[begin]   << "\t"
229            << pos[begin]  -1 << "\t"
230            << pos[end]    -1 << "\t"
231            << avgFst         << "\t"
232            << avgFstHigh     << "\t"
233            << nHigh + nLow   << "\t"
234            << nHigh          << "\t"
235            << (pos[end] - pos[begin])
236            << endl;
237 
238     }
239   }
240 }
241 
242 //-------------------------------    MAIN     --------------------------------
243 /*
244  Comments:
245 */
246 
main(int argc,char ** argv)247 int main( int argc, char** argv)
248 {
249   globalOpts.cut = 2;
250   int parse = parseOpts(argc, argv);
251 
252   string last;
253   int lastPos = 0;
254 
255   vector<string> seqid;
256   vector<int>      pos;
257   vector<double> value;
258 
259   if(globalOpts.file.empty()){
260     printHelp();
261     exit(1);
262   }
263 
264   string line;
265   ifstream myfile (globalOpts.file);
266   if (myfile.is_open())
267     {
268       while ( getline (myfile,line) )
269 	{
270 	  vector<string> lineDat = split(line, '\t');
271 	  if(lineDat.size() != 9){
272 	    cerr << "FATAL: not valid normalized iHS file" << endl;
273 	    exit(1);
274 	  }
275 	  if(last.compare(lineDat[0]) != 0){
276 	    last = lineDat[0];
277 	    process(pos, value, seqid);
278 	    pos.clear();
279 	    value.clear();
280 	    seqid.clear();
281 	    lastPos = 0;
282 	  }
283 	  else{
284 	    if(atoi(lineDat[1].c_str()) < lastPos){
285 	      cerr << "FATAL: normalized iHS input must be sorted by position." << endl;
286 	      exit(1);
287 	    }
288 	    lastPos = atoi(lineDat[1].c_str());
289 	    seqid.push_back(lineDat[0]);
290 	    pos.push_back(atoi(lineDat[1].c_str()));
291 	    value.push_back(abs(atof(lineDat[6].c_str())));
292 	  }
293 	}
294 
295       std::cerr << "INFO: about to segment: " << pos.size() << " scores." << std::endl;
296       process(pos, value, seqid);
297       myfile.close();
298     }
299   else{
300     cerr << "FATAL: could not open file: " << globalOpts.file << endl;
301     exit(1);
302   }
303   return 0;
304 }
305