1 
2 /*
3 
4 This program was created at:  Tue Sep  8 21:05:23 2015
5 This program was created by:  Zev N. Kronenberg
6 
7 
8 Contact: zev.kronenber@gmail.com
9 
10 Organization: Unviersity of Utah
11     School of Medicine
12     Salt Lake City, Utah
13 
14 
15 The MIT License (MIT)
16 
17 Copyright (c) <2015> <Zev N. Kronenberg>
18 
19 Permission is hereby granted, free of charge, to any person obtaining a copy
20 of this software and associated documentation files (the "Software"), to deal
21 in the Software without restriction, including without limitation the rights
22 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
23 copies of the Software, and to permit persons to whom the Software is
24 furnished to do so, subject to the following conditions:
25 
26 The above copyright notice and this permission notice shall be included in
27 all copies or substantial portions of the Software.
28 
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
34 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
35 THE SOFTWARE.
36 
37 
38 */
39 
40 #include <string>
41 #include <iostream>
42 #include <fstream>
43 #include <math.h>
44 #include <cmath>
45 #include <stdlib.h>
46 #include <time.h>
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <algorithm>
50 #include "split.h"
51 #include "gpatInfo.hpp"
52 
53 struct options{
54   std::string file;
55   double cut;
56 }globalOpts;
57 
58 using namespace std;
59 
60 static const char *optString = "hf:s:";
61 
printHelp(void)62 void printHelp(void){
63 
64   cerr << endl << endl;
65   cerr << "INFO: help" << endl;
66   cerr << "INFO: description:" << endl;
67   cerr << "      Creates genomic segments (bed file) for regions with high wcFst  " << endl;
68 
69   cerr << "Output : 8 columns :                 "    << endl;
70   cerr << "     1. Seqid                        "    << endl;
71   cerr << "     2. Start (zero based)           "    << endl;
72   cerr << "     3. End   (zero based)           "    << endl;
73   cerr << "     4. Average iHS                  "    << endl;
74   cerr << "     5. Average high Fst (iHS > -s)  "    << endl;
75   cerr << "     6. N iHS values in segment      "    << endl;
76   cerr << "     7. N high iHS values in segment "    << endl;
77   cerr << "     8. Segment length               "    << endl;
78 
79   cerr << "INFO: usage:  segmentFst -s 2 -f iHS.normalized.output.txt " << endl;
80   cerr << endl;
81   cerr << "INFO: required: -f            -- Output from normalizeIHS     "   << endl;
82   cerr << "INFO: optional: -s            -- High absolute iHS cutoff [2] "    << endl;
83 
84   cerr << endl;
85 
86   printVersion();
87 }
88 
89 //-------------------------------   OPTIONS   --------------------------------
parseOpts(int argc,char ** argv)90 int parseOpts(int argc, char** argv)
91     {
92     int opt = 0;
93     globalOpts.file = "NA";
94     opt = getopt(argc, argv, optString);
95     while(opt != -1){
96       switch(opt){
97       case 's':
98 	{
99 	  string op = optarg;
100 	  globalOpts.cut = atof(op.c_str());
101 	  break;
102 	}
103       case 'h':
104 	{
105 	  printHelp();
106 	  exit(1);
107 	  break;
108 	}
109 
110       case 'f':
111 	{
112 	  globalOpts.file = optarg;
113 	  break;
114 	}
115       case '?':
116 	{
117 	  break;
118 	}
119       }
120 	opt = getopt( argc, argv, optString );
121    }
122 return 1;
123 }
124 //------------------------------- SUBROUTINE --------------------------------
125 /*
126  Function input  :
127 
128  Function does   :
129 
130  Function returns:
131 
132 */
133 
growWindow(vector<double> & values,int * begin,int * end,int * nhigh,int * nlow,double * hSum,double * lSum)134 bool growWindow(vector<double> & values,
135 		int * begin            ,
136 		int * end              ,
137 		int * nhigh            ,
138 		int * nlow             ,
139 		double * hSum          ,
140 		double * lSum           ){
141 
142   *begin -= 1;
143   *end   += 1;
144 
145   if(*begin < 0){
146     return false;
147   }
148   if(*end >= values.size()){
149     return false;
150   }
151 
152   for(int index = *begin; index <= *end; index++){
153     if(values[index] > globalOpts.cut){
154       *nhigh += 1;
155       *hSum += values[index];
156     }
157     else{
158       *nlow += 1;
159       *lSum += values[index];
160     }
161   }
162   if((*nhigh)*2 < (*nlow)){
163     return false;
164   }
165   return true;
166 }
167 
168 //------------------------------- SUBROUTINE --------------------------------
169 /*
170  Function input  : takes the sorted Fst scores, positions, and seqids
171 
172  Function does   : segments and prints bed
173 
174  Function returns:
175 
176 */
177 
178 
process(vector<int> & pos,vector<double> & value,vector<string> & seqid)179 void process(vector<int> & pos, vector<double> & value, vector<string> & seqid)
180 {
181 
182 
183   // i is the index of the outter loop/aka variant sites.
184   // always start the seed with 9 SNPs the window grows to 10 in "growWindow"
185   for(int i = 9; i < value.size()-9; i++){
186 
187     int begin = i -9;
188     int end   = i +9;
189 
190     int nHigh = 0;
191     int nLow  = 0;
192 
193     double HighSum = 0;
194     double LowSum  = 0;
195 
196     bool anyGroth = false;
197 
198     while(growWindow(value, &begin, &end,
199 		     &nHigh, &nLow, &HighSum, &LowSum)){
200       anyGroth = true;
201     }
202     // the current window had an extention
203     if(anyGroth){
204     // reset the index beyond current window
205       i = end + 1;
206 
207       if(begin < 0){
208 	begin = 0;
209       }
210       if(end >= value.size()){
211 	end = (value.size() - 1);
212       }
213 
214       double avgFstHigh = HighSum / double(nHigh);
215       double avgFst     = (HighSum + LowSum) / (double(nHigh)+double(nLow));
216 
217 
218 
219       cout << seqid[begin]   << "\t"
220            << pos[begin]  -1 << "\t"
221            << pos[end]    -1 << "\t"
222            << avgFst         << "\t"
223            << avgFstHigh     << "\t"
224            << nHigh + nLow   << "\t"
225            << nHigh          << "\t"
226            << (pos[end] - pos[begin])
227            << endl;
228 
229     }
230   }
231 }
232 
233 //-------------------------------    MAIN     --------------------------------
234 /*
235  Comments:
236 */
237 
main(int argc,char ** argv)238 int main( int argc, char** argv)
239 {
240   globalOpts.cut = 2;
241   int parse = parseOpts(argc, argv);
242 
243   string last;
244   int lastPos = 0;
245 
246   vector<string> seqid;
247   vector<int>      pos;
248   vector<double> value;
249 
250   if(globalOpts.file.empty()){
251     printHelp();
252     exit(1);
253   }
254 
255   string line;
256   ifstream myfile (globalOpts.file);
257   if (myfile.is_open())
258     {
259       while ( getline (myfile,line) )
260 	{
261 	  vector<string> lineDat = split(line, '\t');
262 	  if(lineDat.size() != 9){
263 	    cerr << "FATAL: not valid normalized iHS file" << endl;
264 	    exit(1);
265 	  }
266 	  if(last.compare(lineDat[0]) != 0){
267 	    last = lineDat[0];
268 	    process(pos, value, seqid);
269 	    pos.clear();
270 	    value.clear();
271 	    seqid.clear();
272 	    lastPos = 0;
273 	  }
274 	  else{
275 	    if(atoi(lineDat[1].c_str()) < lastPos){
276 	      cerr << "FATAL: normalized iHS input must be sorted by position." << endl;
277 	      exit(1);
278 	    }
279 	    lastPos = atoi(lineDat[1].c_str());
280 	    seqid.push_back(lineDat[0]);
281 	    pos.push_back(atoi(lineDat[1].c_str()));
282 	    value.push_back(abs(atof(lineDat[6].c_str())));
283 	  }
284 	}
285 
286       std::cerr << "INFO: about to segment: " << pos.size() << " scores." << std::endl;
287       process(pos, value, seqid);
288       myfile.close();
289     }
290   else{
291     cerr << "FATAL: could not open file: " << globalOpts.file << endl;
292     exit(1);
293   }
294   return 0;
295 }
296