1 
2 /*
3 
4 This program was created at:  Tue Sep  8 21:05:23 2015
5 This program was created by:  Zev N. Kronenberg
6 
7 
8 Contact: zev.kronenber@gmail.com
9 
10 Organization: Unviersity of Utah
11     School of Medicine
12     Salt Lake City, Utah
13 
14 
15 The MIT License (MIT)
16 
17 Copyright (c) <2015> <Zev N. Kronenberg>
18 
19 Permission is hereby granted, free of charge, to any person obtaining a copy
20 of this software and associated documentation files (the "Software"), to deal
21 in the Software without restriction, including without limitation the rights
22 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
23 copies of the Software, and to permit persons to whom the Software is
24 furnished to do so, subject to the following conditions:
25 
26 The above copyright notice and this permission notice shall be included in
27 all copies or substantial portions of the Software.
28 
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
34 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
35 THE SOFTWARE.
36 
37 
38 */
39 
40 #include <string>
41 #include <iostream>
42 #include <fstream>
43 #include <math.h>
44 #include <cmath>
45 #include <stdlib.h>
46 #include <time.h>
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <algorithm>
50 #include "split.h"
51 #include "gpatInfo.hpp"
52 
53 struct options{
54   std::string file;
55   double cut;
56 }globalOpts;
57 
58 using namespace std;
59 
60 static const char *optString = "hf:s:";
61 
printHelp(void)62 void printHelp(void){
63 
64   cerr << endl << endl;
65   cerr << "INFO: help" << endl;
66   cerr << "INFO: description:" << endl;
67   cerr << "      Creates genomic segments (bed file) for regions with high wcFst  " << endl;
68 
69   cerr << "Output : 8 columns :                 "    << endl;
70   cerr << "     1. Seqid                        "    << endl;
71   cerr << "     2. Start (zero based)           "    << endl;
72   cerr << "     3. End   (zero based)           "    << endl;
73   cerr << "     4. Average Fst                  "    << endl;
74   cerr << "     5. Average high Fst (Fst > -s)  "    << endl;
75   cerr << "     6. N Fst values in segment      "    << endl;
76   cerr << "     7. N high fst values in segment "    << endl;
77   cerr << "     8. Segment length               "    << endl;
78 
79   cerr << "INFO: usage:  segmentFst -s 0.7 -f wcFst.output.txt " << endl;
80   cerr << endl;
81   cerr << "INFO: required: -f            -- Output from wcFst     "   << endl;
82   cerr << "INFO: optional: -s            -- High Fst cutoff [0.8] "    << endl;
83 
84   cerr << endl;
85 
86   printVersion();
87 }
88 
89 //-------------------------------   OPTIONS   --------------------------------
parseOpts(int argc,char ** argv)90 int parseOpts(int argc, char** argv)
91     {
92     int opt = 0;
93     globalOpts.file = "NA";
94     opt = getopt(argc, argv, optString);
95     while(opt != -1){
96       switch(opt){
97       case 's':
98 	{
99 	  string op = optarg;
100 	  globalOpts.cut = atof(op.c_str());
101 	  break;
102 	}
103       case 'h':
104 	{
105 	  printHelp();
106 	  exit(1);
107 	  break;
108 	}
109 
110       case 'f':
111 	{
112 	  globalOpts.file = optarg;
113 	  break;
114 	}
115       case '?':
116 	{
117 	  break;
118 	}
119       }
120 	opt = getopt( argc, argv, optString );
121    }
122 return 1;
123 }
124 //------------------------------- SUBROUTINE --------------------------------
125 /*
126  Function input  : The vector of values
127  Function does   : Finds the bounds of the window
128  Function returns: bool
129 */
130 
findWindow(vector<double> & values,int * begin,int * end)131 bool findWindow(vector<double> & values,
132                 int * begin            ,
133                 int * end              ){
134 
135     *begin -= 1;
136     *end   += 1;
137 
138     if(*begin < 0){
139         return false;
140     }
141     if(*end >= values.size()){
142         return false;
143     }
144 
145     int tmpHigh = 0;
146     int tmpLow  = 0;
147 
148     for(int index = *begin; index <= *end; index++){
149         if(values[index] > globalOpts.cut){
150             tmpHigh += 1;
151         }
152         else{
153             tmpLow += 1;
154         }
155     }
156     if((tmpHigh*2) < tmpLow){
157         return false;
158     }
159     return true;
160 }
161 
162 //------------------------------- SUBROUTINE --------------------------------
163 /*
164  Function input  : takes the sorted Fst scores, positions, and seqids
165  Function does   : segments and prints bed
166  Function returns:
167 
168 */
169 
170 
process(vector<int> & pos,vector<double> & value,vector<string> & seqid)171 void process(vector<int> & pos, vector<double> & value, vector<string> & seqid)
172 {
173 
174   // i is the index of the outter loop/aka variant sites.
175   // always start the seed with 9 SNPs the window grows to 10 in "growWindow"
176   for(int i = 9; i < value.size()-9; i++){
177 
178     int begin = i -9;
179     int end   = i +9;
180 
181     bool anyGroth = false;
182 
183     while(findWindow(value, &begin, &end)){
184         anyGroth = true;
185     }
186     // the current window had an extention
187     if(anyGroth){
188         // reset the index beyond current window
189 
190         i = end + 1;
191 
192         if(begin < 0){
193             begin = 0;
194         }
195         if(end >= value.size()){
196             end = (value.size() - 1);
197         }
198 
199         int nHigh = 0;
200         int nLow  = 0;
201 
202         double HighSum = 0;
203         double LowSum  = 0;
204 
205         for(long int lb = begin ; lb <= end; lb++){
206             if( value[lb] >= globalOpts.cut ){
207                 nHigh += 1;
208                 HighSum += value[lb];
209             }
210             else{
211                 nLow +=1;
212                 LowSum += value[lb];
213             }
214         }
215 
216 
217       double avgFstHigh = HighSum / double(nHigh);
218       double avgFst     = (HighSum + LowSum) / (double(nHigh)+double(nLow));
219 
220 
221       cout << seqid[begin]   << "\t"
222            << pos[begin]  -1 << "\t"
223            << pos[end]    -1 << "\t"
224            << avgFst         << "\t"
225            << avgFstHigh     << "\t"
226            << nHigh + nLow   << "\t"
227            << nHigh          << "\t"
228            << (pos[end] - pos[begin])
229            << endl;
230 
231     }
232   }
233 }
234 
235 //-------------------------------    MAIN     --------------------------------
236 /*
237  Comments:
238 */
239 
main(int argc,char ** argv)240 int main( int argc, char** argv)
241 {
242   globalOpts.cut = 0.8;
243   int parse = parseOpts(argc, argv);
244 
245   string last;
246   int lastPos = 0;
247 
248   vector<string> seqid;
249   vector<int>      pos;
250   vector<double> value;
251 
252   if(globalOpts.file.empty()){
253     printHelp();
254     exit(1);
255   }
256 
257   string line;
258   ifstream myfile (globalOpts.file);
259   if (myfile.is_open())
260     {
261       while ( getline (myfile,line) )
262 	{
263 	  vector<string> lineDat = split(line, '\t');
264 	  if(lineDat.size() != 5){
265 	    cerr << "FATAL: not valid wcFst input" << endl;
266 	    exit(1);
267 	  }
268 	  if(last.compare(lineDat[0]) != 0){
269 	    last = lineDat[0];
270 	    process(pos, value, seqid);
271 	    pos.clear();
272 	    value.clear();
273 	    seqid.clear();
274 	    lastPos = 0;
275 	  }
276 	  else{
277 	    if(atoi(lineDat[1].c_str()) < lastPos){
278 	      cerr << "FATAL: wcFst input must be sorted by position." << endl;
279 	      exit(1);
280 	    }
281 	    lastPos = atoi(lineDat[1].c_str());
282 	    seqid.push_back(lineDat[0]);
283 	    pos.push_back(atoi(lineDat[1].c_str()));
284 	    value.push_back(atof(lineDat[4].c_str()));
285 	  }
286 	}
287 
288       std::cerr << "INFO: about to segment: " << pos.size() << " scores." << std::endl;
289       process(pos, value, seqid);
290       myfile.close();
291     }
292   else{
293     cerr << "FATAL: could not open file: " << globalOpts.file << endl;
294     exit(1);
295   }
296   return 0;
297 }
298