1 /*
2     vcflib C++ library for parsing and manipulating VCF files
3 
4     Copyright © 2010-2020 Erik Garrison
5     Copyright © 2020      Pjotr Prins
6 
7     This software is published under the MIT License. See the LICENSE file.
8 */
9 
10 
11 /*
12 
13 This program was created at:  Tue Sep  8 21:05:23 2015
14 This program was created by:  Zev N. Kronenberg
15 
16 
17 Contact: zev.kronenber@gmail.com
18 
19 Organization: Unviersity of Utah
20     School of Medicine
21     Salt Lake City, Utah
22 
23 
24 The MIT License (MIT)
25 
26 Copyright (c) <2015> <Zev N. Kronenberg>
27 
28 Permission is hereby granted, free of charge, to any person obtaining a copy
29 of this software and associated documentation files (the "Software"), to deal
30 in the Software without restriction, including without limitation the rights
31 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
32 copies of the Software, and to permit persons to whom the Software is
33 furnished to do so, subject to the following conditions:
34 
35 The above copyright notice and this permission notice shall be included in
36 all copies or substantial portions of the Software.
37 
38 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
39 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
40 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
43 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
44 THE SOFTWARE.
45 
46 
47 */
48 
49 #include <string>
50 #include <iostream>
51 #include <fstream>
52 #include <math.h>
53 #include <cmath>
54 #include <stdlib.h>
55 #include <time.h>
56 #include <stdio.h>
57 #include <unistd.h>
58 #include <algorithm>
59 #include "split.h"
60 #include "gpatInfo.hpp"
61 
62 struct options{
63   std::string file;
64   double cut;
65 }globalOpts;
66 
67 using namespace std;
68 
69 static const char *optString = "hf:s:";
70 
printHelp(void)71 void printHelp(void){
72 
73   cerr << endl << endl;
74   cerr << "INFO: help" << endl;
75   cerr << "INFO: description:" << endl;
76   cerr << "segmentFst creates genomic segments (bed file) for regions with high wcFst" << endl << endl ;
77   cerr << "segmentFst provides a way to find continious regions with high Fst values.  It takes the output of wcFst and produces a BED file.  These high Fst region can be permutated with 'permuteGPATwindow'" << endl ;
78 
79   cerr << "Output : 8 columns :                 "    << endl;
80   cerr << "     1. Seqid                        "    << endl;
81   cerr << "     2. Start (zero based)           "    << endl;
82   cerr << "     3. End   (zero based)           "    << endl;
83   cerr << "     4. Average Fst                  "    << endl;
84   cerr << "     5. Average high Fst (Fst > -s)  "    << endl;
85   cerr << "     6. N Fst values in segment      "    << endl;
86   cerr << "     7. N high fst values in segment "    << endl;
87   cerr << "     8. Segment length               "    << endl;
88 
89   cerr << "INFO: usage:  segmentFst -s 0.7 -f wcFst.output.txt " << endl;
90   cerr << endl;
91   cerr << "INFO: required: -f            -- Output from wcFst     "   << endl;
92   cerr << "INFO: optional: -s            -- High Fst cutoff [0.8] "    << endl;
93   cerr << endl << "Type: statistics" << endl << endl;
94   cerr << endl;
95 
96   printVersion();
97 }
98 
99 //-------------------------------   OPTIONS   --------------------------------
parseOpts(int argc,char ** argv)100 int parseOpts(int argc, char** argv)
101     {
102     int opt = 0;
103     globalOpts.file = "NA";
104     opt = getopt(argc, argv, optString);
105     while(opt != -1){
106       switch(opt){
107       case 's':
108 	{
109 	  string op = optarg;
110 	  globalOpts.cut = atof(op.c_str());
111 	  break;
112 	}
113       case 'h':
114 	{
115 	  printHelp();
116 	  exit(1);
117 	  break;
118 	}
119 
120       case 'f':
121 	{
122 	  globalOpts.file = optarg;
123 	  break;
124 	}
125       case '?':
126 	{
127 	  break;
128 	}
129       }
130 	opt = getopt( argc, argv, optString );
131    }
132 return 1;
133 }
134 //------------------------------- SUBROUTINE --------------------------------
135 /*
136  Function input  : The vector of values
137  Function does   : Finds the bounds of the window
138  Function returns: bool
139 */
140 
findWindow(vector<double> & values,int * begin,int * end)141 bool findWindow(vector<double> & values,
142                 int * begin            ,
143                 int * end              ){
144 
145     *begin -= 1;
146     *end   += 1;
147 
148     if(*begin < 0){
149         return false;
150     }
151     if(*end >= values.size()){
152         return false;
153     }
154 
155     int tmpHigh = 0;
156     int tmpLow  = 0;
157 
158     for(int index = *begin; index <= *end; index++){
159         if(values[index] > globalOpts.cut){
160             tmpHigh += 1;
161         }
162         else{
163             tmpLow += 1;
164         }
165     }
166     if((tmpHigh*2) < tmpLow){
167         return false;
168     }
169     return true;
170 }
171 
172 //------------------------------- SUBROUTINE --------------------------------
173 /*
174  Function input  : takes the sorted Fst scores, positions, and seqids
175  Function does   : segments and prints bed
176  Function returns:
177 
178 */
179 
180 
process(vector<int> & pos,vector<double> & value,vector<string> & seqid)181 void process(vector<int> & pos, vector<double> & value, vector<string> & seqid)
182 {
183 
184   // i is the index of the outter loop/aka variant sites.
185   // always start the seed with 9 SNPs the window grows to 10 in "growWindow"
186   for(int i = 9; i < value.size()-9; i++){
187 
188     int begin = i -9;
189     int end   = i +9;
190 
191     bool anyGroth = false;
192 
193     while(findWindow(value, &begin, &end)){
194         anyGroth = true;
195     }
196     // the current window had an extention
197     if(anyGroth){
198         // reset the index beyond current window
199 
200         i = end + 1;
201 
202         if(begin < 0){
203             begin = 0;
204         }
205         if(end >= value.size()){
206             end = (value.size() - 1);
207         }
208 
209         int nHigh = 0;
210         int nLow  = 0;
211 
212         double HighSum = 0;
213         double LowSum  = 0;
214 
215         for(long int lb = begin ; lb <= end; lb++){
216             if( value[lb] >= globalOpts.cut ){
217                 nHigh += 1;
218                 HighSum += value[lb];
219             }
220             else{
221                 nLow +=1;
222                 LowSum += value[lb];
223             }
224         }
225 
226 
227       double avgFstHigh = HighSum / double(nHigh);
228       double avgFst     = (HighSum + LowSum) / (double(nHigh)+double(nLow));
229 
230 
231       cout << seqid[begin]   << "\t"
232            << pos[begin]  -1 << "\t"
233            << pos[end]    -1 << "\t"
234            << avgFst         << "\t"
235            << avgFstHigh     << "\t"
236            << nHigh + nLow   << "\t"
237            << nHigh          << "\t"
238            << (pos[end] - pos[begin])
239            << endl;
240 
241     }
242   }
243 }
244 
245 //-------------------------------    MAIN     --------------------------------
246 /*
247  Comments:
248 */
249 
main(int argc,char ** argv)250 int main( int argc, char** argv)
251 {
252   globalOpts.cut = 0.8;
253   int parse = parseOpts(argc, argv);
254 
255   string last;
256   int lastPos = 0;
257 
258   vector<string> seqid;
259   vector<int>      pos;
260   vector<double> value;
261 
262   if(globalOpts.file.empty()){
263     printHelp();
264     exit(1);
265   }
266 
267   string line;
268   ifstream myfile (globalOpts.file);
269   if (myfile.is_open())
270     {
271       while ( getline (myfile,line) )
272 	{
273 	  vector<string> lineDat = split(line, '\t');
274 	  if(lineDat.size() != 5){
275 	    cerr << "FATAL: not valid wcFst input" << endl;
276 	    exit(1);
277 	  }
278 	  if(last.compare(lineDat[0]) != 0){
279 	    last = lineDat[0];
280 	    process(pos, value, seqid);
281 	    pos.clear();
282 	    value.clear();
283 	    seqid.clear();
284 	    lastPos = 0;
285 	  }
286 	  else{
287 	    if(atoi(lineDat[1].c_str()) < lastPos){
288 	      cerr << "FATAL: wcFst input must be sorted by position." << endl;
289 	      exit(1);
290 	    }
291 	    lastPos = atoi(lineDat[1].c_str());
292 	    seqid.push_back(lineDat[0]);
293 	    pos.push_back(atoi(lineDat[1].c_str()));
294 	    value.push_back(atof(lineDat[4].c_str()));
295 	  }
296 	}
297 
298       std::cerr << "INFO: about to segment: " << pos.size() << " scores." << std::endl;
299       process(pos, value, seqid);
300       myfile.close();
301     }
302   else{
303     cerr << "FATAL: could not open file: " << globalOpts.file << endl;
304     exit(1);
305   }
306   return 0;
307 }
308