1 /*
2     vcflib C++ library for parsing and manipulating VCF files
3 
4     Copyright © 2010-2020 Erik Garrison
5     Copyright © 2020      Pjotr Prins
6 
7     This software is published under the MIT License. See the LICENSE file.
8 */
9 
10 /*
11 
12 This program was created at:  Fri Apr 17 14:59:53 2015
13 This program was created by:  Zev N. Kronenberg
14 
15 
16 Contact: zev.kronenber@gmail.com
17 
18 Organization: Unviersity of Utah
19     School of Medicine
20     Salt Lake City, Utah
21 
22 
23 The MIT License (MIT)
24 
25 Copyright (c) <2015> <Zev N. Kronenberg>
26 
27 Permission is hereby granted, free of charge, to any person obtaining a copy
28 of this software and associated documentation files (the "Software"), to deal
29 in the Software without restriction, including without limitation the rights
30 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
31 copies of the Software, and to permit persons to whom the Software is
32 furnished to do so, subject to the following conditions:
33 
34 The above copyright notice and this permission notice shall be included in
35 all copies or substantial portions of the Software.
36 
37 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
38 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
39 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
40 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
42 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
43 THE SOFTWARE.
44 
45 
46 */
47 #include <fstream>
48 #include "split.h"
49 #include <vector>
50 #include <map>
51 #include <string>
52 #include <iostream>
53 #include <math.h>
54 #include <cmath>
55 #include <stdlib.h>
56 #include <time.h>
57 #include <stdio.h>
58 #include <unistd.h>
59 #include <stdlib.h>
60 #include "gpatInfo.hpp"
61 
62 #include <omp.h>
63 // print lock
64 omp_lock_t lock;
65 
66 struct options{
67   std::string file    ;
68   std::string smoothed;
69   std::string format  ;
70   double npermutation ;
71   double nsuc         ;
72   int threads         ;
73   int chrIndex        ;
74   int nIndex          ;
75   int valueIndex      ;
76 }globalOpts;
77 
78 struct score{
79   std::string seqid;
80   long int pos     ;
81   double score     ;
82 };
83 
84 struct smoothedInputData{
85   std::string line;
86   double score ;
87   double n     ;
88   double nPer  ;
89   double nSuc  ;
90   double ePv   ;
91 };
92 
93 static const char *optString = "x:y:u:f:n:s:";
94 
95 using namespace std;
96 
97 std::map<std::string, int> FORMATS;
98 
99 //------------------------------- SUBROUTINE --------------------------------
100 /*
101  Function input  : NA
102 
103  Function does   : prints help
104 
105  Function returns: NA
106 
107 */
printHelp()108 void printHelp()
109 {
110 
111   cerr << endl << endl;
112   cerr << "INFO: help" << endl;
113   cerr << "INFO: description:" << endl;
114   cerr << "     permuteSmoothFst is a method for adding empirical p-values  smoothed wcFst scores." << endl ;
115   cerr << endl;
116   cerr << "INFO: usage:  permuteSmoothFst -s wcFst.smooth.txt -f wcFst.txt -n 5 -s 1 "<< endl;
117   cerr << endl;
118   cerr << "Required:" << endl;
119   cerr << "      file:     f   -- argument: original wcFst data     "<< endl;
120   cerr << "      smoothed: s   -- argument: smoothed wcFst data     "<< endl;
121   cerr << "      format:   y   -- argument: [swcFst, segwcFst]      "<< endl;
122   cerr << "Optional:" << endl;
123   cerr << "      number:   n   -- argument: the number of permutations to run for each value [1000]" << endl;
124   cerr << "      success:  u   -- argument: stop permutations after \'s\' successes [1]"             << endl;
125   cerr << "      success:  x   -- argument: number of threads [1]"             << endl;
126   cerr << endl;
127   cerr << "OUTPUT: permuteSmoothFst will append three additional columns:" << endl;
128   cerr << "        1. The number of successes                            " << endl;
129   cerr << "        2. The number of trials                               " << endl;
130   cerr << "        3. The empirical p-value                              " << endl;
131   cerr << endl;
132   printVersion();
133 
134 }
135 
136 
137 //-------------------------------   OPTIONS   --------------------------------
parseOpts(int argc,char ** argv)138 int parseOpts(int argc, char** argv)
139 {
140   int opt = 0;
141   globalOpts.file = "NA";
142 
143   globalOpts.nsuc         = 1;
144   globalOpts.npermutation = 1000;
145 
146   opt = getopt(argc, argv, optString);
147   while(opt != -1){
148     switch(opt){
149     case 'x':
150       {
151 	globalOpts.threads = atoi(optarg);
152       }
153     case 'f':
154       {
155 	globalOpts.file =  optarg;
156 	break;
157       }
158     case 'y':
159       {
160 	globalOpts.format = optarg;
161 	if(FORMATS.find(globalOpts.format) == FORMATS.end()){
162 	  std::cerr << "FATAL: format not supported: " << globalOpts.format;
163 	  std::cerr << endl;
164 	  printHelp();
165 	  exit(1);
166 	}
167 	if(globalOpts.format == "swcFst"){
168 	  globalOpts.chrIndex   = 0;
169 	  globalOpts.nIndex     = 3;
170 	  globalOpts.valueIndex = 4;
171 	}
172 	if(globalOpts.format == "segwcFst"){
173 	  globalOpts.chrIndex   = 0;
174 	  globalOpts.valueIndex = 3;
175 	  globalOpts.nIndex     = 5;
176 	}
177 	break;
178       }
179     case 'n':
180       {
181 	globalOpts.npermutation = atof(((string)optarg).c_str());
182 	cerr << "INFO: permuteGPAT++ will do N permutations: " << globalOpts.npermutation << endl;
183 	break;
184       }
185     case 's':
186       {
187 	globalOpts.smoothed = optarg;
188 	cerr << "INFO: smoothed file: " << globalOpts.smoothed << endl;
189 	break;
190       }
191     case 'u':
192       {
193 	globalOpts.nsuc = atof(((string)optarg).c_str());
194 	cerr << "INFO: permuteGPAT++ will stop permutations after N successes: " << globalOpts.nsuc << endl;
195 	break;
196 	}
197     case '?':
198       {
199 	break;
200       }
201     }
202 
203     opt = getopt( argc, argv, optString );
204   }
205   return 1;
206 }
207 
208 
209 //------------------------------- SUBROUTINE --------------------------------
210 /*
211  Function input  : data, vector to load
212 
213  Function does   : returns a contguous window
214 
215  Function returns: bool
216 
217 */
218 
getContiguousWindow(vector<score * > & data,vector<double> & load,int n)219 bool getContiguousWindow(vector<score *> & data, vector<double> & load, int n){
220   int r = rand() % data.size();
221 
222   if(r+n >= data.size()){
223     return false;
224   }
225 
226   if(data[r]->seqid != data[r+n]->seqid){
227     return false;
228   }
229 
230   for(int i = r; i < r+n; i++){
231     load.push_back(data[i]->score);
232   }
233   return true;
234 }
235 
236 
237 //------------------------------- SUBROUTINE --------------------------------
238 
mean(vector<double> & data)239 double mean(vector<double> & data){
240 
241   double sum = 0;
242 
243   for(vector<double>::iterator it = data.begin(); it != data.end(); it++){
244     sum += (*it);
245   }
246   return sum / data.size();
247 }
248 
249 
250 
251 //------------------------------- SUBROUTINE --------------------------------
252 /*
253  Function input  : score, n, data
254 
255  Function does   : permutes the score.  requires that the window is contiguous
256 
257  Function returns: NA
258 
259 */
260 
permute(double s,int n,vector<score * > & data,double * nRep,double * nSuc,double * ePv)261 void permute(double s, int n, vector<score *> & data,
262 	     double * nRep, double *nSuc, double * ePv){
263 
264 
265   *ePv = 1 / globalOpts.npermutation;
266 
267   while(*nSuc < globalOpts.nsuc && *nRep < globalOpts.npermutation ){
268     *nRep += 1;
269 
270 
271     std::vector<double> scores;
272 
273     bool getWindow = false;
274     while(!getWindow){
275       getWindow = getContiguousWindow(data, scores, n);
276     }
277 
278     double ns = mean(scores);
279 
280     if(ns > s){
281       *nSuc += 1;
282     }
283   }
284   if(*nSuc > 0){
285     *ePv = *nSuc / *nRep;
286   }
287 }
288 
289 //-------------------------------    MAIN     --------------------------------
290 /*
291  Comments:
292 */
293 
main(int argc,char ** argv)294 int main( int argc, char** argv)
295 {
296 srand (time(NULL));
297 
298  FORMATS["swcFst"]   = 1;
299  FORMATS["segwcFst"] = 1;
300 
301  globalOpts.threads = 1;
302 
303 int parse = parseOpts(argc, argv);
304 
305  omp_set_num_threads(globalOpts.threads);
306 
307 
308  if(globalOpts.file.compare("NA") == 0){
309    cerr << "FATAL: no file was provided" << endl;
310    printHelp();
311    exit(1);
312  }
313  if(globalOpts.format.empty()){
314    std::cerr << "FATAL: no format specified." << std::endl;
315    exit(1);
316  }
317 
318  vector< score *> data;
319 
320  ifstream wcDat (globalOpts.file.c_str());
321 
322  string line;
323 
324  if(wcDat.is_open()){
325 
326    while(getline(wcDat, line)){
327      vector<string> region = split(line, "\t");
328      // will change for other output
329      double value = atof(region[4].c_str());
330 
331      if(globalOpts.format == "swcFst" || globalOpts.format == "segwcFst"){
332 
333        if(region.size() != 5){
334 	 cerr << "FATAL: wrong number of columns in wcFst input" << endl;
335 	 exit(1);
336        }
337        if(value < 0 ){
338 	 value = 0;
339        }
340      }
341 
342      score * sp;
343      sp = new score;
344      sp->seqid = region[0]              ;
345      sp->pos   = atoi(region[1].c_str());
346      sp->score = value                  ;
347 
348      data.push_back(sp);
349    }
350  }
351  else{
352    cerr << "FATAL: could not open file: " << globalOpts.file << endl;
353    exit(1);
354  }
355 
356 
357  wcDat.close();
358  line.clear();
359 
360  cerr << "INFO: raw values to permute against: " << data.size() << endl;
361 
362  ifstream smoothedFile (globalOpts.smoothed.c_str());
363 
364  vector<smoothedInputData*> sData;
365 
366  if(smoothedFile.is_open()){
367    while(getline(smoothedFile, line)){
368 
369      vector<string> region = split(line, "\t");
370      smoothedInputData * sp = new smoothedInputData;
371 
372      sp->line = line;
373      sp->score = atof(region[globalOpts.valueIndex].c_str());
374      sp->n     = atoi(region[globalOpts.nIndex].c_str());
375      sp->nPer = 0;
376      sp->nSuc = 0;
377      sp->ePv  = 0;
378 
379      sData.push_back(sp);
380    }
381  }
382  smoothedFile.close();
383 
384  cerr << "INFO: Number of smoothed windows to permute : " << sData.size() << endl;
385 
386 #pragma omp parallel for schedule(dynamic, 20)
387  for(int i = 0; i < sData.size(); i++){
388    permute(sData[i]->score, sData[i]->n,
389 	   data, &sData[i]->nPer,
390 	   &sData[i]->nSuc, &sData[i]->ePv);
391 
392    omp_set_lock(&lock);
393    cout << sData[i]->line
394 	<< "\t" << sData[i]->nSuc
395 	<< "\t" << sData[i]->nPer
396 	<< "\t" << sData[i]->ePv << endl;
397    omp_unset_lock(&lock);
398  }
399 
400  for(vector<score*>::iterator itz = data.begin();
401      itz != data.end(); itz++){
402    delete *itz;
403  }
404  for(vector<smoothedInputData*>::iterator itz = sData.begin();
405      itz != sData.end(); itz++){
406    delete *itz;
407  }
408 
409 
410 return 0;
411 }
412