1 /*
2     vcflib C++ library for parsing and manipulating VCF files
3 
4     Copyright © 2010-2020 Erik Garrison
5     Copyright © 2015      Zev N. Kronenberg
6     Copyright © 2020      Pjotr Prins
7 
8     This software is published under the MIT License. See the LICENSE file.
9 */
10 
11 /*
12 
13 This program was created at:  Fri Apr 17 14:59:53 2015
14 This program was created by:  Zev N. Kronenberg
15 
16 
17 Contact: zev.kronenber@gmail.com
18 
19 Organization: Unviersity of Utah
20     School of Medicine
21     Salt Lake City, Utah
22 
23 
24 The MIT License (MIT)
25 
26 Copyright (c) <2015> <Zev N. Kronenberg>
27 
28 Permission is hereby granted, free of charge, to any person obtaining a copy
29 of this software and associated documentation files (the "Software"), to deal
30 in the Software without restriction, including without limitation the rights
31 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
32 copies of the Software, and to permit persons to whom the Software is
33 furnished to do so, subject to the following conditions:
34 
35 The above copyright notice and this permission notice shall be included in
36 all copies or substantial portions of the Software.
37 
38 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
39 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
40 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
43 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
44 THE SOFTWARE.
45 
46 
47 */
48 #include <fstream>
49 #include "split.h"
50 #include <vector>
51 #include <map>
52 #include <string>
53 #include <iostream>
54 #include <math.h>
55 #include <cmath>
56 #include <stdlib.h>
57 #include <time.h>
58 #include <stdio.h>
59 #include <unistd.h>
60 #include <stdlib.h>
61 #include "gpatInfo.hpp"
62 
63 #if defined HAS_OPENMP
64 #include <omp.h>
65 // print lock
66 omp_lock_t lock;
67 #endif
68 
69 struct options{
70   std::string file    ;
71   std::string smoothed;
72   std::string format  ;
73   double npermutation ;
74   double nsuc         ;
75   int threads         ;
76   int chrIndex        ;
77   int nIndex          ;
78   int valueIndex      ;
79 }globalOpts;
80 
81 struct score{
82   std::string seqid;
83   long int pos     ;
84   double score     ;
85 };
86 
87 struct smoothedInputData{
88   std::string line;
89   double score ;
90   double n     ;
91   double nPer  ;
92   double nSuc  ;
93   double ePv   ;
94 };
95 
96 static const char *optString = "x:y:u:f:n:s:";
97 
98 using namespace std;
99 
100 std::map<std::string, int> FORMATS;
101 
102 //------------------------------- SUBROUTINE --------------------------------
103 /*
104  Function input  : NA
105 
106  Function does   : prints help
107 
108  Function returns: NA
109 
110 */
printHelp()111 void printHelp()
112 {
113 
114   cerr << endl << endl;
115   cerr << "INFO: help" << endl;
116   cerr << "INFO: description:" << endl;
117   cerr << "     permuteSmooth is a method for adding empirical p-values  smoothed wcFst scores." << endl ;
118   cerr << endl;
119   cerr << "INFO: usage:  permuteSmooth -s wcFst.smooth.txt -f wcFst.txt -n 5 -s 1 "<< endl;
120   cerr << endl;
121   cerr << "Required:" << endl;
122   cerr << "      file:     f   -- argument: original wcFst data     "<< endl;
123   cerr << "      smoothed: s   -- argument: smoothed wcFst data     "<< endl;
124   cerr << "      format:   y   -- argument: [swcFst, segwcFst]      "<< endl;
125   cerr << "Optional:" << endl;
126   cerr << "      number:   n   -- argument: the number of permutations to run for each value [1000]" << endl;
127   cerr << "      success:  u   -- argument: stop permutations after \'s\' successes [1]"             << endl;
128   cerr << "      success:  x   -- argument: number of threads [1]"             << endl;
129   cerr << endl;
130   cerr << "OUTPUT: permuteSmooth will append three additional columns:" << endl;
131   cerr << "        1. The number of successes                            " << endl;
132   cerr << "        2. The number of trials                               " << endl;
133   cerr << "        3. The empirical p-value                              " << endl;
134   cerr << endl;
135   cerr << endl << "Type: statistics" << endl << endl;
136   printVersion();
137 
138 }
139 
140 
141 //-------------------------------   OPTIONS   --------------------------------
parseOpts(int argc,char ** argv)142 int parseOpts(int argc, char** argv)
143 {
144   int opt = 0;
145   globalOpts.file = "NA";
146 
147   globalOpts.nsuc         = 1;
148   globalOpts.npermutation = 1000;
149 
150   if (argc == 2) {
151     string h_flag = argv[1];
152 
153     if (argc == 2 && (h_flag == "-h" || h_flag == "--help")) {
154       printHelp();
155       exit(1);
156     }
157   }
158 
159   opt = getopt(argc, argv, optString);
160   while(opt != -1){
161     switch(opt){
162     case 'x':
163       {
164           globalOpts.threads = atoi(optarg);
165           break;
166       }
167     case 'f':
168       {
169 	globalOpts.file =  optarg;
170 	break;
171       }
172     case 'y':
173       {
174 	globalOpts.format = optarg;
175 	if(FORMATS.find(globalOpts.format) == FORMATS.end()){
176 	  std::cerr << "FATAL: format not supported: " << globalOpts.format;
177 	  std::cerr << endl;
178 	  printHelp();
179 	  exit(1);
180 	}
181 	if(globalOpts.format == "swcFst"){
182 	  globalOpts.chrIndex   = 0;
183 	  globalOpts.nIndex     = 3;
184 	  globalOpts.valueIndex = 4;
185 	}
186 	if(globalOpts.format == "segwcFst"){
187 	  globalOpts.chrIndex   = 0;
188 	  globalOpts.valueIndex = 3;
189 	  globalOpts.nIndex     = 5;
190 	}
191 	break;
192       }
193     case 'n':
194       {
195 	globalOpts.npermutation = atof(((string)optarg).c_str());
196 	cerr << "INFO: permuteGPAT++ will do N permutations: " << globalOpts.npermutation << endl;
197 	break;
198       }
199     case 's':
200       {
201 	globalOpts.smoothed = optarg;
202 	cerr << "INFO: smoothed file: " << globalOpts.smoothed << endl;
203 	break;
204       }
205     case 'u':
206       {
207 	globalOpts.nsuc = atof(((string)optarg).c_str());
208 	cerr << "INFO: permuteGPAT++ will stop permutations after N successes: " << globalOpts.nsuc << endl;
209 	break;
210 	}
211     case '?':
212       {
213 	break;
214       }
215     }
216 
217     opt = getopt( argc, argv, optString );
218   }
219   return 1;
220 }
221 
222 
223 //------------------------------- SUBROUTINE --------------------------------
224 /*
225  Function input  : data, vector to load
226 
227  Function does   : returns a contguous window
228 
229  Function returns: bool
230 
231 */
232 
getContiguousWindow(vector<score * > & data,vector<double> & load,int n,int * nfail)233 bool getContiguousWindow(vector<score *> & data,
234 			 vector<double> & load,
235 			 int n, int * nfail){
236   int r = rand() % data.size();
237 
238   if(r+n >= data.size()){
239     *nfail+=1;
240     return false;
241   }
242 
243   if(data[r]->seqid != data[r+n]->seqid){
244     *nfail+=1;
245     return false;
246   }
247 
248   for(int i = r; i < r+n; i++){
249     load.push_back(data[i]->score);
250   }
251   return true;
252 }
253 
254 
255 //------------------------------- SUBROUTINE --------------------------------
256 
mean(vector<double> & data)257 double mean(vector<double> & data){
258 
259   double sum = 0;
260 
261   for(vector<double>::iterator it = data.begin(); it != data.end(); it++){
262     sum += (*it);
263   }
264   return sum / data.size();
265 }
266 
267 
268 
269 //------------------------------- SUBROUTINE --------------------------------
270 /*
271  Function input  : score, n, data
272 
273  Function does   : permutes the score.  requires that the window is contiguous
274 
275  Function returns: NA
276 
277 */
278 
permute(double s,int n,vector<score * > & data,double * nRep,double * nSuc,double * ePv)279 bool permute(double s, int n, vector<score *> & data,
280 	     double * nRep, double *nSuc, double * ePv){
281 
282 
283   *ePv = 1 / globalOpts.npermutation;
284 
285   while(*nSuc < globalOpts.nsuc && *nRep < globalOpts.npermutation ){
286     *nRep += 1;
287 
288 
289     std::vector<double> scores;
290 
291     bool getWindow = false;
292 
293     int nfail = 0;
294 
295     while(!getWindow){
296       if(nfail > globalOpts.npermutation){
297 	return false;
298       }
299       getWindow = getContiguousWindow(data, scores, n, &nfail);
300     }
301 
302     double ns = mean(scores);
303 
304     if(ns > s){
305       *nSuc += 1;
306     }
307   }
308   if(*nSuc > 0){
309     *ePv = *nSuc / *nRep;
310   }
311 
312   return true;
313 }
314 
315 //-------------------------------    MAIN     --------------------------------
316 /*
317  Comments:
318 */
319 
main(int argc,char ** argv)320 int main( int argc, char** argv)
321 {
322 srand (time(NULL));
323 
324  FORMATS["swcFst"]   = 1;
325  FORMATS["segwcFst"] = 1;
326 
327  globalOpts.threads = 1;
328 
329 int parse = parseOpts(argc, argv);
330 
331  #if defined HAS_OPENMP
332  omp_set_num_threads(globalOpts.threads);
333 #endif
334 
335  if(globalOpts.file.compare("NA") == 0){
336    cerr << "FATAL: no file was provided" << endl;
337    printHelp();
338    exit(1);
339  }
340  if(globalOpts.format.empty()){
341    std::cerr << "FATAL: no format specified." << std::endl;
342    exit(1);
343  }
344 
345  vector< score *> data;
346 
347  ifstream wcDat (globalOpts.file.c_str());
348 
349  string line;
350 
351  if(wcDat.is_open()){
352 
353    while(getline(wcDat, line)){
354      vector<string> region = split(line, "\t");
355      // will change for other output
356      double value = atof(region[4].c_str());
357 
358      if(globalOpts.format == "swcFst" || globalOpts.format == "segwcFst"){
359 
360        if(region.size() != 5){
361 	 cerr << "FATAL: wrong number of columns in wcFst input" << endl;
362 	 exit(1);
363        }
364        if(value < 0 ){
365 	 value = 0;
366        }
367      }
368 
369      score * sp;
370      sp = new score;
371      sp->seqid = region[0]              ;
372      sp->pos   = atoi(region[1].c_str());
373      sp->score = value                  ;
374 
375      data.push_back(sp);
376    }
377  }
378  else{
379    cerr << "FATAL: could not open file: " << globalOpts.file << endl;
380    exit(1);
381  }
382 
383 
384  wcDat.close();
385  line.clear();
386 
387  cerr << "INFO: raw values to permute against: " << data.size() << endl;
388 
389  ifstream smoothedFile (globalOpts.smoothed.c_str());
390 
391  vector<smoothedInputData*> sData;
392 
393  if(smoothedFile.is_open()){
394    while(getline(smoothedFile, line)){
395 
396      vector<string> region = split(line, "\t");
397      smoothedInputData * sp = new smoothedInputData;
398 
399      sp->line = line;
400      sp->score = atof(region[globalOpts.valueIndex].c_str());
401      sp->n     = atoi(region[globalOpts.nIndex].c_str());
402      sp->nPer = 0;
403      sp->nSuc = 0;
404      sp->ePv  = 0;
405 
406      sData.push_back(sp);
407    }
408  }
409  smoothedFile.close();
410 
411  cerr << "INFO: Number of smoothed windows to permute : " << sData.size() << endl;
412 
413 #if defined HAS_OPENMP
414 #pragma omp parallel for schedule(dynamic, 20)
415 #endif
416 
417  for(int i = 0; i < sData.size(); i++){
418    bool per = permute(sData[i]->score, sData[i]->n,
419 	   data, &sData[i]->nPer,
420 	   &sData[i]->nSuc, &sData[i]->ePv);
421 
422 
423    if(per){
424 #if defined HAS_OPENMP
425      omp_set_lock(&lock);
426 #endif
427    cout << sData[i]->line
428 	<< "\t" << sData[i]->nSuc
429 	<< "\t" << sData[i]->nPer
430 	<< "\t" << sData[i]->ePv << endl;
431    #if defined HAS_OPENMP
432    omp_unset_lock(&lock);
433    #endif
434    }
435    else{
436      #if defined HAS_OPENMP
437      omp_set_lock(&lock);
438      #endif
439      cout << sData[i]->line
440 	  << "\t" << "NA"
441 	  << "\t" << "NA"
442 	  << "\t" << "NA" << endl;
443      #if defined HAS_OPENMP
444      omp_unset_lock(&lock);
445      #endif
446    }
447  }
448 
449  for(vector<score*>::iterator itz = data.begin();
450      itz != data.end(); itz++){
451    delete *itz;
452  }
453  for(vector<smoothedInputData*>::iterator itz = sData.begin();
454      itz != sData.end(); itz++){
455    delete *itz;
456  }
457 
458 
459 return 0;
460 }
461