1 /*
2 
3 This program was created at:  Fri Apr 17 14:59:53 2015
4 This program was created by:  Zev N. Kronenberg
5 
6 
7 Contact: zev.kronenber@gmail.com
8 
9 Organization: Unviersity of Utah
10     School of Medicine
11     Salt Lake City, Utah
12 
13 
14 The MIT License (MIT)
15 
16 Copyright (c) <2015> <Zev N. Kronenberg>
17 
18 Permission is hereby granted, free of charge, to any person obtaining a copy
19 of this software and associated documentation files (the "Software"), to deal
20 in the Software without restriction, including without limitation the rights
21 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
22 copies of the Software, and to permit persons to whom the Software is
23 furnished to do so, subject to the following conditions:
24 
25 The above copyright notice and this permission notice shall be included in
26 all copies or substantial portions of the Software.
27 
28 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
33 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
34 THE SOFTWARE.
35 
36 
37 */
38 #include <fstream>
39 #include "split.h"
40 #include <vector>
41 #include <map>
42 #include <string>
43 #include <iostream>
44 #include <math.h>
45 #include <cmath>
46 #include <stdlib.h>
47 #include <time.h>
48 #include <stdio.h>
49 #include <unistd.h>
50 #include <stdlib.h>
51 #include "gpatInfo.hpp"
52 
53 #if defined HAS_OPENMP
54 #include <omp.h>
55 // print lock
56 omp_lock_t lock;
57 #endif
58 
59 struct options{
60   std::string file    ;
61   std::string smoothed;
62   std::string format  ;
63   double npermutation ;
64   double nsuc         ;
65   int threads         ;
66   int chrIndex        ;
67   int nIndex          ;
68   int valueIndex      ;
69 }globalOpts;
70 
71 struct score{
72   std::string seqid;
73   long int pos     ;
74   double score     ;
75 };
76 
77 struct smoothedInputData{
78   std::string line;
79   double score ;
80   double n     ;
81   double nPer  ;
82   double nSuc  ;
83   double ePv   ;
84 };
85 
86 static const char *optString = "x:y:u:f:n:s:";
87 
88 using namespace std;
89 
90 std::map<std::string, int> FORMATS;
91 
92 //------------------------------- SUBROUTINE --------------------------------
93 /*
94  Function input  : NA
95 
96  Function does   : prints help
97 
98  Function returns: NA
99 
100 */
printHelp()101 void printHelp()
102 {
103 
104   cerr << endl << endl;
105   cerr << "INFO: help" << endl;
106   cerr << "INFO: description:" << endl;
107   cerr << "     permuteSmooth is a method for adding empirical p-values  smoothed wcFst scores." << endl ;
108   cerr << endl;
109   cerr << "INFO: usage:  permuteSmooth -s wcFst.smooth.txt -f wcFst.txt -n 5 -s 1 "<< endl;
110   cerr << endl;
111   cerr << "Required:" << endl;
112   cerr << "      file:     f   -- argument: original wcFst data     "<< endl;
113   cerr << "      smoothed: s   -- argument: smoothed wcFst data     "<< endl;
114   cerr << "      format:   y   -- argument: [swcFst, segwcFst]      "<< endl;
115   cerr << "Optional:" << endl;
116   cerr << "      number:   n   -- argument: the number of permutations to run for each value [1000]" << endl;
117   cerr << "      success:  u   -- argument: stop permutations after \'s\' successes [1]"             << endl;
118   cerr << "      success:  x   -- argument: number of threads [1]"             << endl;
119   cerr << endl;
120   cerr << "OUTPUT: permuteSmooth will append three additional columns:" << endl;
121   cerr << "        1. The number of successes                            " << endl;
122   cerr << "        2. The number of trials                               " << endl;
123   cerr << "        3. The empirical p-value                              " << endl;
124   cerr << endl;
125   printVersion();
126 
127 }
128 
129 
130 //-------------------------------   OPTIONS   --------------------------------
parseOpts(int argc,char ** argv)131 int parseOpts(int argc, char** argv)
132 {
133   int opt = 0;
134   globalOpts.file = "NA";
135 
136   globalOpts.nsuc         = 1;
137   globalOpts.npermutation = 1000;
138 
139   opt = getopt(argc, argv, optString);
140   while(opt != -1){
141     switch(opt){
142     case 'x':
143       {
144           globalOpts.threads = atoi(optarg);
145           break;
146       }
147     case 'f':
148       {
149 	globalOpts.file =  optarg;
150 	break;
151       }
152     case 'y':
153       {
154 	globalOpts.format = optarg;
155 	if(FORMATS.find(globalOpts.format) == FORMATS.end()){
156 	  std::cerr << "FATAL: format not supported: " << globalOpts.format;
157 	  std::cerr << endl;
158 	  printHelp();
159 	  exit(1);
160 	}
161 	if(globalOpts.format == "swcFst"){
162 	  globalOpts.chrIndex   = 0;
163 	  globalOpts.nIndex     = 3;
164 	  globalOpts.valueIndex = 4;
165 	}
166 	if(globalOpts.format == "segwcFst"){
167 	  globalOpts.chrIndex   = 0;
168 	  globalOpts.valueIndex = 3;
169 	  globalOpts.nIndex     = 5;
170 	}
171 	break;
172       }
173     case 'n':
174       {
175 	globalOpts.npermutation = atof(((string)optarg).c_str());
176 	cerr << "INFO: permuteGPAT++ will do N permutations: " << globalOpts.npermutation << endl;
177 	break;
178       }
179     case 's':
180       {
181 	globalOpts.smoothed = optarg;
182 	cerr << "INFO: smoothed file: " << globalOpts.smoothed << endl;
183 	break;
184       }
185     case 'u':
186       {
187 	globalOpts.nsuc = atof(((string)optarg).c_str());
188 	cerr << "INFO: permuteGPAT++ will stop permutations after N successes: " << globalOpts.nsuc << endl;
189 	break;
190 	}
191     case '?':
192       {
193 	break;
194       }
195     }
196 
197     opt = getopt( argc, argv, optString );
198   }
199   return 1;
200 }
201 
202 
203 //------------------------------- SUBROUTINE --------------------------------
204 /*
205  Function input  : data, vector to load
206 
207  Function does   : returns a contguous window
208 
209  Function returns: bool
210 
211 */
212 
getContiguousWindow(vector<score * > & data,vector<double> & load,int n,int * nfail)213 bool getContiguousWindow(vector<score *> & data,
214 			 vector<double> & load,
215 			 int n, int * nfail){
216   int r = rand() % data.size();
217 
218   if(r+n >= data.size()){
219     *nfail+=1;
220     return false;
221   }
222 
223   if(data[r]->seqid != data[r+n]->seqid){
224     *nfail+=1;
225     return false;
226   }
227 
228   for(int i = r; i < r+n; i++){
229     load.push_back(data[i]->score);
230   }
231   return true;
232 }
233 
234 
235 //------------------------------- SUBROUTINE --------------------------------
236 
mean(vector<double> & data)237 double mean(vector<double> & data){
238 
239   double sum = 0;
240 
241   for(vector<double>::iterator it = data.begin(); it != data.end(); it++){
242     sum += (*it);
243   }
244   return sum / data.size();
245 }
246 
247 
248 
249 //------------------------------- SUBROUTINE --------------------------------
250 /*
251  Function input  : score, n, data
252 
253  Function does   : permutes the score.  requires that the window is contiguous
254 
255  Function returns: NA
256 
257 */
258 
permute(double s,int n,vector<score * > & data,double * nRep,double * nSuc,double * ePv)259 bool permute(double s, int n, vector<score *> & data,
260 	     double * nRep, double *nSuc, double * ePv){
261 
262 
263   *ePv = 1 / globalOpts.npermutation;
264 
265   while(*nSuc < globalOpts.nsuc && *nRep < globalOpts.npermutation ){
266     *nRep += 1;
267 
268 
269     std::vector<double> scores;
270 
271     bool getWindow = false;
272 
273     int nfail = 0;
274 
275     while(!getWindow){
276       if(nfail > globalOpts.npermutation){
277 	return false;
278       }
279       getWindow = getContiguousWindow(data, scores, n, &nfail);
280     }
281 
282     double ns = mean(scores);
283 
284     if(ns > s){
285       *nSuc += 1;
286     }
287   }
288   if(*nSuc > 0){
289     *ePv = *nSuc / *nRep;
290   }
291 
292   return true;
293 }
294 
295 //-------------------------------    MAIN     --------------------------------
296 /*
297  Comments:
298 */
299 
main(int argc,char ** argv)300 int main( int argc, char** argv)
301 {
302 srand (time(NULL));
303 
304  FORMATS["swcFst"]   = 1;
305  FORMATS["segwcFst"] = 1;
306 
307  globalOpts.threads = 1;
308 
309 int parse = parseOpts(argc, argv);
310 
311  #if defined HAS_OPENMP
312  omp_set_num_threads(globalOpts.threads);
313 #endif
314 
315  if(globalOpts.file.compare("NA") == 0){
316    cerr << "FATAL: no file was provided" << endl;
317    printHelp();
318    exit(1);
319  }
320  if(globalOpts.format.empty()){
321    std::cerr << "FATAL: no format specified." << std::endl;
322    exit(1);
323  }
324 
325  vector< score *> data;
326 
327  ifstream wcDat (globalOpts.file.c_str());
328 
329  string line;
330 
331  if(wcDat.is_open()){
332 
333    while(getline(wcDat, line)){
334      vector<string> region = split(line, "\t");
335      // will change for other output
336      double value = atof(region[4].c_str());
337 
338      if(globalOpts.format == "swcFst" || globalOpts.format == "segwcFst"){
339 
340        if(region.size() != 5){
341 	 cerr << "FATAL: wrong number of columns in wcFst input" << endl;
342 	 exit(1);
343        }
344        if(value < 0 ){
345 	 value = 0;
346        }
347      }
348 
349      score * sp;
350      sp = new score;
351      sp->seqid = region[0]              ;
352      sp->pos   = atoi(region[1].c_str());
353      sp->score = value                  ;
354 
355      data.push_back(sp);
356    }
357  }
358  else{
359    cerr << "FATAL: could not open file: " << globalOpts.file << endl;
360    exit(1);
361  }
362 
363 
364  wcDat.close();
365  line.clear();
366 
367  cerr << "INFO: raw values to permute against: " << data.size() << endl;
368 
369  ifstream smoothedFile (globalOpts.smoothed.c_str());
370 
371  vector<smoothedInputData*> sData;
372 
373  if(smoothedFile.is_open()){
374    while(getline(smoothedFile, line)){
375 
376      vector<string> region = split(line, "\t");
377      smoothedInputData * sp = new smoothedInputData;
378 
379      sp->line = line;
380      sp->score = atof(region[globalOpts.valueIndex].c_str());
381      sp->n     = atoi(region[globalOpts.nIndex].c_str());
382      sp->nPer = 0;
383      sp->nSuc = 0;
384      sp->ePv  = 0;
385 
386      sData.push_back(sp);
387    }
388  }
389  smoothedFile.close();
390 
391  cerr << "INFO: Number of smoothed windows to permute : " << sData.size() << endl;
392 
393 #if defined HAS_OPENMP
394 #pragma omp parallel for schedule(dynamic, 20)
395 #endif
396 
397  for(int i = 0; i < sData.size(); i++){
398    bool per = permute(sData[i]->score, sData[i]->n,
399 	   data, &sData[i]->nPer,
400 	   &sData[i]->nSuc, &sData[i]->ePv);
401 
402 
403    if(per){
404 #if defined HAS_OPENMP
405      omp_set_lock(&lock);
406 #endif
407    cout << sData[i]->line
408 	<< "\t" << sData[i]->nSuc
409 	<< "\t" << sData[i]->nPer
410 	<< "\t" << sData[i]->ePv << endl;
411    #if defined HAS_OPENMP
412    omp_unset_lock(&lock);
413    #endif
414    }
415    else{
416      #if defined HAS_OPENMP
417      omp_set_lock(&lock);
418      #endif
419      cout << sData[i]->line
420 	  << "\t" << "NA"
421 	  << "\t" << "NA"
422 	  << "\t" << "NA" << endl;
423      #if defined HAS_OPENMP
424      omp_unset_lock(&lock);
425      #endif
426    }
427  }
428 
429  for(vector<score*>::iterator itz = data.begin();
430      itz != data.end(); itz++){
431    delete *itz;
432  }
433  for(vector<smoothedInputData*>::iterator itz = sData.begin();
434      itz != sData.end(); itz++){
435    delete *itz;
436  }
437 
438 
439 return 0;
440 }
441