1 /*
2 
3 This program was created at:  Fri Apr 17 14:59:53 2015
4 This program was created by:  Zev N. Kronenberg
5 
6 
7 Contact: zev.kronenber@gmail.com
8 
9 Organization: Unviersity of Utah
10     School of Medicine
11     Salt Lake City, Utah
12 
13 
14 The MIT License (MIT)
15 
16 Copyright (c) <2015> <Zev N. Kronenberg>
17 
18 Permission is hereby granted, free of charge, to any person obtaining a copy
19 of this software and associated documentation files (the "Software"), to deal
20 in the Software without restriction, including without limitation the rights
21 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
22 copies of the Software, and to permit persons to whom the Software is
23 furnished to do so, subject to the following conditions:
24 
25 The above copyright notice and this permission notice shall be included in
26 all copies or substantial portions of the Software.
27 
28 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
33 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
34 THE SOFTWARE.
35 
36 
37 */
38 #include <fstream>
39 #include "split.h"
40 #include <vector>
41 #include <map>
42 #include <string>
43 #include <iostream>
44 #include <math.h>
45 #include <cmath>
46 #include <stdlib.h>
47 #include <time.h>
48 #include <stdio.h>
49 #include <unistd.h>
50 #include <stdlib.h>
51 #include "gpatInfo.hpp"
52 
53 #include <omp.h>
54 // print lock
55 omp_lock_t lock;
56 
57 struct options{
58   std::string file    ;
59   std::string smoothed;
60   std::string format  ;
61   double npermutation ;
62   double nsuc         ;
63   int threads         ;
64   int chrIndex        ;
65   int nIndex          ;
66   int valueIndex      ;
67 }globalOpts;
68 
69 struct score{
70   std::string seqid;
71   long int pos     ;
72   double score     ;
73 };
74 
75 struct smoothedInputData{
76   std::string line;
77   double score ;
78   double n     ;
79   double nPer  ;
80   double nSuc  ;
81   double ePv   ;
82 };
83 
84 static const char *optString = "x:y:u:f:n:s:";
85 
86 using namespace std;
87 
88 std::map<std::string, int> FORMATS;
89 
90 //------------------------------- SUBROUTINE --------------------------------
91 /*
92  Function input  : NA
93 
94  Function does   : prints help
95 
96  Function returns: NA
97 
98 */
printHelp()99 void printHelp()
100 {
101 
102   cerr << endl << endl;
103   cerr << "INFO: help" << endl;
104   cerr << "INFO: description:" << endl;
105   cerr << "     permuteSmoothFst is a method for adding empirical p-values  smoothed wcFst scores." << endl ;
106   cerr << endl;
107   cerr << "INFO: usage:  permuteSmoothFst -s wcFst.smooth.txt -f wcFst.txt -n 5 -s 1 "<< endl;
108   cerr << endl;
109   cerr << "Required:" << endl;
110   cerr << "      file:     f   -- argument: original wcFst data     "<< endl;
111   cerr << "      smoothed: s   -- argument: smoothed wcFst data     "<< endl;
112   cerr << "      format:   y   -- argument: [swcFst, segwcFst]      "<< endl;
113   cerr << "Optional:" << endl;
114   cerr << "      number:   n   -- argument: the number of permutations to run for each value [1000]" << endl;
115   cerr << "      success:  u   -- argument: stop permutations after \'s\' successes [1]"             << endl;
116   cerr << "      success:  x   -- argument: number of threads [1]"             << endl;
117   cerr << endl;
118   cerr << "OUTPUT: permuteSmoothFst will append three additional columns:" << endl;
119   cerr << "        1. The number of successes                            " << endl;
120   cerr << "        2. The number of trials                               " << endl;
121   cerr << "        3. The empirical p-value                              " << endl;
122   cerr << endl;
123   printVersion();
124 
125 }
126 
127 
128 //-------------------------------   OPTIONS   --------------------------------
parseOpts(int argc,char ** argv)129 int parseOpts(int argc, char** argv)
130 {
131   int opt = 0;
132   globalOpts.file = "NA";
133 
134   globalOpts.nsuc         = 1;
135   globalOpts.npermutation = 1000;
136 
137   opt = getopt(argc, argv, optString);
138   while(opt != -1){
139     switch(opt){
140     case 'x':
141       {
142 	globalOpts.threads = atoi(optarg);
143       }
144     case 'f':
145       {
146 	globalOpts.file =  optarg;
147 	break;
148       }
149     case 'y':
150       {
151 	globalOpts.format = optarg;
152 	if(FORMATS.find(globalOpts.format) == FORMATS.end()){
153 	  std::cerr << "FATAL: format not supported: " << globalOpts.format;
154 	  std::cerr << endl;
155 	  printHelp();
156 	  exit(1);
157 	}
158 	if(globalOpts.format == "swcFst"){
159 	  globalOpts.chrIndex   = 0;
160 	  globalOpts.nIndex     = 3;
161 	  globalOpts.valueIndex = 4;
162 	}
163 	if(globalOpts.format == "segwcFst"){
164 	  globalOpts.chrIndex   = 0;
165 	  globalOpts.valueIndex = 3;
166 	  globalOpts.nIndex     = 5;
167 	}
168 	break;
169       }
170     case 'n':
171       {
172 	globalOpts.npermutation = atof(((string)optarg).c_str());
173 	cerr << "INFO: permuteGPAT++ will do N permutations: " << globalOpts.npermutation << endl;
174 	break;
175       }
176     case 's':
177       {
178 	globalOpts.smoothed = optarg;
179 	cerr << "INFO: smoothed file: " << globalOpts.smoothed << endl;
180 	break;
181       }
182     case 'u':
183       {
184 	globalOpts.nsuc = atof(((string)optarg).c_str());
185 	cerr << "INFO: permuteGPAT++ will stop permutations after N successes: " << globalOpts.nsuc << endl;
186 	break;
187 	}
188     case '?':
189       {
190 	break;
191       }
192     }
193 
194     opt = getopt( argc, argv, optString );
195   }
196   return 1;
197 }
198 
199 
200 //------------------------------- SUBROUTINE --------------------------------
201 /*
202  Function input  : data, vector to load
203 
204  Function does   : returns a contguous window
205 
206  Function returns: bool
207 
208 */
209 
getContiguousWindow(vector<score * > & data,vector<double> & load,int n)210 bool getContiguousWindow(vector<score *> & data, vector<double> & load, int n){
211   int r = rand() % data.size();
212 
213   if(r+n >= data.size()){
214     return false;
215   }
216 
217   if(data[r]->seqid != data[r+n]->seqid){
218     return false;
219   }
220 
221   for(int i = r; i < r+n; i++){
222     load.push_back(data[i]->score);
223   }
224   return true;
225 }
226 
227 
228 //------------------------------- SUBROUTINE --------------------------------
229 
mean(vector<double> & data)230 double mean(vector<double> & data){
231 
232   double sum = 0;
233 
234   for(vector<double>::iterator it = data.begin(); it != data.end(); it++){
235     sum += (*it);
236   }
237   return sum / data.size();
238 }
239 
240 
241 
242 //------------------------------- SUBROUTINE --------------------------------
243 /*
244  Function input  : score, n, data
245 
246  Function does   : permutes the score.  requires that the window is contiguous
247 
248  Function returns: NA
249 
250 */
251 
permute(double s,int n,vector<score * > & data,double * nRep,double * nSuc,double * ePv)252 void permute(double s, int n, vector<score *> & data,
253 	     double * nRep, double *nSuc, double * ePv){
254 
255 
256   *ePv = 1 / globalOpts.npermutation;
257 
258   while(*nSuc < globalOpts.nsuc && *nRep < globalOpts.npermutation ){
259     *nRep += 1;
260 
261 
262     std::vector<double> scores;
263 
264     bool getWindow = false;
265     while(!getWindow){
266       getWindow = getContiguousWindow(data, scores, n);
267     }
268 
269     double ns = mean(scores);
270 
271     if(ns > s){
272       *nSuc += 1;
273     }
274   }
275   if(*nSuc > 0){
276     *ePv = *nSuc / *nRep;
277   }
278 }
279 
280 //-------------------------------    MAIN     --------------------------------
281 /*
282  Comments:
283 */
284 
main(int argc,char ** argv)285 int main( int argc, char** argv)
286 {
287 srand (time(NULL));
288 
289  FORMATS["swcFst"]   = 1;
290  FORMATS["segwcFst"] = 1;
291 
292  globalOpts.threads = 1;
293 
294 int parse = parseOpts(argc, argv);
295 
296  omp_set_num_threads(globalOpts.threads);
297 
298 
299  if(globalOpts.file.compare("NA") == 0){
300    cerr << "FATAL: no file was provided" << endl;
301    printHelp();
302    exit(1);
303  }
304  if(globalOpts.format.empty()){
305    std::cerr << "FATAL: no format specified." << std::endl;
306    exit(1);
307  }
308 
309  vector< score *> data;
310 
311  ifstream wcDat (globalOpts.file.c_str());
312 
313  string line;
314 
315  if(wcDat.is_open()){
316 
317    while(getline(wcDat, line)){
318      vector<string> region = split(line, "\t");
319      // will change for other output
320      double value = atof(region[4].c_str());
321 
322      if(globalOpts.format == "swcFst" || globalOpts.format == "segwcFst"){
323 
324        if(region.size() != 5){
325 	 cerr << "FATAL: wrong number of columns in wcFst input" << endl;
326 	 exit(1);
327        }
328        if(value < 0 ){
329 	 value = 0;
330        }
331      }
332 
333      score * sp;
334      sp = new score;
335      sp->seqid = region[0]              ;
336      sp->pos   = atoi(region[1].c_str());
337      sp->score = value                  ;
338 
339      data.push_back(sp);
340    }
341  }
342  else{
343    cerr << "FATAL: could not open file: " << globalOpts.file << endl;
344    exit(1);
345  }
346 
347 
348  wcDat.close();
349  line.clear();
350 
351  cerr << "INFO: raw values to permute against: " << data.size() << endl;
352 
353  ifstream smoothedFile (globalOpts.smoothed.c_str());
354 
355  vector<smoothedInputData*> sData;
356 
357  if(smoothedFile.is_open()){
358    while(getline(smoothedFile, line)){
359 
360      vector<string> region = split(line, "\t");
361      smoothedInputData * sp = new smoothedInputData;
362 
363      sp->line = line;
364      sp->score = atof(region[globalOpts.valueIndex].c_str());
365      sp->n     = atoi(region[globalOpts.nIndex].c_str());
366      sp->nPer = 0;
367      sp->nSuc = 0;
368      sp->ePv  = 0;
369 
370      sData.push_back(sp);
371    }
372  }
373  smoothedFile.close();
374 
375  cerr << "INFO: Number of smoothed windows to permute : " << sData.size() << endl;
376 
377 #pragma omp parallel for schedule(dynamic, 20)
378  for(int i = 0; i < sData.size(); i++){
379    permute(sData[i]->score, sData[i]->n,
380 	   data, &sData[i]->nPer,
381 	   &sData[i]->nSuc, &sData[i]->ePv);
382 
383    omp_set_lock(&lock);
384    cout << sData[i]->line
385 	<< "\t" << sData[i]->nSuc
386 	<< "\t" << sData[i]->nPer
387 	<< "\t" << sData[i]->ePv << endl;
388    omp_unset_lock(&lock);
389  }
390 
391  for(vector<score*>::iterator itz = data.begin();
392      itz != data.end(); itz++){
393    delete *itz;
394  }
395  for(vector<smoothedInputData*>::iterator itz = sData.begin();
396      itz != sData.end(); itz++){
397    delete *itz;
398  }
399 
400 
401 return 0;
402 }
403