1 /*
2
3 This program was created at: Fri Apr 17 14:59:53 2015
4 This program was created by: Zev N. Kronenberg
5
6
7 Contact: zev.kronenber@gmail.com
8
9 Organization: Unviersity of Utah
10 School of Medicine
11 Salt Lake City, Utah
12
13
14 The MIT License (MIT)
15
16 Copyright (c) <2015> <Zev N. Kronenberg>
17
18 Permission is hereby granted, free of charge, to any person obtaining a copy
19 of this software and associated documentation files (the "Software"), to deal
20 in the Software without restriction, including without limitation the rights
21 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
22 copies of the Software, and to permit persons to whom the Software is
23 furnished to do so, subject to the following conditions:
24
25 The above copyright notice and this permission notice shall be included in
26 all copies or substantial portions of the Software.
27
28 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
33 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
34 THE SOFTWARE.
35
36
37 */
38 #include <fstream>
39 #include "split.h"
40 #include <vector>
41 #include <map>
42 #include <string>
43 #include <iostream>
44 #include <math.h>
45 #include <cmath>
46 #include <stdlib.h>
47 #include <time.h>
48 #include <stdio.h>
49 #include <unistd.h>
50 #include <stdlib.h>
51 #include "gpatInfo.hpp"
52
53 #if defined HAS_OPENMP
54 #include <omp.h>
55 // print lock
56 omp_lock_t lock;
57 #endif
58
59 struct options{
60 std::string file ;
61 std::string smoothed;
62 std::string format ;
63 double npermutation ;
64 double nsuc ;
65 int threads ;
66 int chrIndex ;
67 int nIndex ;
68 int valueIndex ;
69 }globalOpts;
70
71 struct score{
72 std::string seqid;
73 long int pos ;
74 double score ;
75 };
76
77 struct smoothedInputData{
78 std::string line;
79 double score ;
80 double n ;
81 double nPer ;
82 double nSuc ;
83 double ePv ;
84 };
85
86 static const char *optString = "x:y:u:f:n:s:";
87
88 using namespace std;
89
90 std::map<std::string, int> FORMATS;
91
92 //------------------------------- SUBROUTINE --------------------------------
93 /*
94 Function input : NA
95
96 Function does : prints help
97
98 Function returns: NA
99
100 */
printHelp()101 void printHelp()
102 {
103
104 cerr << endl << endl;
105 cerr << "INFO: help" << endl;
106 cerr << "INFO: description:" << endl;
107 cerr << " permuteSmooth is a method for adding empirical p-values smoothed wcFst scores." << endl ;
108 cerr << endl;
109 cerr << "INFO: usage: permuteSmooth -s wcFst.smooth.txt -f wcFst.txt -n 5 -s 1 "<< endl;
110 cerr << endl;
111 cerr << "Required:" << endl;
112 cerr << " file: f -- argument: original wcFst data "<< endl;
113 cerr << " smoothed: s -- argument: smoothed wcFst data "<< endl;
114 cerr << " format: y -- argument: [swcFst, segwcFst] "<< endl;
115 cerr << "Optional:" << endl;
116 cerr << " number: n -- argument: the number of permutations to run for each value [1000]" << endl;
117 cerr << " success: u -- argument: stop permutations after \'s\' successes [1]" << endl;
118 cerr << " success: x -- argument: number of threads [1]" << endl;
119 cerr << endl;
120 cerr << "OUTPUT: permuteSmooth will append three additional columns:" << endl;
121 cerr << " 1. The number of successes " << endl;
122 cerr << " 2. The number of trials " << endl;
123 cerr << " 3. The empirical p-value " << endl;
124 cerr << endl;
125 printVersion();
126
127 }
128
129
130 //------------------------------- OPTIONS --------------------------------
parseOpts(int argc,char ** argv)131 int parseOpts(int argc, char** argv)
132 {
133 int opt = 0;
134 globalOpts.file = "NA";
135
136 globalOpts.nsuc = 1;
137 globalOpts.npermutation = 1000;
138
139 opt = getopt(argc, argv, optString);
140 while(opt != -1){
141 switch(opt){
142 case 'x':
143 {
144 globalOpts.threads = atoi(optarg);
145 break;
146 }
147 case 'f':
148 {
149 globalOpts.file = optarg;
150 break;
151 }
152 case 'y':
153 {
154 globalOpts.format = optarg;
155 if(FORMATS.find(globalOpts.format) == FORMATS.end()){
156 std::cerr << "FATAL: format not supported: " << globalOpts.format;
157 std::cerr << endl;
158 printHelp();
159 exit(1);
160 }
161 if(globalOpts.format == "swcFst"){
162 globalOpts.chrIndex = 0;
163 globalOpts.nIndex = 3;
164 globalOpts.valueIndex = 4;
165 }
166 if(globalOpts.format == "segwcFst"){
167 globalOpts.chrIndex = 0;
168 globalOpts.valueIndex = 3;
169 globalOpts.nIndex = 5;
170 }
171 break;
172 }
173 case 'n':
174 {
175 globalOpts.npermutation = atof(((string)optarg).c_str());
176 cerr << "INFO: permuteGPAT++ will do N permutations: " << globalOpts.npermutation << endl;
177 break;
178 }
179 case 's':
180 {
181 globalOpts.smoothed = optarg;
182 cerr << "INFO: smoothed file: " << globalOpts.smoothed << endl;
183 break;
184 }
185 case 'u':
186 {
187 globalOpts.nsuc = atof(((string)optarg).c_str());
188 cerr << "INFO: permuteGPAT++ will stop permutations after N successes: " << globalOpts.nsuc << endl;
189 break;
190 }
191 case '?':
192 {
193 break;
194 }
195 }
196
197 opt = getopt( argc, argv, optString );
198 }
199 return 1;
200 }
201
202
203 //------------------------------- SUBROUTINE --------------------------------
204 /*
205 Function input : data, vector to load
206
207 Function does : returns a contguous window
208
209 Function returns: bool
210
211 */
212
getContiguousWindow(vector<score * > & data,vector<double> & load,int n,int * nfail)213 bool getContiguousWindow(vector<score *> & data,
214 vector<double> & load,
215 int n, int * nfail){
216 int r = rand() % data.size();
217
218 if(r+n >= data.size()){
219 *nfail+=1;
220 return false;
221 }
222
223 if(data[r]->seqid != data[r+n]->seqid){
224 *nfail+=1;
225 return false;
226 }
227
228 for(int i = r; i < r+n; i++){
229 load.push_back(data[i]->score);
230 }
231 return true;
232 }
233
234
235 //------------------------------- SUBROUTINE --------------------------------
236
mean(vector<double> & data)237 double mean(vector<double> & data){
238
239 double sum = 0;
240
241 for(vector<double>::iterator it = data.begin(); it != data.end(); it++){
242 sum += (*it);
243 }
244 return sum / data.size();
245 }
246
247
248
249 //------------------------------- SUBROUTINE --------------------------------
250 /*
251 Function input : score, n, data
252
253 Function does : permutes the score. requires that the window is contiguous
254
255 Function returns: NA
256
257 */
258
permute(double s,int n,vector<score * > & data,double * nRep,double * nSuc,double * ePv)259 bool permute(double s, int n, vector<score *> & data,
260 double * nRep, double *nSuc, double * ePv){
261
262
263 *ePv = 1 / globalOpts.npermutation;
264
265 while(*nSuc < globalOpts.nsuc && *nRep < globalOpts.npermutation ){
266 *nRep += 1;
267
268
269 std::vector<double> scores;
270
271 bool getWindow = false;
272
273 int nfail = 0;
274
275 while(!getWindow){
276 if(nfail > globalOpts.npermutation){
277 return false;
278 }
279 getWindow = getContiguousWindow(data, scores, n, &nfail);
280 }
281
282 double ns = mean(scores);
283
284 if(ns > s){
285 *nSuc += 1;
286 }
287 }
288 if(*nSuc > 0){
289 *ePv = *nSuc / *nRep;
290 }
291
292 return true;
293 }
294
295 //------------------------------- MAIN --------------------------------
296 /*
297 Comments:
298 */
299
main(int argc,char ** argv)300 int main( int argc, char** argv)
301 {
302 srand (time(NULL));
303
304 FORMATS["swcFst"] = 1;
305 FORMATS["segwcFst"] = 1;
306
307 globalOpts.threads = 1;
308
309 int parse = parseOpts(argc, argv);
310
311 #if defined HAS_OPENMP
312 omp_set_num_threads(globalOpts.threads);
313 #endif
314
315 if(globalOpts.file.compare("NA") == 0){
316 cerr << "FATAL: no file was provided" << endl;
317 printHelp();
318 exit(1);
319 }
320 if(globalOpts.format.empty()){
321 std::cerr << "FATAL: no format specified." << std::endl;
322 exit(1);
323 }
324
325 vector< score *> data;
326
327 ifstream wcDat (globalOpts.file.c_str());
328
329 string line;
330
331 if(wcDat.is_open()){
332
333 while(getline(wcDat, line)){
334 vector<string> region = split(line, "\t");
335 // will change for other output
336 double value = atof(region[4].c_str());
337
338 if(globalOpts.format == "swcFst" || globalOpts.format == "segwcFst"){
339
340 if(region.size() != 5){
341 cerr << "FATAL: wrong number of columns in wcFst input" << endl;
342 exit(1);
343 }
344 if(value < 0 ){
345 value = 0;
346 }
347 }
348
349 score * sp;
350 sp = new score;
351 sp->seqid = region[0] ;
352 sp->pos = atoi(region[1].c_str());
353 sp->score = value ;
354
355 data.push_back(sp);
356 }
357 }
358 else{
359 cerr << "FATAL: could not open file: " << globalOpts.file << endl;
360 exit(1);
361 }
362
363
364 wcDat.close();
365 line.clear();
366
367 cerr << "INFO: raw values to permute against: " << data.size() << endl;
368
369 ifstream smoothedFile (globalOpts.smoothed.c_str());
370
371 vector<smoothedInputData*> sData;
372
373 if(smoothedFile.is_open()){
374 while(getline(smoothedFile, line)){
375
376 vector<string> region = split(line, "\t");
377 smoothedInputData * sp = new smoothedInputData;
378
379 sp->line = line;
380 sp->score = atof(region[globalOpts.valueIndex].c_str());
381 sp->n = atoi(region[globalOpts.nIndex].c_str());
382 sp->nPer = 0;
383 sp->nSuc = 0;
384 sp->ePv = 0;
385
386 sData.push_back(sp);
387 }
388 }
389 smoothedFile.close();
390
391 cerr << "INFO: Number of smoothed windows to permute : " << sData.size() << endl;
392
393 #if defined HAS_OPENMP
394 #pragma omp parallel for schedule(dynamic, 20)
395 #endif
396
397 for(int i = 0; i < sData.size(); i++){
398 bool per = permute(sData[i]->score, sData[i]->n,
399 data, &sData[i]->nPer,
400 &sData[i]->nSuc, &sData[i]->ePv);
401
402
403 if(per){
404 #if defined HAS_OPENMP
405 omp_set_lock(&lock);
406 #endif
407 cout << sData[i]->line
408 << "\t" << sData[i]->nSuc
409 << "\t" << sData[i]->nPer
410 << "\t" << sData[i]->ePv << endl;
411 #if defined HAS_OPENMP
412 omp_unset_lock(&lock);
413 #endif
414 }
415 else{
416 #if defined HAS_OPENMP
417 omp_set_lock(&lock);
418 #endif
419 cout << sData[i]->line
420 << "\t" << "NA"
421 << "\t" << "NA"
422 << "\t" << "NA" << endl;
423 #if defined HAS_OPENMP
424 omp_unset_lock(&lock);
425 #endif
426 }
427 }
428
429 for(vector<score*>::iterator itz = data.begin();
430 itz != data.end(); itz++){
431 delete *itz;
432 }
433 for(vector<smoothedInputData*>::iterator itz = sData.begin();
434 itz != sData.end(); itz++){
435 delete *itz;
436 }
437
438
439 return 0;
440 }
441