1 /*
2
3 This program was created at: Fri Apr 17 14:59:53 2015
4 This program was created by: Zev N. Kronenberg
5
6
7 Contact: zev.kronenber@gmail.com
8
9 Organization: Unviersity of Utah
10 School of Medicine
11 Salt Lake City, Utah
12
13
14 The MIT License (MIT)
15
16 Copyright (c) <2015> <Zev N. Kronenberg>
17
18 Permission is hereby granted, free of charge, to any person obtaining a copy
19 of this software and associated documentation files (the "Software"), to deal
20 in the Software without restriction, including without limitation the rights
21 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
22 copies of the Software, and to permit persons to whom the Software is
23 furnished to do so, subject to the following conditions:
24
25 The above copyright notice and this permission notice shall be included in
26 all copies or substantial portions of the Software.
27
28 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
33 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
34 THE SOFTWARE.
35
36
37 */
38 #include <fstream>
39 #include "split.h"
40 #include <vector>
41 #include <map>
42 #include <string>
43 #include <iostream>
44 #include <math.h>
45 #include <cmath>
46 #include <stdlib.h>
47 #include <time.h>
48 #include <stdio.h>
49 #include <unistd.h>
50 #include <stdlib.h>
51 #include "gpatInfo.hpp"
52
53 #include <omp.h>
54 // print lock
55 omp_lock_t lock;
56
57 struct options{
58 std::string file ;
59 std::string smoothed;
60 std::string format ;
61 double npermutation ;
62 double nsuc ;
63 int threads ;
64 int chrIndex ;
65 int nIndex ;
66 int valueIndex ;
67 }globalOpts;
68
69 struct score{
70 std::string seqid;
71 long int pos ;
72 double score ;
73 };
74
75 struct smoothedInputData{
76 std::string line;
77 double score ;
78 double n ;
79 double nPer ;
80 double nSuc ;
81 double ePv ;
82 };
83
84 static const char *optString = "x:y:u:f:n:s:";
85
86 using namespace std;
87
88 std::map<std::string, int> FORMATS;
89
90 //------------------------------- SUBROUTINE --------------------------------
91 /*
92 Function input : NA
93
94 Function does : prints help
95
96 Function returns: NA
97
98 */
printHelp()99 void printHelp()
100 {
101
102 cerr << endl << endl;
103 cerr << "INFO: help" << endl;
104 cerr << "INFO: description:" << endl;
105 cerr << " permuteSmoothFst is a method for adding empirical p-values smoothed wcFst scores." << endl ;
106 cerr << endl;
107 cerr << "INFO: usage: permuteSmoothFst -s wcFst.smooth.txt -f wcFst.txt -n 5 -s 1 "<< endl;
108 cerr << endl;
109 cerr << "Required:" << endl;
110 cerr << " file: f -- argument: original wcFst data "<< endl;
111 cerr << " smoothed: s -- argument: smoothed wcFst data "<< endl;
112 cerr << " format: y -- argument: [swcFst, segwcFst] "<< endl;
113 cerr << "Optional:" << endl;
114 cerr << " number: n -- argument: the number of permutations to run for each value [1000]" << endl;
115 cerr << " success: u -- argument: stop permutations after \'s\' successes [1]" << endl;
116 cerr << " success: x -- argument: number of threads [1]" << endl;
117 cerr << endl;
118 cerr << "OUTPUT: permuteSmoothFst will append three additional columns:" << endl;
119 cerr << " 1. The number of successes " << endl;
120 cerr << " 2. The number of trials " << endl;
121 cerr << " 3. The empirical p-value " << endl;
122 cerr << endl;
123 printVersion();
124
125 }
126
127
128 //------------------------------- OPTIONS --------------------------------
parseOpts(int argc,char ** argv)129 int parseOpts(int argc, char** argv)
130 {
131 int opt = 0;
132 globalOpts.file = "NA";
133
134 globalOpts.nsuc = 1;
135 globalOpts.npermutation = 1000;
136
137 opt = getopt(argc, argv, optString);
138 while(opt != -1){
139 switch(opt){
140 case 'x':
141 {
142 globalOpts.threads = atoi(optarg);
143 }
144 case 'f':
145 {
146 globalOpts.file = optarg;
147 break;
148 }
149 case 'y':
150 {
151 globalOpts.format = optarg;
152 if(FORMATS.find(globalOpts.format) == FORMATS.end()){
153 std::cerr << "FATAL: format not supported: " << globalOpts.format;
154 std::cerr << endl;
155 printHelp();
156 exit(1);
157 }
158 if(globalOpts.format == "swcFst"){
159 globalOpts.chrIndex = 0;
160 globalOpts.nIndex = 3;
161 globalOpts.valueIndex = 4;
162 }
163 if(globalOpts.format == "segwcFst"){
164 globalOpts.chrIndex = 0;
165 globalOpts.valueIndex = 3;
166 globalOpts.nIndex = 5;
167 }
168 break;
169 }
170 case 'n':
171 {
172 globalOpts.npermutation = atof(((string)optarg).c_str());
173 cerr << "INFO: permuteGPAT++ will do N permutations: " << globalOpts.npermutation << endl;
174 break;
175 }
176 case 's':
177 {
178 globalOpts.smoothed = optarg;
179 cerr << "INFO: smoothed file: " << globalOpts.smoothed << endl;
180 break;
181 }
182 case 'u':
183 {
184 globalOpts.nsuc = atof(((string)optarg).c_str());
185 cerr << "INFO: permuteGPAT++ will stop permutations after N successes: " << globalOpts.nsuc << endl;
186 break;
187 }
188 case '?':
189 {
190 break;
191 }
192 }
193
194 opt = getopt( argc, argv, optString );
195 }
196 return 1;
197 }
198
199
200 //------------------------------- SUBROUTINE --------------------------------
201 /*
202 Function input : data, vector to load
203
204 Function does : returns a contguous window
205
206 Function returns: bool
207
208 */
209
getContiguousWindow(vector<score * > & data,vector<double> & load,int n)210 bool getContiguousWindow(vector<score *> & data, vector<double> & load, int n){
211 int r = rand() % data.size();
212
213 if(r+n >= data.size()){
214 return false;
215 }
216
217 if(data[r]->seqid != data[r+n]->seqid){
218 return false;
219 }
220
221 for(int i = r; i < r+n; i++){
222 load.push_back(data[i]->score);
223 }
224 return true;
225 }
226
227
228 //------------------------------- SUBROUTINE --------------------------------
229
mean(vector<double> & data)230 double mean(vector<double> & data){
231
232 double sum = 0;
233
234 for(vector<double>::iterator it = data.begin(); it != data.end(); it++){
235 sum += (*it);
236 }
237 return sum / data.size();
238 }
239
240
241
242 //------------------------------- SUBROUTINE --------------------------------
243 /*
244 Function input : score, n, data
245
246 Function does : permutes the score. requires that the window is contiguous
247
248 Function returns: NA
249
250 */
251
permute(double s,int n,vector<score * > & data,double * nRep,double * nSuc,double * ePv)252 void permute(double s, int n, vector<score *> & data,
253 double * nRep, double *nSuc, double * ePv){
254
255
256 *ePv = 1 / globalOpts.npermutation;
257
258 while(*nSuc < globalOpts.nsuc && *nRep < globalOpts.npermutation ){
259 *nRep += 1;
260
261
262 std::vector<double> scores;
263
264 bool getWindow = false;
265 while(!getWindow){
266 getWindow = getContiguousWindow(data, scores, n);
267 }
268
269 double ns = mean(scores);
270
271 if(ns > s){
272 *nSuc += 1;
273 }
274 }
275 if(*nSuc > 0){
276 *ePv = *nSuc / *nRep;
277 }
278 }
279
280 //------------------------------- MAIN --------------------------------
281 /*
282 Comments:
283 */
284
main(int argc,char ** argv)285 int main( int argc, char** argv)
286 {
287 srand (time(NULL));
288
289 FORMATS["swcFst"] = 1;
290 FORMATS["segwcFst"] = 1;
291
292 globalOpts.threads = 1;
293
294 int parse = parseOpts(argc, argv);
295
296 omp_set_num_threads(globalOpts.threads);
297
298
299 if(globalOpts.file.compare("NA") == 0){
300 cerr << "FATAL: no file was provided" << endl;
301 printHelp();
302 exit(1);
303 }
304 if(globalOpts.format.empty()){
305 std::cerr << "FATAL: no format specified." << std::endl;
306 exit(1);
307 }
308
309 vector< score *> data;
310
311 ifstream wcDat (globalOpts.file.c_str());
312
313 string line;
314
315 if(wcDat.is_open()){
316
317 while(getline(wcDat, line)){
318 vector<string> region = split(line, "\t");
319 // will change for other output
320 double value = atof(region[4].c_str());
321
322 if(globalOpts.format == "swcFst" || globalOpts.format == "segwcFst"){
323
324 if(region.size() != 5){
325 cerr << "FATAL: wrong number of columns in wcFst input" << endl;
326 exit(1);
327 }
328 if(value < 0 ){
329 value = 0;
330 }
331 }
332
333 score * sp;
334 sp = new score;
335 sp->seqid = region[0] ;
336 sp->pos = atoi(region[1].c_str());
337 sp->score = value ;
338
339 data.push_back(sp);
340 }
341 }
342 else{
343 cerr << "FATAL: could not open file: " << globalOpts.file << endl;
344 exit(1);
345 }
346
347
348 wcDat.close();
349 line.clear();
350
351 cerr << "INFO: raw values to permute against: " << data.size() << endl;
352
353 ifstream smoothedFile (globalOpts.smoothed.c_str());
354
355 vector<smoothedInputData*> sData;
356
357 if(smoothedFile.is_open()){
358 while(getline(smoothedFile, line)){
359
360 vector<string> region = split(line, "\t");
361 smoothedInputData * sp = new smoothedInputData;
362
363 sp->line = line;
364 sp->score = atof(region[globalOpts.valueIndex].c_str());
365 sp->n = atoi(region[globalOpts.nIndex].c_str());
366 sp->nPer = 0;
367 sp->nSuc = 0;
368 sp->ePv = 0;
369
370 sData.push_back(sp);
371 }
372 }
373 smoothedFile.close();
374
375 cerr << "INFO: Number of smoothed windows to permute : " << sData.size() << endl;
376
377 #pragma omp parallel for schedule(dynamic, 20)
378 for(int i = 0; i < sData.size(); i++){
379 permute(sData[i]->score, sData[i]->n,
380 data, &sData[i]->nPer,
381 &sData[i]->nSuc, &sData[i]->ePv);
382
383 omp_set_lock(&lock);
384 cout << sData[i]->line
385 << "\t" << sData[i]->nSuc
386 << "\t" << sData[i]->nPer
387 << "\t" << sData[i]->ePv << endl;
388 omp_unset_lock(&lock);
389 }
390
391 for(vector<score*>::iterator itz = data.begin();
392 itz != data.end(); itz++){
393 delete *itz;
394 }
395 for(vector<smoothedInputData*>::iterator itz = sData.begin();
396 itz != sData.end(); itz++){
397 delete *itz;
398 }
399
400
401 return 0;
402 }
403