1 /*
2 vcflib C++ library for parsing and manipulating VCF files
3
4 Copyright © 2010-2020 Erik Garrison
5 Copyright © 2015 Zev N. Kronenberg
6 Copyright © 2020 Pjotr Prins
7
8 This software is published under the MIT License. See the LICENSE file.
9 */
10
11 /*
12
13 This program was created at: Fri Apr 17 14:59:53 2015
14 This program was created by: Zev N. Kronenberg
15
16
17 Contact: zev.kronenber@gmail.com
18
19 Organization: Unviersity of Utah
20 School of Medicine
21 Salt Lake City, Utah
22
23
24 The MIT License (MIT)
25
26 Copyright (c) <2015> <Zev N. Kronenberg>
27
28 Permission is hereby granted, free of charge, to any person obtaining a copy
29 of this software and associated documentation files (the "Software"), to deal
30 in the Software without restriction, including without limitation the rights
31 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
32 copies of the Software, and to permit persons to whom the Software is
33 furnished to do so, subject to the following conditions:
34
35 The above copyright notice and this permission notice shall be included in
36 all copies or substantial portions of the Software.
37
38 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
39 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
40 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
43 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
44 THE SOFTWARE.
45
46
47 */
48 #include <fstream>
49 #include "split.h"
50 #include <vector>
51 #include <map>
52 #include <string>
53 #include <iostream>
54 #include <math.h>
55 #include <cmath>
56 #include <stdlib.h>
57 #include <time.h>
58 #include <stdio.h>
59 #include <unistd.h>
60 #include <stdlib.h>
61 #include "gpatInfo.hpp"
62
63 #if defined HAS_OPENMP
64 #include <omp.h>
65 // print lock
66 omp_lock_t lock;
67 #endif
68
69 struct options{
70 std::string file ;
71 std::string smoothed;
72 std::string format ;
73 double npermutation ;
74 double nsuc ;
75 int threads ;
76 int chrIndex ;
77 int nIndex ;
78 int valueIndex ;
79 }globalOpts;
80
81 struct score{
82 std::string seqid;
83 long int pos ;
84 double score ;
85 };
86
87 struct smoothedInputData{
88 std::string line;
89 double score ;
90 double n ;
91 double nPer ;
92 double nSuc ;
93 double ePv ;
94 };
95
96 static const char *optString = "x:y:u:f:n:s:";
97
98 using namespace std;
99
100 std::map<std::string, int> FORMATS;
101
102 //------------------------------- SUBROUTINE --------------------------------
103 /*
104 Function input : NA
105
106 Function does : prints help
107
108 Function returns: NA
109
110 */
printHelp()111 void printHelp()
112 {
113
114 cerr << endl << endl;
115 cerr << "INFO: help" << endl;
116 cerr << "INFO: description:" << endl;
117 cerr << " permuteSmooth is a method for adding empirical p-values smoothed wcFst scores." << endl ;
118 cerr << endl;
119 cerr << "INFO: usage: permuteSmooth -s wcFst.smooth.txt -f wcFst.txt -n 5 -s 1 "<< endl;
120 cerr << endl;
121 cerr << "Required:" << endl;
122 cerr << " file: f -- argument: original wcFst data "<< endl;
123 cerr << " smoothed: s -- argument: smoothed wcFst data "<< endl;
124 cerr << " format: y -- argument: [swcFst, segwcFst] "<< endl;
125 cerr << "Optional:" << endl;
126 cerr << " number: n -- argument: the number of permutations to run for each value [1000]" << endl;
127 cerr << " success: u -- argument: stop permutations after \'s\' successes [1]" << endl;
128 cerr << " success: x -- argument: number of threads [1]" << endl;
129 cerr << endl;
130 cerr << "OUTPUT: permuteSmooth will append three additional columns:" << endl;
131 cerr << " 1. The number of successes " << endl;
132 cerr << " 2. The number of trials " << endl;
133 cerr << " 3. The empirical p-value " << endl;
134 cerr << endl;
135 cerr << endl << "Type: statistics" << endl << endl;
136 printVersion();
137
138 }
139
140
141 //------------------------------- OPTIONS --------------------------------
parseOpts(int argc,char ** argv)142 int parseOpts(int argc, char** argv)
143 {
144 int opt = 0;
145 globalOpts.file = "NA";
146
147 globalOpts.nsuc = 1;
148 globalOpts.npermutation = 1000;
149
150 if (argc == 2) {
151 string h_flag = argv[1];
152
153 if (argc == 2 && (h_flag == "-h" || h_flag == "--help")) {
154 printHelp();
155 exit(1);
156 }
157 }
158
159 opt = getopt(argc, argv, optString);
160 while(opt != -1){
161 switch(opt){
162 case 'x':
163 {
164 globalOpts.threads = atoi(optarg);
165 break;
166 }
167 case 'f':
168 {
169 globalOpts.file = optarg;
170 break;
171 }
172 case 'y':
173 {
174 globalOpts.format = optarg;
175 if(FORMATS.find(globalOpts.format) == FORMATS.end()){
176 std::cerr << "FATAL: format not supported: " << globalOpts.format;
177 std::cerr << endl;
178 printHelp();
179 exit(1);
180 }
181 if(globalOpts.format == "swcFst"){
182 globalOpts.chrIndex = 0;
183 globalOpts.nIndex = 3;
184 globalOpts.valueIndex = 4;
185 }
186 if(globalOpts.format == "segwcFst"){
187 globalOpts.chrIndex = 0;
188 globalOpts.valueIndex = 3;
189 globalOpts.nIndex = 5;
190 }
191 break;
192 }
193 case 'n':
194 {
195 globalOpts.npermutation = atof(((string)optarg).c_str());
196 cerr << "INFO: permuteGPAT++ will do N permutations: " << globalOpts.npermutation << endl;
197 break;
198 }
199 case 's':
200 {
201 globalOpts.smoothed = optarg;
202 cerr << "INFO: smoothed file: " << globalOpts.smoothed << endl;
203 break;
204 }
205 case 'u':
206 {
207 globalOpts.nsuc = atof(((string)optarg).c_str());
208 cerr << "INFO: permuteGPAT++ will stop permutations after N successes: " << globalOpts.nsuc << endl;
209 break;
210 }
211 case '?':
212 {
213 break;
214 }
215 }
216
217 opt = getopt( argc, argv, optString );
218 }
219 return 1;
220 }
221
222
223 //------------------------------- SUBROUTINE --------------------------------
224 /*
225 Function input : data, vector to load
226
227 Function does : returns a contguous window
228
229 Function returns: bool
230
231 */
232
getContiguousWindow(vector<score * > & data,vector<double> & load,int n,int * nfail)233 bool getContiguousWindow(vector<score *> & data,
234 vector<double> & load,
235 int n, int * nfail){
236 int r = rand() % data.size();
237
238 if(r+n >= data.size()){
239 *nfail+=1;
240 return false;
241 }
242
243 if(data[r]->seqid != data[r+n]->seqid){
244 *nfail+=1;
245 return false;
246 }
247
248 for(int i = r; i < r+n; i++){
249 load.push_back(data[i]->score);
250 }
251 return true;
252 }
253
254
255 //------------------------------- SUBROUTINE --------------------------------
256
mean(vector<double> & data)257 double mean(vector<double> & data){
258
259 double sum = 0;
260
261 for(vector<double>::iterator it = data.begin(); it != data.end(); it++){
262 sum += (*it);
263 }
264 return sum / data.size();
265 }
266
267
268
269 //------------------------------- SUBROUTINE --------------------------------
270 /*
271 Function input : score, n, data
272
273 Function does : permutes the score. requires that the window is contiguous
274
275 Function returns: NA
276
277 */
278
permute(double s,int n,vector<score * > & data,double * nRep,double * nSuc,double * ePv)279 bool permute(double s, int n, vector<score *> & data,
280 double * nRep, double *nSuc, double * ePv){
281
282
283 *ePv = 1 / globalOpts.npermutation;
284
285 while(*nSuc < globalOpts.nsuc && *nRep < globalOpts.npermutation ){
286 *nRep += 1;
287
288
289 std::vector<double> scores;
290
291 bool getWindow = false;
292
293 int nfail = 0;
294
295 while(!getWindow){
296 if(nfail > globalOpts.npermutation){
297 return false;
298 }
299 getWindow = getContiguousWindow(data, scores, n, &nfail);
300 }
301
302 double ns = mean(scores);
303
304 if(ns > s){
305 *nSuc += 1;
306 }
307 }
308 if(*nSuc > 0){
309 *ePv = *nSuc / *nRep;
310 }
311
312 return true;
313 }
314
315 //------------------------------- MAIN --------------------------------
316 /*
317 Comments:
318 */
319
main(int argc,char ** argv)320 int main( int argc, char** argv)
321 {
322 srand (time(NULL));
323
324 FORMATS["swcFst"] = 1;
325 FORMATS["segwcFst"] = 1;
326
327 globalOpts.threads = 1;
328
329 int parse = parseOpts(argc, argv);
330
331 #if defined HAS_OPENMP
332 omp_set_num_threads(globalOpts.threads);
333 #endif
334
335 if(globalOpts.file.compare("NA") == 0){
336 cerr << "FATAL: no file was provided" << endl;
337 printHelp();
338 exit(1);
339 }
340 if(globalOpts.format.empty()){
341 std::cerr << "FATAL: no format specified." << std::endl;
342 exit(1);
343 }
344
345 vector< score *> data;
346
347 ifstream wcDat (globalOpts.file.c_str());
348
349 string line;
350
351 if(wcDat.is_open()){
352
353 while(getline(wcDat, line)){
354 vector<string> region = split(line, "\t");
355 // will change for other output
356 double value = atof(region[4].c_str());
357
358 if(globalOpts.format == "swcFst" || globalOpts.format == "segwcFst"){
359
360 if(region.size() != 5){
361 cerr << "FATAL: wrong number of columns in wcFst input" << endl;
362 exit(1);
363 }
364 if(value < 0 ){
365 value = 0;
366 }
367 }
368
369 score * sp;
370 sp = new score;
371 sp->seqid = region[0] ;
372 sp->pos = atoi(region[1].c_str());
373 sp->score = value ;
374
375 data.push_back(sp);
376 }
377 }
378 else{
379 cerr << "FATAL: could not open file: " << globalOpts.file << endl;
380 exit(1);
381 }
382
383
384 wcDat.close();
385 line.clear();
386
387 cerr << "INFO: raw values to permute against: " << data.size() << endl;
388
389 ifstream smoothedFile (globalOpts.smoothed.c_str());
390
391 vector<smoothedInputData*> sData;
392
393 if(smoothedFile.is_open()){
394 while(getline(smoothedFile, line)){
395
396 vector<string> region = split(line, "\t");
397 smoothedInputData * sp = new smoothedInputData;
398
399 sp->line = line;
400 sp->score = atof(region[globalOpts.valueIndex].c_str());
401 sp->n = atoi(region[globalOpts.nIndex].c_str());
402 sp->nPer = 0;
403 sp->nSuc = 0;
404 sp->ePv = 0;
405
406 sData.push_back(sp);
407 }
408 }
409 smoothedFile.close();
410
411 cerr << "INFO: Number of smoothed windows to permute : " << sData.size() << endl;
412
413 #if defined HAS_OPENMP
414 #pragma omp parallel for schedule(dynamic, 20)
415 #endif
416
417 for(int i = 0; i < sData.size(); i++){
418 bool per = permute(sData[i]->score, sData[i]->n,
419 data, &sData[i]->nPer,
420 &sData[i]->nSuc, &sData[i]->ePv);
421
422
423 if(per){
424 #if defined HAS_OPENMP
425 omp_set_lock(&lock);
426 #endif
427 cout << sData[i]->line
428 << "\t" << sData[i]->nSuc
429 << "\t" << sData[i]->nPer
430 << "\t" << sData[i]->ePv << endl;
431 #if defined HAS_OPENMP
432 omp_unset_lock(&lock);
433 #endif
434 }
435 else{
436 #if defined HAS_OPENMP
437 omp_set_lock(&lock);
438 #endif
439 cout << sData[i]->line
440 << "\t" << "NA"
441 << "\t" << "NA"
442 << "\t" << "NA" << endl;
443 #if defined HAS_OPENMP
444 omp_unset_lock(&lock);
445 #endif
446 }
447 }
448
449 for(vector<score*>::iterator itz = data.begin();
450 itz != data.end(); itz++){
451 delete *itz;
452 }
453 for(vector<smoothedInputData*>::iterator itz = sData.begin();
454 itz != sData.end(); itz++){
455 delete *itz;
456 }
457
458
459 return 0;
460 }
461