1 /*
2 vcflib C++ library for parsing and manipulating VCF files
3
4 Copyright © 2010-2020 Erik Garrison
5 Copyright © 2020 Pjotr Prins
6
7 This software is published under the MIT License. See the LICENSE file.
8 */
9
10 /*
11
12 This program was created at: Fri Apr 17 14:59:53 2015
13 This program was created by: Zev N. Kronenberg
14
15
16 Contact: zev.kronenber@gmail.com
17
18 Organization: Unviersity of Utah
19 School of Medicine
20 Salt Lake City, Utah
21
22
23 The MIT License (MIT)
24
25 Copyright (c) <2015> <Zev N. Kronenberg>
26
27 Permission is hereby granted, free of charge, to any person obtaining a copy
28 of this software and associated documentation files (the "Software"), to deal
29 in the Software without restriction, including without limitation the rights
30 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
31 copies of the Software, and to permit persons to whom the Software is
32 furnished to do so, subject to the following conditions:
33
34 The above copyright notice and this permission notice shall be included in
35 all copies or substantial portions of the Software.
36
37 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
38 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
39 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
40 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
42 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
43 THE SOFTWARE.
44
45
46 */
47 #include <fstream>
48 #include "split.h"
49 #include <vector>
50 #include <map>
51 #include <string>
52 #include <iostream>
53 #include <math.h>
54 #include <cmath>
55 #include <stdlib.h>
56 #include <time.h>
57 #include <stdio.h>
58 #include <unistd.h>
59 #include <stdlib.h>
60 #include "gpatInfo.hpp"
61
62 #include <omp.h>
63 // print lock
64 omp_lock_t lock;
65
66 struct options{
67 std::string file ;
68 std::string smoothed;
69 std::string format ;
70 double npermutation ;
71 double nsuc ;
72 int threads ;
73 int chrIndex ;
74 int nIndex ;
75 int valueIndex ;
76 }globalOpts;
77
78 struct score{
79 std::string seqid;
80 long int pos ;
81 double score ;
82 };
83
84 struct smoothedInputData{
85 std::string line;
86 double score ;
87 double n ;
88 double nPer ;
89 double nSuc ;
90 double ePv ;
91 };
92
93 static const char *optString = "x:y:u:f:n:s:";
94
95 using namespace std;
96
97 std::map<std::string, int> FORMATS;
98
99 //------------------------------- SUBROUTINE --------------------------------
100 /*
101 Function input : NA
102
103 Function does : prints help
104
105 Function returns: NA
106
107 */
printHelp()108 void printHelp()
109 {
110
111 cerr << endl << endl;
112 cerr << "INFO: help" << endl;
113 cerr << "INFO: description:" << endl;
114 cerr << " permuteSmoothFst is a method for adding empirical p-values smoothed wcFst scores." << endl ;
115 cerr << endl;
116 cerr << "INFO: usage: permuteSmoothFst -s wcFst.smooth.txt -f wcFst.txt -n 5 -s 1 "<< endl;
117 cerr << endl;
118 cerr << "Required:" << endl;
119 cerr << " file: f -- argument: original wcFst data "<< endl;
120 cerr << " smoothed: s -- argument: smoothed wcFst data "<< endl;
121 cerr << " format: y -- argument: [swcFst, segwcFst] "<< endl;
122 cerr << "Optional:" << endl;
123 cerr << " number: n -- argument: the number of permutations to run for each value [1000]" << endl;
124 cerr << " success: u -- argument: stop permutations after \'s\' successes [1]" << endl;
125 cerr << " success: x -- argument: number of threads [1]" << endl;
126 cerr << endl;
127 cerr << "OUTPUT: permuteSmoothFst will append three additional columns:" << endl;
128 cerr << " 1. The number of successes " << endl;
129 cerr << " 2. The number of trials " << endl;
130 cerr << " 3. The empirical p-value " << endl;
131 cerr << endl;
132 printVersion();
133
134 }
135
136
137 //------------------------------- OPTIONS --------------------------------
parseOpts(int argc,char ** argv)138 int parseOpts(int argc, char** argv)
139 {
140 int opt = 0;
141 globalOpts.file = "NA";
142
143 globalOpts.nsuc = 1;
144 globalOpts.npermutation = 1000;
145
146 opt = getopt(argc, argv, optString);
147 while(opt != -1){
148 switch(opt){
149 case 'x':
150 {
151 globalOpts.threads = atoi(optarg);
152 }
153 case 'f':
154 {
155 globalOpts.file = optarg;
156 break;
157 }
158 case 'y':
159 {
160 globalOpts.format = optarg;
161 if(FORMATS.find(globalOpts.format) == FORMATS.end()){
162 std::cerr << "FATAL: format not supported: " << globalOpts.format;
163 std::cerr << endl;
164 printHelp();
165 exit(1);
166 }
167 if(globalOpts.format == "swcFst"){
168 globalOpts.chrIndex = 0;
169 globalOpts.nIndex = 3;
170 globalOpts.valueIndex = 4;
171 }
172 if(globalOpts.format == "segwcFst"){
173 globalOpts.chrIndex = 0;
174 globalOpts.valueIndex = 3;
175 globalOpts.nIndex = 5;
176 }
177 break;
178 }
179 case 'n':
180 {
181 globalOpts.npermutation = atof(((string)optarg).c_str());
182 cerr << "INFO: permuteGPAT++ will do N permutations: " << globalOpts.npermutation << endl;
183 break;
184 }
185 case 's':
186 {
187 globalOpts.smoothed = optarg;
188 cerr << "INFO: smoothed file: " << globalOpts.smoothed << endl;
189 break;
190 }
191 case 'u':
192 {
193 globalOpts.nsuc = atof(((string)optarg).c_str());
194 cerr << "INFO: permuteGPAT++ will stop permutations after N successes: " << globalOpts.nsuc << endl;
195 break;
196 }
197 case '?':
198 {
199 break;
200 }
201 }
202
203 opt = getopt( argc, argv, optString );
204 }
205 return 1;
206 }
207
208
209 //------------------------------- SUBROUTINE --------------------------------
210 /*
211 Function input : data, vector to load
212
213 Function does : returns a contguous window
214
215 Function returns: bool
216
217 */
218
getContiguousWindow(vector<score * > & data,vector<double> & load,int n)219 bool getContiguousWindow(vector<score *> & data, vector<double> & load, int n){
220 int r = rand() % data.size();
221
222 if(r+n >= data.size()){
223 return false;
224 }
225
226 if(data[r]->seqid != data[r+n]->seqid){
227 return false;
228 }
229
230 for(int i = r; i < r+n; i++){
231 load.push_back(data[i]->score);
232 }
233 return true;
234 }
235
236
237 //------------------------------- SUBROUTINE --------------------------------
238
mean(vector<double> & data)239 double mean(vector<double> & data){
240
241 double sum = 0;
242
243 for(vector<double>::iterator it = data.begin(); it != data.end(); it++){
244 sum += (*it);
245 }
246 return sum / data.size();
247 }
248
249
250
251 //------------------------------- SUBROUTINE --------------------------------
252 /*
253 Function input : score, n, data
254
255 Function does : permutes the score. requires that the window is contiguous
256
257 Function returns: NA
258
259 */
260
permute(double s,int n,vector<score * > & data,double * nRep,double * nSuc,double * ePv)261 void permute(double s, int n, vector<score *> & data,
262 double * nRep, double *nSuc, double * ePv){
263
264
265 *ePv = 1 / globalOpts.npermutation;
266
267 while(*nSuc < globalOpts.nsuc && *nRep < globalOpts.npermutation ){
268 *nRep += 1;
269
270
271 std::vector<double> scores;
272
273 bool getWindow = false;
274 while(!getWindow){
275 getWindow = getContiguousWindow(data, scores, n);
276 }
277
278 double ns = mean(scores);
279
280 if(ns > s){
281 *nSuc += 1;
282 }
283 }
284 if(*nSuc > 0){
285 *ePv = *nSuc / *nRep;
286 }
287 }
288
289 //------------------------------- MAIN --------------------------------
290 /*
291 Comments:
292 */
293
main(int argc,char ** argv)294 int main( int argc, char** argv)
295 {
296 srand (time(NULL));
297
298 FORMATS["swcFst"] = 1;
299 FORMATS["segwcFst"] = 1;
300
301 globalOpts.threads = 1;
302
303 int parse = parseOpts(argc, argv);
304
305 omp_set_num_threads(globalOpts.threads);
306
307
308 if(globalOpts.file.compare("NA") == 0){
309 cerr << "FATAL: no file was provided" << endl;
310 printHelp();
311 exit(1);
312 }
313 if(globalOpts.format.empty()){
314 std::cerr << "FATAL: no format specified." << std::endl;
315 exit(1);
316 }
317
318 vector< score *> data;
319
320 ifstream wcDat (globalOpts.file.c_str());
321
322 string line;
323
324 if(wcDat.is_open()){
325
326 while(getline(wcDat, line)){
327 vector<string> region = split(line, "\t");
328 // will change for other output
329 double value = atof(region[4].c_str());
330
331 if(globalOpts.format == "swcFst" || globalOpts.format == "segwcFst"){
332
333 if(region.size() != 5){
334 cerr << "FATAL: wrong number of columns in wcFst input" << endl;
335 exit(1);
336 }
337 if(value < 0 ){
338 value = 0;
339 }
340 }
341
342 score * sp;
343 sp = new score;
344 sp->seqid = region[0] ;
345 sp->pos = atoi(region[1].c_str());
346 sp->score = value ;
347
348 data.push_back(sp);
349 }
350 }
351 else{
352 cerr << "FATAL: could not open file: " << globalOpts.file << endl;
353 exit(1);
354 }
355
356
357 wcDat.close();
358 line.clear();
359
360 cerr << "INFO: raw values to permute against: " << data.size() << endl;
361
362 ifstream smoothedFile (globalOpts.smoothed.c_str());
363
364 vector<smoothedInputData*> sData;
365
366 if(smoothedFile.is_open()){
367 while(getline(smoothedFile, line)){
368
369 vector<string> region = split(line, "\t");
370 smoothedInputData * sp = new smoothedInputData;
371
372 sp->line = line;
373 sp->score = atof(region[globalOpts.valueIndex].c_str());
374 sp->n = atoi(region[globalOpts.nIndex].c_str());
375 sp->nPer = 0;
376 sp->nSuc = 0;
377 sp->ePv = 0;
378
379 sData.push_back(sp);
380 }
381 }
382 smoothedFile.close();
383
384 cerr << "INFO: Number of smoothed windows to permute : " << sData.size() << endl;
385
386 #pragma omp parallel for schedule(dynamic, 20)
387 for(int i = 0; i < sData.size(); i++){
388 permute(sData[i]->score, sData[i]->n,
389 data, &sData[i]->nPer,
390 &sData[i]->nSuc, &sData[i]->ePv);
391
392 omp_set_lock(&lock);
393 cout << sData[i]->line
394 << "\t" << sData[i]->nSuc
395 << "\t" << sData[i]->nPer
396 << "\t" << sData[i]->ePv << endl;
397 omp_unset_lock(&lock);
398 }
399
400 for(vector<score*>::iterator itz = data.begin();
401 itz != data.end(); itz++){
402 delete *itz;
403 }
404 for(vector<smoothedInputData*>::iterator itz = sData.begin();
405 itz != sData.end(); itz++){
406 delete *itz;
407 }
408
409
410 return 0;
411 }
412