1 /*
2 vcflib C++ library for parsing and manipulating VCF files
3
4 Copyright © 2010-2020 Erik Garrison
5 Copyright © 2020 Pjotr Prins
6
7 This software is published under the MIT License. See the LICENSE file.
8 */
9
10 #include <iostream>
11 #include <fstream>
12 #include <getopt.h>
13 #include <map>
14 #include <list>
15 #include <vector>
16 #include <string>
17 #include "split.h"
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include "gpatInfo.hpp"
21 #include <math.h>
22
23 using namespace std;
24
25 struct opts{
26 bool truncate;
27 string format;
28 long int step;
29 long int size;
30 int seqid;
31 int pos ;
32 int value;
33 };
34
35 struct score{
36 long int position;
37 double score;
38 };
39
40
41
printHelp(void)42 void printHelp(void){
43 cerr << endl << endl;
44 cerr << "INFO: help" << endl;
45 cerr << "INFO: description:" << endl;
46 cerr << "smoothes is a method for window smoothing many of the GPAT++ formats." << endl << endl ;
47 cerr << " smoother averages a set of scores over a sliding genomic window. " << endl;
48 cerr << " smoother slides over genomic positions not the SNP indices. In other words " << endl;
49 cerr << " the number of scores within a window will not be constant. The last " << endl;
50 cerr << " window for each seqid can be smaller than the defined window size. " << endl;
51 cerr << " smoother automatically analyses different seqids separately. " << endl;
52
53 cerr << "Output : 4 columns : " << endl;
54 cerr << " 1. seqid " << endl;
55 cerr << " 2. window start " << endl;
56 cerr << " 2. window end " << endl;
57 cerr << " 3. averaged score " << endl << endl;
58
59 cerr << "INFO: usage: smoother --format pFst --file GPA.output.txt" << endl;
60 cerr << endl;
61 cerr << "INFO: required: f,file -- argument: a file created by GPAT++ " << endl;
62 cerr << "INFO: required: o,format -- argument: format of input file, case sensitive " << endl;
63 cerr << " available format options: " << endl;
64 cerr << " wcFst, pFst, bFst, iHS, xpEHH, abba-baba, col3 " << endl;
65 cerr << "INFO: optional: w,window -- argument: size of genomic window in base pairs (default 5000)" << endl;
66 cerr << "INFO: optional: s,step -- argument: window step size in base pairs (default 1000) " << endl;
67 cerr << "INFO: optional: t,truncate -- flag : end last window at last position (zero based) " << endl;
68 cerr << endl << "Type: transformation" << endl << endl;
69 printVersion();
70 cerr << endl << endl;
71 }
72
ngreater(list<score> & rangeData,double val)73 double ngreater(list<score> & rangeData, double val){
74
75 double n = 0;
76
77
78 for(list<score>::iterator it = rangeData.begin();
79 it != rangeData.end(); it++ ){
80 if(it->score >= val){
81 n += 1;
82 }
83 }
84 return n;
85 }
86
87
windowAvg(list<score> & rangeData)88 double windowAvg(list<score> & rangeData){
89
90 double n = 0;
91 double s = 0;
92
93 for(list<score>::iterator it = rangeData.begin(); it != rangeData.end(); it++){
94 s += it->score;
95 n += 1;
96 }
97 return (s/n);
98 }
99
100 //calculation of Patterson's D statistic
dStatistic(list<score> & rangeData)101 double dStatistic(list<score> & rangeData){
102
103 double abba = 0;
104 double baba = 0;
105 double dstat ;
106 for(list<score>::iterator it = rangeData.begin(); it != rangeData.end(); it++){
107 if(it->score == 0){ // means we have BABA locus
108 baba += 1;
109 }
110 else{ // count towards ABBA locus
111 abba += 1;
112 }
113 }
114 dstat = (abba - baba) / (abba + baba ); // d-statistic implementation
115 return (dstat);
116 }
117
processSeqid(ifstream & file,string seqid,streampos offset,opts & opt)118 void processSeqid(ifstream & file, string seqid, streampos offset, opts & opt){
119
120 string line ;
121
122 long int windowSize = opt.size;
123 long int start = 0;
124 long int end = windowSize;
125
126 list<score> windowDat;
127
128 file.clear();
129
130 file.seekg(offset);
131
132 vector<string> sline;
133
134 while(getline(file, line)){
135
136 sline = split(line, '\t');
137 score current ;
138 if(seqid != sline[opt.seqid]){
139 break;
140 }
141 current.position = atol( sline[opt.pos].c_str() );
142 current.score = atof( sline[opt.value].c_str() );
143
144 if(opt.format == "iHS"){
145 current.score = fabs(current.score);
146 }
147
148
149
150 // add in if abba-baba to process second score.
151
152
153 if(current.position > end){
154
155 double reportValue ;
156
157 if(opt.format == "abba-baba"){
158 reportValue = dStatistic(windowDat);
159 }
160 else{
161 reportValue = windowAvg(windowDat);
162 }
163 // nan
164 if( reportValue == reportValue){
165 cout << seqid << "\t" << start << "\t" << end << "\t" << windowDat.size() << "\t" << reportValue;
166 if(opt.format == "iHS"){
167 std::cout << "\t" << ngreater(windowDat, 2.5) ;
168 }
169 std::cout << std::endl;
170 }
171
172
173 }
174 while(end < current.position){
175 start += opt.step;
176 end += opt.step;
177 while(windowDat.front().position < start && !windowDat.empty()){
178 windowDat.pop_front();
179 }
180 }
181 windowDat.push_back(current);
182 }
183 // add function for D-stat if abba-baba
184 double finalMean = windowAvg(windowDat);
185
186 if(opt.truncate && (finalMean == finalMean) ){
187 cout << seqid << "\t" << start << "\t" << windowDat.back().position - 1 << "\t" << windowDat.size() << "\t" << finalMean;
188
189 if(opt.format == "iHS"){
190 std::cout << "\t" << ngreater(windowDat, 2.5) ;
191 }
192 std::cout << std::endl;
193
194 }
195 else if(finalMean == finalMean){
196 cout << seqid << "\t" << start << "\t" << end << "\t" << windowDat.size() << "\t" << finalMean;
197
198 if(opt.format == "iHS"){
199 std::cout << "\t" << ngreater(windowDat, 2.5) ;
200 }
201 std::cout << std::endl;
202
203 }
204 cerr << "INFO: smoother finished : " << seqid << endl;
205 }
206
main(int argc,char ** argv)207 int main(int argc, char** argv) {
208
209 map<string, int> acceptableFormats;
210 acceptableFormats["pFst"] = 1;
211 acceptableFormats["col3"] = 1;
212 acceptableFormats["bFst"] = 1;
213 acceptableFormats["wcFst"] = 1;
214 acceptableFormats["xpEHH"] = 1;
215 acceptableFormats["iHS"] = 1;
216 acceptableFormats["cqf"] = 1;
217 acceptableFormats["deltaAf"] = 1;
218 acceptableFormats["abba-baba"] = 1;
219
220 opts opt;
221 opt.size = 5000;
222 opt.step = 1000;
223 opt.format = "NA";
224 opt.truncate = false;
225
226 string filename = "NA";
227
228 static struct option longopts[] =
229 {
230 {"version" , 0, 0, 'v'},
231 {"help" , 0, 0, 'h'},
232 {"file" , 1, 0, 'f'},
233 {"window" , 1, 0, 'w'},
234 {"step" , 1, 0, 's'},
235 {"format" , 1, 0, 'o'},
236 {"truncate" , 0, 0, 't'},
237 {0,0,0,0}
238 };
239
240 int index;
241 int iarg=0;
242
243 while(iarg != -1){
244 iarg = getopt_long(argc, argv, "f:w:s:o:vht", longopts, &index);
245 switch(iarg){
246 case 't':
247 {
248 opt.truncate = true;
249 break;
250 }
251 case 'h':
252 {
253 printHelp();
254 return 0;
255 }
256 case 'v':
257 {
258 printVersion();
259 return 0;
260 }
261 case 'f':
262 {
263 filename = optarg;
264 cerr << "INFO: file : " << filename << endl;
265 break;
266 }
267 case 's':
268 {
269 opt.step = atol(optarg);
270 cerr << "INFO: step size : " << optarg << endl;
271 break;
272 }
273 case 'w':
274 {
275 opt.size = atol(optarg);
276 cerr << "INFO: step size : " << optarg << endl;
277 break;
278 }
279 case 'o':
280 {
281 opt.format = optarg;
282 cerr << "INFO: specified input format : " << optarg << endl;
283 break;
284 }
285 }
286 }
287 if(filename == "NA"){
288 cerr << "FATAL: file was not specified!" << endl << endl;
289 printHelp();
290 return 1;
291 }
292
293 if(acceptableFormats.find(opt.format) == acceptableFormats.end()){
294 cerr << "FATAL: unacceptable input file format, see --format " << endl << endl;
295 printHelp();
296 return 1;
297 }
298 if(opt.format == "deltaAf"){
299 opt.seqid = 0;
300 opt.pos = 1;
301 opt.value = 4;
302 }
303 else if(opt.format == "abba-baba"){
304 opt.seqid = 0;
305 opt.pos = 1;
306 opt.value = 2;
307 }
308 else if(opt.format == "col3"){
309 opt.seqid = 0;
310 opt.pos = 1;
311 opt.value = 2;
312 }
313 else if(opt.format == "pFst"){
314 opt.seqid = 0;
315 opt.pos = 1;
316 opt.value = 2;
317 }
318 else if (opt.format == "bFst"){
319 opt.seqid = 0;
320 opt.pos = 1;
321 opt.value = 8;
322 }
323 else if (opt.format == "wcFst"){
324 opt.seqid = 0;
325 opt.pos = 1;
326 opt.value = 4;
327 }
328 else if(opt.format == "cqf"){
329 opt.seqid = 0;
330 opt.pos = 1;
331 opt.value = 5;
332 }
333 else if(opt.format == "xpEHH"){
334 opt.seqid = 0;
335 opt.pos = 1;
336 opt.value = 5;
337 }
338 else if(opt.format == "iHS"){
339 opt.seqid = 0;
340 opt.pos = 1;
341 opt.value = 6;
342 }
343
344 else{
345 cerr << "FATAL: input format flag not specified correctly : " << opt.format << endl;
346 cerr << "INFO: please use smoother --help" << endl;
347 return 1;
348 }
349
350 ifstream ifs(filename.c_str());
351
352 string currentSeqid = "NA";
353
354 string line;
355
356 map<string, streampos > seqidIndex;
357
358 if(ifs){
359 while(getline(ifs, line)){
360 vector<string> sline = split(line, '\t');
361 if(sline[opt.seqid] != currentSeqid){
362
363 long int bline = ifs.tellg() ;
364 bline -= ( line.size() +1 );
365
366 // std::cerr << "INFO: seqid: " << sline[opt.seqid] << " tellg: " << bline << std::endl;
367
368 map<string, streampos>::iterator it;
369
370 if(seqidIndex.find(sline[opt.seqid]) != seqidIndex.end() ){
371 cerr << "FATAL: file is unsorted!" << endl;
372 return 1;
373 }
374 seqidIndex[sline[opt.seqid]] = bline;
375 currentSeqid = sline[opt.seqid];
376 }
377 }
378 }
379 else{
380 cerr << "FATAL: no lines -- or -- couldn't open file" << endl;
381 }
382
383 for( map< string, streampos>::iterator it = seqidIndex.begin(); it != seqidIndex.end(); it++){
384 cerr << "INFO: processing seqid : "<< (it->first) << endl;
385 processSeqid(ifs, (it->first),(it->second), opt);
386 }
387
388 ifs.close();
389 cerr << "INFO: smoother has successfully finished" << endl;
390
391 return 0;
392
393 }
394