1 /*
2     vcflib C++ library for parsing and manipulating VCF files
3 
4     Copyright © 2010-2020 Erik Garrison
5     Copyright © 2020      Pjotr Prins
6 
7     This software is published under the MIT License. See the LICENSE file.
8 */
9 
10 #include <iostream>
11 #include <fstream>
12 #include <getopt.h>
13 #include <map>
14 #include <list>
15 #include <vector>
16 #include <string>
17 #include "split.h"
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include "gpatInfo.hpp"
21 #include <math.h>
22 
23 using namespace std;
24 
25 struct opts{
26   bool     truncate;
27   string   format;
28   long int step;
29   long int size;
30   int      seqid;
31   int      pos  ;
32   int      value;
33 };
34 
35 struct score{
36   long int position;
37   double score;
38 };
39 
40 
41 
printHelp(void)42 void printHelp(void){
43   cerr << endl << endl;
44   cerr << "INFO: help" << endl;
45   cerr << "INFO: description:" << endl;
46   cerr << "smoothes is a method for window smoothing many of the GPAT++ formats." << endl << endl ;
47   cerr << "      smoother averages a set of scores over a sliding genomic window.            " << endl;
48   cerr << "      smoother slides over genomic positions not the SNP indices. In other words  " << endl;
49   cerr << "      the number of scores within a window will not be constant. The last         " << endl;
50   cerr << "      window for each seqid can be smaller than the defined window size.          " << endl;
51   cerr << "      smoother automatically analyses different seqids separately.                " << endl;
52 
53   cerr << "Output : 4 columns :     "    << endl;
54   cerr << "     1. seqid            "    << endl;
55   cerr << "     2. window start     "    << endl;
56   cerr << "     2. window end       "    << endl;
57   cerr << "     3. averaged score   "    << endl  << endl;
58 
59   cerr << "INFO: usage: smoother --format pFst --file GPA.output.txt" << endl;
60   cerr << endl;
61   cerr << "INFO: required: f,file     -- argument: a file created by GPAT++                           " << endl;
62   cerr << "INFO: required: o,format   -- argument: format of input file, case sensitive               " << endl;
63   cerr << "                              available format options:                                    " << endl;
64   cerr << "                                wcFst, pFst, bFst, iHS, xpEHH, abba-baba, col3             " << endl;
65   cerr << "INFO: optional: w,window   -- argument: size of genomic window in base pairs (default 5000)" << endl;
66   cerr << "INFO: optional: s,step     -- argument: window step size in base pairs (default 1000)      " << endl;
67   cerr << "INFO: optional: t,truncate -- flag    : end last window at last position (zero based)      " << endl;
68   cerr << endl << "Type: transformation" << endl << endl;
69   printVersion();
70   cerr << endl << endl;
71 }
72 
ngreater(list<score> & rangeData,double val)73 double ngreater(list<score> & rangeData, double val){
74 
75   double n = 0;
76 
77 
78   for(list<score>::iterator it = rangeData.begin();
79       it != rangeData.end(); it++ ){
80     if(it->score >= val){
81       n += 1;
82     }
83   }
84   return n;
85 }
86 
87 
windowAvg(list<score> & rangeData)88 double windowAvg(list<score> & rangeData){
89 
90   double n = 0;
91   double s = 0;
92 
93   for(list<score>::iterator it = rangeData.begin(); it != rangeData.end(); it++){
94     s += it->score;
95     n += 1;
96   }
97   return (s/n);
98 }
99 
100 //calculation of Patterson's D statistic
dStatistic(list<score> & rangeData)101 double dStatistic(list<score> & rangeData){
102 
103   double abba = 0;
104   double baba = 0;
105   double dstat ;
106   for(list<score>::iterator it = rangeData.begin(); it != rangeData.end(); it++){
107     if(it->score == 0){ // means we have BABA locus
108       baba += 1;
109     }
110     else{ // count towards ABBA locus
111       abba += 1;
112     }
113   }
114   dstat = (abba - baba) / (abba + baba ); // d-statistic implementation
115   return (dstat);
116 }
117 
processSeqid(ifstream & file,string seqid,streampos offset,opts & opt)118 void processSeqid(ifstream & file, string seqid, streampos offset, opts & opt){
119 
120   string line ;
121 
122   long int windowSize = opt.size;
123   long int start = 0;
124   long int end   = windowSize;
125 
126   list<score> windowDat;
127 
128   file.clear();
129 
130   file.seekg(offset);
131 
132   vector<string> sline;
133 
134   while(getline(file, line)){
135 
136     sline = split(line, '\t');
137     score current ;
138     if(seqid != sline[opt.seqid]){
139       break;
140     }
141     current.position = atol( sline[opt.pos].c_str() );
142     current.score    = atof( sline[opt.value].c_str() );
143 
144     if(opt.format == "iHS"){
145       current.score = fabs(current.score);
146     }
147 
148 
149 
150     // add in if abba-baba to process second score.
151 
152 
153     if(current.position > end){
154 
155       double reportValue ;
156 
157       if(opt.format == "abba-baba"){
158 	reportValue = dStatistic(windowDat);
159       }
160       else{
161 	reportValue = windowAvg(windowDat);
162       }
163       // nan
164       if( reportValue == reportValue){
165 	cout << seqid << "\t" << start << "\t" << end << "\t" << windowDat.size() << "\t" << reportValue;
166 	if(opt.format == "iHS"){
167 	  std::cout << "\t" << ngreater(windowDat, 2.5) ;
168 	}
169 	std::cout << std::endl;
170       }
171 
172 
173     }
174     while(end < current.position){
175       start += opt.step;
176       end   += opt.step;
177       while(windowDat.front().position < start && !windowDat.empty()){
178 	windowDat.pop_front();
179       }
180     }
181     windowDat.push_back(current);
182   }
183   // add function for D-stat if abba-baba
184   double finalMean = windowAvg(windowDat);
185 
186   if(opt.truncate && (finalMean == finalMean) ){
187     cout << seqid << "\t" << start << "\t" << windowDat.back().position - 1 << "\t" << windowDat.size()  << "\t" << finalMean;
188 
189     if(opt.format == "iHS"){
190       std::cout << "\t" << ngreater(windowDat, 2.5) ;
191     }
192     std::cout << std::endl;
193 
194   }
195   else if(finalMean == finalMean){
196     cout << seqid << "\t" << start << "\t" << end << "\t" << windowDat.size() << "\t" << finalMean;
197 
198     if(opt.format == "iHS"){
199       std::cout << "\t" << ngreater(windowDat, 2.5) ;
200     }
201     std::cout << std::endl;
202 
203   }
204   cerr << "INFO: smoother finished : " << seqid << endl;
205 }
206 
main(int argc,char ** argv)207 int main(int argc, char** argv) {
208 
209   map<string, int> acceptableFormats;
210   acceptableFormats["pFst"]  = 1;
211   acceptableFormats["col3"]  = 1;
212   acceptableFormats["bFst"]  = 1;
213   acceptableFormats["wcFst"] = 1;
214   acceptableFormats["xpEHH"] = 1;
215   acceptableFormats["iHS"]   = 1;
216   acceptableFormats["cqf"]   = 1;
217   acceptableFormats["deltaAf"]   = 1;
218   acceptableFormats["abba-baba"]   = 1;
219 
220   opts opt;
221   opt.size     = 5000;
222   opt.step     = 1000;
223   opt.format   = "NA";
224   opt.truncate = false;
225 
226   string filename = "NA";
227 
228   static struct option longopts[] =
229     {
230       {"version"   , 0, 0, 'v'},
231       {"help"      , 0, 0, 'h'},
232       {"file"      , 1, 0, 'f'},
233       {"window"    , 1, 0, 'w'},
234       {"step"      , 1, 0, 's'},
235       {"format"    , 1, 0, 'o'},
236       {"truncate"  , 0, 0, 't'},
237       {0,0,0,0}
238     };
239 
240   int index;
241   int iarg=0;
242 
243   while(iarg != -1){
244     iarg = getopt_long(argc, argv, "f:w:s:o:vht", longopts, &index);
245     switch(iarg){
246     case 't':
247       {
248 	opt.truncate = true;
249 	break;
250       }
251     case 'h':
252       {
253 	printHelp();
254 	return 0;
255       }
256     case 'v':
257       {
258 	printVersion();
259 	return 0;
260       }
261     case 'f':
262       {
263 	filename = optarg;
264 	cerr << "INFO: file : " << filename << endl;
265 	break;
266       }
267     case 's':
268       {
269 	opt.step = atol(optarg);
270 	cerr << "INFO: step size : " << optarg << endl;
271 	break;
272       }
273     case 'w':
274       {
275 	opt.size = atol(optarg);
276 	cerr << "INFO: step size : " << optarg << endl;
277 	break;
278       }
279     case 'o':
280       {
281 	opt.format = optarg;
282 	cerr << "INFO: specified input format : " << optarg << endl;
283 	break;
284       }
285     }
286   }
287   if(filename == "NA"){
288     cerr << "FATAL: file was not specified!" << endl << endl;
289     printHelp();
290     return 1;
291   }
292 
293   if(acceptableFormats.find(opt.format) == acceptableFormats.end()){
294     cerr << "FATAL: unacceptable input file format, see --format "  << endl << endl;
295     printHelp();
296     return 1;
297   }
298   if(opt.format == "deltaAf"){
299     opt.seqid = 0;
300     opt.pos   = 1;
301     opt.value = 4;
302   }
303   else if(opt.format == "abba-baba"){
304     opt.seqid = 0;
305     opt.pos   = 1;
306     opt.value = 2;
307   }
308   else if(opt.format == "col3"){
309     opt.seqid = 0;
310     opt.pos   = 1;
311     opt.value = 2;
312   }
313   else if(opt.format == "pFst"){
314     opt.seqid = 0;
315     opt.pos   = 1;
316     opt.value = 2;
317   }
318   else if (opt.format == "bFst"){
319     opt.seqid = 0;
320     opt.pos   = 1;
321     opt.value = 8;
322   }
323   else if (opt.format == "wcFst"){
324     opt.seqid = 0;
325     opt.pos   = 1;
326     opt.value = 4;
327   }
328   else if(opt.format == "cqf"){
329     opt.seqid = 0;
330     opt.pos   = 1;
331     opt.value = 5;
332   }
333   else if(opt.format == "xpEHH"){
334     opt.seqid = 0;
335     opt.pos   = 1;
336     opt.value = 5;
337   }
338   else if(opt.format == "iHS"){
339     opt.seqid = 0;
340     opt.pos   = 1;
341     opt.value = 6;
342   }
343 
344   else{
345     cerr << "FATAL: input format flag not specified correctly : " << opt.format << endl;
346     cerr << "INFO:  please use smoother --help" << endl;
347     return 1;
348   }
349 
350   ifstream ifs(filename.c_str());
351 
352   string currentSeqid = "NA";
353 
354   string line;
355 
356   map<string, streampos > seqidIndex;
357 
358   if(ifs){
359     while(getline(ifs, line)){
360       vector<string> sline = split(line, '\t');
361       if(sline[opt.seqid] != currentSeqid){
362 
363 	long int bline = ifs.tellg() ;
364 	bline -=  ( line.size() +1 );
365 
366 	//	std::cerr << "INFO: seqid: " << sline[opt.seqid] << " tellg: " << bline << std::endl;
367 
368 	map<string, streampos>::iterator it;
369 
370 	if(seqidIndex.find(sline[opt.seqid]) != seqidIndex.end() ){
371 	  cerr << "FATAL: file is unsorted!" << endl;
372 	  return 1;
373 	}
374 	seqidIndex[sline[opt.seqid]] = bline;
375 	currentSeqid = sline[opt.seqid];
376       }
377     }
378   }
379   else{
380     cerr << "FATAL: no lines -- or -- couldn't open file" << endl;
381   }
382 
383   for( map< string, streampos>::iterator it = seqidIndex.begin(); it != seqidIndex.end(); it++){
384     cerr << "INFO: processing seqid : "<< (it->first) << endl;
385     processSeqid(ifs, (it->first),(it->second), opt);
386   }
387 
388   ifs.close();
389   cerr << "INFO: smoother has successfully finished" << endl;
390 
391   return 0;
392 
393 }
394