1 /*
2     vcflib C++ library for parsing and manipulating VCF files
3 
4     Copyright © 2010-2020 Erik Garrison
5     Copyright © 2020      Pjotr Prins
6 
7     This software is published under the MIT License. See the LICENSE file.
8 */
9 
10 
11 /*
12 
13 This program was created at:  Tue Sep  8 21:05:23 2015
14 This program was created by:  Zev N. Kronenberg
15 
16 
17 Contact: zev.kronenber@gmail.com
18 
19 Organization: Unviersity of Utah
20     School of Medicine
21     Salt Lake City, Utah
22 
23 
24 The MIT License (MIT)
25 
26 Copyright (c) <2015> <Zev N. Kronenberg>
27 
28 Permission is hereby granted, free of charge, to any person obtaining a copy
29 of this software and associated documentation files (the "Software"), to deal
30 in the Software without restriction, including without limitation the rights
31 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
32 copies of the Software, and to permit persons to whom the Software is
33 furnished to do so, subject to the following conditions:
34 
35 The above copyright notice and this permission notice shall be included in
36 all copies or substantial portions of the Software.
37 
38 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
39 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
40 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
43 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
44 THE SOFTWARE.
45 
46 
47 */
48 
49 #include <string>
50 #include <iostream>
51 #include <fstream>
52 #include <math.h>
53 #include <cmath>
54 #include <stdlib.h>
55 #include <time.h>
56 #include <stdio.h>
57 #include <unistd.h>
58 #include <algorithm>
59 #include "split.h"
60 #include "gpatInfo.hpp"
61 
62 struct options{
63   std::string file;
64   double afDiff;
65 }globalOpts;
66 
67 struct iHSdat{
68   std::string seqid;
69   std::string start;
70   std::string F1   ;
71   std::string F2   ;
72 
73   double af   ;
74   double ehhR ;
75   double ehhA ;
76   double iHS  ;
77   double niHS ;
78 };
79 
80 
81 using namespace std;
82 
83 static const char *optString = "hf:s:";
84 
printHelp(void)85 void printHelp(void){
86 
87   cerr << endl << endl;
88   cerr << "INFO: help" << endl;
89   cerr << "INFO: description:" << endl;
90   cerr << "      normalizes iHS or XP-EHH scores. " << endl << endl ;
91 
92   cerr << R"(
93 
94 A cross-population extended haplotype homozygosity (XP-EHH) score is
95 directional: a positive score suggests selection is likely to have
96 happened in population A, whereas a negative score suggests the same
97 about population B. See for example
98 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2687721/
99 
100 )" << endl ;
101 
102   cerr << "Output : normalize-iHS adds one additional column to input (normalized score)." << endl;
103 
104   cerr << "INFO: usage:  normalizeHS -s 0.01 -f input.txt " << endl;
105   cerr << endl;
106   cerr << "INFO: required: -f            -- Output from iHS or XPEHH "   << endl;
107   cerr << "INFO: optional: -s            -- Max AF diff for window [0.01]"   << endl;
108   cerr << endl << "Type: genotype" << endl << endl;
109 
110   cerr << endl;
111 
112   printVersion();
113 }
114 
115 //-------------------------------   OPTIONS   --------------------------------
parseOpts(int argc,char ** argv)116 int parseOpts(int argc, char** argv)
117     {
118     int opt = 0;
119     opt = getopt(argc, argv, optString);
120     while(opt != -1){
121       switch(opt){
122       case 's':
123 	{
124 	  string op = optarg;
125 	  globalOpts.afDiff = atof(op.c_str());
126 	  break;
127 	}
128       case 'h':
129 	{
130 	  printHelp();
131 	  exit(1);
132 	  break;
133 	}
134 
135       case 'f':
136 	{
137 	  globalOpts.file = optarg;
138 	  break;
139 	}
140       case '?':
141 	{
142 	  break;
143 	}
144       }
145 	opt = getopt( argc, argv, optString );
146    }
147 return 1;
148 }
149 
sortAF(iHSdat * L,iHSdat * R)150 bool sortAF(iHSdat * L, iHSdat * R){
151   if(L->af < R->af){
152     return true;
153   }
154   return false;
155 }
156 
157 
158 //------------------------------- SUBROUTINE --------------------------------
159 /*
160  Function input  : vector of doubles
161 
162  Function does   : calculates the var
163 
164  Function returns: double
165 
166 */
167 
var(vector<double> & data,double mu)168 double var(vector<double> & data, double mu){
169   double variance = 0;
170 
171   for(vector<double>::iterator it = data.begin(); it != data.end(); it++){
172     variance += pow((*it) - mu,2);
173   }
174 
175   return variance / (data.size() - 1);
176 }
177 
178 
179 //------------------------------- SUBROUTINE --------------------------------
180 /*
181  Function input  : vector of doubles
182 
183  Function does   : computes the mean
184 
185  Function returns: the mean
186 
187 */
188 
189 
windowAvg(std::vector<double> & rangeData)190 double windowAvg(std::vector<double> & rangeData){
191 
192   long double n = 0;
193   long double s = 0;
194 
195   for(std::vector<double>::iterator it = rangeData.begin(); it != rangeData.end(); it++){
196     s += *it;
197     n += 1;
198   }
199 
200 
201   return (s/n);
202 }
203 
204 
205 
206 //------------------------------- SUBROUTINE --------------------------------
207 /*
208  Function input  : vector of iHS data
209 
210  Function does   : normalizes
211 
212  Function returns: nothing
213 
214 */
215 
normalize(std::vector<iHSdat * > & data,int * pos)216 void normalize(std::vector<iHSdat *> & data, int * pos){
217 
218   std::vector<double> windat;
219 
220   int start = *pos;
221   int end   = *pos;
222 
223   while((abs(data[start]->af - data[end]->af ) < globalOpts.afDiff)
224 	&& end < data.size() -1 ){
225     end += 1;
226   }
227 
228   for(int i = start; i <= end; i++){
229     windat.push_back(data[i]->iHS);
230   }
231 
232   double avg = windowAvg(windat);
233   double sd  = sqrt(var(windat, avg));
234 
235   std::cerr << "start: " << data[start]->af << " "
236 	    << "end: " << data[end]->af  << " "
237             << "n iHS scores: " << windat.size() << " "
238 	    << "mean: " << avg << " "
239 	    << "sd: " << sd << std::endl;
240 
241   for(int i = start; i <= end; i++){
242     data[i]->niHS = (data[i]->iHS - avg) / (sd);
243   }
244 
245   *pos = end;
246 
247 }
248 
249 //-------------------------------    MAIN     --------------------------------
250 /*
251  Comments:
252 */
253 
main(int argc,char ** argv)254 int main( int argc, char** argv)
255 {
256   globalOpts.afDiff = 0.01;
257   int parse = parseOpts(argc, argv);
258 
259   if(globalOpts.file.empty()){
260     std::cerr << "FATAL: no file" << std::endl;
261     exit(1);
262   }
263 
264   std::vector<iHSdat *> data;
265 
266   string line;
267   ifstream myfile (globalOpts.file);
268   if (myfile.is_open())
269     {
270       while ( getline (myfile,line) ){
271 	vector<string> lineDat = split(line, '\t');
272 
273 	iHSdat * tp = new iHSdat;
274 	tp->seqid = lineDat[0];
275 	tp->start = lineDat[1];
276 	tp->af    = atof(lineDat[2].c_str());
277 	tp->ehhR  = atof(lineDat[3].c_str());
278 	tp->ehhA  = atof(lineDat[4].c_str());
279 	tp->iHS   = atof(lineDat[5].c_str());
280 	tp->F1    = lineDat[6].c_str();
281 	tp->F2    = lineDat[7].c_str();
282 	tp->niHS  = 0;
283 
284 	data.push_back(tp);
285 
286       }
287 
288     myfile.close();
289     }
290   else{
291     cerr << "FATAL: could not open file: " << globalOpts.file << endl;
292     exit(1);
293   }
294 
295 
296   std::cerr << "INFO: sorting " << data.size() << " scores by AF" << std::endl;
297 
298   sort(data.begin(), data.end(), sortAF);
299 
300   std::cerr << "INFO: finished sorting" << std::endl;
301 
302   for(int i = 0; i < data.size() ; i++){
303     normalize(data, &i);
304   }
305 
306 
307   for(int i = 0; i < data.size(); i++){
308     std::cout << data[i]->seqid << "\t"
309 	      << data[i]->start << "\t"
310 	      << data[i]->af << "\t"
311 	      << data[i]->ehhR << "\t"
312 	      << data[i]->ehhA << "\t"
313 	      << data[i]->iHS << "\t"
314 	      << data[i]->niHS << "\t"
315 	      << data[i]->F1 << "\t"
316 	      << data[i]->F2 << std::endl;
317 
318   }
319 
320 
321   return 0;
322 }
323