1 /*
2 vcflib C++ library for parsing and manipulating VCF files
3
4 Copyright © 2010-2020 Erik Garrison
5 Copyright © 2020 Pjotr Prins
6
7 This software is published under the MIT License. See the LICENSE file.
8 */
9
10
11 /*
12
13 This program was created at: Tue Sep 8 21:05:23 2015
14 This program was created by: Zev N. Kronenberg
15
16
17 Contact: zev.kronenber@gmail.com
18
19 Organization: Unviersity of Utah
20 School of Medicine
21 Salt Lake City, Utah
22
23
24 The MIT License (MIT)
25
26 Copyright (c) <2015> <Zev N. Kronenberg>
27
28 Permission is hereby granted, free of charge, to any person obtaining a copy
29 of this software and associated documentation files (the "Software"), to deal
30 in the Software without restriction, including without limitation the rights
31 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
32 copies of the Software, and to permit persons to whom the Software is
33 furnished to do so, subject to the following conditions:
34
35 The above copyright notice and this permission notice shall be included in
36 all copies or substantial portions of the Software.
37
38 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
39 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
40 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
43 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
44 THE SOFTWARE.
45
46
47 */
48
49 #include <string>
50 #include <iostream>
51 #include <fstream>
52 #include <math.h>
53 #include <cmath>
54 #include <stdlib.h>
55 #include <time.h>
56 #include <stdio.h>
57 #include <unistd.h>
58 #include <algorithm>
59 #include "split.h"
60 #include "gpatInfo.hpp"
61
62 struct options{
63 std::string file;
64 double afDiff;
65 }globalOpts;
66
67 struct iHSdat{
68 std::string seqid;
69 std::string start;
70 std::string F1 ;
71 std::string F2 ;
72
73 double af ;
74 double ehhR ;
75 double ehhA ;
76 double iHS ;
77 double niHS ;
78 };
79
80
81 using namespace std;
82
83 static const char *optString = "hf:s:";
84
printHelp(void)85 void printHelp(void){
86
87 cerr << endl << endl;
88 cerr << "INFO: help" << endl;
89 cerr << "INFO: description:" << endl;
90 cerr << " normalizes iHS or XP-EHH scores. " << endl << endl ;
91
92 cerr << R"(
93
94 A cross-population extended haplotype homozygosity (XP-EHH) score is
95 directional: a positive score suggests selection is likely to have
96 happened in population A, whereas a negative score suggests the same
97 about population B. See for example
98 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2687721/
99
100 )" << endl ;
101
102 cerr << "Output : normalize-iHS adds one additional column to input (normalized score)." << endl;
103
104 cerr << "INFO: usage: normalizeHS -s 0.01 -f input.txt " << endl;
105 cerr << endl;
106 cerr << "INFO: required: -f -- Output from iHS or XPEHH " << endl;
107 cerr << "INFO: optional: -s -- Max AF diff for window [0.01]" << endl;
108 cerr << endl << "Type: genotype" << endl << endl;
109
110 cerr << endl;
111
112 printVersion();
113 }
114
115 //------------------------------- OPTIONS --------------------------------
parseOpts(int argc,char ** argv)116 int parseOpts(int argc, char** argv)
117 {
118 int opt = 0;
119 opt = getopt(argc, argv, optString);
120 while(opt != -1){
121 switch(opt){
122 case 's':
123 {
124 string op = optarg;
125 globalOpts.afDiff = atof(op.c_str());
126 break;
127 }
128 case 'h':
129 {
130 printHelp();
131 exit(1);
132 break;
133 }
134
135 case 'f':
136 {
137 globalOpts.file = optarg;
138 break;
139 }
140 case '?':
141 {
142 break;
143 }
144 }
145 opt = getopt( argc, argv, optString );
146 }
147 return 1;
148 }
149
sortAF(iHSdat * L,iHSdat * R)150 bool sortAF(iHSdat * L, iHSdat * R){
151 if(L->af < R->af){
152 return true;
153 }
154 return false;
155 }
156
157
158 //------------------------------- SUBROUTINE --------------------------------
159 /*
160 Function input : vector of doubles
161
162 Function does : calculates the var
163
164 Function returns: double
165
166 */
167
var(vector<double> & data,double mu)168 double var(vector<double> & data, double mu){
169 double variance = 0;
170
171 for(vector<double>::iterator it = data.begin(); it != data.end(); it++){
172 variance += pow((*it) - mu,2);
173 }
174
175 return variance / (data.size() - 1);
176 }
177
178
179 //------------------------------- SUBROUTINE --------------------------------
180 /*
181 Function input : vector of doubles
182
183 Function does : computes the mean
184
185 Function returns: the mean
186
187 */
188
189
windowAvg(std::vector<double> & rangeData)190 double windowAvg(std::vector<double> & rangeData){
191
192 long double n = 0;
193 long double s = 0;
194
195 for(std::vector<double>::iterator it = rangeData.begin(); it != rangeData.end(); it++){
196 s += *it;
197 n += 1;
198 }
199
200
201 return (s/n);
202 }
203
204
205
206 //------------------------------- SUBROUTINE --------------------------------
207 /*
208 Function input : vector of iHS data
209
210 Function does : normalizes
211
212 Function returns: nothing
213
214 */
215
normalize(std::vector<iHSdat * > & data,int * pos)216 void normalize(std::vector<iHSdat *> & data, int * pos){
217
218 std::vector<double> windat;
219
220 int start = *pos;
221 int end = *pos;
222
223 while((abs(data[start]->af - data[end]->af ) < globalOpts.afDiff)
224 && end < data.size() -1 ){
225 end += 1;
226 }
227
228 for(int i = start; i <= end; i++){
229 windat.push_back(data[i]->iHS);
230 }
231
232 double avg = windowAvg(windat);
233 double sd = sqrt(var(windat, avg));
234
235 std::cerr << "start: " << data[start]->af << " "
236 << "end: " << data[end]->af << " "
237 << "n iHS scores: " << windat.size() << " "
238 << "mean: " << avg << " "
239 << "sd: " << sd << std::endl;
240
241 for(int i = start; i <= end; i++){
242 data[i]->niHS = (data[i]->iHS - avg) / (sd);
243 }
244
245 *pos = end;
246
247 }
248
249 //------------------------------- MAIN --------------------------------
250 /*
251 Comments:
252 */
253
main(int argc,char ** argv)254 int main( int argc, char** argv)
255 {
256 globalOpts.afDiff = 0.01;
257 int parse = parseOpts(argc, argv);
258
259 if(globalOpts.file.empty()){
260 std::cerr << "FATAL: no file" << std::endl;
261 exit(1);
262 }
263
264 std::vector<iHSdat *> data;
265
266 string line;
267 ifstream myfile (globalOpts.file);
268 if (myfile.is_open())
269 {
270 while ( getline (myfile,line) ){
271 vector<string> lineDat = split(line, '\t');
272
273 iHSdat * tp = new iHSdat;
274 tp->seqid = lineDat[0];
275 tp->start = lineDat[1];
276 tp->af = atof(lineDat[2].c_str());
277 tp->ehhR = atof(lineDat[3].c_str());
278 tp->ehhA = atof(lineDat[4].c_str());
279 tp->iHS = atof(lineDat[5].c_str());
280 tp->F1 = lineDat[6].c_str();
281 tp->F2 = lineDat[7].c_str();
282 tp->niHS = 0;
283
284 data.push_back(tp);
285
286 }
287
288 myfile.close();
289 }
290 else{
291 cerr << "FATAL: could not open file: " << globalOpts.file << endl;
292 exit(1);
293 }
294
295
296 std::cerr << "INFO: sorting " << data.size() << " scores by AF" << std::endl;
297
298 sort(data.begin(), data.end(), sortAF);
299
300 std::cerr << "INFO: finished sorting" << std::endl;
301
302 for(int i = 0; i < data.size() ; i++){
303 normalize(data, &i);
304 }
305
306
307 for(int i = 0; i < data.size(); i++){
308 std::cout << data[i]->seqid << "\t"
309 << data[i]->start << "\t"
310 << data[i]->af << "\t"
311 << data[i]->ehhR << "\t"
312 << data[i]->ehhA << "\t"
313 << data[i]->iHS << "\t"
314 << data[i]->niHS << "\t"
315 << data[i]->F1 << "\t"
316 << data[i]->F2 << std::endl;
317
318 }
319
320
321 return 0;
322 }
323