1
2 /*
3
4 This program was created at: Tue Sep 8 21:05:23 2015
5 This program was created by: Zev N. Kronenberg
6
7
8 Contact: zev.kronenber@gmail.com
9
10 Organization: Unviersity of Utah
11 School of Medicine
12 Salt Lake City, Utah
13
14
15 The MIT License (MIT)
16
17 Copyright (c) <2015> <Zev N. Kronenberg>
18
19 Permission is hereby granted, free of charge, to any person obtaining a copy
20 of this software and associated documentation files (the "Software"), to deal
21 in the Software without restriction, including without limitation the rights
22 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
23 copies of the Software, and to permit persons to whom the Software is
24 furnished to do so, subject to the following conditions:
25
26 The above copyright notice and this permission notice shall be included in
27 all copies or substantial portions of the Software.
28
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
34 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
35 THE SOFTWARE.
36
37
38 */
39
40 #include <string>
41 #include <iostream>
42 #include <fstream>
43 #include <math.h>
44 #include <cmath>
45 #include <stdlib.h>
46 #include <time.h>
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <algorithm>
50 #include "split.h"
51 #include "gpatInfo.hpp"
52
53 struct options{
54 std::string file;
55 double afDiff;
56 }globalOpts;
57
58 struct iHSdat{
59 std::string seqid;
60 std::string start;
61 std::string F1 ;
62 std::string F2 ;
63
64 double af ;
65 double ehhR ;
66 double ehhA ;
67 double iHS ;
68 double niHS ;
69 };
70
71
72 using namespace std;
73
74 static const char *optString = "hf:s:";
75
printHelp(void)76 void printHelp(void){
77
78 cerr << endl << endl;
79 cerr << "INFO: help" << endl;
80 cerr << "INFO: description:" << endl;
81 cerr << " normalizes iHS or XP-EHH scores " << endl;
82
83 cerr << "Output : normalize-iHS adds one additional column to input (normalized score)." << endl;
84
85 cerr << "INFO: usage: normalizeHS -s 0.01 -f input.txt " << endl;
86 cerr << endl;
87 cerr << "INFO: required: -f -- Output from iHS or XPEHH " << endl;
88 cerr << "INFO: optional: -s -- Max AF diff for window [0.01]" << endl;
89
90 cerr << endl;
91
92 printVersion();
93 }
94
95 //------------------------------- OPTIONS --------------------------------
parseOpts(int argc,char ** argv)96 int parseOpts(int argc, char** argv)
97 {
98 int opt = 0;
99 opt = getopt(argc, argv, optString);
100 while(opt != -1){
101 switch(opt){
102 case 's':
103 {
104 string op = optarg;
105 globalOpts.afDiff = atof(op.c_str());
106 break;
107 }
108 case 'h':
109 {
110 printHelp();
111 exit(1);
112 break;
113 }
114
115 case 'f':
116 {
117 globalOpts.file = optarg;
118 break;
119 }
120 case '?':
121 {
122 break;
123 }
124 }
125 opt = getopt( argc, argv, optString );
126 }
127 return 1;
128 }
129
sortAF(iHSdat * L,iHSdat * R)130 bool sortAF(iHSdat * L, iHSdat * R){
131 if(L->af < R->af){
132 return true;
133 }
134 return false;
135 }
136
137
138 //------------------------------- SUBROUTINE --------------------------------
139 /*
140 Function input : vector of doubles
141
142 Function does : calculates the var
143
144 Function returns: double
145
146 */
147
var(vector<double> & data,double mu)148 double var(vector<double> & data, double mu){
149 double variance = 0;
150
151 for(vector<double>::iterator it = data.begin(); it != data.end(); it++){
152 variance += pow((*it) - mu,2);
153 }
154
155 return variance / (data.size() - 1);
156 }
157
158
159 //------------------------------- SUBROUTINE --------------------------------
160 /*
161 Function input : vector of doubles
162
163 Function does : computes the mean
164
165 Function returns: the mean
166
167 */
168
169
windowAvg(std::vector<double> & rangeData)170 double windowAvg(std::vector<double> & rangeData){
171
172 long double n = 0;
173 long double s = 0;
174
175 for(std::vector<double>::iterator it = rangeData.begin(); it != rangeData.end(); it++){
176 s += *it;
177 n += 1;
178 }
179
180
181 return (s/n);
182 }
183
184
185
186 //------------------------------- SUBROUTINE --------------------------------
187 /*
188 Function input : vector of iHS data
189
190 Function does : normalizes
191
192 Function returns: nothing
193
194 */
195
normalize(std::vector<iHSdat * > & data,int * pos)196 void normalize(std::vector<iHSdat *> & data, int * pos){
197
198 std::vector<double> windat;
199
200 int start = *pos;
201 int end = *pos;
202
203 while((abs(data[start]->af - data[end]->af ) < globalOpts.afDiff)
204 && end < data.size() -1 ){
205 end += 1;
206 }
207
208 for(int i = start; i <= end; i++){
209 windat.push_back(data[i]->iHS);
210 }
211
212 double avg = windowAvg(windat);
213 double sd = sqrt(var(windat, avg));
214
215 std::cerr << "start: " << data[start]->af << " "
216 << "end: " << data[end]->af << " "
217 << "n iHS scores: " << windat.size() << " "
218 << "mean: " << avg << " "
219 << "sd: " << sd << std::endl;
220
221 for(int i = start; i <= end; i++){
222 data[i]->niHS = (data[i]->iHS - avg) / (sd);
223 }
224
225 *pos = end;
226
227 }
228
229 //------------------------------- MAIN --------------------------------
230 /*
231 Comments:
232 */
233
main(int argc,char ** argv)234 int main( int argc, char** argv)
235 {
236 globalOpts.afDiff = 0.01;
237 int parse = parseOpts(argc, argv);
238
239 if(globalOpts.file.empty()){
240 std::cerr << "FATAL: no file" << std::endl;
241 exit(1);
242 }
243
244 std::vector<iHSdat *> data;
245
246 string line;
247 ifstream myfile (globalOpts.file);
248 if (myfile.is_open())
249 {
250 while ( getline (myfile,line) ){
251 vector<string> lineDat = split(line, '\t');
252
253 iHSdat * tp = new iHSdat;
254 tp->seqid = lineDat[0];
255 tp->start = lineDat[1];
256 tp->af = atof(lineDat[2].c_str());
257 tp->ehhR = atof(lineDat[3].c_str());
258 tp->ehhA = atof(lineDat[4].c_str());
259 tp->iHS = atof(lineDat[5].c_str());
260 tp->F1 = lineDat[6].c_str();
261 tp->F2 = lineDat[7].c_str();
262 tp->niHS = 0;
263
264 data.push_back(tp);
265
266 }
267
268 myfile.close();
269 }
270 else{
271 cerr << "FATAL: could not open file: " << globalOpts.file << endl;
272 exit(1);
273 }
274
275
276 std::cerr << "INFO: sorting " << data.size() << " scores by AF" << std::endl;
277
278 sort(data.begin(), data.end(), sortAF);
279
280 std::cerr << "INFO: finished sorting" << std::endl;
281
282 for(int i = 0; i < data.size() ; i++){
283 normalize(data, &i);
284 }
285
286
287 for(int i = 0; i < data.size(); i++){
288 std::cout << data[i]->seqid << "\t"
289 << data[i]->start << "\t"
290 << data[i]->af << "\t"
291 << data[i]->ehhR << "\t"
292 << data[i]->ehhA << "\t"
293 << data[i]->iHS << "\t"
294 << data[i]->niHS << "\t"
295 << data[i]->F1 << "\t"
296 << data[i]->F2 << std::endl;
297
298 }
299
300
301 return 0;
302 }
303