1 
2 /*
3 
4 This program was created at:  Tue Sep  8 21:05:23 2015
5 This program was created by:  Zev N. Kronenberg
6 
7 
8 Contact: zev.kronenber@gmail.com
9 
10 Organization: Unviersity of Utah
11     School of Medicine
12     Salt Lake City, Utah
13 
14 
15 The MIT License (MIT)
16 
17 Copyright (c) <2015> <Zev N. Kronenberg>
18 
19 Permission is hereby granted, free of charge, to any person obtaining a copy
20 of this software and associated documentation files (the "Software"), to deal
21 in the Software without restriction, including without limitation the rights
22 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
23 copies of the Software, and to permit persons to whom the Software is
24 furnished to do so, subject to the following conditions:
25 
26 The above copyright notice and this permission notice shall be included in
27 all copies or substantial portions of the Software.
28 
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
34 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
35 THE SOFTWARE.
36 
37 
38 */
39 
40 #include <string>
41 #include <iostream>
42 #include <fstream>
43 #include <math.h>
44 #include <cmath>
45 #include <stdlib.h>
46 #include <time.h>
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <algorithm>
50 #include "split.h"
51 #include "gpatInfo.hpp"
52 
53 struct options{
54   std::string file;
55   double afDiff;
56 }globalOpts;
57 
58 struct iHSdat{
59   std::string seqid;
60   std::string start;
61   std::string F1   ;
62   std::string F2   ;
63 
64   double af   ;
65   double ehhR ;
66   double ehhA ;
67   double iHS  ;
68   double niHS ;
69 };
70 
71 
72 using namespace std;
73 
74 static const char *optString = "hf:s:";
75 
printHelp(void)76 void printHelp(void){
77 
78   cerr << endl << endl;
79   cerr << "INFO: help" << endl;
80   cerr << "INFO: description:" << endl;
81   cerr << "      normalizes iHS or XP-EHH scores  " << endl;
82 
83   cerr << "Output : normalize-iHS adds one additional column to input (normalized score)." << endl;
84 
85   cerr << "INFO: usage:  normalizeHS -s 0.01 -f input.txt " << endl;
86   cerr << endl;
87   cerr << "INFO: required: -f            -- Output from iHS or XPEHH "   << endl;
88   cerr << "INFO: optional: -s            -- Max AF diff for window [0.01]"   << endl;
89 
90   cerr << endl;
91 
92   printVersion();
93 }
94 
95 //-------------------------------   OPTIONS   --------------------------------
parseOpts(int argc,char ** argv)96 int parseOpts(int argc, char** argv)
97     {
98     int opt = 0;
99     opt = getopt(argc, argv, optString);
100     while(opt != -1){
101       switch(opt){
102       case 's':
103 	{
104 	  string op = optarg;
105 	  globalOpts.afDiff = atof(op.c_str());
106 	  break;
107 	}
108       case 'h':
109 	{
110 	  printHelp();
111 	  exit(1);
112 	  break;
113 	}
114 
115       case 'f':
116 	{
117 	  globalOpts.file = optarg;
118 	  break;
119 	}
120       case '?':
121 	{
122 	  break;
123 	}
124       }
125 	opt = getopt( argc, argv, optString );
126    }
127 return 1;
128 }
129 
sortAF(iHSdat * L,iHSdat * R)130 bool sortAF(iHSdat * L, iHSdat * R){
131   if(L->af < R->af){
132     return true;
133   }
134   return false;
135 }
136 
137 
138 //------------------------------- SUBROUTINE --------------------------------
139 /*
140  Function input  : vector of doubles
141 
142  Function does   : calculates the var
143 
144  Function returns: double
145 
146 */
147 
var(vector<double> & data,double mu)148 double var(vector<double> & data, double mu){
149   double variance = 0;
150 
151   for(vector<double>::iterator it = data.begin(); it != data.end(); it++){
152     variance += pow((*it) - mu,2);
153   }
154 
155   return variance / (data.size() - 1);
156 }
157 
158 
159 //------------------------------- SUBROUTINE --------------------------------
160 /*
161  Function input  : vector of doubles
162 
163  Function does   : computes the mean
164 
165  Function returns: the mean
166 
167 */
168 
169 
windowAvg(std::vector<double> & rangeData)170 double windowAvg(std::vector<double> & rangeData){
171 
172   long double n = 0;
173   long double s = 0;
174 
175   for(std::vector<double>::iterator it = rangeData.begin(); it != rangeData.end(); it++){
176     s += *it;
177     n += 1;
178   }
179 
180 
181   return (s/n);
182 }
183 
184 
185 
186 //------------------------------- SUBROUTINE --------------------------------
187 /*
188  Function input  : vector of iHS data
189 
190  Function does   : normalizes
191 
192  Function returns: nothing
193 
194 */
195 
normalize(std::vector<iHSdat * > & data,int * pos)196 void normalize(std::vector<iHSdat *> & data, int * pos){
197 
198   std::vector<double> windat;
199 
200   int start = *pos;
201   int end   = *pos;
202 
203   while((abs(data[start]->af - data[end]->af ) < globalOpts.afDiff)
204 	&& end < data.size() -1 ){
205     end += 1;
206   }
207 
208   for(int i = start; i <= end; i++){
209     windat.push_back(data[i]->iHS);
210   }
211 
212   double avg = windowAvg(windat);
213   double sd  = sqrt(var(windat, avg));
214 
215   std::cerr << "start: " << data[start]->af << " "
216 	    << "end: " << data[end]->af  << " "
217             << "n iHS scores: " << windat.size() << " "
218 	    << "mean: " << avg << " "
219 	    << "sd: " << sd << std::endl;
220 
221   for(int i = start; i <= end; i++){
222     data[i]->niHS = (data[i]->iHS - avg) / (sd);
223   }
224 
225   *pos = end;
226 
227 }
228 
229 //-------------------------------    MAIN     --------------------------------
230 /*
231  Comments:
232 */
233 
main(int argc,char ** argv)234 int main( int argc, char** argv)
235 {
236   globalOpts.afDiff = 0.01;
237   int parse = parseOpts(argc, argv);
238 
239   if(globalOpts.file.empty()){
240     std::cerr << "FATAL: no file" << std::endl;
241     exit(1);
242   }
243 
244   std::vector<iHSdat *> data;
245 
246   string line;
247   ifstream myfile (globalOpts.file);
248   if (myfile.is_open())
249     {
250       while ( getline (myfile,line) ){
251 	vector<string> lineDat = split(line, '\t');
252 
253 	iHSdat * tp = new iHSdat;
254 	tp->seqid = lineDat[0];
255 	tp->start = lineDat[1];
256 	tp->af    = atof(lineDat[2].c_str());
257 	tp->ehhR  = atof(lineDat[3].c_str());
258 	tp->ehhA  = atof(lineDat[4].c_str());
259 	tp->iHS   = atof(lineDat[5].c_str());
260 	tp->F1    = lineDat[6].c_str();
261 	tp->F2    = lineDat[7].c_str();
262 	tp->niHS  = 0;
263 
264 	data.push_back(tp);
265 
266       }
267 
268     myfile.close();
269     }
270   else{
271     cerr << "FATAL: could not open file: " << globalOpts.file << endl;
272     exit(1);
273   }
274 
275 
276   std::cerr << "INFO: sorting " << data.size() << " scores by AF" << std::endl;
277 
278   sort(data.begin(), data.end(), sortAF);
279 
280   std::cerr << "INFO: finished sorting" << std::endl;
281 
282   for(int i = 0; i < data.size() ; i++){
283     normalize(data, &i);
284   }
285 
286 
287   for(int i = 0; i < data.size(); i++){
288     std::cout << data[i]->seqid << "\t"
289 	      << data[i]->start << "\t"
290 	      << data[i]->af << "\t"
291 	      << data[i]->ehhR << "\t"
292 	      << data[i]->ehhA << "\t"
293 	      << data[i]->iHS << "\t"
294 	      << data[i]->niHS << "\t"
295 	      << data[i]->F1 << "\t"
296 	      << data[i]->F2 << std::endl;
297 
298   }
299 
300 
301   return 0;
302 }
303