1 #include "Variant.h"
2 #include "split.h"
3 #include "cdflib.hpp"
4 #include "pdflib.hpp"
5 #include "var.hpp"
6 
7 #include <string>
8 #include <iostream>
9 #include <math.h>
10 #include <cmath>
11 #include <stdlib.h>
12 #include <time.h>
13 #include <stdio.h>
14 #include <getopt.h>
15 #include "gpatInfo.hpp"
16 
17 using namespace std;
18 using namespace vcf;
19 
20 struct indv{
21   int nhet  ;
22   int nhom  ;
23   int nalt  ;
24   int nocall;
25 };
26 
27 
28 
printHelp(void)29 void printHelp(void){
30 
31   cerr << endl << endl;
32   cerr << "INFO: help" << endl;
33   cerr << "INFO: description:" << endl;
34   cerr << "      Splits VCF into unique files.  This resolves the issue " << endl;
35   cerr << "      of same start. " << endl;
36 
37   cerr << "Output : N vcf files                 "    << endl;
38 
39   cerr << "INFO: usage:  splitUniqStarts --file my.vcf --prefix split " << endl;
40   cerr << endl;
41   cerr << "INFO: required: f,file    -- proper formatted VCF" << endl;
42   cerr << "INFO: required: p,prefix    -- vcf prefix" << endl;
43 
44 
45   printVersion();
46 }
47 
48 
main(int argc,char ** argv)49 int main(int argc, char** argv) {
50 
51   int maxNfiles = 1;
52 
53   string prefix;
54 
55   string filename;
56 
57   // set region to scaffold
58 
59   string region = "NA";
60 
61   // using vcflib; thanks to Erik Garrison
62 
63 
64   // zero based index for the target and background indivudals
65 
66 
67     const struct option longopts[] =
68       {
69 	{"version"   , 0, 0, 'v'},
70 	{"help"      , 0, 0, 'h'},
71         {"file"      , 1, 0, 'f'},
72 	{"prefix"    , 1, 0, 'p'},
73 	{0,0,0,0}
74       };
75 
76     int index;
77     int iarg=0;
78 
79     while(iarg != -1)
80       {
81 	iarg = getopt_long(argc, argv, "p:f:r:vh", longopts, &index);
82 
83 	switch (iarg)
84 	  {
85 	  case 'p':
86 	    {
87 	      prefix = optarg;
88 	      break;
89 	    }
90 	  case 'h':
91 	    {
92 	      printHelp();
93 	      return 0;
94 	    }
95 	  case 'v':
96 	    {
97 	      printVersion();
98 	      return 0;
99 	    }
100 	  case 'f':
101 	    {
102 	      cerr << "INFO: file: " << optarg  <<  endl;
103 	      filename = optarg;
104 	      break;
105 	    }
106 	  case 'r':
107 	    {
108 	      cerr << "INFO: set seqid region to : " << optarg << endl;
109 	      region = optarg;
110 	      break;
111 	    }
112 	  default:
113 	    break;
114 	  }
115 
116       }
117 
118     if(filename.empty()){
119       cerr << "FATAL: failed to specify a file" << endl;
120       printHelp();
121     }
122 
123     // scope since variant file has no close
124     {
125       VariantCallFile variantFile;
126 
127       if(!variantFile.open(filename)){
128 	cerr << "FATAL: could not open file for reading" << endl;
129 	printHelp();
130       }
131 
132       if(region != "NA"){
133 	if(! variantFile.setRegion(region)){
134 	  cerr <<"FATAL: unable to set region" << endl;
135 	  return 1;
136 	}
137       }
138 
139       if (!variantFile.is_open()) {
140 	cerr << "FATAL: could not open VCF for reading" << endl;
141 	printHelp();
142 	return 1;
143       }
144 
145       Variant var(variantFile);
146 
147       std::map<long int, int> seen;
148 
149       long int lastP    = -1;
150       std::string lastS = "NA";
151 
152       while (variantFile.getNextVariant(var)) {
153 
154 	if(var.position < lastP && var.sequenceName == lastS){
155 
156 	  std::cerr << "FATAL: " << "VCF must be sorted." << std::endl;
157 	  std::cerr << "       lastPos = " << lastP << " currentPos = " << var.position << std::endl;
158 	  exit(1);
159 	}
160 	if(lastS != var.sequenceName){
161 	  for(std::map<long int, int>::iterator it = seen.begin();
162 	      it != seen.end(); it++){
163 	    if(it->second > maxNfiles){
164 std::cerr << "INFO: " << lastS << "\t" << it->first << "\t" << it->second << std::endl;
165 	      maxNfiles = it->second;
166 	    }
167 	  }
168 	  seen.clear();
169 	  lastS = var.sequenceName;
170 	}
171 
172 	// counting non uniq starts
173 	if(var.position == lastP){
174 	  if(seen.find(var.position) != seen.end()){
175 	    seen[var.position] += 1;
176 	  }
177 	  else{
178 	    // start with 2 (current and last)
179 	    seen[var.position] = 2;
180 	  }
181 	}
182 	lastP = var.position;
183       }
184     }
185 
186     std::cerr << "INFO: " << maxNfiles << " required to uniq start." << std::endl;
187 
188     if(maxNfiles == 1){
189       std::cerr << "INFO: all start positions are unique, bye!";
190       return 0;
191 
192     }
193 
194     {
195       VariantCallFile variantFile;
196       if(!variantFile.open(filename)){
197 	std::cerr << "FATAL: could not open VCF" << endl;
198 	exit(1);
199       }
200 
201       std::map<int, ofstream *> outputFiles;
202       for(int i = 1; i <= maxNfiles; i++){
203 	ofstream * oz = new ofstream;
204 
205 	stringstream fname;
206         fname << prefix << "." << i << ".vcf";
207 	oz->open(fname.str().c_str());
208 
209 	*oz << variantFile.header << std::endl;
210 
211 	outputFiles[i] = oz;
212       }
213 
214       Variant var(variantFile);
215 
216       int index = 1;
217 
218       while (variantFile.getNextVariant(var)) {
219 	*(outputFiles[index]) <<  var << std::endl;
220 	index += 1;
221 	if(index > maxNfiles){
222 	  index = 1;
223 	}
224       }
225       for(int i = 1; i <= maxNfiles; i++){
226 	outputFiles[i]->close();
227       }
228     }
229     return 0;
230 }
231