1 #include "Variant.h"
2 #include "split.h"
3 #include "cdflib.hpp"
4 #include "pdflib.hpp"
5 #include "var.hpp"
6
7 #include <string>
8 #include <iostream>
9 #include <math.h>
10 #include <cmath>
11 #include <stdlib.h>
12 #include <time.h>
13 #include <stdio.h>
14 #include <getopt.h>
15 #include "gpatInfo.hpp"
16
17 using namespace std;
18 using namespace vcf;
19
20 struct indv{
21 int nhet ;
22 int nhom ;
23 int nalt ;
24 int nocall;
25 };
26
27
28
printHelp(void)29 void printHelp(void){
30
31 cerr << endl << endl;
32 cerr << "INFO: help" << endl;
33 cerr << "INFO: description:" << endl;
34 cerr << " Splits VCF into unique files. This resolves the issue " << endl;
35 cerr << " of same start. " << endl;
36
37 cerr << "Output : N vcf files " << endl;
38
39 cerr << "INFO: usage: splitUniqStarts --file my.vcf --prefix split " << endl;
40 cerr << endl;
41 cerr << "INFO: required: f,file -- proper formatted VCF" << endl;
42 cerr << "INFO: required: p,prefix -- vcf prefix" << endl;
43
44
45 printVersion();
46 }
47
48
main(int argc,char ** argv)49 int main(int argc, char** argv) {
50
51 int maxNfiles = 1;
52
53 string prefix;
54
55 string filename;
56
57 // set region to scaffold
58
59 string region = "NA";
60
61 // using vcflib; thanks to Erik Garrison
62
63
64 // zero based index for the target and background indivudals
65
66
67 const struct option longopts[] =
68 {
69 {"version" , 0, 0, 'v'},
70 {"help" , 0, 0, 'h'},
71 {"file" , 1, 0, 'f'},
72 {"prefix" , 1, 0, 'p'},
73 {0,0,0,0}
74 };
75
76 int index;
77 int iarg=0;
78
79 while(iarg != -1)
80 {
81 iarg = getopt_long(argc, argv, "p:f:r:vh", longopts, &index);
82
83 switch (iarg)
84 {
85 case 'p':
86 {
87 prefix = optarg;
88 break;
89 }
90 case 'h':
91 {
92 printHelp();
93 return 0;
94 }
95 case 'v':
96 {
97 printVersion();
98 return 0;
99 }
100 case 'f':
101 {
102 cerr << "INFO: file: " << optarg << endl;
103 filename = optarg;
104 break;
105 }
106 case 'r':
107 {
108 cerr << "INFO: set seqid region to : " << optarg << endl;
109 region = optarg;
110 break;
111 }
112 default:
113 break;
114 }
115
116 }
117
118 if(filename.empty()){
119 cerr << "FATAL: failed to specify a file" << endl;
120 printHelp();
121 }
122
123 // scope since variant file has no close
124 {
125 VariantCallFile variantFile;
126
127 if(!variantFile.open(filename)){
128 cerr << "FATAL: could not open file for reading" << endl;
129 printHelp();
130 }
131
132 if(region != "NA"){
133 if(! variantFile.setRegion(region)){
134 cerr <<"FATAL: unable to set region" << endl;
135 return 1;
136 }
137 }
138
139 if (!variantFile.is_open()) {
140 cerr << "FATAL: could not open VCF for reading" << endl;
141 printHelp();
142 return 1;
143 }
144
145 Variant var(variantFile);
146
147 std::map<long int, int> seen;
148
149 long int lastP = -1;
150 std::string lastS = "NA";
151
152 while (variantFile.getNextVariant(var)) {
153
154 if(var.position < lastP && var.sequenceName == lastS){
155
156 std::cerr << "FATAL: " << "VCF must be sorted." << std::endl;
157 std::cerr << " lastPos = " << lastP << " currentPos = " << var.position << std::endl;
158 exit(1);
159 }
160 if(lastS != var.sequenceName){
161 for(std::map<long int, int>::iterator it = seen.begin();
162 it != seen.end(); it++){
163 if(it->second > maxNfiles){
164 std::cerr << "INFO: " << lastS << "\t" << it->first << "\t" << it->second << std::endl;
165 maxNfiles = it->second;
166 }
167 }
168 seen.clear();
169 lastS = var.sequenceName;
170 }
171
172 // counting non uniq starts
173 if(var.position == lastP){
174 if(seen.find(var.position) != seen.end()){
175 seen[var.position] += 1;
176 }
177 else{
178 // start with 2 (current and last)
179 seen[var.position] = 2;
180 }
181 }
182 lastP = var.position;
183 }
184 }
185
186 std::cerr << "INFO: " << maxNfiles << " required to uniq start." << std::endl;
187
188 if(maxNfiles == 1){
189 std::cerr << "INFO: all start positions are unique, bye!";
190 return 0;
191
192 }
193
194 {
195 VariantCallFile variantFile;
196 if(!variantFile.open(filename)){
197 std::cerr << "FATAL: could not open VCF" << endl;
198 exit(1);
199 }
200
201 std::map<int, ofstream *> outputFiles;
202 for(int i = 1; i <= maxNfiles; i++){
203 ofstream * oz = new ofstream;
204
205 stringstream fname;
206 fname << prefix << "." << i << ".vcf";
207 oz->open(fname.str().c_str());
208
209 *oz << variantFile.header << std::endl;
210
211 outputFiles[i] = oz;
212 }
213
214 Variant var(variantFile);
215
216 int index = 1;
217
218 while (variantFile.getNextVariant(var)) {
219 *(outputFiles[index]) << var << std::endl;
220 index += 1;
221 if(index > maxNfiles){
222 index = 1;
223 }
224 }
225 for(int i = 1; i <= maxNfiles; i++){
226 outputFiles[i]->close();
227 }
228 }
229 return 0;
230 }
231