1 /*
2 vcflib C++ library for parsing and manipulating VCF files
3
4 Copyright © 2010-2020 Erik Garrison
5 Copyright © 2020 Pjotr Prins
6
7 This software is published under the MIT License. See the LICENSE file.
8 */
9
10 #include "Variant.h"
11 #include "split.h"
12 #include "cdflib.hpp"
13 #include "pdflib.hpp"
14 #include "var.hpp"
15
16 #include <string>
17 #include <iostream>
18 #include <math.h>
19 #include <cmath>
20 #include <stdlib.h>
21 #include <time.h>
22 #include <stdio.h>
23 #include <getopt.h>
24 #include "gpatInfo.hpp"
25
26 using namespace std;
27 using namespace vcf;
28
29 struct indv{
30 int nhet ;
31 int nhom ;
32 int nalt ;
33 int nocall;
34 };
35
36
37
printHelp(void)38 void printHelp(void){
39
40 cerr << endl << endl;
41 cerr << "INFO: help" << endl;
42 cerr << "INFO: description:" << endl;
43 cerr << " Splits VCF into unique files. This resolves the issue " << endl;
44 cerr << " of same start. " << endl;
45
46 cerr << "Output : N vcf files " << endl;
47
48 cerr << "INFO: usage: splitUniqStarts --file my.vcf --prefix split " << endl;
49 cerr << endl;
50 cerr << "INFO: required: f,file -- proper formatted VCF" << endl;
51 cerr << "INFO: required: p,prefix -- vcf prefix" << endl;
52
53
54 printVersion();
55 }
56
57
main(int argc,char ** argv)58 int main(int argc, char** argv) {
59
60 int maxNfiles = 1;
61
62 string prefix;
63
64 string filename;
65
66 // set region to scaffold
67
68 string region = "NA";
69
70 // using vcflib; thanks to Erik Garrison
71
72
73 // zero based index for the target and background indivudals
74
75
76 const struct option longopts[] =
77 {
78 {"version" , 0, 0, 'v'},
79 {"help" , 0, 0, 'h'},
80 {"file" , 1, 0, 'f'},
81 {"prefix" , 1, 0, 'p'},
82 {0,0,0,0}
83 };
84
85 int index;
86 int iarg=0;
87
88 while(iarg != -1)
89 {
90 iarg = getopt_long(argc, argv, "p:f:r:vh", longopts, &index);
91
92 switch (iarg)
93 {
94 case 'p':
95 {
96 prefix = optarg;
97 break;
98 }
99 case 'h':
100 {
101 printHelp();
102 return 0;
103 }
104 case 'v':
105 {
106 printVersion();
107 return 0;
108 }
109 case 'f':
110 {
111 cerr << "INFO: file: " << optarg << endl;
112 filename = optarg;
113 break;
114 }
115 case 'r':
116 {
117 cerr << "INFO: set seqid region to : " << optarg << endl;
118 region = optarg;
119 break;
120 }
121 default:
122 break;
123 }
124
125 }
126
127 if(filename.empty()){
128 cerr << "FATAL: failed to specify a file" << endl;
129 printHelp();
130 }
131
132 // scope since variant file has no close
133 {
134 VariantCallFile variantFile;
135
136 if(!variantFile.open(filename)){
137 cerr << "FATAL: could not open file for reading" << endl;
138 printHelp();
139 }
140
141 if(region != "NA"){
142 if(! variantFile.setRegion(region)){
143 cerr <<"FATAL: unable to set region" << endl;
144 return 1;
145 }
146 }
147
148 if (!variantFile.is_open()) {
149 cerr << "FATAL: could not open VCF for reading" << endl;
150 printHelp();
151 return 1;
152 }
153
154 Variant var(variantFile);
155
156 std::map<long int, int> seen;
157
158 long int lastP = -1;
159 std::string lastS = "NA";
160
161 while (variantFile.getNextVariant(var)) {
162
163 if(var.position < lastP && var.sequenceName == lastS){
164
165 std::cerr << "FATAL: " << "VCF must be sorted." << std::endl;
166 std::cerr << " lastPos = " << lastP << " currentPos = " << var.position << std::endl;
167 exit(1);
168 }
169 if(lastS != var.sequenceName){
170 for(std::map<long int, int>::iterator it = seen.begin();
171 it != seen.end(); it++){
172 if(it->second > maxNfiles){
173 std::cerr << "INFO: " << lastS << "\t" << it->first << "\t" << it->second << std::endl;
174 maxNfiles = it->second;
175 }
176 }
177 seen.clear();
178 lastS = var.sequenceName;
179 }
180
181 // counting non uniq starts
182 if(var.position == lastP){
183 if(seen.find(var.position) != seen.end()){
184 seen[var.position] += 1;
185 }
186 else{
187 // start with 2 (current and last)
188 seen[var.position] = 2;
189 }
190 }
191 lastP = var.position;
192 }
193 }
194
195 std::cerr << "INFO: " << maxNfiles << " required to uniq start." << std::endl;
196
197 if(maxNfiles == 1){
198 std::cerr << "INFO: all start positions are unique, bye!";
199 return 0;
200
201 }
202
203 {
204 VariantCallFile variantFile;
205 if(!variantFile.open(filename)){
206 std::cerr << "FATAL: could not open VCF" << endl;
207 exit(1);
208 }
209
210 std::map<int, ofstream *> outputFiles;
211 for(int i = 1; i <= maxNfiles; i++){
212 ofstream * oz = new ofstream;
213
214 stringstream fname;
215 fname << prefix << "." << i << ".vcf";
216 oz->open(fname.str().c_str());
217
218 *oz << variantFile.header << std::endl;
219
220 outputFiles[i] = oz;
221 }
222
223 Variant var(variantFile);
224
225 int index = 1;
226
227 while (variantFile.getNextVariant(var)) {
228 *(outputFiles[index]) << var << std::endl;
229 index += 1;
230 if(index > maxNfiles){
231 index = 1;
232 }
233 }
234 for(int i = 1; i <= maxNfiles; i++){
235 outputFiles[i]->close();
236 }
237 }
238 return 0;
239 }
240