1 /*
2     vcflib C++ library for parsing and manipulating VCF files
3 
4     Copyright © 2010-2020 Erik Garrison
5     Copyright © 2020      Pjotr Prins
6 
7     This software is published under the MIT License. See the LICENSE file.
8 */
9 
10 #include "Variant.h"
11 #include "split.h"
12 #include "cdflib.hpp"
13 #include "pdflib.hpp"
14 #include "var.hpp"
15 
16 #include <string>
17 #include <iostream>
18 #include <math.h>
19 #include <cmath>
20 #include <stdlib.h>
21 #include <time.h>
22 #include <stdio.h>
23 #include <getopt.h>
24 #include "gpatInfo.hpp"
25 
26 using namespace std;
27 using namespace vcf;
28 
29 struct indv{
30   int nhet  ;
31   int nhom  ;
32   int nalt  ;
33   int nocall;
34 };
35 
36 
37 
printHelp(void)38 void printHelp(void){
39 
40   cerr << endl << endl;
41   cerr << "INFO: help" << endl;
42   cerr << "INFO: description:" << endl;
43   cerr << "      Splits VCF into unique files.  This resolves the issue " << endl;
44   cerr << "      of same start. " << endl;
45 
46   cerr << "Output : N vcf files                 "    << endl;
47 
48   cerr << "INFO: usage:  splitUniqStarts --file my.vcf --prefix split " << endl;
49   cerr << endl;
50   cerr << "INFO: required: f,file    -- proper formatted VCF" << endl;
51   cerr << "INFO: required: p,prefix    -- vcf prefix" << endl;
52 
53 
54   printVersion();
55 }
56 
57 
main(int argc,char ** argv)58 int main(int argc, char** argv) {
59 
60   int maxNfiles = 1;
61 
62   string prefix;
63 
64   string filename;
65 
66   // set region to scaffold
67 
68   string region = "NA";
69 
70   // using vcflib; thanks to Erik Garrison
71 
72 
73   // zero based index for the target and background indivudals
74 
75 
76     const struct option longopts[] =
77       {
78 	{"version"   , 0, 0, 'v'},
79 	{"help"      , 0, 0, 'h'},
80         {"file"      , 1, 0, 'f'},
81 	{"prefix"    , 1, 0, 'p'},
82 	{0,0,0,0}
83       };
84 
85     int index;
86     int iarg=0;
87 
88     while(iarg != -1)
89       {
90 	iarg = getopt_long(argc, argv, "p:f:r:vh", longopts, &index);
91 
92 	switch (iarg)
93 	  {
94 	  case 'p':
95 	    {
96 	      prefix = optarg;
97 	      break;
98 	    }
99 	  case 'h':
100 	    {
101 	      printHelp();
102 	      return 0;
103 	    }
104 	  case 'v':
105 	    {
106 	      printVersion();
107 	      return 0;
108 	    }
109 	  case 'f':
110 	    {
111 	      cerr << "INFO: file: " << optarg  <<  endl;
112 	      filename = optarg;
113 	      break;
114 	    }
115 	  case 'r':
116 	    {
117 	      cerr << "INFO: set seqid region to : " << optarg << endl;
118 	      region = optarg;
119 	      break;
120 	    }
121 	  default:
122 	    break;
123 	  }
124 
125       }
126 
127     if(filename.empty()){
128       cerr << "FATAL: failed to specify a file" << endl;
129       printHelp();
130     }
131 
132     // scope since variant file has no close
133     {
134       VariantCallFile variantFile;
135 
136       if(!variantFile.open(filename)){
137 	cerr << "FATAL: could not open file for reading" << endl;
138 	printHelp();
139       }
140 
141       if(region != "NA"){
142 	if(! variantFile.setRegion(region)){
143 	  cerr <<"FATAL: unable to set region" << endl;
144 	  return 1;
145 	}
146       }
147 
148       if (!variantFile.is_open()) {
149 	cerr << "FATAL: could not open VCF for reading" << endl;
150 	printHelp();
151 	return 1;
152       }
153 
154       Variant var(variantFile);
155 
156       std::map<long int, int> seen;
157 
158       long int lastP    = -1;
159       std::string lastS = "NA";
160 
161       while (variantFile.getNextVariant(var)) {
162 
163 	if(var.position < lastP && var.sequenceName == lastS){
164 
165 	  std::cerr << "FATAL: " << "VCF must be sorted." << std::endl;
166 	  std::cerr << "       lastPos = " << lastP << " currentPos = " << var.position << std::endl;
167 	  exit(1);
168 	}
169 	if(lastS != var.sequenceName){
170 	  for(std::map<long int, int>::iterator it = seen.begin();
171 	      it != seen.end(); it++){
172 	    if(it->second > maxNfiles){
173 std::cerr << "INFO: " << lastS << "\t" << it->first << "\t" << it->second << std::endl;
174 	      maxNfiles = it->second;
175 	    }
176 	  }
177 	  seen.clear();
178 	  lastS = var.sequenceName;
179 	}
180 
181 	// counting non uniq starts
182 	if(var.position == lastP){
183 	  if(seen.find(var.position) != seen.end()){
184 	    seen[var.position] += 1;
185 	  }
186 	  else{
187 	    // start with 2 (current and last)
188 	    seen[var.position] = 2;
189 	  }
190 	}
191 	lastP = var.position;
192       }
193     }
194 
195     std::cerr << "INFO: " << maxNfiles << " required to uniq start." << std::endl;
196 
197     if(maxNfiles == 1){
198       std::cerr << "INFO: all start positions are unique, bye!";
199       return 0;
200 
201     }
202 
203     {
204       VariantCallFile variantFile;
205       if(!variantFile.open(filename)){
206 	std::cerr << "FATAL: could not open VCF" << endl;
207 	exit(1);
208       }
209 
210       std::map<int, ofstream *> outputFiles;
211       for(int i = 1; i <= maxNfiles; i++){
212 	ofstream * oz = new ofstream;
213 
214 	stringstream fname;
215         fname << prefix << "." << i << ".vcf";
216 	oz->open(fname.str().c_str());
217 
218 	*oz << variantFile.header << std::endl;
219 
220 	outputFiles[i] = oz;
221       }
222 
223       Variant var(variantFile);
224 
225       int index = 1;
226 
227       while (variantFile.getNextVariant(var)) {
228 	*(outputFiles[index]) <<  var << std::endl;
229 	index += 1;
230 	if(index > maxNfiles){
231 	  index = 1;
232 	}
233       }
234       for(int i = 1; i <= maxNfiles; i++){
235 	outputFiles[i]->close();
236       }
237     }
238     return 0;
239 }
240