1 #include "Variant.h"
2 #include <getopt.h>
3 #include "Fasta.h"
4 #include "gpatInfo.hpp"
5 #include <algorithm>
6 #include <list>
7 #include <set>
8
9 using namespace std;
10 using namespace vcflib;
11
12
printSummary(char ** argv)13 void printSummary(char** argv) {
14 cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
15 << endl
16 << "options:" << endl
17 << " -h, --help Print this message" << endl
18 << " -v, --version Print version" << endl
19 << " -r, --reference FILE FASTA reference file" << endl
20 << " -w, --window-size N Merge variants at most this many bp apart (default 30)" << endl
21 << " -o, --only-variants Don't output the entire haplotype, just concatenate" << endl
22 << " REF/ALT strings (delimited by \":\")" << endl
23 << endl
24 << "Convert genotype-based phased alleles within --window-size into haplotype alleles." << endl
25 << "Will break haplotype construction when encountering non-phased genotypes on input." << endl
26 << endl;
27 exit(0);
28 }
29
isPhased(Variant & var)30 bool isPhased(Variant& var) {
31 for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
32 map<string, vector<string> >& sample = s->second;
33 map<string, vector<string> >::iterator g = sample.find("GT");
34 if (g != sample.end()) {
35 string gt = g->second.front();
36 if (gt.size() > 1 && gt.find("|") == string::npos) {
37 return false;
38 }
39 }
40 }
41 return true;
42 }
43
main(int argc,char ** argv)44 int main(int argc, char** argv) {
45
46 string vcfFileName;
47 string fastaFileName;
48 int windowsize = 30;
49 bool onlyVariants = false;
50
51 if (argc == 1)
52 printSummary(argv);
53
54 int c;
55 while (true) {
56 static struct option long_options[] =
57 {
58 /* These options set a flag. */
59 //{"verbose", no_argument, &verbose_flag, 1},
60 {"help", no_argument, 0, 'h'},
61 {"window-size", required_argument, 0, 'w'},
62 {"reference", required_argument, 0, 'r'},
63 {"version", no_argument, 0, 'v'},
64 {"only-variants", no_argument, 0, 'o'},
65 {0, 0, 0, 0}
66 };
67 /* getopt_long stores the option index here. */
68 int option_index = 0;
69
70 c = getopt_long (argc, argv, "hvow:r:",
71 long_options, &option_index);
72
73 if (c == -1){
74 break;
75 }
76 switch (c) {
77 case 'v':
78 {
79 printBasicVersion();
80 exit(0);
81 }
82 case 'o':
83 {
84 onlyVariants = true;
85 break;
86 }
87 case 'w':
88 {
89 windowsize = atoi(optarg);
90 break;
91 }
92 case 'r':
93 {
94 fastaFileName = string(optarg);
95 break;
96 }
97 case 'h':
98 {
99 printSummary(argv);
100 break;
101 }
102 case '?':
103 {
104 printSummary(argv);
105 exit(1);
106 break;
107 }
108 default:
109 abort ();
110 }
111 }
112
113 VariantCallFile variantFile;
114 string inputFilename;
115 if (optind == argc - 1) {
116 inputFilename = argv[optind];
117 variantFile.open(inputFilename);
118 } else {
119 variantFile.open(std::cin);
120 }
121
122 if (!variantFile.is_open()) {
123 cerr << "could not open VCF file" << endl;
124 exit(1);
125 }
126
127 FastaReference reference;
128 if (fastaFileName.empty()) {
129 cerr << "a reference is required for haplotype allele generation" << endl;
130 exit(1);
131 }
132 reference.open(fastaFileName);
133
134 // pattern
135 // when variants are within windowSize from each other, build up local haplotypes
136 // establish all the haplotypes which exist within the window using genotypes+allele#+position map
137 // generate a haplotype allele string for each unique haplotype
138 // for completeness retain phasing information in the genotypes
139 // write a new VCF record in which there are haplotype alleles and correctly described genotypes for each sample
140 // if the variants are outside of the windowSize, just write out the record
141
142 Variant var(variantFile);
143 Variant outputVar(variantFile);
144
145 cout << variantFile.header << endl;
146
147 // get the first distances
148 vector<Variant> cluster;
149
150 while (variantFile.getNextVariant(var) || !cluster.empty()) {
151
152 bool haplotypeCluster = false;
153
154 if (variantFile.done()) {
155 if (cluster.size() >= 1) {
156 haplotypeCluster = true;
157 } else {
158 cout << cluster.front() << endl;
159 cluster.clear();
160 }
161 } else if (isPhased(var)) {
162 if (cluster.empty()
163 || cluster.back().sequenceName == var.sequenceName
164 && var.position - cluster.back().position + cluster.back().ref.size() - 1 <= windowsize) {
165 cluster.push_back(var);
166 } else {
167 if (cluster.size() == 1) {
168 cout << cluster.front() << endl;
169 cluster.clear();
170 if (!variantFile.done()) {
171 cluster.push_back(var);
172 }
173 } else {
174 haplotypeCluster = true;
175 }
176 }
177 } else { // not phased
178 if (cluster.empty()) {
179 cout << var << endl;
180 } else if (cluster.size() == 1) {
181 cout << cluster.front() << endl;
182 cluster.clear();
183 cout << var << endl;
184 } else {
185 haplotypeCluster = true;
186 }
187 }
188
189 // we need to deal with the current cluster, as our next var is outside of bounds
190 // process the last cluster if it's more than 1 var
191 if (haplotypeCluster) {
192 /*
193 cerr << "cluster: ";
194 for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
195 cerr << " " << v->position;
196 }
197 cerr << endl;
198 */
199
200 // generate haplotype alleles and genotypes!
201 // get the reference sequence across the haplotype in question
202 string referenceHaplotype = reference.getSubSequence(cluster.front().sequenceName,
203 cluster.front().position - 1,
204 cluster.back().position
205 + cluster.back().ref.size() - cluster.front().position);
206
207 // establish what haplotypes there are by parsing the (phased) genotypes across the samples over these records
208 map<string, vector<vector<int> > > sampleHaplotypes;
209 for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
210 // build the haplotype using the genotype fields in the variant cluster
211 // only build haplotypes for samples with complete information
212 string& sampleName = *s;
213 vector<vector<int> >& haplotypes = sampleHaplotypes[sampleName];
214
215 bool completeCoverage = true;
216 // ensure complete genotype coverage over the haplotype cluster
217 for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
218 if (v->samples.find(sampleName) == v->samples.end()
219 || v->samples[sampleName].find("GT") == v->samples[sampleName].end()) {
220 completeCoverage = false;
221 break;
222 }
223 }
224 if (!completeCoverage) {
225 continue; // skip samples without complete coverage
226 }
227
228 // what's the ploidy?
229 {
230 string& gt = cluster.front().samples[sampleName]["GT"].front();
231 vector<string> gtspec = split(gt, "|");
232 for (vector<string>::iterator g = gtspec.begin(); g != gtspec.end(); ++g) {
233 vector<int> haplotype;
234 haplotypes.push_back(haplotype);
235 }
236 }
237
238 for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
239 string& gt = v->samples[sampleName]["GT"].front();
240 vector<string> gtspec = split(gt, "|");
241 vector<string>::iterator g = gtspec.begin();
242 for (vector<vector<int> >::iterator h = haplotypes.begin(); h != haplotypes.end(); ++h, ++g) {
243 int j;
244 convert(*g, j);
245 h->push_back(j);
246 }
247 }
248 }
249
250 map<const vector<int>*, vector<string> > hapToSamples;
251 set<vector<int> > uniqueHaplotypes;
252 for (map<string, vector<vector<int> > >::iterator hs = sampleHaplotypes.begin();
253 hs != sampleHaplotypes.end(); ++hs) {
254 vector<vector<int> >& haps = hs->second;
255 for (vector<vector<int> >::iterator h = haps.begin(); h != haps.end(); ++h) {
256 uniqueHaplotypes.insert(*h);
257 hapToSamples[&*uniqueHaplotypes.find(*h)].push_back(hs->first);
258 }
259 }
260
261 // write new haplotypes
262 map<vector<int>, string> haplotypeSeqs;
263 map<vector<int>, int> haplotypeIndexes;
264 map<int, string> alleles;
265
266 int impossibleHaplotypes = 0;
267
268 // always include the reference haplotype as 0
269 // when we come to it in the haplotypes, we'll ignore it
270 int alleleIndex = 1;
271 for (set<vector<int> >::iterator u = uniqueHaplotypes.begin(); u != uniqueHaplotypes.end(); ++u) {
272
273 /*
274 for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z) {
275 cerr << *z;
276 }
277 cerr << endl;
278 */
279
280 string haplotype;
281 if (!onlyVariants) {
282 haplotype = referenceHaplotype;
283 }
284 bool isreference = true;
285 bool impossibleHaplotype = false;
286 int referenceInsertOffset = 0;
287 int j = 0; // index into variant cluster
288 int lastpos = 0;
289 int lastrefend = 0;
290 for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z, ++j) {
291 int i = *z;
292 Variant& vartoInsert = cluster.at(j);
293 if (i == 0) {
294 if (onlyVariants) {
295 if (!haplotype.empty()) haplotype.append(":");
296 haplotype.append(vartoInsert.ref);
297 }
298 }
299 if (i != 0) {
300 isreference = false;
301 string& alternate = vartoInsert.alleles.at(i);
302 if (vartoInsert.position < lastrefend) {
303 cerr << "impossible haplotype, overlapping alleles at " << vartoInsert.sequenceName << ":" << vartoInsert.position << endl;
304 cerr << "+target " << vartoInsert.sequenceName << " " << vartoInsert.position-1 << " " << vartoInsert.position-1 + vartoInsert.ref.size() << endl;
305 cerr << "+variant " << vartoInsert.sequenceName << ":" << vartoInsert.position << ":" << alternate << endl;
306 impossibleHaplotype = true;
307 // find the impossible haplotype samples
308 cerr << "+samples ";
309 for (auto& sample : hapToSamples[&*u]) {
310 cerr << sample << " ";
311 } cerr << endl;
312 break;
313 } else {
314 //cerr << vartoInsert.position << " " << cluster.front().position + referenceInsertOffset << endl;
315 //cerr << "replacing " << vartoInsert.ref << " at " << vartoInsert.position - cluster.front().position + referenceInsertOffset << " with " << alternate << endl;
316 if (onlyVariants) {
317 if (!haplotype.empty()) haplotype.append(":");
318 haplotype.append(alternate);
319 } else {
320 haplotype.replace(vartoInsert.position - cluster.front().position + referenceInsertOffset,
321 vartoInsert.ref.size(), alternate);
322 if (alternate.size() != vartoInsert.ref.size()) {
323 referenceInsertOffset += alternate.size() - vartoInsert.ref.size();
324 }
325 lastpos = vartoInsert.position;
326 lastrefend = vartoInsert.position + vartoInsert.ref.size();
327 }
328 }
329 }
330 }
331
332 if (impossibleHaplotype) {
333 ++impossibleHaplotypes;
334 haplotypeIndexes[*u] = -1; // indicates impossible haplotype
335 impossibleHaplotype = false;
336 } else if (isreference) {
337 alleles[0] = haplotype;
338 haplotypeIndexes[*u] = 0;
339 } else {
340 alleles[alleleIndex] = haplotype;
341 haplotypeIndexes[*u] = alleleIndex;
342 ++alleleIndex;
343 }
344 haplotypeSeqs[*u] = haplotype;
345 // if there's not a reference allele, add it
346 if (alleles.find(0) == alleles.end()) {
347 alleles[0] = referenceHaplotype;
348 // nb, there is no reference haplotype among
349 // the samples, so we don't have to add it to
350 // the haplotypeIndexes
351 }
352 }
353
354 if (onlyVariants) {
355 string newRef;
356 for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
357 if (!newRef.empty()) newRef.append(":");
358 newRef.append(v->ref);
359 }
360 outputVar.ref = newRef;
361 } else {
362 outputVar.ref = alleles[0];
363 }
364 outputVar.alt.clear();
365 for (int i = 1; i < alleleIndex; ++i) {
366 outputVar.alt.push_back(alleles[i]);
367 }
368
369 outputVar.sequenceName = cluster.front().sequenceName;
370 outputVar.position = cluster.front().position;
371 outputVar.filter = ".";
372 outputVar.id = ".";
373 outputVar.info = cluster.front().info;
374 outputVar.samples.clear();
375 outputVar.format = cluster.front().format;
376
377 // now the genotypes
378 for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
379 string& sampleName = *s;
380 vector<string> gt;
381 vector<vector<int> > & hs = sampleHaplotypes[sampleName];
382 for (vector<vector<int> >::iterator h = hs.begin(); h != hs.end(); ++h) {
383 int hi = haplotypeIndexes[*h];
384 if (hi != -1) {
385 gt.push_back(convert(hi));
386 } else {
387 // nonexistent or impossible haplotype
388 gt.push_back(".");
389 }
390 }
391 if (gt.size() != 0) {
392 outputVar.samples[sampleName]["GT"].push_back(join(gt, "|"));
393 }
394 }
395 if (cluster.size() - impossibleHaplotypes < 2) {
396 for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
397 cout << *v << endl;
398 }
399 } else {
400 if (!outputVar.alt.empty()) {
401 cout << outputVar << endl;
402 } else {
403 cerr << "no alternate alleles remain at " << outputVar.sequenceName << ":" << outputVar.position << " after haplotype validation" << endl;
404 }
405 }
406 cluster.clear();
407 if (!variantFile.done()) cluster.push_back(var);
408 }
409 }
410
411 exit(0); // why?
412 return 0;
413
414 }
415
416