1 #include "Variant.h"
2 #include <getopt.h>
3 #include "Fasta.h"
4 #include "gpatInfo.hpp"
5 #include <algorithm>
6 #include <list>
7 #include <set>
8 
9 using namespace std;
10 using namespace vcflib;
11 
12 
printSummary(char ** argv)13 void printSummary(char** argv) {
14     cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
15          << endl
16          << "options:" << endl
17          << "    -h, --help              Print this message" << endl
18          << "    -v, --version           Print version" << endl
19          << "    -r, --reference FILE    FASTA reference file" << endl
20          << "    -w, --window-size N     Merge variants at most this many bp apart (default 30)" << endl
21          << "    -o, --only-variants     Don't output the entire haplotype, just concatenate" << endl
22          << "                            REF/ALT strings (delimited by \":\")" << endl
23          << endl
24          << "Convert genotype-based phased alleles within --window-size into haplotype alleles." << endl
25          << "Will break haplotype construction when encountering non-phased genotypes on input." << endl
26          << endl;
27     exit(0);
28 }
29 
isPhased(Variant & var)30 bool isPhased(Variant& var) {
31     for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
32         map<string, vector<string> >& sample = s->second;
33         map<string, vector<string> >::iterator g = sample.find("GT");
34         if (g != sample.end()) {
35             string gt = g->second.front();
36             if (gt.size() > 1 && gt.find("|") == string::npos) {
37                 return false;
38             }
39         }
40     }
41     return true;
42 }
43 
main(int argc,char ** argv)44 int main(int argc, char** argv) {
45 
46     string vcfFileName;
47     string fastaFileName;
48     int windowsize = 30;
49     bool onlyVariants = false;
50 
51     if (argc == 1)
52         printSummary(argv);
53 
54     int c;
55     while (true) {
56         static struct option long_options[] =
57             {
58                 /* These options set a flag. */
59                 //{"verbose", no_argument,       &verbose_flag, 1},
60                 {"help", no_argument, 0, 'h'},
61                 {"window-size", required_argument, 0, 'w'},
62                 {"reference", required_argument, 0, 'r'},
63                 {"version", no_argument, 0, 'v'},
64                 {"only-variants", no_argument, 0, 'o'},
65                 {0, 0, 0, 0}
66             };
67         /* getopt_long stores the option index here. */
68         int option_index = 0;
69 
70         c = getopt_long (argc, argv, "hvow:r:",
71                          long_options, &option_index);
72 
73         if (c == -1){
74             break;
75         }
76         switch (c) {
77         case 'v':
78         {
79             printBasicVersion();
80             exit(0);
81         }
82         case 'o':
83         {
84             onlyVariants = true;
85             break;
86         }
87         case 'w':
88         {
89             windowsize = atoi(optarg);
90             break;
91         }
92         case 'r':
93         {
94             fastaFileName = string(optarg);
95             break;
96         }
97         case 'h':
98         {
99             printSummary(argv);
100             break;
101         }
102         case '?':
103         {
104             printSummary(argv);
105             exit(1);
106             break;
107         }
108         default:
109             abort ();
110         }
111     }
112 
113     VariantCallFile variantFile;
114     string inputFilename;
115     if (optind == argc - 1) {
116         inputFilename = argv[optind];
117         variantFile.open(inputFilename);
118     } else {
119         variantFile.open(std::cin);
120     }
121 
122     if (!variantFile.is_open()) {
123         cerr << "could not open VCF file" << endl;
124         exit(1);
125     }
126 
127     FastaReference reference;
128     if (fastaFileName.empty()) {
129         cerr << "a reference is required for haplotype allele generation" << endl;
130         exit(1);
131     }
132     reference.open(fastaFileName);
133 
134     // pattern
135     // when variants are within windowSize from each other, build up local haplotypes
136     // establish all the haplotypes which exist within the window using genotypes+allele#+position map
137     // generate a haplotype allele string for each unique haplotype
138     // for completeness retain phasing information in the genotypes
139     // write a new VCF record in which there are haplotype alleles and correctly described genotypes for each sample
140     // if the variants are outside of the windowSize, just write out the record
141 
142     Variant var(variantFile);
143     Variant outputVar(variantFile);
144 
145     cout << variantFile.header << endl;
146 
147     // get the first distances
148     vector<Variant> cluster;
149 
150     while (variantFile.getNextVariant(var) || !cluster.empty()) {
151 
152         bool haplotypeCluster = false;
153 
154         if (variantFile.done()) {
155             if (cluster.size() >= 1) {
156                 haplotypeCluster = true;
157             } else {
158                 cout << cluster.front() << endl;
159                 cluster.clear();
160             }
161         } else if (isPhased(var)) {
162             if (cluster.empty()
163                 || cluster.back().sequenceName == var.sequenceName
164                 && var.position - cluster.back().position + cluster.back().ref.size() - 1 <= windowsize) {
165                 cluster.push_back(var);
166             } else {
167                 if (cluster.size() == 1) {
168                     cout << cluster.front() << endl;
169                     cluster.clear();
170                     if (!variantFile.done()) {
171                         cluster.push_back(var);
172                     }
173                 } else {
174                     haplotypeCluster = true;
175                 }
176             }
177         } else { // not phased
178             if (cluster.empty()) {
179                 cout << var << endl;
180             } else if (cluster.size() == 1) {
181                 cout << cluster.front() << endl;
182                 cluster.clear();
183                 cout << var << endl;
184             } else {
185                 haplotypeCluster = true;
186             }
187         }
188 
189         // we need to deal with the current cluster, as our next var is outside of bounds
190         // process the last cluster if it's more than 1 var
191         if (haplotypeCluster) {
192             /*
193             cerr << "cluster: ";
194             for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
195                 cerr << " " << v->position;
196             }
197             cerr << endl;
198             */
199 
200             // generate haplotype alleles and genotypes!
201             // get the reference sequence across the haplotype in question
202             string referenceHaplotype = reference.getSubSequence(cluster.front().sequenceName,
203                                                                  cluster.front().position - 1,
204                                                                  cluster.back().position
205                                                                  + cluster.back().ref.size() - cluster.front().position);
206 
207             // establish what haplotypes there are by parsing the (phased) genotypes across the samples over these records
208             map<string, vector<vector<int> > > sampleHaplotypes;
209             for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
210                 // build the haplotype using the genotype fields in the variant cluster
211                 // only build haplotypes for samples with complete information
212                 string& sampleName = *s;
213                 vector<vector<int> >& haplotypes = sampleHaplotypes[sampleName];
214 
215                 bool completeCoverage = true;
216                 // ensure complete genotype coverage over the haplotype cluster
217                 for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
218                     if (v->samples.find(sampleName) == v->samples.end()
219                         || v->samples[sampleName].find("GT") == v->samples[sampleName].end()) {
220                         completeCoverage = false;
221                         break;
222                     }
223                 }
224                 if (!completeCoverage) {
225                     continue; // skip samples without complete coverage
226                 }
227 
228                 // what's the ploidy?
229                 {
230                     string& gt = cluster.front().samples[sampleName]["GT"].front();
231                     vector<string> gtspec = split(gt, "|");
232                     for (vector<string>::iterator g = gtspec.begin(); g != gtspec.end(); ++g) {
233                         vector<int> haplotype;
234                         haplotypes.push_back(haplotype);
235                     }
236                 }
237 
238                 for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
239                     string& gt = v->samples[sampleName]["GT"].front();
240                     vector<string> gtspec = split(gt, "|");
241                     vector<string>::iterator g = gtspec.begin();
242                     for (vector<vector<int> >::iterator h = haplotypes.begin(); h != haplotypes.end(); ++h, ++g) {
243                         int j;
244                         convert(*g, j);
245                         h->push_back(j);
246                     }
247                 }
248             }
249 
250             map<const vector<int>*, vector<string> > hapToSamples;
251             set<vector<int> > uniqueHaplotypes;
252             for (map<string, vector<vector<int> > >::iterator hs = sampleHaplotypes.begin();
253                  hs != sampleHaplotypes.end(); ++hs) {
254                 vector<vector<int> >& haps = hs->second;
255                 for (vector<vector<int> >::iterator h = haps.begin(); h != haps.end(); ++h) {
256                     uniqueHaplotypes.insert(*h);
257                     hapToSamples[&*uniqueHaplotypes.find(*h)].push_back(hs->first);
258                 }
259             }
260 
261             // write new haplotypes
262             map<vector<int>, string> haplotypeSeqs;
263             map<vector<int>, int> haplotypeIndexes;
264             map<int, string> alleles;
265 
266             int impossibleHaplotypes = 0;
267 
268             // always include the reference haplotype as 0
269             // when we come to it in the haplotypes, we'll ignore it
270             int alleleIndex = 1;
271             for (set<vector<int> >::iterator u = uniqueHaplotypes.begin(); u != uniqueHaplotypes.end(); ++u) {
272 
273                 /*
274                 for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z) {
275                     cerr << *z;
276                 }
277                 cerr << endl;
278                 */
279 
280                 string haplotype;
281                 if (!onlyVariants) {
282                     haplotype = referenceHaplotype;
283                 }
284                 bool isreference = true;
285                 bool impossibleHaplotype = false;
286                 int referenceInsertOffset = 0;
287                 int j = 0; // index into variant cluster
288                 int lastpos = 0;
289                 int lastrefend = 0;
290                 for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z, ++j) {
291                     int i = *z;
292                     Variant& vartoInsert = cluster.at(j);
293                     if (i == 0) {
294                         if (onlyVariants) {
295                             if (!haplotype.empty()) haplotype.append(":");
296                             haplotype.append(vartoInsert.ref);
297                         }
298                     }
299                     if (i != 0) {
300                         isreference = false;
301                         string& alternate = vartoInsert.alleles.at(i);
302                         if (vartoInsert.position < lastrefend) {
303                             cerr << "impossible haplotype, overlapping alleles at " << vartoInsert.sequenceName << ":" << vartoInsert.position << endl;
304                             cerr << "+target " << vartoInsert.sequenceName << " " << vartoInsert.position-1 << " " << vartoInsert.position-1 + vartoInsert.ref.size() << endl;
305                             cerr << "+variant " << vartoInsert.sequenceName << ":" << vartoInsert.position << ":" << alternate << endl;
306                             impossibleHaplotype = true;
307                             // find the impossible haplotype samples
308                             cerr << "+samples ";
309                             for (auto& sample : hapToSamples[&*u]) {
310                                 cerr << sample << " ";
311                             } cerr << endl;
312                             break;
313                         } else {
314                             //cerr << vartoInsert.position << " " << cluster.front().position + referenceInsertOffset << endl;
315                             //cerr << "replacing " << vartoInsert.ref << " at " << vartoInsert.position - cluster.front().position + referenceInsertOffset << " with " << alternate << endl;
316                             if (onlyVariants) {
317                                 if (!haplotype.empty()) haplotype.append(":");
318                                 haplotype.append(alternate);
319                             } else {
320                                 haplotype.replace(vartoInsert.position - cluster.front().position + referenceInsertOffset,
321                                                   vartoInsert.ref.size(), alternate);
322                                 if (alternate.size() != vartoInsert.ref.size()) {
323                                     referenceInsertOffset += alternate.size() - vartoInsert.ref.size();
324                                 }
325                                 lastpos = vartoInsert.position;
326                                 lastrefend = vartoInsert.position + vartoInsert.ref.size();
327                             }
328                         }
329                     }
330                 }
331 
332                 if (impossibleHaplotype) {
333                     ++impossibleHaplotypes;
334                     haplotypeIndexes[*u] = -1; // indicates impossible haplotype
335                     impossibleHaplotype = false;
336                 } else if (isreference) {
337                     alleles[0] = haplotype;
338                     haplotypeIndexes[*u] = 0;
339                 } else {
340                     alleles[alleleIndex] = haplotype;
341                     haplotypeIndexes[*u] = alleleIndex;
342                     ++alleleIndex;
343                 }
344                 haplotypeSeqs[*u] = haplotype;
345                 // if there's not a reference allele, add it
346                 if (alleles.find(0) == alleles.end()) {
347                     alleles[0] = referenceHaplotype;
348                     // nb, there is no reference haplotype among
349                     // the samples, so we don't have to add it to
350                     // the haplotypeIndexes
351                 }
352             }
353 
354             if (onlyVariants) {
355                 string newRef;
356                 for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
357                     if (!newRef.empty()) newRef.append(":");
358                     newRef.append(v->ref);
359                 }
360                 outputVar.ref = newRef;
361             } else {
362                 outputVar.ref = alleles[0];
363             }
364             outputVar.alt.clear();
365             for (int i = 1; i < alleleIndex; ++i) {
366                 outputVar.alt.push_back(alleles[i]);
367             }
368 
369             outputVar.sequenceName = cluster.front().sequenceName;
370             outputVar.position = cluster.front().position;
371             outputVar.filter = ".";
372             outputVar.id = ".";
373             outputVar.info = cluster.front().info;
374             outputVar.samples.clear();
375             outputVar.format = cluster.front().format;
376 
377             // now the genotypes
378             for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
379                 string& sampleName = *s;
380                 vector<string> gt;
381                 vector<vector<int> > & hs = sampleHaplotypes[sampleName];
382                 for (vector<vector<int> >::iterator h = hs.begin(); h != hs.end(); ++h) {
383                     int hi = haplotypeIndexes[*h];
384                     if (hi != -1) {
385                         gt.push_back(convert(hi));
386                     } else {
387                         // nonexistent or impossible haplotype
388                         gt.push_back(".");
389                     }
390                 }
391                 if (gt.size() != 0) {
392                     outputVar.samples[sampleName]["GT"].push_back(join(gt, "|"));
393                 }
394             }
395             if (cluster.size() - impossibleHaplotypes < 2) {
396                 for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
397                     cout << *v << endl;
398                 }
399             } else {
400                 if (!outputVar.alt.empty()) {
401                     cout << outputVar << endl;
402                 } else {
403                     cerr << "no alternate alleles remain at " << outputVar.sequenceName << ":" << outputVar.position << " after haplotype validation" << endl;
404                 }
405             }
406             cluster.clear();
407             if (!variantFile.done()) cluster.push_back(var);
408         }
409     }
410 
411     exit(0);  // why?
412     return 0;
413 
414 }
415 
416