1 /* The MIT License
2
3 Copyright (c) 2015 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "bcf_single_genotyping_buffered_reader.h"
25
26 /**
27 * Constructor.
28 */
BCFSingleGenotypingBufferedReader(std::string input_vcf_file,std::vector<GenomeInterval> & intervals,std::string output_vcf_file)29 BCFSingleGenotypingBufferedReader::BCFSingleGenotypingBufferedReader(std::string input_vcf_file, std::vector<GenomeInterval>& intervals, std::string output_vcf_file)
30 {
31 /////////////////////
32 //io initialization//
33 /////////////////////
34 odr = new BCFOrderedReader(input_vcf_file, intervals);
35
36 //////////////////////////
37 //options initialization//
38 //////////////////////////
39 output_annotations = false;
40
41 ////////////////////////
42 //stats initialization//
43 ////////////////////////
44 no_snps_genotyped = 0;
45 no_indels_genotyped = 0;
46 no_vntrs_genotyped = 0;
47
48 ////////////////////////
49 //tools initialization//
50 ////////////////////////
51 vm = new VariantManip();
52 // fai = fai_load(ref_fasta_file.c_str());
53 // if (fai==NULL)
54 // {
55 // fprintf(stderr, "[%s:%d %s] Cannot load genome index: %s\n", __FILE__, __LINE__, __FUNCTION__, ref_fasta_file.c_str());
56 // exit(1);
57 // }
58 }
59
60 /**
61 * Collects sufficient statistics from read for variants to be genotyped.
62 *
63 * The VCF records in the buffer must never occur before
64 */
process_read(bam_hdr_t * h,bam1_t * s)65 void BCFSingleGenotypingBufferedReader::process_read(bam_hdr_t *h, bam1_t *s)
66 {
67 //wrap bam1_t in AugmentBAMRecord
68 as.initialize(h, s);
69
70 uint32_t tid = bam_get_tid(s);
71 uint32_t beg1 = as.beg1;
72 uint32_t end1 = as.end1;
73
74 //collect statistics for variant records that are in the buffer and overlap with the read
75 GenotypingRecord* g;
76 for (std::list<GenotypingRecord*>::iterator i=buffer.begin(); i!=buffer.end(); ++i)
77 {
78 g = *i;
79
80 // std::cerr << g->pos1 << " " << g->beg1 << " " << g->end1 << " ";
81
82 //same chromosome
83 if (tid==g->rid)
84 {
85 if (end1 < g->beg1)
86 {
87 //can terminate
88 return;
89 }
90 else if (beg1 > g->end1)
91 {
92 //this should not occur if the buffer was flushed before invoking process read
93 continue;
94 }
95 //else if (beg1 <= g->beg1 && g->end1 <= end1)
96 else if (beg1 <= g->pos1 && g->pos1 <= end1)
97 {
98 // collect_sufficient_statistics(*i, as);
99 }
100 else
101 {
102 //other types of overlap, just ignore
103 }
104
105 // std::cerr << "\n";
106 }
107 //prior chromosome
108 else if (tid<g->rid)
109 {
110 //this should not occur if the buffer was flushed before invoking process read
111 return;
112 }
113 //latter chromosome
114 else if (tid>g->rid)
115 {
116 //in case if the buffer has a VCF record later in the list which matches it
117 continue;
118 }
119 }
120
121 //you will only reach here if a read occurs after or overlaps the last record in the buffer
122 //adding new VCF records and collecting statistics if necessary
123 bcf1_t *v = bcf_init();
124 while (odr->read(v))
125 {
126 int32_t vtype = vm->classify_variant(odr->hdr, v, variant);
127 g = create_genotyping_record(odr->hdr, v, 2, variant);
128 buffer.push_back(g);
129
130 if (tid==g->rid)
131 {
132 //if (end1>=g->beg1 && pos1<=g->end1)
133 if (beg1 <= g->pos1 && g->pos1 <= end1)
134 {
135 // collect_sufficient_statistics(g, as);
136 }
137 }
138
139 //VCF record occurs after the read
140 if (tid < g->rid || end1 < g->beg1)
141 {
142 return;
143 }
144 else
145 {
146 v = bcf_init();
147 }
148 }
149
150 //this means end of file
151 bcf_destroy(v);
152 }
153
154 /**
155 * Flush records.
156 */
flush(bam_hdr_t * h,bam1_t * s,bool flush_all)157 void BCFSingleGenotypingBufferedReader::flush(bam_hdr_t *h, bam1_t *s, bool flush_all)
158 {
159 if (flush_all)
160 {
161 //read all the remaining from the reference genotyping file
162 bcf1_t *v = bcf_init();
163 while (odr->read(v))
164 {
165 int32_t vtype = vm->classify_variant(odr->hdr, v, variant);
166 GenotypingRecord* g = create_genotyping_record(odr->hdr, v, 2, variant);
167 buffer.push_back(g);
168 v = bcf_init();
169 }
170 bcf_destroy(v);
171
172 GenotypingRecord* g;
173 while (!buffer.empty())
174 {
175 g = buffer.front();
176 // genotype_and_print(g);
177 delete g;
178 buffer.pop_front();
179 }
180 }
181 else
182 {
183 //std::cerr << "partial flush\n";
184
185 uint32_t tid = bam_get_tid(s);
186 GenotypingRecord* g;
187
188 while (!buffer.empty())
189 {
190 g = buffer.front();
191
192 if (tid==g->rid)
193 {
194 if (bam_get_pos1(s) > g->end1)
195 {
196 // genotype_and_print(g);
197 delete g;
198 buffer.pop_front();
199 }
200 else
201 {
202 return;
203 }
204 }
205 else if (tid>g->rid)
206 {
207 // genotype_and_print(g);
208 delete g;
209 buffer.pop_front();
210 }
211 else
212 {
213 return;
214 }
215 }
216 }
217 }
218
219 /**
220 * Create appropriate genotyping record.
221 */
create_genotyping_record(bcf_hdr_t * h,bcf1_t * v,uint32_t ploidy,Variant & variant)222 GenotypingRecord* BCFSingleGenotypingBufferedReader::create_genotyping_record(bcf_hdr_t* h, bcf1_t* v, uint32_t ploidy, Variant& variant)
223 {
224 GenotypingRecord* g = NULL;
225 if (variant.type==VT_SNP)
226 {
227 g = new SNPGenotypingRecord(h, v, 1, 2, NULL);
228 }
229
230 return g;
231
232 }
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262