1 /* The MIT License
2
3 Copyright (c) 2013 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "bcf_ordered_writer.h"
25
BCFOrderedWriter(std::string output_vcf_file_name,int32_t window,int32_t compression)26 BCFOrderedWriter::BCFOrderedWriter(std::string output_vcf_file_name, int32_t window, int32_t compression)
27 {
28 this->file_name = output_vcf_file_name;
29 this->window = window;
30 file = NULL;
31
32 kstring_t mode = {0,0,0};
33 kputc('w', &mode);
34
35 if (file_name=="+")
36 {
37 kputs("bu", &mode);
38 file_name = "-";
39 }
40 else if (file_name=="-")
41 {
42 //do nothing
43 }
44 else
45 {
46 if (str_ends_with(file_name, ".vcf"))
47 {
48 //do nothing
49 }
50 else if (str_ends_with(file_name, ".vcf.gz"))
51 {
52 kputc('z', &mode);
53 if (compression!=6 && compression!=-1)
54 {
55 kputw(compression, &mode);
56 }
57 else if (compression==-1)
58 {
59 kputw(0, &mode);
60 }
61 }
62 else if (str_ends_with(file_name, ".bcf"))
63 {
64 kputc('b', &mode);
65 if (compression!=6 && compression!=-1)
66 {
67 kputw(compression, &mode);
68 }
69 else if (compression==-1)
70 {
71 kputc('u', &mode);
72 }
73 }
74 else if (str_ends_with(file_name, ".ubcf"))
75 {
76 kputs("bu", &mode);
77 }
78 else
79 {
80 fprintf(stderr, "[%s:%d %s] Not a VCF/BCF file: %s\n", __FILE__,__LINE__,__FUNCTION__, file_name.c_str());
81 exit(1);
82 }
83 }
84
85 file = bcf_open(file_name.c_str(), mode.s);
86 if (file==NULL)
87 {
88 fprintf(stderr, "[%s:%d %s] Cannot open VCF/BCF file for writing: %s\n", __FILE__,__LINE__,__FUNCTION__, file_name.c_str());
89 exit(1);
90 }
91
92 hdr = bcf_hdr_init("w");
93 bcf_hdr_set_version(hdr, "VCFv4.2");
94 linked_hdr = false;
95 }
96
97 /**
98 * Duplicates a hdr and sets it.
99 */
set_hdr(bcf_hdr_t * hdr)100 void BCFOrderedWriter::set_hdr(bcf_hdr_t *hdr)
101 {
102 if (this->hdr)
103 {
104 bcf_hdr_destroy(this->hdr);
105 }
106 this->hdr = bcf_hdr_dup(hdr);
107 linked_hdr = false;
108 }
109
110 /**
111 * Links a header. This is useful when the VCF file being read has an incomplete header.
112 * As the VCF records are read, the incomplete header will be fixed with string type assumptions
113 * and the VCF records can be written out without any failure. The header in the VCF file being
114 * written will be incomplete nonetheless and the user should use an alt header when reading the
115 * file to bypass the problem.
116 */
link_hdr(bcf_hdr_t * hdr)117 void BCFOrderedWriter::link_hdr(bcf_hdr_t *hdr)
118 {
119 this->hdr = hdr;
120 linked_hdr = true;
121 }
122
123 /**
124 * Reads next record, hides the random access of different regions from the user.
125 */
write_hdr()126 void BCFOrderedWriter::write_hdr()
127 {
128 bcf_hdr_write(file, hdr);
129 }
130
131 /**
132 * Reads next record, hides the random access of different regions from the user.
133 */
write(bcf1_t * v)134 void BCFOrderedWriter::write(bcf1_t *v)
135 {
136 //place into appropriate position in the buffer
137 if (window)
138 {
139 if (!buffer.empty())
140 {
141 if (bcf_get_rid(v)==bcf_get_rid(buffer.back()))
142 {
143 std::list<bcf1_t*>::iterator i = buffer.begin();
144
145 for (i=buffer.begin(); i!=buffer.end(); ++i)
146 {
147 //equal sign ensures records are kept in original order
148 if (bcf_get_pos1(v)>=bcf_get_pos1(*i))
149 {
150 buffer.insert(i,v);
151 flush(false);
152 return;
153 }
154 }
155
156 if (i==buffer.end())
157 {
158 int32_t cutoff_pos1 = std::max(bcf_get_pos1(buffer.front())-window,(hts_pos_t)1);
159 if (bcf_get_pos1(v)<cutoff_pos1)
160 {
161 fprintf(stderr, "[%s:%d %s] Might not be sorted for window size %d at current record %s:%d < %d (%d [last record] - %d), please increase window size to at least %d.\n",
162 __FILE__,
163 __LINE__,
164 __FUNCTION__,
165 window,
166 bcf_get_chrom(hdr, v),
167 bcf_get_pos1(v),
168 cutoff_pos1,
169 bcf_get_pos1(buffer.front()),
170 window,
171 bcf_get_pos1(buffer.front())-bcf_get_pos1(v)+1);
172 }
173 }
174
175 buffer.insert(i,v);
176 flush(false);
177 }
178 else
179 {
180 flush(true);
181 buffer.push_front(v);
182 }
183 }
184 else
185 {
186 buffer.push_front(v);
187 }
188
189 v = NULL;
190 }
191 else
192 {
193 //todo: add a mechanism to populate header similar to vcf_parse in vcf_format which is called by bcf_write
194 bcf_write(file, hdr, v);
195 }
196 }
197
198 /**
199 * Flush writable records from buffer.
200 */
flush()201 void BCFOrderedWriter::flush()
202 {
203 flush(true);
204 }
205
206 /**
207 * Returns record to pool.
208 */
store_bcf1_into_pool(bcf1_t * v)209 void BCFOrderedWriter::store_bcf1_into_pool(bcf1_t* v)
210 {
211 bcf_clear(v);
212 pool.push_back(v);
213 v = NULL;
214 }
215
216 /**
217 * Gets record from pool, creates a new record if necessary
218 */
get_bcf1_from_pool()219 bcf1_t* BCFOrderedWriter::get_bcf1_from_pool()
220 {
221 if(!pool.empty())
222 {
223 bcf1_t* v = pool.front();
224 pool.pop_front();
225 return v;
226 }
227 else
228 {
229 bcf1_t* v = bcf_init();
230 bcf_clear(v);
231 return v;
232 }
233 };
234
235 /**
236 * Flush writable records from buffer.
237 */
flush(bool force)238 void BCFOrderedWriter::flush(bool force)
239 {
240 if (force)
241 {
242 while (!buffer.empty())
243 {
244 bcf_write(file, hdr, buffer.back());
245 bcf_destroy(buffer.back());
246 //store_bcf1_into_pool(buffer.back());
247 buffer.pop_back();
248 }
249 }
250 else
251 {
252 if (buffer.size()>1)
253 {
254 int32_t cutoff_pos1 = std::max(bcf_get_pos1(buffer.front())-window,(hts_pos_t)1);
255
256 while (buffer.size()>1)
257 {
258 if (bcf_get_pos1(buffer.back())<=cutoff_pos1)
259 {
260 bcf_write(file, hdr, buffer.back());
261 bcf_destroy(buffer.back());
262 //store_bcf1_into_pool(buffer.back());
263 buffer.pop_back();
264 }
265 else
266 {
267 return;
268 }
269 }
270 }
271 }
272 }
273
274 /**
275 * Closes the file.
276 */
close()277 void BCFOrderedWriter::close()
278 {
279 flush(true);
280 bcf_close(file);
281 if (!linked_hdr && hdr) bcf_hdr_destroy(hdr);
282 // while (buffer.size()!=0)
283 // {
284 // bcf_destroy(buffer.pop_back());
285 // buffer.pop_back();
286 // if (bcf_get_pos1(buffer.back())<=cutoff_pos1)
287 // {
288 // bcf_write(file, hdr, buffer.back());
289 // store_bcf1_into_pool(buffer.back());
290 // buffer.pop_back();
291 // }
292 // else
293 // {
294 // return;
295 // }
296 // }
297 }
298