1 /* The MIT License
2 
3    Copyright (c) 2013 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "bcf_ordered_writer.h"
25 
BCFOrderedWriter(std::string output_vcf_file_name,int32_t window,int32_t compression)26 BCFOrderedWriter::BCFOrderedWriter(std::string output_vcf_file_name, int32_t window, int32_t compression)
27 {
28     this->file_name = output_vcf_file_name;
29     this->window = window;
30     file = NULL;
31 
32     kstring_t mode = {0,0,0};
33     kputc('w', &mode);
34 
35     if (file_name=="+")
36     {
37         kputs("bu", &mode);
38         file_name = "-";
39     }
40     else if (file_name=="-")
41     {
42         //do nothing
43     }
44     else
45     {
46         if (str_ends_with(file_name, ".vcf"))
47         {
48             //do nothing
49         }
50         else if (str_ends_with(file_name, ".vcf.gz"))
51         {
52             kputc('z', &mode);
53             if (compression!=6 && compression!=-1)
54             {
55                 kputw(compression, &mode);
56             }
57             else if (compression==-1)
58             {
59                 kputw(0, &mode);
60             }
61         }
62         else if (str_ends_with(file_name, ".bcf"))
63         {
64             kputc('b', &mode);
65             if (compression!=6 && compression!=-1)
66             {
67                 kputw(compression, &mode);
68             }
69             else if (compression==-1)
70             {
71                 kputc('u', &mode);
72             }
73         }
74         else if (str_ends_with(file_name, ".ubcf"))
75         {
76             kputs("bu", &mode);
77         }
78         else
79         {
80             fprintf(stderr, "[%s:%d %s] Not a VCF/BCF file: %s\n", __FILE__,__LINE__,__FUNCTION__, file_name.c_str());
81             exit(1);
82         }
83     }
84 
85     file = bcf_open(file_name.c_str(), mode.s);
86     if (file==NULL)
87     {
88         fprintf(stderr, "[%s:%d %s] Cannot open VCF/BCF file for writing: %s\n", __FILE__,__LINE__,__FUNCTION__, file_name.c_str());
89         exit(1);
90     }
91 
92     hdr = bcf_hdr_init("w");
93     bcf_hdr_set_version(hdr, "VCFv4.2");
94     linked_hdr = false;
95 }
96 
97 /**
98  * Duplicates a hdr and sets it.
99  */
set_hdr(bcf_hdr_t * hdr)100 void BCFOrderedWriter::set_hdr(bcf_hdr_t *hdr)
101 {
102     if (this->hdr)
103     {
104         bcf_hdr_destroy(this->hdr);
105     }
106     this->hdr = bcf_hdr_dup(hdr);
107     linked_hdr = false;
108 }
109 
110 /**
111  * Links a header.  This is useful when the VCF file being read has an incomplete header.
112  * As the VCF records are read, the incomplete header will be fixed with string type assumptions
113  * and the VCF records can be written out without any failure.  The header in the VCF file being
114  * written will be incomplete nonetheless and the user should use an alt header when reading the
115  * file to bypass the problem.
116  */
link_hdr(bcf_hdr_t * hdr)117 void BCFOrderedWriter::link_hdr(bcf_hdr_t *hdr)
118 {
119     this->hdr = hdr;
120     linked_hdr = true;
121 }
122 
123 /**
124  * Reads next record, hides the random access of different regions from the user.
125  */
write_hdr()126 void BCFOrderedWriter::write_hdr()
127 {
128     bcf_hdr_write(file, hdr);
129 }
130 
131 /**
132  * Reads next record, hides the random access of different regions from the user.
133  */
write(bcf1_t * v)134 void BCFOrderedWriter::write(bcf1_t *v)
135 {
136     //place into appropriate position in the buffer
137     if (window)
138     {
139         if (!buffer.empty())
140         {
141             if (bcf_get_rid(v)==bcf_get_rid(buffer.back()))
142             {
143                 std::list<bcf1_t*>::iterator i = buffer.begin();
144 
145                 for (i=buffer.begin(); i!=buffer.end(); ++i)
146                 {
147                     //equal sign ensures records are kept in original order
148                     if (bcf_get_pos1(v)>=bcf_get_pos1(*i))
149                     {
150                         buffer.insert(i,v);
151                         flush(false);
152                         return;
153                     }
154                 }
155 
156                 if (i==buffer.end())
157                 {
158                     int32_t cutoff_pos1 =  std::max(bcf_get_pos1(buffer.front())-window,(hts_pos_t)1);
159                     if (bcf_get_pos1(v)<cutoff_pos1)
160                     {
161                         fprintf(stderr, "[%s:%d %s] Might not be sorted for window size %d at current record %s:%d < %d (%d [last record] - %d), please increase window size to at least %d.\n",
162                                           __FILE__,
163                                           __LINE__,
164                                           __FUNCTION__,
165                                           window,
166                                           bcf_get_chrom(hdr, v),
167                                           bcf_get_pos1(v),
168                                           cutoff_pos1,
169                                           bcf_get_pos1(buffer.front()),
170                                           window,
171                                           bcf_get_pos1(buffer.front())-bcf_get_pos1(v)+1);
172                     }
173                 }
174 
175                 buffer.insert(i,v);
176                 flush(false);
177             }
178             else
179             {
180                 flush(true);
181                 buffer.push_front(v);
182             }
183         }
184         else
185         {
186             buffer.push_front(v);
187         }
188 
189         v = NULL;
190     }
191     else
192     {
193         //todo:  add a mechanism to populate header similar to vcf_parse in vcf_format which is called by bcf_write
194         bcf_write(file, hdr, v);
195     }
196 }
197 
198 /**
199  * Flush writable records from buffer.
200  */
flush()201 void BCFOrderedWriter::flush()
202 {
203     flush(true);
204 }
205 
206 /**
207  * Returns record to pool.
208  */
store_bcf1_into_pool(bcf1_t * v)209 void BCFOrderedWriter::store_bcf1_into_pool(bcf1_t* v)
210 {
211     bcf_clear(v);
212     pool.push_back(v);
213     v = NULL;
214 }
215 
216 /**
217  * Gets record from pool, creates a new record if necessary
218  */
get_bcf1_from_pool()219 bcf1_t* BCFOrderedWriter::get_bcf1_from_pool()
220 {
221     if(!pool.empty())
222     {
223         bcf1_t* v = pool.front();
224         pool.pop_front();
225         return v;
226     }
227     else
228     {
229         bcf1_t* v = bcf_init();
230         bcf_clear(v);
231         return v;
232     }
233 };
234 
235 /**
236  * Flush writable records from buffer.
237  */
flush(bool force)238 void BCFOrderedWriter::flush(bool force)
239 {
240     if (force)
241     {
242         while (!buffer.empty())
243         {
244             bcf_write(file, hdr, buffer.back());
245             bcf_destroy(buffer.back());
246             //store_bcf1_into_pool(buffer.back());
247             buffer.pop_back();
248         }
249     }
250     else
251     {
252         if (buffer.size()>1)
253         {
254             int32_t cutoff_pos1 =  std::max(bcf_get_pos1(buffer.front())-window,(hts_pos_t)1);
255 
256             while (buffer.size()>1)
257             {
258                 if (bcf_get_pos1(buffer.back())<=cutoff_pos1)
259                 {
260                     bcf_write(file, hdr, buffer.back());
261                     bcf_destroy(buffer.back());
262                     //store_bcf1_into_pool(buffer.back());
263                     buffer.pop_back();
264                 }
265                 else
266                 {
267                     return;
268                 }
269             }
270         }
271     }
272 }
273 
274 /**
275  * Closes the file.
276  */
close()277 void BCFOrderedWriter::close()
278 {
279     flush(true);
280     bcf_close(file);
281     if (!linked_hdr && hdr) bcf_hdr_destroy(hdr);
282 //    while (buffer.size()!=0)
283 //    {
284 //        bcf_destroy(buffer.pop_back());
285 //        buffer.pop_back();
286 //        if (bcf_get_pos1(buffer.back())<=cutoff_pos1)
287 //        {
288 //            bcf_write(file, hdr, buffer.back());
289 //            store_bcf1_into_pool(buffer.back());
290 //            buffer.pop_back();
291 //        }
292 //        else
293 //        {
294 //            return;
295 //        }
296 //    }
297 }
298