1 /* The MIT License
2 
3    Copyright (c) 2015 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "bcf_single_genotyping_buffered_reader.h"
25 
26 /**
27  * Constructor.
28  */
BCFSingleGenotypingBufferedReader(std::string input_vcf_file,std::vector<GenomeInterval> & intervals,std::string output_vcf_file)29 BCFSingleGenotypingBufferedReader::BCFSingleGenotypingBufferedReader(std::string input_vcf_file, std::vector<GenomeInterval>& intervals, std::string output_vcf_file)
30 {
31     /////////////////////
32     //io initialization//
33     /////////////////////
34     odr = new BCFOrderedReader(input_vcf_file, intervals);
35 
36     //////////////////////////
37     //options initialization//
38     //////////////////////////
39     output_annotations = false;
40 
41     ////////////////////////
42     //stats initialization//
43     ////////////////////////
44     no_snps_genotyped = 0;
45     no_indels_genotyped = 0;
46     no_vntrs_genotyped = 0;
47 
48     ////////////////////////
49     //tools initialization//
50     ////////////////////////
51     vm = new VariantManip();
52 //    fai = fai_load(ref_fasta_file.c_str());
53 //    if (fai==NULL)
54 //    {
55 //        fprintf(stderr, "[%s:%d %s] Cannot load genome index: %s\n", __FILE__, __LINE__, __FUNCTION__, ref_fasta_file.c_str());
56 //        exit(1);
57 //    }
58 }
59 
60 /**
61  * Collects sufficient statistics from read for variants to be genotyped.
62  *
63  * The VCF records in the buffer must never occur before
64  */
process_read(bam_hdr_t * h,bam1_t * s)65 void BCFSingleGenotypingBufferedReader::process_read(bam_hdr_t *h, bam1_t *s)
66 {
67     //wrap bam1_t in AugmentBAMRecord
68     as.initialize(h, s);
69 
70     uint32_t tid = bam_get_tid(s);
71     uint32_t beg1 = as.beg1;
72     uint32_t end1 = as.end1;
73 
74     //collect statistics for variant records that are in the buffer and overlap with the read
75     GenotypingRecord* g;
76     for (std::list<GenotypingRecord*>::iterator i=buffer.begin(); i!=buffer.end(); ++i)
77     {
78         g = *i;
79 
80 //        std::cerr << g->pos1 << " " << g->beg1 << " " << g->end1 << " ";
81 
82         //same chromosome
83         if (tid==g->rid)
84         {
85             if (end1 < g->beg1)
86             {
87                 //can terminate
88                 return;
89             }
90             else if (beg1 > g->end1)
91             {
92                 //this should not occur if the buffer was flushed before invoking process read
93                 continue;
94             }
95             //else if (beg1 <= g->beg1 && g->end1 <= end1)
96             else if (beg1 <= g->pos1 && g->pos1 <= end1)
97             {
98 //                collect_sufficient_statistics(*i, as);
99             }
100             else
101             {
102                 //other types of overlap, just ignore
103             }
104 
105 //            std::cerr << "\n";
106         }
107         //prior chromosome
108         else if (tid<g->rid)
109         {
110             //this should not occur if the buffer was flushed before invoking process read
111             return;
112         }
113         //latter chromosome
114         else if (tid>g->rid)
115         {
116             //in case if the buffer has a VCF record later in the list which matches it
117             continue;
118         }
119     }
120 
121     //you will only reach here if a read occurs after or overlaps the last record in the buffer
122     //adding new VCF records and collecting statistics if necessary
123     bcf1_t *v = bcf_init();
124     while (odr->read(v))
125     {
126         int32_t vtype = vm->classify_variant(odr->hdr, v, variant);
127         g = create_genotyping_record(odr->hdr, v, 2, variant);
128         buffer.push_back(g);
129 
130         if (tid==g->rid)
131         {
132             //if (end1>=g->beg1 && pos1<=g->end1)
133             if (beg1 <= g->pos1 && g->pos1 <= end1)
134             {
135 //                collect_sufficient_statistics(g, as);
136             }
137         }
138 
139         //VCF record occurs after the read
140         if (tid < g->rid || end1 < g->beg1)
141         {
142             return;
143         }
144         else
145         {
146             v = bcf_init();
147         }
148     }
149 
150     //this means end of file
151     bcf_destroy(v);
152 }
153 
154 /**
155  * Flush records.
156  */
flush(bam_hdr_t * h,bam1_t * s,bool flush_all)157 void BCFSingleGenotypingBufferedReader::flush(bam_hdr_t *h, bam1_t *s, bool flush_all)
158 {
159     if (flush_all)
160     {
161         //read all the remaining from the reference genotyping file
162         bcf1_t *v = bcf_init();
163         while (odr->read(v))
164         {
165             int32_t vtype = vm->classify_variant(odr->hdr, v, variant);
166             GenotypingRecord* g = create_genotyping_record(odr->hdr, v, 2, variant);
167             buffer.push_back(g);
168             v = bcf_init();
169         }
170         bcf_destroy(v);
171 
172         GenotypingRecord* g;
173         while (!buffer.empty())
174         {
175             g = buffer.front();
176 //            genotype_and_print(g);
177             delete g;
178             buffer.pop_front();
179         }
180     }
181     else
182     {
183         //std::cerr << "partial flush\n";
184 
185         uint32_t tid = bam_get_tid(s);
186         GenotypingRecord* g;
187 
188         while (!buffer.empty())
189         {
190             g = buffer.front();
191 
192             if (tid==g->rid)
193             {
194                 if (bam_get_pos1(s) > g->end1)
195                 {
196 //                    genotype_and_print(g);
197                     delete g;
198                     buffer.pop_front();
199                 }
200                 else
201                 {
202                     return;
203                 }
204             }
205             else if (tid>g->rid)
206             {
207 //                genotype_and_print(g);
208                 delete g;
209                 buffer.pop_front();
210             }
211             else
212             {
213                 return;
214             }
215         }
216     }
217 }
218 
219 /**
220  * Create appropriate genotyping record.
221  */
create_genotyping_record(bcf_hdr_t * h,bcf1_t * v,uint32_t ploidy,Variant & variant)222 GenotypingRecord* BCFSingleGenotypingBufferedReader::create_genotyping_record(bcf_hdr_t* h, bcf1_t* v, uint32_t ploidy, Variant& variant)
223 {
224     GenotypingRecord* g = NULL;
225     if (variant.type==VT_SNP)
226     {
227         g = new SNPGenotypingRecord(h, v, 1, 2, NULL);
228     }
229 
230     return g;
231 
232 }
233 
234 
235 
236 
237 
238 
239 
240 
241 
242 
243 
244 
245 
246 
247 
248 
249 
250 
251 
252 
253 
254 
255 
256 
257 
258 
259 
260 
261 
262