1 /*  vcf_sweep.c -- forward/reverse sweep API.
2 
3     Copyright (C) 2013 Genome Research Ltd.
4 
5     Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE.  */
24 
25 #include <config.h>
26 
27 #include "htslib/vcf_sweep.h"
28 #include "htslib/bgzf.h"
29 
30 #define SW_FWD 0
31 #define SW_BWD 1
32 
33 struct _bcf_sweep_t
34 {
35     htsFile *file;
36     bcf_hdr_t *hdr;
37     BGZF *fp;
38 
39     int direction;          // to tell if the direction has changed
40     int block_size;         // the size of uncompressed data to hold in memory
41     bcf1_t *rec;            // bcf buffer
42     int nrec, mrec;         // number of used records; total size of the buffer
43     int lrid, lpos, lnals, lals_len, mlals;   // to check uniqueness of a record
44     char *lals;
45 
46     uint64_t *idx;          // uncompressed offsets of VCF/BCF records
47     int iidx, nidx, midx;   // i: current offset; n: used; m: allocated
48     int idx_done;           // the index is built during the first pass
49 };
50 
51 BGZF *hts_get_bgzfp(htsFile *fp);
52 int hts_useek(htsFile *file, long uoffset, int where);
53 long hts_utell(htsFile *file);
54 
sw_rec_equal(bcf_sweep_t * sw,bcf1_t * rec)55 static inline int sw_rec_equal(bcf_sweep_t *sw, bcf1_t *rec)
56 {
57     if ( sw->lrid!=rec->rid ) return 0;
58     if ( sw->lpos!=rec->pos ) return 0;
59     if ( sw->lnals!=rec->n_allele ) return 0;
60 
61     char *t = rec->d.allele[sw->lnals-1];
62     int len = t - rec->d.allele[0] + 1;
63     while ( *t ) { t++; len++; }
64     if ( sw->lals_len!=len ) return 0;
65     if ( memcmp(sw->lals,rec->d.allele[0],len) ) return 0;
66     return 1;
67 }
68 
sw_rec_save(bcf_sweep_t * sw,bcf1_t * rec)69 static void sw_rec_save(bcf_sweep_t *sw, bcf1_t *rec)
70 {
71     sw->lrid  = rec->rid;
72     sw->lpos  = rec->pos;
73     sw->lnals = rec->n_allele;
74 
75     char *t = rec->d.allele[sw->lnals-1];
76     int len = t - rec->d.allele[0] + 1;
77     while ( *t ) { t++; len++; }
78     sw->lals_len = len;
79     hts_expand(char, len, sw->mlals, sw->lals);
80     memcpy(sw->lals, rec->d.allele[0], len);
81 }
82 
sw_fill_buffer(bcf_sweep_t * sw)83 static void sw_fill_buffer(bcf_sweep_t *sw)
84 {
85     if ( !sw->iidx ) return;
86     sw->iidx--;
87 
88     int ret = hts_useek(sw->file, sw->idx[sw->iidx], 0);
89     assert( ret==0 );
90 
91     sw->nrec = 0;
92     bcf1_t *rec = &sw->rec[sw->nrec];
93     while ( (ret=bcf_read1(sw->file, sw->hdr, rec))==0 )
94     {
95         bcf_unpack(rec, BCF_UN_STR);
96 
97         // if not in the last block, stop at the saved record
98         if ( sw->iidx+1 < sw->nidx && sw_rec_equal(sw,rec) ) break;
99 
100         sw->nrec++;
101         hts_expand0(bcf1_t, sw->nrec+1, sw->mrec, sw->rec);
102         rec = &sw->rec[sw->nrec];
103     }
104     sw_rec_save(sw, &sw->rec[0]);
105 }
106 
bcf_sweep_init(const char * fname)107 bcf_sweep_t *bcf_sweep_init(const char *fname)
108 {
109     bcf_sweep_t *sw = (bcf_sweep_t*) calloc(1,sizeof(bcf_sweep_t));
110     sw->file = hts_open(fname, "r");
111     sw->fp   = hts_get_bgzfp(sw->file);
112     if (sw->fp) bgzf_index_build_init(sw->fp);
113     sw->hdr  = bcf_hdr_read(sw->file);
114     sw->mrec = 1;
115     sw->rec  = (bcf1_t*) calloc(sw->mrec,(sizeof(bcf1_t)));
116     sw->block_size = 1024*1024*3;
117     sw->direction = SW_FWD;
118     return sw;
119 }
120 
bcf_sweep_destroy(bcf_sweep_t * sw)121 void bcf_sweep_destroy(bcf_sweep_t *sw)
122 {
123     int i;
124     for (i=0; i<sw->mrec; i++) bcf_empty1(&sw->rec[i]);
125     free(sw->idx);
126     free(sw->rec);
127     free(sw->lals);
128     bcf_hdr_destroy(sw->hdr);
129     hts_close(sw->file);
130     free(sw);
131 }
132 
sw_seek(bcf_sweep_t * sw,int direction)133 static void sw_seek(bcf_sweep_t *sw, int direction)
134 {
135     sw->direction = direction;
136     if ( direction==SW_FWD )
137         hts_useek(sw->file, sw->idx[0], 0);
138     else
139     {
140         sw->iidx = sw->nidx;
141         sw->nrec = 0;
142     }
143 }
144 
bcf_sweep_fwd(bcf_sweep_t * sw)145 bcf1_t *bcf_sweep_fwd(bcf_sweep_t *sw)
146 {
147     if ( sw->direction==SW_BWD ) sw_seek(sw, SW_FWD);
148 
149     long pos = hts_utell(sw->file);
150 
151     bcf1_t *rec = &sw->rec[0];
152     int ret = bcf_read1(sw->file, sw->hdr, rec);
153 
154     if ( ret!=0 )   // last record, get ready for sweeping backwards
155     {
156         sw->idx_done = 1;
157         if (sw->fp) sw->fp->idx_build_otf = 0;
158         sw_seek(sw, SW_BWD);
159         return NULL;
160     }
161 
162     if ( !sw->idx_done )
163     {
164         if ( !sw->nidx || pos - sw->idx[sw->nidx-1] > sw->block_size )
165         {
166             sw->nidx++;
167             hts_expand(uint64_t, sw->nidx, sw->midx, sw->idx);
168             sw->idx[sw->nidx-1] = pos;
169         }
170     }
171     return rec;
172 }
173 
bcf_sweep_bwd(bcf_sweep_t * sw)174 bcf1_t *bcf_sweep_bwd(bcf_sweep_t *sw)
175 {
176     if ( sw->direction==SW_FWD ) sw_seek(sw, SW_BWD);
177     if ( !sw->nrec ) sw_fill_buffer(sw);
178     if ( !sw->nrec ) return NULL;
179     return &sw->rec[ --sw->nrec ];
180 }
181 
bcf_sweep_hdr(bcf_sweep_t * sw)182 bcf_hdr_t *bcf_sweep_hdr(bcf_sweep_t *sw) { return sw->hdr; }
183 
184