1 /*  tbx.c -- tabix API functions.
2 
3     Copyright (C) 2009, 2010, 2012-2015, 2017 Genome Research Ltd.
4     Copyright (C) 2010-2012 Broad Institute.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <config.h>
27 
28 #include <stdlib.h>
29 #include <string.h>
30 #include <stdio.h>
31 #include <assert.h>
32 #include <errno.h>
33 #include "htslib/tbx.h"
34 #include "htslib/bgzf.h"
35 #include "htslib/hts_endian.h"
36 #include "hts_internal.h"
37 
38 #include "htslib/khash.h"
39 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
40 
41 const tbx_conf_t tbx_conf_gff = { 0, 1, 4, 5, '#', 0 };
42 const tbx_conf_t tbx_conf_bed = { TBX_UCSC, 1, 2, 3, '#', 0 };
43 const tbx_conf_t tbx_conf_psltbl = { TBX_UCSC, 15, 17, 18, '#', 0 };
44 const tbx_conf_t tbx_conf_sam = { TBX_SAM, 3, 4, 0, '@', 0 };
45 const tbx_conf_t tbx_conf_vcf = { TBX_VCF, 1, 2, 0, '#', 0 };
46 
47 typedef struct {
48     int64_t beg, end;
49     char *ss, *se;
50     int tid;
51 } tbx_intv_t;
52 
get_tid(tbx_t * tbx,const char * ss,int is_add)53 static inline int get_tid(tbx_t *tbx, const char *ss, int is_add)
54 {
55     khint_t k;
56     khash_t(s2i) *d;
57     if (tbx->dict == 0) tbx->dict = kh_init(s2i);
58     d = (khash_t(s2i)*)tbx->dict;
59     if (is_add) {
60         int absent;
61         k = kh_put(s2i, d, ss, &absent);
62         if (absent < 0) {
63             return -1; // Out of memory
64         } else if (absent) {
65             char *ss_dup = strdup(ss);
66             if (ss_dup) {
67                 kh_key(d, k) = ss_dup;
68                 kh_val(d, k) = kh_size(d) - 1;
69             } else {
70                 kh_del(s2i, d, k);
71                 return -1; // Out of memory
72             }
73         }
74     } else k = kh_get(s2i, d, ss);
75     return k == kh_end(d)? -1 : kh_val(d, k);
76 }
77 
tbx_name2id(tbx_t * tbx,const char * ss)78 int tbx_name2id(tbx_t *tbx, const char *ss)
79 {
80     return get_tid(tbx, ss, 0);
81 }
82 
tbx_parse1(const tbx_conf_t * conf,int len,char * line,tbx_intv_t * intv)83 int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv)
84 {
85     int i, b = 0, id = 1, ncols = 0;
86     char *s;
87     intv->ss = intv->se = 0; intv->beg = intv->end = -1;
88     for (i = 0; i <= len; ++i) {
89         if (line[i] == '\t' || line[i] == 0) {
90             ++ncols;
91             if (id == conf->sc) {
92                 intv->ss = line + b; intv->se = line + i;
93             } else if (id == conf->bc) {
94                 // here ->beg is 0-based.
95                 intv->beg = intv->end = strtol(line + b, &s, 0);
96                 if ( s==line+b ) return -1; // expected int
97                 if (!(conf->preset&TBX_UCSC)) --intv->beg;
98                 else ++intv->end;
99                 if (intv->beg < 0) intv->beg = 0;
100                 if (intv->end < 1) intv->end = 1;
101             } else {
102                 if ((conf->preset&0xffff) == TBX_GENERIC) {
103                     if (id == conf->ec)
104                     {
105                         intv->end = strtol(line + b, &s, 0);
106                         if ( s==line+b ) return -1; // expected int
107                     }
108                 } else if ((conf->preset&0xffff) == TBX_SAM) {
109                     if (id == 6) { // CIGAR
110                         int l = 0;
111                         char *t;
112                         for (s = line + b; s < line + i;) {
113                             long x = strtol(s, &t, 10);
114                             char op = toupper_c(*t);
115                             if (op == 'M' || op == 'D' || op == 'N') l += x;
116                             s = t + 1;
117                         }
118                         if (l == 0) l = 1;
119                         intv->end = intv->beg + l;
120                     }
121                 } else if ((conf->preset&0xffff) == TBX_VCF) {
122                     if (id == 4) {
123                         if (b < i) intv->end = intv->beg + (i - b);
124                     } else if (id == 8) { // look for "END="
125                         int c = line[i];
126                         line[i] = 0;
127                         s = strstr(line + b, "END=");
128                         if (s == line + b) s += 4;
129                         else if (s) {
130                             s = strstr(line + b, ";END=");
131                             if (s) s += 5;
132                         }
133                         if (s) intv->end = strtol(s, &s, 0);
134                         line[i] = c;
135                     }
136                 }
137             }
138             b = i + 1;
139             ++id;
140         }
141     }
142     if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
143     return 0;
144 }
145 
get_intv(tbx_t * tbx,kstring_t * str,tbx_intv_t * intv,int is_add)146 static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_add)
147 {
148     if (tbx_parse1(&tbx->conf, str->l, str->s, intv) == 0) {
149         int c = *intv->se;
150         *intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c;
151         return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1;
152     } else {
153         char *type = NULL;
154         switch (tbx->conf.preset&0xffff)
155         {
156             case TBX_SAM: type = "TBX_SAM"; break;
157             case TBX_VCF: type = "TBX_VCF"; break;
158             case TBX_UCSC: type = "TBX_UCSC"; break;
159             default: type = "TBX_GENERIC"; break;
160         }
161         fprintf(stderr, "[E::%s] failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"\n", __func__, type, str->s);
162         return -1;
163     }
164 }
165 
tbx_readrec(BGZF * fp,void * tbxv,void * sv,int * tid,int * beg,int * end)166 int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end)
167 {
168     tbx_t *tbx = (tbx_t *) tbxv;
169     kstring_t *s = (kstring_t *) sv;
170     int ret;
171     if ((ret = bgzf_getline(fp, '\n', s)) >= 0) {
172         tbx_intv_t intv;
173         get_intv(tbx, s, &intv, 0);
174         *tid = intv.tid; *beg = intv.beg; *end = intv.end;
175     }
176     return ret;
177 }
178 
tbx_set_meta(tbx_t * tbx)179 void tbx_set_meta(tbx_t *tbx)
180 {
181     int i, l = 0, l_nm;
182     uint32_t x[7];
183     char **name;
184     uint8_t *meta;
185     khint_t k;
186     khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
187 
188     memcpy(x, &tbx->conf, 24);
189     name = (char**)malloc(sizeof(char*) * kh_size(d));
190     for (k = kh_begin(d), l = 0; k != kh_end(d); ++k) {
191         if (!kh_exist(d, k)) continue;
192         name[kh_val(d, k)] = (char*)kh_key(d, k);
193         l += strlen(kh_key(d, k)) + 1; // +1 to include '\0'
194     }
195     l_nm = x[6] = l;
196     meta = (uint8_t*)malloc(l_nm + 28);
197     if (ed_is_big())
198         for (i = 0; i < 7; ++i)
199             x[i] = ed_swap_4(x[i]);
200     memcpy(meta, x, 28);
201     for (l = 28, i = 0; i < (int)kh_size(d); ++i) {
202         int x = strlen(name[i]) + 1;
203         memcpy(meta + l, name[i], x);
204         l += x;
205     }
206     free(name);
207     hts_idx_set_meta(tbx->idx, l, meta, 0);
208 }
209 
tbx_index(BGZF * fp,int min_shift,const tbx_conf_t * conf)210 tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
211 {
212     tbx_t *tbx;
213     kstring_t str;
214     int ret, first = 0, n_lvls, fmt;
215     int64_t lineno = 0;
216     uint64_t last_off = 0;
217     tbx_intv_t intv;
218 
219     str.s = 0; str.l = str.m = 0;
220     tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
221     tbx->conf = *conf;
222     if (min_shift > 0) n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3, fmt = HTS_FMT_CSI;
223     else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI;
224     while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
225         ++lineno;
226         if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) {
227             last_off = bgzf_tell(fp);
228             continue;
229         }
230         if (first == 0) {
231             tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);
232             first = 1;
233         }
234         get_intv(tbx, &str, &intv, 1);
235         ret = hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end, bgzf_tell(fp), 1);
236         if (ret < 0)
237         {
238             free(str.s);
239             tbx_destroy(tbx);
240             return NULL;
241         }
242     }
243     if ( !tbx->idx ) tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);   // empty file
244     if ( !tbx->dict ) tbx->dict = kh_init(s2i);
245     hts_idx_finish(tbx->idx, bgzf_tell(fp));
246     tbx_set_meta(tbx);
247     free(str.s);
248     return tbx;
249 }
250 
tbx_destroy(tbx_t * tbx)251 void tbx_destroy(tbx_t *tbx)
252 {
253     khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
254     if (d != NULL)
255     {
256         khint_t k;
257         for (k = kh_begin(d); k != kh_end(d); ++k)
258             if (kh_exist(d, k)) free((char*)kh_key(d, k));
259     }
260     hts_idx_destroy(tbx->idx);
261     kh_destroy(s2i, d);
262     free(tbx);
263 }
264 
tbx_index_build3(const char * fn,const char * fnidx,int min_shift,int n_threads,const tbx_conf_t * conf)265 int tbx_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads, const tbx_conf_t *conf)
266 {
267     tbx_t *tbx;
268     BGZF *fp;
269     int ret;
270     if ((fp = bgzf_open(fn, "r")) == 0) return -1;
271     if ( n_threads ) bgzf_mt(fp, n_threads, 256);
272     if ( bgzf_compression(fp) != bgzf ) { bgzf_close(fp); return -1; }
273     tbx = tbx_index(fp, min_shift, conf);
274     bgzf_close(fp);
275     if ( !tbx ) return -1;
276     ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0? HTS_FMT_CSI : HTS_FMT_TBI);
277     tbx_destroy(tbx);
278     return ret;
279 }
280 
tbx_index_build2(const char * fn,const char * fnidx,int min_shift,const tbx_conf_t * conf)281 int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf)
282 {
283     return tbx_index_build3(fn, fnidx, min_shift, 0, conf);
284 }
285 
tbx_index_build(const char * fn,int min_shift,const tbx_conf_t * conf)286 int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf)
287 {
288     return tbx_index_build3(fn, NULL, min_shift, 0, conf);
289 }
290 
tbx_index_load2(const char * fn,const char * fnidx)291 tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
292 {
293     tbx_t *tbx;
294     uint8_t *meta;
295     char *nm, *p;
296     uint32_t l_meta, l_nm;
297     tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
298     tbx->idx = fnidx? hts_idx_load2(fn, fnidx) : hts_idx_load(fn, HTS_FMT_TBI);
299     if ( !tbx->idx )
300     {
301         free(tbx);
302         return NULL;
303     }
304     meta = hts_idx_get_meta(tbx->idx, &l_meta);
305     if ( !meta || l_meta < 28) goto invalid;
306 
307     tbx->conf.preset = le_to_i32(&meta[0]);
308     tbx->conf.sc = le_to_i32(&meta[4]);
309     tbx->conf.bc = le_to_i32(&meta[8]);
310     tbx->conf.ec = le_to_i32(&meta[12]);
311     tbx->conf.meta_char = le_to_i32(&meta[16]);
312     tbx->conf.line_skip = le_to_i32(&meta[20]);
313     l_nm = le_to_u32(&meta[24]);
314     if (l_nm > l_meta - 28) goto invalid;
315 
316     p = nm = (char*)meta + 28;
317     // This assumes meta is NUL-terminated, so we can merrily strlen away.
318     // hts_idx_load_local() assures this for us by adding a NUL on the end
319     // of whatever it reads.
320     for (; p - nm < l_nm; p += strlen(p) + 1) {
321         if (get_tid(tbx, p, 1) < 0) {
322             fprintf(stderr, "[E::%s] %s\n", __func__, strerror(errno));
323             goto fail;
324         }
325     }
326     return tbx;
327 
328  invalid:
329     if (hts_verbose >= 1) {
330         fprintf(stderr, "[E::%s] Invalid index header for %s\n",
331                 __func__, fnidx ? fnidx : fn);
332     }
333  fail:
334     tbx_destroy(tbx);
335     return NULL;
336 }
337 
tbx_index_load(const char * fn)338 tbx_t *tbx_index_load(const char *fn)
339 {
340     return tbx_index_load2(fn, NULL);
341 }
342 
tbx_seqnames(tbx_t * tbx,int * n)343 const char **tbx_seqnames(tbx_t *tbx, int *n)
344 {
345     khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
346     if (d == NULL)
347     {
348         *n = 0;
349         return NULL;
350     }
351     int tid, m = kh_size(d);
352     const char **names = (const char**) calloc(m,sizeof(const char*));
353     khint_t k;
354     for (k=kh_begin(d); k<kh_end(d); k++)
355     {
356         if ( !kh_exist(d,k) ) continue;
357         tid = kh_val(d,k);
358         assert( tid<m );
359         names[tid] = kh_key(d,k);
360     }
361     // sanity check: there should be no gaps
362     for (tid=0; tid<m; tid++)
363         assert(names[tid]);
364     *n = m;
365     return names;
366 }
367 
368