1 /* tbx.c -- tabix API functions.
2
3 Copyright (C) 2009, 2010, 2012-2015, 2017 Genome Research Ltd.
4 Copyright (C) 2010-2012 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <config.h>
27
28 #include <stdlib.h>
29 #include <string.h>
30 #include <stdio.h>
31 #include <assert.h>
32 #include <errno.h>
33 #include "htslib/tbx.h"
34 #include "htslib/bgzf.h"
35 #include "htslib/hts_endian.h"
36 #include "hts_internal.h"
37
38 #include "htslib/khash.h"
39 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
40
41 const tbx_conf_t tbx_conf_gff = { 0, 1, 4, 5, '#', 0 };
42 const tbx_conf_t tbx_conf_bed = { TBX_UCSC, 1, 2, 3, '#', 0 };
43 const tbx_conf_t tbx_conf_psltbl = { TBX_UCSC, 15, 17, 18, '#', 0 };
44 const tbx_conf_t tbx_conf_sam = { TBX_SAM, 3, 4, 0, '@', 0 };
45 const tbx_conf_t tbx_conf_vcf = { TBX_VCF, 1, 2, 0, '#', 0 };
46
47 typedef struct {
48 int64_t beg, end;
49 char *ss, *se;
50 int tid;
51 } tbx_intv_t;
52
get_tid(tbx_t * tbx,const char * ss,int is_add)53 static inline int get_tid(tbx_t *tbx, const char *ss, int is_add)
54 {
55 khint_t k;
56 khash_t(s2i) *d;
57 if (tbx->dict == 0) tbx->dict = kh_init(s2i);
58 d = (khash_t(s2i)*)tbx->dict;
59 if (is_add) {
60 int absent;
61 k = kh_put(s2i, d, ss, &absent);
62 if (absent < 0) {
63 return -1; // Out of memory
64 } else if (absent) {
65 char *ss_dup = strdup(ss);
66 if (ss_dup) {
67 kh_key(d, k) = ss_dup;
68 kh_val(d, k) = kh_size(d) - 1;
69 } else {
70 kh_del(s2i, d, k);
71 return -1; // Out of memory
72 }
73 }
74 } else k = kh_get(s2i, d, ss);
75 return k == kh_end(d)? -1 : kh_val(d, k);
76 }
77
tbx_name2id(tbx_t * tbx,const char * ss)78 int tbx_name2id(tbx_t *tbx, const char *ss)
79 {
80 return get_tid(tbx, ss, 0);
81 }
82
tbx_parse1(const tbx_conf_t * conf,int len,char * line,tbx_intv_t * intv)83 int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv)
84 {
85 int i, b = 0, id = 1, ncols = 0;
86 char *s;
87 intv->ss = intv->se = 0; intv->beg = intv->end = -1;
88 for (i = 0; i <= len; ++i) {
89 if (line[i] == '\t' || line[i] == 0) {
90 ++ncols;
91 if (id == conf->sc) {
92 intv->ss = line + b; intv->se = line + i;
93 } else if (id == conf->bc) {
94 // here ->beg is 0-based.
95 intv->beg = intv->end = strtol(line + b, &s, 0);
96 if ( s==line+b ) return -1; // expected int
97 if (!(conf->preset&TBX_UCSC)) --intv->beg;
98 else ++intv->end;
99 if (intv->beg < 0) intv->beg = 0;
100 if (intv->end < 1) intv->end = 1;
101 } else {
102 if ((conf->preset&0xffff) == TBX_GENERIC) {
103 if (id == conf->ec)
104 {
105 intv->end = strtol(line + b, &s, 0);
106 if ( s==line+b ) return -1; // expected int
107 }
108 } else if ((conf->preset&0xffff) == TBX_SAM) {
109 if (id == 6) { // CIGAR
110 int l = 0;
111 char *t;
112 for (s = line + b; s < line + i;) {
113 long x = strtol(s, &t, 10);
114 char op = toupper_c(*t);
115 if (op == 'M' || op == 'D' || op == 'N') l += x;
116 s = t + 1;
117 }
118 if (l == 0) l = 1;
119 intv->end = intv->beg + l;
120 }
121 } else if ((conf->preset&0xffff) == TBX_VCF) {
122 if (id == 4) {
123 if (b < i) intv->end = intv->beg + (i - b);
124 } else if (id == 8) { // look for "END="
125 int c = line[i];
126 line[i] = 0;
127 s = strstr(line + b, "END=");
128 if (s == line + b) s += 4;
129 else if (s) {
130 s = strstr(line + b, ";END=");
131 if (s) s += 5;
132 }
133 if (s) intv->end = strtol(s, &s, 0);
134 line[i] = c;
135 }
136 }
137 }
138 b = i + 1;
139 ++id;
140 }
141 }
142 if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
143 return 0;
144 }
145
get_intv(tbx_t * tbx,kstring_t * str,tbx_intv_t * intv,int is_add)146 static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_add)
147 {
148 if (tbx_parse1(&tbx->conf, str->l, str->s, intv) == 0) {
149 int c = *intv->se;
150 *intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c;
151 return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1;
152 } else {
153 char *type = NULL;
154 switch (tbx->conf.preset&0xffff)
155 {
156 case TBX_SAM: type = "TBX_SAM"; break;
157 case TBX_VCF: type = "TBX_VCF"; break;
158 case TBX_UCSC: type = "TBX_UCSC"; break;
159 default: type = "TBX_GENERIC"; break;
160 }
161 fprintf(stderr, "[E::%s] failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"\n", __func__, type, str->s);
162 return -1;
163 }
164 }
165
tbx_readrec(BGZF * fp,void * tbxv,void * sv,int * tid,int * beg,int * end)166 int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end)
167 {
168 tbx_t *tbx = (tbx_t *) tbxv;
169 kstring_t *s = (kstring_t *) sv;
170 int ret;
171 if ((ret = bgzf_getline(fp, '\n', s)) >= 0) {
172 tbx_intv_t intv;
173 get_intv(tbx, s, &intv, 0);
174 *tid = intv.tid; *beg = intv.beg; *end = intv.end;
175 }
176 return ret;
177 }
178
tbx_set_meta(tbx_t * tbx)179 void tbx_set_meta(tbx_t *tbx)
180 {
181 int i, l = 0, l_nm;
182 uint32_t x[7];
183 char **name;
184 uint8_t *meta;
185 khint_t k;
186 khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
187
188 memcpy(x, &tbx->conf, 24);
189 name = (char**)malloc(sizeof(char*) * kh_size(d));
190 for (k = kh_begin(d), l = 0; k != kh_end(d); ++k) {
191 if (!kh_exist(d, k)) continue;
192 name[kh_val(d, k)] = (char*)kh_key(d, k);
193 l += strlen(kh_key(d, k)) + 1; // +1 to include '\0'
194 }
195 l_nm = x[6] = l;
196 meta = (uint8_t*)malloc(l_nm + 28);
197 if (ed_is_big())
198 for (i = 0; i < 7; ++i)
199 x[i] = ed_swap_4(x[i]);
200 memcpy(meta, x, 28);
201 for (l = 28, i = 0; i < (int)kh_size(d); ++i) {
202 int x = strlen(name[i]) + 1;
203 memcpy(meta + l, name[i], x);
204 l += x;
205 }
206 free(name);
207 hts_idx_set_meta(tbx->idx, l, meta, 0);
208 }
209
tbx_index(BGZF * fp,int min_shift,const tbx_conf_t * conf)210 tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
211 {
212 tbx_t *tbx;
213 kstring_t str;
214 int ret, first = 0, n_lvls, fmt;
215 int64_t lineno = 0;
216 uint64_t last_off = 0;
217 tbx_intv_t intv;
218
219 str.s = 0; str.l = str.m = 0;
220 tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
221 tbx->conf = *conf;
222 if (min_shift > 0) n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3, fmt = HTS_FMT_CSI;
223 else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI;
224 while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
225 ++lineno;
226 if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) {
227 last_off = bgzf_tell(fp);
228 continue;
229 }
230 if (first == 0) {
231 tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);
232 first = 1;
233 }
234 get_intv(tbx, &str, &intv, 1);
235 ret = hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end, bgzf_tell(fp), 1);
236 if (ret < 0)
237 {
238 free(str.s);
239 tbx_destroy(tbx);
240 return NULL;
241 }
242 }
243 if ( !tbx->idx ) tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); // empty file
244 if ( !tbx->dict ) tbx->dict = kh_init(s2i);
245 hts_idx_finish(tbx->idx, bgzf_tell(fp));
246 tbx_set_meta(tbx);
247 free(str.s);
248 return tbx;
249 }
250
tbx_destroy(tbx_t * tbx)251 void tbx_destroy(tbx_t *tbx)
252 {
253 khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
254 if (d != NULL)
255 {
256 khint_t k;
257 for (k = kh_begin(d); k != kh_end(d); ++k)
258 if (kh_exist(d, k)) free((char*)kh_key(d, k));
259 }
260 hts_idx_destroy(tbx->idx);
261 kh_destroy(s2i, d);
262 free(tbx);
263 }
264
tbx_index_build3(const char * fn,const char * fnidx,int min_shift,int n_threads,const tbx_conf_t * conf)265 int tbx_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads, const tbx_conf_t *conf)
266 {
267 tbx_t *tbx;
268 BGZF *fp;
269 int ret;
270 if ((fp = bgzf_open(fn, "r")) == 0) return -1;
271 if ( n_threads ) bgzf_mt(fp, n_threads, 256);
272 if ( bgzf_compression(fp) != bgzf ) { bgzf_close(fp); return -1; }
273 tbx = tbx_index(fp, min_shift, conf);
274 bgzf_close(fp);
275 if ( !tbx ) return -1;
276 ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0? HTS_FMT_CSI : HTS_FMT_TBI);
277 tbx_destroy(tbx);
278 return ret;
279 }
280
tbx_index_build2(const char * fn,const char * fnidx,int min_shift,const tbx_conf_t * conf)281 int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf)
282 {
283 return tbx_index_build3(fn, fnidx, min_shift, 0, conf);
284 }
285
tbx_index_build(const char * fn,int min_shift,const tbx_conf_t * conf)286 int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf)
287 {
288 return tbx_index_build3(fn, NULL, min_shift, 0, conf);
289 }
290
tbx_index_load2(const char * fn,const char * fnidx)291 tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
292 {
293 tbx_t *tbx;
294 uint8_t *meta;
295 char *nm, *p;
296 uint32_t l_meta, l_nm;
297 tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
298 tbx->idx = fnidx? hts_idx_load2(fn, fnidx) : hts_idx_load(fn, HTS_FMT_TBI);
299 if ( !tbx->idx )
300 {
301 free(tbx);
302 return NULL;
303 }
304 meta = hts_idx_get_meta(tbx->idx, &l_meta);
305 if ( !meta || l_meta < 28) goto invalid;
306
307 tbx->conf.preset = le_to_i32(&meta[0]);
308 tbx->conf.sc = le_to_i32(&meta[4]);
309 tbx->conf.bc = le_to_i32(&meta[8]);
310 tbx->conf.ec = le_to_i32(&meta[12]);
311 tbx->conf.meta_char = le_to_i32(&meta[16]);
312 tbx->conf.line_skip = le_to_i32(&meta[20]);
313 l_nm = le_to_u32(&meta[24]);
314 if (l_nm > l_meta - 28) goto invalid;
315
316 p = nm = (char*)meta + 28;
317 // This assumes meta is NUL-terminated, so we can merrily strlen away.
318 // hts_idx_load_local() assures this for us by adding a NUL on the end
319 // of whatever it reads.
320 for (; p - nm < l_nm; p += strlen(p) + 1) {
321 if (get_tid(tbx, p, 1) < 0) {
322 fprintf(stderr, "[E::%s] %s\n", __func__, strerror(errno));
323 goto fail;
324 }
325 }
326 return tbx;
327
328 invalid:
329 if (hts_verbose >= 1) {
330 fprintf(stderr, "[E::%s] Invalid index header for %s\n",
331 __func__, fnidx ? fnidx : fn);
332 }
333 fail:
334 tbx_destroy(tbx);
335 return NULL;
336 }
337
tbx_index_load(const char * fn)338 tbx_t *tbx_index_load(const char *fn)
339 {
340 return tbx_index_load2(fn, NULL);
341 }
342
tbx_seqnames(tbx_t * tbx,int * n)343 const char **tbx_seqnames(tbx_t *tbx, int *n)
344 {
345 khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
346 if (d == NULL)
347 {
348 *n = 0;
349 return NULL;
350 }
351 int tid, m = kh_size(d);
352 const char **names = (const char**) calloc(m,sizeof(const char*));
353 khint_t k;
354 for (k=kh_begin(d); k<kh_end(d); k++)
355 {
356 if ( !kh_exist(d,k) ) continue;
357 tid = kh_val(d,k);
358 assert( tid<m );
359 names[tid] = kh_key(d,k);
360 }
361 // sanity check: there should be no gaps
362 for (tid=0; tid<m; tid++)
363 assert(names[tid]);
364 *n = m;
365 return names;
366 }
367
368