1 /* faidx.c -- FASTA random access.
2
3 Copyright (C) 2008, 2009, 2013-2017 Genome Research Ltd.
4 Portions copyright (C) 2011 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <config.h>
27
28 #include <ctype.h>
29 #include <string.h>
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <inttypes.h>
33 #include <errno.h>
34 #include <limits.h>
35 #include <unistd.h>
36 #include <assert.h>
37
38 #include "htslib/bgzf.h"
39 #include "htslib/faidx.h"
40 #include "htslib/hfile.h"
41 #include "htslib/khash.h"
42 #include "htslib/kstring.h"
43 #include "hts_internal.h"
44
45 typedef struct {
46 int32_t line_len, line_blen;
47 int64_t len;
48 uint64_t offset;
49 } faidx1_t;
50 KHASH_MAP_INIT_STR(s, faidx1_t)
51
52 struct __faidx_t {
53 BGZF *bgzf;
54 int n, m;
55 char **name;
56 khash_t(s) *hash;
57 };
58
59 #ifndef kroundup32
60 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
61 #endif
62
fai_insert_index(faidx_t * idx,const char * name,int64_t len,int line_len,int line_blen,uint64_t offset)63 static inline int fai_insert_index(faidx_t *idx, const char *name, int64_t len, int line_len, int line_blen, uint64_t offset)
64 {
65 if (!name) {
66 fprintf(stderr, "[fai_build_core] malformed line\n");
67 return -1;
68 }
69
70 char *name_key = strdup(name);
71 int absent;
72 khint_t k = kh_put(s, idx->hash, name_key, &absent);
73 faidx1_t *v = &kh_value(idx->hash, k);
74
75 if (! absent) {
76 fprintf(stderr, "[fai_build_core] ignoring duplicate sequence \"%s\" at byte offset %"PRIu64"\n", name, offset);
77 free(name_key);
78 return 0;
79 }
80
81 if (idx->n == idx->m) {
82 char **tmp;
83 idx->m = idx->m? idx->m<<1 : 16;
84 if (!(tmp = (char**)realloc(idx->name, sizeof(char*) * idx->m))) {
85 fprintf(stderr, "[fai_build_core] out of memory\n");
86 return -1;
87 }
88 idx->name = tmp;
89 }
90 idx->name[idx->n++] = name_key;
91 v->len = len;
92 v->line_len = line_len;
93 v->line_blen = line_blen;
94 v->offset = offset;
95
96 return 0;
97 }
98
fai_build_core(BGZF * bgzf)99 faidx_t *fai_build_core(BGZF *bgzf)
100 {
101 kstring_t name = { 0, 0, NULL };
102 int c;
103 int line_len, line_blen, state;
104 int l1, l2;
105 faidx_t *idx;
106 uint64_t offset;
107 int64_t len;
108
109 idx = (faidx_t*)calloc(1, sizeof(faidx_t));
110 idx->hash = kh_init(s);
111 len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
112 while ( (c=bgzf_getc(bgzf))>=0 ) {
113 if (c == '\n') { // an empty line
114 if (state == 1) {
115 offset = bgzf_utell(bgzf);
116 continue;
117 } else if ((state == 0 && len < 0) || state == 2) continue;
118 else if (state == 0) { state = 2; continue; }
119 }
120 if (c == '>') { // fasta header
121 if (len >= 0) {
122 if (fai_insert_index(idx, name.s, len, line_len, line_blen, offset) != 0)
123 goto fail;
124 }
125
126 name.l = 0;
127 while ((c = bgzf_getc(bgzf)) >= 0)
128 if (! isspace(c)) kputc_(c, &name);
129 else if (name.l > 0 || c == '\n') break;
130 kputsn("", 0, &name);
131
132 if ( c<0 ) {
133 fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
134 goto fail;
135 }
136 if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
137 state = 1; len = 0;
138 offset = bgzf_utell(bgzf);
139 } else {
140 if (state == 3) {
141 fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name.s);
142 goto fail;
143 }
144 if (state == 2) state = 3;
145 l1 = l2 = 0;
146 do {
147 ++l1;
148 if (isgraph(c)) ++l2;
149 } while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
150 if (state == 3 && l2) {
151 fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name.s);
152 goto fail;
153 }
154 ++l1; len += l2;
155 if (state == 1) line_len = l1, line_blen = l2, state = 0;
156 else if (state == 0) {
157 if (l1 != line_len || l2 != line_blen) state = 2;
158 }
159 }
160 }
161
162 if (len >= 0) {
163 if (fai_insert_index(idx, name.s, len, line_len, line_blen, offset) != 0)
164 goto fail;
165 } else {
166 goto fail;
167 }
168
169 free(name.s);
170 return idx;
171
172 fail:
173 free(name.s);
174 fai_destroy(idx);
175 return NULL;
176 }
177
fai_save(const faidx_t * fai,hFILE * fp)178 static int fai_save(const faidx_t *fai, hFILE *fp) {
179 khint_t k;
180 int i;
181 char buf[96]; // Must be big enough for format below.
182
183 for (i = 0; i < fai->n; ++i) {
184 faidx1_t x;
185 k = kh_get(s, fai->hash, fai->name[i]);
186 assert(k < kh_end(fai->hash));
187 x = kh_value(fai->hash, k);
188 snprintf(buf, sizeof(buf),
189 "\t%"PRId64"\t%"PRIu64"\t%"PRId32"\t%"PRId32"\n",
190 x.len, x.offset, x.line_blen, x.line_len);
191 if (hputs(fai->name[i], fp) != 0) return -1;
192 if (hputs(buf, fp) != 0) return -1;
193 }
194 return 0;
195 }
196
fai_read(hFILE * fp,const char * fname)197 static faidx_t *fai_read(hFILE *fp, const char *fname)
198 {
199 faidx_t *fai;
200 char *buf = NULL, *p;
201 int line_len, line_blen, n;
202 int64_t len;
203 uint64_t offset;
204 ssize_t l, lnum = 1;
205
206 fai = (faidx_t*)calloc(1, sizeof(faidx_t));
207 if (!fai) return NULL;
208
209 fai->hash = kh_init(s);
210 if (!fai->hash) goto fail;
211
212 buf = (char*)calloc(0x10000, 1);
213 if (!buf) goto fail;
214
215 while ((l = hgetln(buf, 0x10000, fp)) > 0) {
216 for (p = buf; *p && !isspace_c(*p); ++p);
217 if (p - buf < l) {
218 *p = 0; ++p;
219 }
220 n = sscanf(p, "%"SCNd64"%"SCNu64"%d%d", &len, &offset, &line_blen, &line_len);
221 if (n != 4) {
222 if (hts_verbose > 1)
223 fprintf(stderr, "[fai_load] couldn't understand FAI %s line %zd\n",
224 fname, lnum);
225 goto fail;
226 }
227 if (fai_insert_index(fai, buf, len, line_len, line_blen, offset) != 0) {
228 goto fail;
229 }
230 if (buf[l - 1] == '\n') ++lnum;
231 }
232
233 if (l < 0) {
234 if (hts_verbose > 1)
235 fprintf(stderr, "[fai_load] error while reading \"%s\": %s\n",
236 fname, strerror(errno));
237 goto fail;
238 }
239 free(buf);
240 return fai;
241
242 fail:
243 free(buf);
244 fai_destroy(fai);
245 return NULL;
246 }
247
fai_destroy(faidx_t * fai)248 void fai_destroy(faidx_t *fai)
249 {
250 int i;
251 if (!fai) return;
252 for (i = 0; i < fai->n; ++i) free(fai->name[i]);
253 free(fai->name);
254 kh_destroy(s, fai->hash);
255 if (fai->bgzf) bgzf_close(fai->bgzf);
256 free(fai);
257 }
258
fai_build3(const char * fn,const char * fnfai,const char * fngzi)259 int fai_build3(const char *fn, const char *fnfai, const char *fngzi)
260 {
261 kstring_t fai_kstr = { 0, 0, NULL };
262 kstring_t gzi_kstr = { 0, 0, NULL };
263 BGZF *bgzf = NULL;
264 hFILE *fp = NULL;
265 faidx_t *fai = NULL;
266 int save_errno, res;
267
268 if (!fnfai) {
269 if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
270 fnfai = fai_kstr.s;
271 }
272 if (!fngzi) {
273 if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
274 fngzi = gzi_kstr.s;
275 }
276
277 bgzf = bgzf_open(fn, "r");
278 if ( !bgzf ) {
279 if (hts_verbose > 1)
280 fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn);
281 goto fail;
282 }
283 if ( bgzf->is_compressed ) {
284 if (bgzf_index_build_init(bgzf) != 0) {
285 if (hts_verbose > 1)
286 fprintf(stderr, "[fai_build] fail to allocate bgzf index");
287 goto fail;
288 }
289 }
290 fai = fai_build_core(bgzf);
291 if ( !fai )
292 {
293 if ( bgzf->is_compressed && bgzf->is_gzip && hts_verbose > 1)
294 fprintf(stderr, "Cannot index files compressed with gzip, please use bgzip\n");
295 goto fail;
296 }
297 if ( bgzf->is_compressed ) {
298 if (bgzf_index_dump(bgzf, fngzi, NULL) < 0) {
299 if (hts_verbose > 1)
300 fprintf(stderr, "[fai_build] fail to make bgzf index %s\n",
301 fngzi);
302 goto fail;
303 }
304 }
305 res = bgzf_close(bgzf);
306 bgzf = NULL;
307 if (res < 0) {
308 if (hts_verbose > 1)
309 fprintf(stderr, "[fai_build] Error on closing %s : %s\n",
310 fn, strerror(errno));
311 goto fail;
312 }
313 fp = hopen(fnfai, "wb");
314 if ( !fp ) {
315 if (hts_verbose > 1)
316 fprintf(stderr, "[fai_build] fail to open FASTA index %s : %s\n",
317 fnfai, strerror(errno));
318 goto fail;
319 }
320 if (fai_save(fai, fp) != 0) {
321 if (hts_verbose > 1)
322 fprintf(stderr, "[fai_build] fail to write FASTA index %s : %s\n",
323 fnfai, strerror(errno));
324 goto fail;
325 }
326 if (hclose(fp) != 0) {
327 if (hts_verbose > 1)
328 fprintf(stderr, "[fai_build] fail on closing FASTA index %s : %s\n",
329 fnfai, strerror(errno));
330 goto fail;
331 }
332
333 free(fai_kstr.s);
334 free(gzi_kstr.s);
335 fai_destroy(fai);
336 return 0;
337
338 fail:
339 save_errno = errno;
340 free(fai_kstr.s);
341 free(gzi_kstr.s);
342 bgzf_close(bgzf);
343 fai_destroy(fai);
344 errno = save_errno;
345 return -1;
346 }
347
fai_build(const char * fn)348 int fai_build(const char *fn) {
349 return fai_build3(fn, NULL, NULL);
350 }
351
fai_load3(const char * fn,const char * fnfai,const char * fngzi,int flags)352 faidx_t *fai_load3(const char *fn, const char *fnfai, const char *fngzi,
353 int flags)
354 {
355 kstring_t fai_kstr = { 0, 0, NULL };
356 kstring_t gzi_kstr = { 0, 0, NULL };
357 hFILE *fp = NULL;
358 faidx_t *fai = NULL;
359 int res;
360
361 if (fn == NULL)
362 return NULL;
363
364 if (fnfai == NULL) {
365 if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
366 fnfai = fai_kstr.s;
367 }
368 if (fngzi == NULL) {
369 if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
370 fngzi = gzi_kstr.s;
371 }
372
373 fp = hopen(fnfai, "rb");
374
375 if (fp == 0) {
376 if (!(flags & FAI_CREATE) || errno != ENOENT) {
377 if (hts_verbose >= 1) {
378 fprintf(stderr, "[fai_load] failed to open FASTA index %s: %s\n",
379 fnfai, strerror(errno));
380 }
381 goto fail;
382 }
383
384 if (hts_verbose >= 1) {
385 fprintf(stderr, "[fai_load] build FASTA index.\n");
386 }
387 if (fai_build3(fn, fnfai, fngzi) < 0) {
388 goto fail;
389 }
390
391 fp = hopen(fnfai, "rb");
392 if (fp == 0) {
393 if (hts_verbose >= 1) {
394 fprintf(stderr, "[fai_load] failed to open FASTA index %s: %s\n",
395 fnfai, strerror(errno));
396 }
397 goto fail;
398 }
399 }
400
401 fai = fai_read(fp, fnfai);
402 if (fai == NULL) {
403 if (hts_verbose >= 1)
404 fprintf(stderr, "[fai_load] failed to read FASTA index %s\n",
405 fnfai);
406 goto fail;
407 }
408
409 res = hclose(fp);
410 fp = NULL;
411 if (res < 0) {
412 if (hts_verbose >= 1)
413 fprintf(stderr, "[fai_load] fail on closing FASTA index %s : %s\n",
414 fnfai, strerror(errno));
415 goto fail;
416 }
417
418 fai->bgzf = bgzf_open(fn, "rb");
419 if (fai->bgzf == 0) {
420 if (hts_verbose >= 1)
421 fprintf(stderr, "[fai_load] fail to open FASTA file %s.\n", fn);
422 goto fail;
423 }
424 if ( fai->bgzf->is_compressed==1 )
425 {
426 if ( bgzf_index_load(fai->bgzf, fngzi, NULL) < 0 )
427 {
428 if (hts_verbose >= 1)
429 fprintf(stderr, "[fai_load] failed to load .gzi index: %s\n",
430 fngzi);
431 goto fail;
432 }
433 }
434 free(fai_kstr.s);
435 free(gzi_kstr.s);
436 return fai;
437
438 fail:
439 if (fai) fai_destroy(fai);
440 if (fp) hclose_abruptly(fp);
441 free(fai_kstr.s);
442 free(gzi_kstr.s);
443 return NULL;
444 }
445
fai_load(const char * fn)446 faidx_t *fai_load(const char *fn)
447 {
448 return fai_load3(fn, NULL, NULL, FAI_CREATE);
449 }
450
fai_retrieve(const faidx_t * fai,const faidx1_t * val,long beg,long end,int * len)451 static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
452 long beg, long end, int *len) {
453 char *s;
454 size_t l;
455 int c = 0;
456 int ret = bgzf_useek(fai->bgzf,
457 val->offset
458 + beg / val->line_blen * val->line_len
459 + beg % val->line_blen, SEEK_SET);
460
461 if (ret < 0) {
462 *len = -1;
463 if (hts_verbose >= 1)
464 fprintf(stderr, "[fai_fetch] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
465 return NULL;
466 }
467
468 l = 0;
469 s = (char*)malloc((size_t) end - beg + 2);
470 if (!s) {
471 *len = -1;
472 return NULL;
473 }
474
475 while ( l < end - beg && (c=bgzf_getc(fai->bgzf))>=0 )
476 if (isgraph(c)) s[l++] = c;
477 if (c < 0) {
478 if (hts_verbose >= 1) {
479 fprintf(stderr, "[E::fai_fetch] fai_fetch failed : %s\n",
480 c == -1 ? "unexpected end of file" : "error reading file");
481 }
482 free(s);
483 *len = -1;
484 return NULL;
485 }
486
487 s[l] = '\0';
488 *len = l < INT_MAX ? l : INT_MAX;
489 return s;
490 }
491
fai_fetch(const faidx_t * fai,const char * str,int * len)492 char *fai_fetch(const faidx_t *fai, const char *str, int *len)
493 {
494 char *s, *ep;
495 size_t i, l, k, name_end;
496 khiter_t iter;
497 faidx1_t val;
498 khash_t(s) *h;
499 long beg, end;
500
501 beg = end = -1;
502 h = fai->hash;
503 name_end = l = strlen(str);
504 s = (char*)malloc(l+1);
505 if (!s) {
506 *len = -1;
507 return NULL;
508 }
509
510 // remove space
511 for (i = k = 0; i < l; ++i)
512 if (!isspace_c(str[i])) s[k++] = str[i];
513 s[k] = 0;
514 name_end = l = k;
515 // determine the sequence name
516 for (i = l; i > 0; --i) if (s[i - 1] == ':') break; // look for colon from the end
517 if (i > 0) name_end = i - 1;
518 if (name_end < l) { // check if this is really the end
519 int n_hyphen = 0;
520 for (i = name_end + 1; i < l; ++i) {
521 if (s[i] == '-') ++n_hyphen;
522 else if (!isdigit_c(s[i]) && s[i] != ',') break;
523 }
524 if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
525 s[name_end] = 0;
526 iter = kh_get(s, h, s);
527 if (iter == kh_end(h)) { // cannot find the sequence name
528 iter = kh_get(s, h, str); // try str as the name
529 if (iter != kh_end(h)) {
530 s[name_end] = ':';
531 name_end = l;
532 }
533 }
534 } else iter = kh_get(s, h, str);
535 if(iter == kh_end(h)) {
536 fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str);
537 free(s);
538 *len = -2;
539 return 0;
540 }
541 val = kh_value(h, iter);
542 // parse the interval
543 if (name_end < l) {
544 int save_errno = errno;
545 errno = 0;
546 for (i = k = name_end + 1; i < l; ++i)
547 if (s[i] != ',') s[k++] = s[i];
548 s[k] = 0;
549 if (s[name_end + 1] == '-') {
550 beg = 0;
551 i = name_end + 2;
552 } else {
553 beg = strtol(s + name_end + 1, &ep, 10);
554 for (i = ep - s; i < k;) if (s[i++] == '-') break;
555 }
556 end = i < k? strtol(s + i, &ep, 10) : val.len;
557 if (beg > 0) --beg;
558 // Check for out of range numbers. Only going to be a problem on
559 // 32-bit platforms with >2Gb sequence length.
560 if (errno == ERANGE && (uint64_t) val.len > LONG_MAX) {
561 fprintf(stderr, "[fai_fetch] Positions in range %s are too large"
562 " for this platform.\n", s);
563 free(s);
564 *len = -2;
565 return NULL;
566 }
567 errno = save_errno;
568 } else beg = 0, end = val.len;
569 if (beg >= val.len) beg = val.len;
570 if (end >= val.len) end = val.len;
571 if (beg > end) beg = end;
572 free(s);
573
574 // now retrieve the sequence
575 return fai_retrieve(fai, &val, beg, end, len);
576 }
577
faidx_fetch_nseq(const faidx_t * fai)578 int faidx_fetch_nseq(const faidx_t *fai)
579 {
580 return fai->n;
581 }
582
faidx_nseq(const faidx_t * fai)583 int faidx_nseq(const faidx_t *fai)
584 {
585 return fai->n;
586 }
587
faidx_iseq(const faidx_t * fai,int i)588 const char *faidx_iseq(const faidx_t *fai, int i)
589 {
590 return fai->name[i];
591 }
592
faidx_seq_len(const faidx_t * fai,const char * seq)593 int faidx_seq_len(const faidx_t *fai, const char *seq)
594 {
595 khint_t k = kh_get(s, fai->hash, seq);
596 if ( k == kh_end(fai->hash) ) return -1;
597 return kh_val(fai->hash, k).len;
598 }
599
faidx_fetch_seq(const faidx_t * fai,const char * c_name,int p_beg_i,int p_end_i,int * len)600 char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
601 {
602 khiter_t iter;
603 faidx1_t val;
604
605 // Adjust position
606 iter = kh_get(s, fai->hash, c_name);
607 if (iter == kh_end(fai->hash))
608 {
609 *len = -2;
610 fprintf(stderr, "[fai_fetch_seq] The sequence \"%s\" not found\n", c_name);
611 return NULL;
612 }
613 val = kh_value(fai->hash, iter);
614 if(p_end_i < p_beg_i) p_beg_i = p_end_i;
615 if(p_beg_i < 0) p_beg_i = 0;
616 else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
617 if(p_end_i < 0) p_end_i = 0;
618 else if(val.len <= p_end_i) p_end_i = val.len - 1;
619
620 // Now retrieve the sequence
621 return fai_retrieve(fai, &val, p_beg_i, (long) p_end_i + 1, len);
622 }
623
faidx_has_seq(const faidx_t * fai,const char * seq)624 int faidx_has_seq(const faidx_t *fai, const char *seq)
625 {
626 khiter_t iter = kh_get(s, fai->hash, seq);
627 if (iter == kh_end(fai->hash)) return 0;
628 return 1;
629 }
630
631