1 /*  faidx.c -- FASTA random access.
2 
3     Copyright (C) 2008, 2009, 2013-2017 Genome Research Ltd.
4     Portions copyright (C) 2011 Broad Institute.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <config.h>
27 
28 #include <ctype.h>
29 #include <string.h>
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <inttypes.h>
33 #include <errno.h>
34 #include <limits.h>
35 #include <unistd.h>
36 #include <assert.h>
37 
38 #include "htslib/bgzf.h"
39 #include "htslib/faidx.h"
40 #include "htslib/hfile.h"
41 #include "htslib/khash.h"
42 #include "htslib/kstring.h"
43 #include "hts_internal.h"
44 
45 typedef struct {
46     int32_t line_len, line_blen;
47     int64_t len;
48     uint64_t offset;
49 } faidx1_t;
50 KHASH_MAP_INIT_STR(s, faidx1_t)
51 
52 struct __faidx_t {
53     BGZF *bgzf;
54     int n, m;
55     char **name;
56     khash_t(s) *hash;
57 };
58 
59 #ifndef kroundup32
60 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
61 #endif
62 
fai_insert_index(faidx_t * idx,const char * name,int64_t len,int line_len,int line_blen,uint64_t offset)63 static inline int fai_insert_index(faidx_t *idx, const char *name, int64_t len, int line_len, int line_blen, uint64_t offset)
64 {
65     if (!name) {
66         fprintf(stderr, "[fai_build_core] malformed line\n");
67         return -1;
68     }
69 
70     char *name_key = strdup(name);
71     int absent;
72     khint_t k = kh_put(s, idx->hash, name_key, &absent);
73     faidx1_t *v = &kh_value(idx->hash, k);
74 
75     if (! absent) {
76         fprintf(stderr, "[fai_build_core] ignoring duplicate sequence \"%s\" at byte offset %"PRIu64"\n", name, offset);
77         free(name_key);
78         return 0;
79     }
80 
81     if (idx->n == idx->m) {
82         char **tmp;
83         idx->m = idx->m? idx->m<<1 : 16;
84         if (!(tmp = (char**)realloc(idx->name, sizeof(char*) * idx->m))) {
85             fprintf(stderr, "[fai_build_core] out of memory\n");
86             return -1;
87         }
88         idx->name = tmp;
89     }
90     idx->name[idx->n++] = name_key;
91     v->len = len;
92     v->line_len = line_len;
93     v->line_blen = line_blen;
94     v->offset = offset;
95 
96     return 0;
97 }
98 
fai_build_core(BGZF * bgzf)99 faidx_t *fai_build_core(BGZF *bgzf)
100 {
101     kstring_t name = { 0, 0, NULL };
102     int c;
103     int line_len, line_blen, state;
104     int l1, l2;
105     faidx_t *idx;
106     uint64_t offset;
107     int64_t len;
108 
109     idx = (faidx_t*)calloc(1, sizeof(faidx_t));
110     idx->hash = kh_init(s);
111     len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
112     while ( (c=bgzf_getc(bgzf))>=0 ) {
113         if (c == '\n') { // an empty line
114             if (state == 1) {
115                 offset = bgzf_utell(bgzf);
116                 continue;
117             } else if ((state == 0 && len < 0) || state == 2) continue;
118             else if (state == 0) { state = 2; continue; }
119         }
120         if (c == '>') { // fasta header
121             if (len >= 0) {
122                 if (fai_insert_index(idx, name.s, len, line_len, line_blen, offset) != 0)
123                     goto fail;
124             }
125 
126             name.l = 0;
127             while ((c = bgzf_getc(bgzf)) >= 0)
128                 if (! isspace(c)) kputc_(c, &name);
129                 else if (name.l > 0 || c == '\n') break;
130             kputsn("", 0, &name);
131 
132             if ( c<0 ) {
133                 fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
134                 goto fail;
135             }
136             if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
137             state = 1; len = 0;
138             offset = bgzf_utell(bgzf);
139         } else {
140             if (state == 3) {
141                 fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name.s);
142                 goto fail;
143             }
144             if (state == 2) state = 3;
145             l1 = l2 = 0;
146             do {
147                 ++l1;
148                 if (isgraph(c)) ++l2;
149             } while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
150             if (state == 3 && l2) {
151                 fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name.s);
152                 goto fail;
153             }
154             ++l1; len += l2;
155             if (state == 1) line_len = l1, line_blen = l2, state = 0;
156             else if (state == 0) {
157                 if (l1 != line_len || l2 != line_blen) state = 2;
158             }
159         }
160     }
161 
162     if (len >= 0) {
163         if (fai_insert_index(idx, name.s, len, line_len, line_blen, offset) != 0)
164             goto fail;
165     } else {
166         goto fail;
167     }
168 
169     free(name.s);
170     return idx;
171 
172 fail:
173     free(name.s);
174     fai_destroy(idx);
175     return NULL;
176 }
177 
fai_save(const faidx_t * fai,hFILE * fp)178 static int fai_save(const faidx_t *fai, hFILE *fp) {
179     khint_t k;
180     int i;
181     char buf[96]; // Must be big enough for format below.
182 
183     for (i = 0; i < fai->n; ++i) {
184         faidx1_t x;
185         k = kh_get(s, fai->hash, fai->name[i]);
186         assert(k < kh_end(fai->hash));
187         x = kh_value(fai->hash, k);
188         snprintf(buf, sizeof(buf),
189                  "\t%"PRId64"\t%"PRIu64"\t%"PRId32"\t%"PRId32"\n",
190                  x.len, x.offset, x.line_blen, x.line_len);
191         if (hputs(fai->name[i], fp) != 0) return -1;
192         if (hputs(buf, fp) != 0) return -1;
193     }
194     return 0;
195 }
196 
fai_read(hFILE * fp,const char * fname)197 static faidx_t *fai_read(hFILE *fp, const char *fname)
198 {
199     faidx_t *fai;
200     char *buf = NULL, *p;
201     int line_len, line_blen, n;
202     int64_t len;
203     uint64_t offset;
204     ssize_t l, lnum = 1;
205 
206     fai = (faidx_t*)calloc(1, sizeof(faidx_t));
207     if (!fai) return NULL;
208 
209     fai->hash = kh_init(s);
210     if (!fai->hash) goto fail;
211 
212     buf = (char*)calloc(0x10000, 1);
213     if (!buf) goto fail;
214 
215     while ((l = hgetln(buf, 0x10000, fp)) > 0) {
216         for (p = buf; *p && !isspace_c(*p); ++p);
217         if (p - buf < l) {
218             *p = 0; ++p;
219         }
220         n = sscanf(p, "%"SCNd64"%"SCNu64"%d%d", &len, &offset, &line_blen, &line_len);
221         if (n != 4) {
222             if (hts_verbose > 1)
223                 fprintf(stderr, "[fai_load] couldn't understand FAI %s line %zd\n",
224                         fname, lnum);
225             goto fail;
226         }
227         if (fai_insert_index(fai, buf, len, line_len, line_blen, offset) != 0) {
228             goto fail;
229         }
230         if (buf[l - 1] == '\n') ++lnum;
231     }
232 
233     if (l < 0) {
234         if (hts_verbose > 1)
235             fprintf(stderr, "[fai_load] error while reading \"%s\": %s\n",
236                     fname, strerror(errno));
237         goto fail;
238     }
239     free(buf);
240     return fai;
241 
242  fail:
243     free(buf);
244     fai_destroy(fai);
245     return NULL;
246 }
247 
fai_destroy(faidx_t * fai)248 void fai_destroy(faidx_t *fai)
249 {
250     int i;
251     if (!fai) return;
252     for (i = 0; i < fai->n; ++i) free(fai->name[i]);
253     free(fai->name);
254     kh_destroy(s, fai->hash);
255     if (fai->bgzf) bgzf_close(fai->bgzf);
256     free(fai);
257 }
258 
fai_build3(const char * fn,const char * fnfai,const char * fngzi)259 int fai_build3(const char *fn, const char *fnfai, const char *fngzi)
260 {
261     kstring_t fai_kstr = { 0, 0, NULL };
262     kstring_t gzi_kstr = { 0, 0, NULL };
263     BGZF *bgzf = NULL;
264     hFILE *fp = NULL;
265     faidx_t *fai = NULL;
266     int save_errno, res;
267 
268     if (!fnfai) {
269         if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
270         fnfai = fai_kstr.s;
271     }
272     if (!fngzi) {
273         if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
274         fngzi = gzi_kstr.s;
275     }
276 
277     bgzf = bgzf_open(fn, "r");
278     if ( !bgzf ) {
279         if (hts_verbose > 1)
280             fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn);
281         goto fail;
282     }
283     if ( bgzf->is_compressed ) {
284         if (bgzf_index_build_init(bgzf) != 0) {
285             if (hts_verbose > 1)
286                 fprintf(stderr, "[fai_build] fail to allocate bgzf index");
287             goto fail;
288         }
289     }
290     fai = fai_build_core(bgzf);
291     if ( !fai )
292     {
293         if ( bgzf->is_compressed && bgzf->is_gzip && hts_verbose > 1)
294             fprintf(stderr, "Cannot index files compressed with gzip, please use bgzip\n");
295         goto fail;
296     }
297     if ( bgzf->is_compressed ) {
298         if (bgzf_index_dump(bgzf, fngzi, NULL) < 0) {
299             if (hts_verbose > 1)
300                 fprintf(stderr, "[fai_build] fail to make bgzf index %s\n",
301                         fngzi);
302             goto fail;
303         }
304     }
305     res = bgzf_close(bgzf);
306     bgzf = NULL;
307     if (res < 0) {
308         if (hts_verbose > 1)
309             fprintf(stderr, "[fai_build] Error on closing %s : %s\n",
310                     fn, strerror(errno));
311         goto fail;
312     }
313     fp = hopen(fnfai, "wb");
314     if ( !fp ) {
315         if (hts_verbose > 1)
316             fprintf(stderr, "[fai_build] fail to open FASTA index %s : %s\n",
317                     fnfai, strerror(errno));
318         goto fail;
319     }
320     if (fai_save(fai, fp) != 0) {
321         if (hts_verbose > 1)
322             fprintf(stderr, "[fai_build] fail to write FASTA index %s : %s\n",
323                     fnfai, strerror(errno));
324         goto fail;
325     }
326     if (hclose(fp) != 0) {
327         if (hts_verbose > 1)
328             fprintf(stderr, "[fai_build] fail on closing FASTA index %s : %s\n",
329                     fnfai, strerror(errno));
330         goto fail;
331     }
332 
333     free(fai_kstr.s);
334     free(gzi_kstr.s);
335     fai_destroy(fai);
336     return 0;
337 
338  fail:
339     save_errno = errno;
340     free(fai_kstr.s);
341     free(gzi_kstr.s);
342     bgzf_close(bgzf);
343     fai_destroy(fai);
344     errno = save_errno;
345     return -1;
346 }
347 
fai_build(const char * fn)348 int fai_build(const char *fn) {
349     return fai_build3(fn, NULL, NULL);
350 }
351 
fai_load3(const char * fn,const char * fnfai,const char * fngzi,int flags)352 faidx_t *fai_load3(const char *fn, const char *fnfai, const char *fngzi,
353                    int flags)
354 {
355     kstring_t fai_kstr = { 0, 0, NULL };
356     kstring_t gzi_kstr = { 0, 0, NULL };
357     hFILE *fp = NULL;
358     faidx_t *fai = NULL;
359     int res;
360 
361     if (fn == NULL)
362         return NULL;
363 
364     if (fnfai == NULL) {
365         if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
366         fnfai = fai_kstr.s;
367     }
368     if (fngzi == NULL) {
369         if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
370         fngzi = gzi_kstr.s;
371     }
372 
373     fp = hopen(fnfai, "rb");
374 
375     if (fp == 0) {
376         if (!(flags & FAI_CREATE) || errno != ENOENT) {
377             if (hts_verbose >= 1) {
378                 fprintf(stderr, "[fai_load] failed to open FASTA index %s: %s\n",
379                         fnfai, strerror(errno));
380             }
381             goto fail;
382         }
383 
384         if (hts_verbose >= 1) {
385             fprintf(stderr, "[fai_load] build FASTA index.\n");
386         }
387         if (fai_build3(fn, fnfai, fngzi) < 0) {
388             goto fail;
389         }
390 
391         fp = hopen(fnfai, "rb");
392         if (fp == 0) {
393             if (hts_verbose >= 1) {
394                 fprintf(stderr, "[fai_load] failed to open FASTA index %s: %s\n",
395                         fnfai, strerror(errno));
396             }
397             goto fail;
398         }
399     }
400 
401     fai = fai_read(fp, fnfai);
402     if (fai == NULL) {
403         if (hts_verbose >= 1)
404             fprintf(stderr, "[fai_load] failed to read FASTA index %s\n",
405                     fnfai);
406         goto fail;
407     }
408 
409     res = hclose(fp);
410     fp = NULL;
411     if (res < 0) {
412         if (hts_verbose >= 1)
413             fprintf(stderr, "[fai_load] fail on closing FASTA index %s : %s\n",
414                     fnfai, strerror(errno));
415         goto fail;
416     }
417 
418     fai->bgzf = bgzf_open(fn, "rb");
419     if (fai->bgzf == 0) {
420         if (hts_verbose >= 1)
421             fprintf(stderr, "[fai_load] fail to open FASTA file %s.\n", fn);
422         goto fail;
423     }
424     if ( fai->bgzf->is_compressed==1 )
425     {
426         if ( bgzf_index_load(fai->bgzf, fngzi, NULL) < 0 )
427         {
428             if (hts_verbose >= 1)
429                 fprintf(stderr, "[fai_load] failed to load .gzi index: %s\n",
430                         fngzi);
431             goto fail;
432         }
433     }
434     free(fai_kstr.s);
435     free(gzi_kstr.s);
436     return fai;
437 
438  fail:
439     if (fai) fai_destroy(fai);
440     if (fp) hclose_abruptly(fp);
441     free(fai_kstr.s);
442     free(gzi_kstr.s);
443     return NULL;
444 }
445 
fai_load(const char * fn)446 faidx_t *fai_load(const char *fn)
447 {
448     return fai_load3(fn, NULL, NULL, FAI_CREATE);
449 }
450 
fai_retrieve(const faidx_t * fai,const faidx1_t * val,long beg,long end,int * len)451 static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
452                           long beg, long end, int *len) {
453     char *s;
454     size_t l;
455     int c = 0;
456     int ret = bgzf_useek(fai->bgzf,
457                          val->offset
458                          + beg / val->line_blen * val->line_len
459                          + beg % val->line_blen, SEEK_SET);
460 
461     if (ret < 0) {
462         *len = -1;
463         if (hts_verbose >= 1)
464             fprintf(stderr, "[fai_fetch] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
465         return NULL;
466     }
467 
468     l = 0;
469     s = (char*)malloc((size_t) end - beg + 2);
470     if (!s) {
471         *len = -1;
472         return NULL;
473     }
474 
475     while ( l < end - beg && (c=bgzf_getc(fai->bgzf))>=0 )
476         if (isgraph(c)) s[l++] = c;
477     if (c < 0) {
478         if (hts_verbose >= 1) {
479             fprintf(stderr, "[E::fai_fetch] fai_fetch failed : %s\n",
480                     c == -1 ? "unexpected end of file" : "error reading file");
481         }
482         free(s);
483         *len = -1;
484         return NULL;
485     }
486 
487     s[l] = '\0';
488     *len = l < INT_MAX ? l : INT_MAX;
489     return s;
490 }
491 
fai_fetch(const faidx_t * fai,const char * str,int * len)492 char *fai_fetch(const faidx_t *fai, const char *str, int *len)
493 {
494     char *s, *ep;
495     size_t i, l, k, name_end;
496     khiter_t iter;
497     faidx1_t val;
498     khash_t(s) *h;
499     long beg, end;
500 
501     beg = end = -1;
502     h = fai->hash;
503     name_end = l = strlen(str);
504     s = (char*)malloc(l+1);
505     if (!s) {
506         *len = -1;
507         return NULL;
508     }
509 
510     // remove space
511     for (i = k = 0; i < l; ++i)
512         if (!isspace_c(str[i])) s[k++] = str[i];
513     s[k] = 0;
514     name_end = l = k;
515     // determine the sequence name
516     for (i = l; i > 0; --i) if (s[i - 1] == ':') break; // look for colon from the end
517     if (i > 0) name_end = i - 1;
518     if (name_end < l) { // check if this is really the end
519         int n_hyphen = 0;
520         for (i = name_end + 1; i < l; ++i) {
521             if (s[i] == '-') ++n_hyphen;
522             else if (!isdigit_c(s[i]) && s[i] != ',') break;
523         }
524         if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
525         s[name_end] = 0;
526         iter = kh_get(s, h, s);
527         if (iter == kh_end(h)) { // cannot find the sequence name
528             iter = kh_get(s, h, str); // try str as the name
529             if (iter != kh_end(h)) {
530                 s[name_end] = ':';
531                 name_end = l;
532             }
533         }
534     } else iter = kh_get(s, h, str);
535     if(iter == kh_end(h)) {
536         fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str);
537         free(s);
538         *len = -2;
539         return 0;
540     }
541     val = kh_value(h, iter);
542     // parse the interval
543     if (name_end < l) {
544         int save_errno = errno;
545         errno = 0;
546         for (i = k = name_end + 1; i < l; ++i)
547             if (s[i] != ',') s[k++] = s[i];
548         s[k] = 0;
549         if (s[name_end + 1] == '-') {
550             beg = 0;
551             i = name_end + 2;
552         } else {
553             beg = strtol(s + name_end + 1, &ep, 10);
554             for (i = ep - s; i < k;) if (s[i++] == '-') break;
555         }
556         end = i < k? strtol(s + i, &ep, 10) : val.len;
557         if (beg > 0) --beg;
558         // Check for out of range numbers.  Only going to be a problem on
559         // 32-bit platforms with >2Gb sequence length.
560         if (errno == ERANGE && (uint64_t) val.len > LONG_MAX) {
561             fprintf(stderr, "[fai_fetch] Positions in range %s are too large"
562                     " for this platform.\n", s);
563             free(s);
564             *len = -2;
565             return NULL;
566         }
567         errno = save_errno;
568     } else beg = 0, end = val.len;
569     if (beg >= val.len) beg = val.len;
570     if (end >= val.len) end = val.len;
571     if (beg > end) beg = end;
572     free(s);
573 
574     // now retrieve the sequence
575     return fai_retrieve(fai, &val, beg, end, len);
576 }
577 
faidx_fetch_nseq(const faidx_t * fai)578 int faidx_fetch_nseq(const faidx_t *fai)
579 {
580     return fai->n;
581 }
582 
faidx_nseq(const faidx_t * fai)583 int faidx_nseq(const faidx_t *fai)
584 {
585     return fai->n;
586 }
587 
faidx_iseq(const faidx_t * fai,int i)588 const char *faidx_iseq(const faidx_t *fai, int i)
589 {
590     return fai->name[i];
591 }
592 
faidx_seq_len(const faidx_t * fai,const char * seq)593 int faidx_seq_len(const faidx_t *fai, const char *seq)
594 {
595     khint_t k = kh_get(s, fai->hash, seq);
596     if ( k == kh_end(fai->hash) ) return -1;
597     return kh_val(fai->hash, k).len;
598 }
599 
faidx_fetch_seq(const faidx_t * fai,const char * c_name,int p_beg_i,int p_end_i,int * len)600 char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
601 {
602     khiter_t iter;
603     faidx1_t val;
604 
605     // Adjust position
606     iter = kh_get(s, fai->hash, c_name);
607     if (iter == kh_end(fai->hash))
608     {
609         *len = -2;
610         fprintf(stderr, "[fai_fetch_seq] The sequence \"%s\" not found\n", c_name);
611         return NULL;
612     }
613     val = kh_value(fai->hash, iter);
614     if(p_end_i < p_beg_i) p_beg_i = p_end_i;
615     if(p_beg_i < 0) p_beg_i = 0;
616     else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
617     if(p_end_i < 0) p_end_i = 0;
618     else if(val.len <= p_end_i) p_end_i = val.len - 1;
619 
620     // Now retrieve the sequence
621     return fai_retrieve(fai, &val, p_beg_i, (long) p_end_i + 1, len);
622 }
623 
faidx_has_seq(const faidx_t * fai,const char * seq)624 int faidx_has_seq(const faidx_t *fai, const char *seq)
625 {
626     khiter_t iter = kh_get(s, fai->hash, seq);
627     if (iter == kh_end(fai->hash)) return 0;
628     return 1;
629 }
630 
631