1 #include <ctype.h>
2 #include <string.h>
3 #include <stdlib.h>
4 #include <stdio.h>
5 #include "faidx.h"
6 #include "khash.h"
7 
8 typedef struct {
9 	uint64_t len:32, line_len:16, line_blen:16;
10 	uint64_t offset;
11 } faidx1_t;
12 KHASH_MAP_INIT_STR(s, faidx1_t)
13 
14 #ifndef _NO_RAZF
15 #include "razf.h"
16 #else
17 #ifdef _WIN32
18 #define ftello(fp) ftell(fp)
19 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
20 #else
21 extern off_t ftello(FILE *stream);
22 extern int fseeko(FILE *stream, off_t offset, int whence);
23 #endif
24 #define RAZF FILE
25 #define razf_read(fp, buf, size) fread(buf, 1, size, fp)
26 #define razf_open(fn, mode) fopen(fn, mode)
27 #define razf_close(fp) fclose(fp)
28 #define razf_seek(fp, offset, whence) fseeko(fp, offset, whence)
29 #define razf_tell(fp) ftello(fp)
30 #endif
31 #ifdef _USE_KNETFILE
32 #include "knetfile.h"
33 #endif
34 
35 struct __faidx_t {
36 	RAZF *rz;
37 	int n, m;
38 	char **name;
39 	khash_t(s) *hash;
40 };
41 
42 #ifndef kroundup32
43 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
44 #endif
45 
fai_insert_index(faidx_t * idx,const char * name,int len,int line_len,int line_blen,uint64_t offset)46 static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
47 {
48 	khint_t k;
49 	int ret;
50 	faidx1_t t;
51 	if (idx->n == idx->m) {
52 		idx->m = idx->m? idx->m<<1 : 16;
53 		idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m);
54 	}
55 	idx->name[idx->n] = strdup(name);
56 	k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
57 	t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
58 	kh_value(idx->hash, k) = t;
59 	++idx->n;
60 }
61 
fai_build_core(RAZF * rz)62 faidx_t *fai_build_core(RAZF *rz)
63 {
64 	char c, *name;
65 	int l_name, m_name, ret;
66 	int len, line_len, line_blen, state;
67 	int l1, l2;
68 	faidx_t *idx;
69 	uint64_t offset;
70 
71 	idx = (faidx_t*)calloc(1, sizeof(faidx_t));
72 	idx->hash = kh_init(s);
73 	name = 0; l_name = m_name = 0;
74 	len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
75 	while (razf_read(rz, &c, 1)) {
76 		if (c == '\n') { // an empty line
77 			if (state == 1) {
78 				offset = razf_tell(rz);
79 				continue;
80 			} else if ((state == 0 && len < 0) || state == 2) continue;
81 		}
82 		if (c == '>') { // fasta header
83 			if (len >= 0)
84 				fai_insert_index(idx, name, len, line_len, line_blen, offset);
85 			l_name = 0;
86 			while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) {
87 				if (m_name < l_name + 2) {
88 					m_name = l_name + 2;
89 					kroundup32(m_name);
90 					name = (char*)realloc(name, m_name);
91 				}
92 				name[l_name++] = c;
93 			}
94 			name[l_name] = '\0';
95 			if (ret == 0) {
96 				fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
97 				free(name); fai_destroy(idx);
98 				return 0;
99 			}
100 			if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n');
101 			state = 1; len = 0;
102 			offset = razf_tell(rz);
103 		} else {
104 			if (state == 3) {
105 				fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
106 				free(name); fai_destroy(idx);
107 				return 0;
108 			}
109 			if (state == 2) state = 3;
110 			l1 = l2 = 0;
111 			do {
112 				++l1;
113 				if (isgraph(c)) ++l2;
114 			} while ((ret = razf_read(rz, &c, 1)) && c != '\n');
115 			if (state == 3 && l2) {
116 				fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
117 				free(name); fai_destroy(idx);
118 				return 0;
119 			}
120 			++l1; len += l2;
121 			if (l2 >= 0x10000) {
122 				fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'.\n", name);
123 				free(name); fai_destroy(idx);
124 				return 0;
125 			}
126 			if (state == 1) line_len = l1, line_blen = l2, state = 0;
127 			else if (state == 0) {
128 				if (l1 != line_len || l2 != line_blen) state = 2;
129 			}
130 		}
131 	}
132 	fai_insert_index(idx, name, len, line_len, line_blen, offset);
133 	free(name);
134 	return idx;
135 }
136 
fai_save(const faidx_t * fai,FILE * fp)137 void fai_save(const faidx_t *fai, FILE *fp)
138 {
139 	khint_t k;
140 	int i;
141 	for (i = 0; i < fai->n; ++i) {
142 		faidx1_t x;
143 		k = kh_get(s, fai->hash, fai->name[i]);
144 		x = kh_value(fai->hash, k);
145 #ifdef _WIN32
146 		fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
147 #else
148 		fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
149 #endif
150 	}
151 }
152 
fai_read(FILE * fp)153 faidx_t *fai_read(FILE *fp)
154 {
155 	faidx_t *fai;
156 	char *buf, *p;
157 	int len, line_len, line_blen;
158 #ifdef _WIN32
159 	long offset;
160 #else
161 	long long offset;
162 #endif
163 	fai = (faidx_t*)calloc(1, sizeof(faidx_t));
164 	fai->hash = kh_init(s);
165 	buf = (char*)calloc(0x10000, 1);
166 	while (!feof(fp) && fgets(buf, 0x10000, fp)) {
167 		for (p = buf; *p && isgraph(*p); ++p);
168 		*p = 0; ++p;
169 #ifdef _WIN32
170 		sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
171 #else
172 		sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
173 #endif
174 		fai_insert_index(fai, buf, len, line_len, line_blen, offset);
175 	}
176 	free(buf);
177 	return fai;
178 }
179 
fai_destroy(faidx_t * fai)180 void fai_destroy(faidx_t *fai)
181 {
182 	int i;
183 	for (i = 0; i < fai->n; ++i) free(fai->name[i]);
184 	free(fai->name);
185 	kh_destroy(s, fai->hash);
186 	if (fai->rz) razf_close(fai->rz);
187 	free(fai);
188 }
189 
fai_build(const char * fn)190 int fai_build(const char *fn)
191 {
192 	char *str;
193 	RAZF *rz;
194 	FILE *fp;
195 	faidx_t *fai;
196 	str = (char*)calloc(strlen(fn) + 5, 1);
197 	sprintf(str, "%s.fai", fn);
198 	rz = razf_open(fn, "r");
199 	if (rz == 0) {
200 		fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn);
201 		free(str);
202 		return -1;
203 	}
204 	fai = fai_build_core(rz);
205 	razf_close(rz);
206 	fp = fopen(str, "wb");
207 	if (fp == 0) {
208 		fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str);
209 		fai_destroy(fai); free(str);
210 		return -1;
211 	}
212 	fai_save(fai, fp);
213 	fclose(fp);
214 	free(str);
215 	fai_destroy(fai);
216 	return 0;
217 }
218 
219 #ifdef _USE_KNETFILE
download_and_open(const char * fn)220 FILE *download_and_open(const char *fn)
221 {
222     const int buf_size = 1 * 1024 * 1024;
223     uint8_t *buf;
224     FILE *fp;
225     knetFile *fp_remote;
226     const char *url = fn;
227     const char *p;
228     int l = strlen(fn);
229     for (p = fn + l - 1; p >= fn; --p)
230         if (*p == '/') break;
231     fn = p + 1;
232 
233     // First try to open a local copy
234     fp = fopen(fn, "r");
235     if (fp)
236         return fp;
237 
238     // If failed, download from remote and open
239     fp_remote = knet_open(url, "rb");
240     if (fp_remote == 0) {
241         fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url);
242         return NULL;
243     }
244     if ((fp = fopen(fn, "wb")) == 0) {
245         fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn);
246         knet_close(fp_remote);
247         return NULL;
248     }
249     buf = (uint8_t*)calloc(buf_size, 1);
250     while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
251         fwrite(buf, 1, l, fp);
252     free(buf);
253     fclose(fp);
254     knet_close(fp_remote);
255 
256     return fopen(fn, "r");
257 }
258 #endif
259 
fai_load(const char * fn)260 faidx_t *fai_load(const char *fn)
261 {
262 	char *str;
263 	FILE *fp;
264 	faidx_t *fai;
265 	str = (char*)calloc(strlen(fn) + 5, 1);
266 	sprintf(str, "%s.fai", fn);
267 
268 #ifdef _USE_KNETFILE
269     if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)
270     {
271         fp = download_and_open(str);
272         if ( !fp )
273         {
274             fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str);
275             free(str);
276             return 0;
277         }
278     }
279     else
280 #endif
281         fp = fopen(str, "rb");
282 	if (fp == 0) {
283 		fprintf(stderr, "[fai_load] build FASTA index.\n");
284 		fai_build(fn);
285 		fp = fopen(str, "rb");
286 		if (fp == 0) {
287 			fprintf(stderr, "[fai_load] fail to open FASTA index.\n");
288 			free(str);
289 			return 0;
290 		}
291 	}
292 
293 	fai = fai_read(fp);
294 	fclose(fp);
295 
296 	fai->rz = razf_open(fn, "rb");
297 	free(str);
298 	if (fai->rz == 0) {
299 		fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
300 		return 0;
301 	}
302 	return fai;
303 }
304 
fai_fetch(const faidx_t * fai,const char * str,int * len)305 char *fai_fetch(const faidx_t *fai, const char *str, int *len)
306 {
307 	char *s, *p, c;
308 	int i, l, k;
309 	khiter_t iter;
310 	faidx1_t val;
311 	khash_t(s) *h;
312 	int beg, end;
313 
314 	beg = end = -1;
315 	h = fai->hash;
316 	l = strlen(str);
317 	p = s = (char*)malloc(l+1);
318 	/* squeeze out "," */
319 	for (i = k = 0; i != l; ++i)
320 		if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
321 	s[k] = 0;
322 	for (i = 0; i != k; ++i) if (s[i] == ':') break;
323 	s[i] = 0;
324 	iter = kh_get(s, h, s); /* get the ref_id */
325 	if (iter == kh_end(h)) {
326 		*len = 0;
327 		free(s); return 0;
328 	}
329 	val = kh_value(h, iter);
330 	if (i == k) { /* dump the whole sequence */
331 		beg = 0; end = val.len;
332 	} else {
333 		for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
334 		beg = atoi(p);
335 		if (i < k) {
336 			p = s + i + 1;
337 			end = atoi(p);
338 		} else end = val.len;
339 	}
340 	if (beg > 0) --beg;
341 	if (beg >= val.len) beg = val.len;
342 	if (end >= val.len) end = val.len;
343 	if (beg > end) beg = end;
344 	free(s);
345 
346 	// now retrieve the sequence
347 	l = 0;
348 	s = (char*)malloc(end - beg + 2);
349 	razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
350 	while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err)
351 		if (isgraph(c)) s[l++] = c;
352 	s[l] = '\0';
353 	*len = l;
354 	return s;
355 }
356 
faidx_main(int argc,char * argv[])357 int faidx_main(int argc, char *argv[])
358 {
359 	if (argc == 1) {
360 		fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n");
361 		return 1;
362 	} else {
363 		if (argc == 2) fai_build(argv[1]);
364 		else {
365 			int i, j, k, l;
366 			char *s;
367 			faidx_t *fai;
368 			fai = fai_load(argv[1]);
369 			if (fai == 0) return 1;
370 			for (i = 2; i != argc; ++i) {
371 				printf(">%s\n", argv[i]);
372 				s = fai_fetch(fai, argv[i], &l);
373 				for (j = 0; j < l; j += 60) {
374 					for (k = 0; k < 60 && k < l - j; ++k)
375 						putchar(s[j + k]);
376 					putchar('\n');
377 				}
378 				free(s);
379 			}
380 			fai_destroy(fai);
381 		}
382 	}
383 	return 0;
384 }
385 
faidx_fetch_nseq(const faidx_t * fai)386 int faidx_fetch_nseq(const faidx_t *fai)
387 {
388 	return fai->n;
389 }
390 
faidx_fetch_seq(const faidx_t * fai,char * c_name,int p_beg_i,int p_end_i,int * len)391 char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len)
392 {
393 	int l;
394 	char c;
395     khiter_t iter;
396     faidx1_t val;
397 	char *seq=NULL;
398 
399     // Adjust position
400     iter = kh_get(s, fai->hash, c_name);
401     if(iter == kh_end(fai->hash)) return 0;
402     val = kh_value(fai->hash, iter);
403 	if(p_end_i < p_beg_i) p_beg_i = p_end_i;
404     if(p_beg_i < 0) p_beg_i = 0;
405     else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
406     if(p_end_i < 0) p_end_i = 0;
407     else if(val.len <= p_end_i) p_end_i = val.len - 1;
408 
409     // Now retrieve the sequence
410 	l = 0;
411 	seq = (char*)malloc(p_end_i - p_beg_i + 2);
412 	razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
413 	while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1)
414 		if (isgraph(c)) seq[l++] = c;
415 	seq[l] = '\0';
416 	*len = l;
417 	return seq;
418 }
419 
420 #ifdef FAIDX_MAIN
main(int argc,char * argv[])421 int main(int argc, char *argv[]) { return faidx_main(argc, argv); }
422 #endif
423