1 /* The MIT License
2 
3    Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
4 
5    Permission is hereby granted, free of charge, to any person obtaining
6    a copy of this software and associated documentation files (the
7    "Software"), to deal in the Software without restriction, including
8    without limitation the rights to use, copy, modify, merge, publish,
9    distribute, sublicense, and/or sell copies of the Software, and to
10    permit persons to whom the Software is furnished to do so, subject to
11    the following conditions:
12 
13    The above copyright notice and this permission notice shall be
14    included in all copies or substantial portions of the Software.
15 
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23    SOFTWARE.
24 */
25 
26 /* Last Modified: 05MAR2012 */
27 
28 #ifndef AC_KSEQ_H
29 #define AC_KSEQ_H
30 
31 #include <ctype.h>
32 #include <string.h>
33 #include <stdlib.h>
34 
35 #ifndef klib_unused
36 #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
37 #define klib_unused __attribute__ ((__unused__))
38 #else
39 #define klib_unused
40 #endif
41 #endif /* klib_unused */
42 
43 #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
44 #define KS_SEP_TAB   1 // isspace() && !' '
45 #define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
46 #define KS_SEP_MAX   2
47 
48 #define __KS_TYPE(type_t) \
49 	typedef struct __kstream_t { \
50 		int begin, end; \
51 		int is_eof:2, bufsize:30; \
52         uint64_t seek_pos; \
53 		type_t f; \
54 		unsigned char *buf; \
55 	} kstream_t;
56 
57 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
58 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
59 
60 #define __KS_BASIC(SCOPE, type_t, __bufsize) \
61 	SCOPE kstream_t *ks_init(type_t f) \
62 	{ \
63 		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
64 		ks->f = f; ks->bufsize = __bufsize; \
65 		ks->buf = (unsigned char*)malloc(__bufsize); \
66 		return ks; \
67 	} \
68 	SCOPE void ks_destroy(kstream_t *ks) \
69 	{ \
70 		if (!ks) return; \
71 		free(ks->buf); \
72 		free(ks); \
73 	}
74 
75 #define __KS_INLINED(__read) \
76 	static inline klib_unused int ks_getc(kstream_t *ks) \
77 	{ \
78 		if (ks->is_eof && ks->begin >= ks->end) return -1; \
79 		if (ks->begin >= ks->end) { \
80 			ks->begin = 0; \
81 			ks->end = __read(ks->f, ks->buf, ks->bufsize); \
82 			if (ks->end == 0) { ks->is_eof = 1; return -1; } \
83 		} \
84         ks->seek_pos++; \
85 		return (int)ks->buf[ks->begin++]; \
86 	} \
87 	static inline klib_unused int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
88 	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
89 
90 #ifndef KSTRING_T
91 #define KSTRING_T kstring_t
92 typedef struct __kstring_t {
93 	size_t l, m;
94 	char *s;
95 } kstring_t;
96 #endif
97 
98 #ifndef kroundup32
99 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
100 #endif
101 
102 #define __KS_GETUNTIL(SCOPE, __read) \
103 	SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append)  \
104 	{ \
105 		int gotany = 0; \
106 		if (dret) *dret = 0; \
107 		str->l = append? str->l : 0; \
108         uint64_t seek_pos = str->l; \
109 		for (;;) { \
110 			int i; \
111 			if (ks->begin >= ks->end) { \
112 				if (!ks->is_eof) { \
113 					ks->begin = 0; \
114 					ks->end = __read(ks->f, ks->buf, ks->bufsize); \
115 					if (ks->end == 0) { ks->is_eof = 1; break; } \
116 				} else break; \
117 			} \
118 			if (delimiter == KS_SEP_LINE) {  \
119 				for (i = ks->begin; i < ks->end; ++i)  \
120 					if (ks->buf[i] == '\n') break; \
121 			} else if (delimiter > KS_SEP_MAX) { \
122 				for (i = ks->begin; i < ks->end; ++i) \
123 					if (ks->buf[i] == delimiter) break; \
124 			} else if (delimiter == KS_SEP_SPACE) { \
125 				for (i = ks->begin; i < ks->end; ++i) \
126 					if (isspace(ks->buf[i])) break; \
127 			} else if (delimiter == KS_SEP_TAB) { \
128 				for (i = ks->begin; i < ks->end; ++i) \
129 					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break;  \
130 			} else i = 0; /* never come to here! */ \
131 			if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
132 				str->m = str->l + (i - ks->begin) + 1; \
133 				kroundup32(str->m); \
134 				str->s = (char*)realloc(str->s, str->m); \
135 			} \
136             seek_pos += i - ks->begin; if ( i < ks->end ) seek_pos++; \
137 			gotany = 1; \
138 			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin);  \
139 			str->l = str->l + (i - ks->begin); \
140 			ks->begin = i + 1; \
141 			if (i < ks->end) { \
142 				if (dret) *dret = ks->buf[i]; \
143 				break; \
144 			} \
145 		} \
146 		if (!gotany && ks_eof(ks)) return -1; \
147         ks->seek_pos += seek_pos; \
148 		if (str->s == 0) { \
149 			str->m = 1; \
150 			str->s = (char*)calloc(1, 1); \
151 		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
152 		str->s[str->l] = '\0';											\
153 		return str->l; \
154 	}
155 
156 #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \
157 	__KS_TYPE(type_t) \
158 	__KS_BASIC(SCOPE, type_t, __bufsize) \
159 	__KS_GETUNTIL(SCOPE, __read) \
160 	__KS_INLINED(__read)
161 
162 #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize)
163 
164 #define KSTREAM_DECLARE(type_t, __read) \
165 	__KS_TYPE(type_t) \
166 	extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \
167 	extern kstream_t *ks_init(type_t f); \
168 	extern void ks_destroy(kstream_t *ks); \
169 	__KS_INLINED(__read)
170 
171 /******************
172  * FASTA/Q parser *
173  ******************/
174 
175 #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
176 
177 #define __KSEQ_BASIC(SCOPE, type_t)										\
178 	SCOPE kseq_t *kseq_init(type_t fd)									\
179 	{																	\
180 		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
181 		s->f = ks_init(fd);												\
182 		return s;														\
183 	}																	\
184 	SCOPE void kseq_destroy(kseq_t *ks)									\
185 	{																	\
186 		if (!ks) return;												\
187 		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
188 		ks_destroy(ks->f);												\
189 		free(ks);														\
190 	}
191 
192 /* Return value:
193    >=0  length of the sequence (normal)
194    -1   end-of-file
195    -2   truncated quality string
196  */
197 #define __KSEQ_READ(SCOPE) \
198 	SCOPE int kseq_read(kseq_t *seq) \
199 	{ \
200 		int c; \
201 		kstream_t *ks = seq->f; \
202 		if (seq->last_char == 0) { /* then jump to the next header line */ \
203 			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
204 			if (c == -1) return -1; /* end of file */ \
205 			seq->last_char = c; \
206 		} /* else: the first header char has been read in the previous call */ \
207 		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
208 		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
209 		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
210 		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
211 			seq->seq.m = 256; \
212 			seq->seq.s = (char*)malloc(seq->seq.m); \
213 		} \
214 		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
215 			if (c == '\n') continue; /* skip empty lines */ \
216 			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
217 			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
218 		} \
219 		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */	\
220 		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
221 			seq->seq.m = seq->seq.l + 2; \
222 			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
223 			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
224 		} \
225 		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
226 		if (c != '+') return seq->seq.l; /* FASTA */ \
227 		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
228 			seq->qual.m = seq->seq.m; \
229 			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
230 		} \
231 		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
232 		if (c == -1) return -2; /* error: no quality string */ \
233 		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
234 		seq->last_char = 0;	/* we have not come to the next header line */ \
235 		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
236 		return seq->seq.l; \
237 	}
238 
239 #define __KSEQ_TYPE(type_t)						\
240 	typedef struct {							\
241 		kstring_t name, comment, seq, qual;		\
242 		int last_char;							\
243 		kstream_t *f;							\
244 	} kseq_t;
245 
246 #define KSEQ_INIT2(SCOPE, type_t, __read)		\
247 	KSTREAM_INIT(type_t, __read, 16384)			\
248 	__KSEQ_TYPE(type_t)							\
249 	__KSEQ_BASIC(SCOPE, type_t)					\
250 	__KSEQ_READ(SCOPE)
251 
252 #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
253 
254 #define KSEQ_DECLARE(type_t) \
255 	__KS_TYPE(type_t) \
256 	__KSEQ_TYPE(type_t) \
257 	extern kseq_t *kseq_init(type_t fd); \
258 	void kseq_destroy(kseq_t *ks); \
259 	int kseq_read(kseq_t *seq);
260 
261 #endif
262