1 /* The MIT License
2 
3    Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
4 
5    Permission is hereby granted, free of charge, to any person obtaining
6    a copy of this software and associated documentation files (the
7    "Software"), to deal in the Software without restriction, including
8    without limitation the rights to use, copy, modify, merge, publish,
9    distribute, sublicense, and/or sell copies of the Software, and to
10    permit persons to whom the Software is furnished to do so, subject to
11    the following conditions:
12 
13    The above copyright notice and this permission notice shall be
14    included in all copies or substantial portions of the Software.
15 
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23    SOFTWARE.
24 */
25 
26 /* Last Modified: 05MAR2012 */
27 
28 #ifndef AC_KSEQ_H
29 #define AC_KSEQ_H
30 
31 #include <ctype.h>
32 #include <string.h>
33 #include <stdlib.h>
34 
35 #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
36 #define KS_SEP_TAB   1 // isspace() && !' '
37 #define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
38 #define KS_SEP_MAX   2
39 
40 #ifndef klib_unused
41 #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
42 #define klib_unused __attribute__ ((__unused__))
43 #else
44 #define klib_unused
45 #endif
46 #endif /* klib_unused */
47 
48 #define __KS_TYPE(type_t) \
49 	typedef struct __kstream_t { \
50 		int begin, end; \
51 		int is_eof:2, bufsize:30; \
52 		type_t f; \
53 		unsigned char *buf; \
54 	} kstream_t;
55 
56 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
57 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
58 
59 #define __KS_BASIC(SCOPE, type_t, __bufsize) \
60 	SCOPE kstream_t *ks_init(type_t f) \
61 	{ \
62 		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
63 		ks->f = f; ks->bufsize = __bufsize; \
64 		ks->buf = (unsigned char*)malloc(__bufsize); \
65 		return ks; \
66 	} \
67 	SCOPE void ks_destroy(kstream_t *ks) \
68 	{ \
69 		if (!ks) return; \
70 		free(ks->buf); \
71 		free(ks); \
72 	}
73 
74 #define __KS_INLINED(__read) \
75 	static inline klib_unused int ks_getc(kstream_t *ks) \
76 	{ \
77 		if (ks->is_eof && ks->begin >= ks->end) return -1; \
78 		if (ks->begin >= ks->end) { \
79 			ks->begin = 0; \
80 			ks->end = __read(ks->f, ks->buf, ks->bufsize); \
81 			if (ks->end < ks->bufsize) ks->is_eof = 1; \
82 			if (ks->end == 0) return -1; \
83 		} \
84 		return (int)ks->buf[ks->begin++]; \
85 	} \
86 	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
87 	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
88 
89 #ifndef KSTRING_T
90 #define KSTRING_T kstring_t
91 typedef struct __kstring_t {
92 	size_t l, m;
93 	char *s;
94 } kstring_t;
95 #endif
96 
97 #ifndef kroundup32
98 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
99 #endif
100 
101 #define __KS_GETUNTIL(SCOPE, __read) \
102 	SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
103 	{ \
104 		if (dret) *dret = 0; \
105 		str->l = append? str->l : 0; \
106 		if (ks->begin >= ks->end && ks->is_eof) return -1; \
107 		for (;;) { \
108 			int i; \
109 			if (ks->begin >= ks->end) { \
110 				if (!ks->is_eof) { \
111 					ks->begin = 0; \
112 					ks->end = __read(ks->f, ks->buf, ks->bufsize); \
113 					if (ks->end < ks->bufsize) ks->is_eof = 1; \
114 					if (ks->end == 0) break; \
115 				} else break; \
116 			} \
117 			if (delimiter == KS_SEP_LINE) { \
118 				for (i = ks->begin; i < ks->end; ++i) \
119 					if (ks->buf[i] == '\n') break; \
120 			} else if (delimiter > KS_SEP_MAX) { \
121 				for (i = ks->begin; i < ks->end; ++i) \
122 					if (ks->buf[i] == delimiter) break; \
123 			} else if (delimiter == KS_SEP_SPACE) { \
124 				for (i = ks->begin; i < ks->end; ++i) \
125 					if (isspace(ks->buf[i])) break; \
126 			} else if (delimiter == KS_SEP_TAB) { \
127 				for (i = ks->begin; i < ks->end; ++i) \
128 					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
129 			} else i = 0; /* never come to here! */ \
130 			if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
131 				str->m = str->l + (i - ks->begin) + 1; \
132 				kroundup32(str->m); \
133 				str->s = (char*)realloc(str->s, str->m); \
134 			} \
135 			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
136 			str->l = str->l + (i - ks->begin); \
137 			ks->begin = i + 1; \
138 			if (i < ks->end) { \
139 				if (dret) *dret = ks->buf[i]; \
140 				break; \
141 			} \
142 		} \
143 		if (str->s == 0) { \
144 			str->m = 1; \
145 			str->s = (char*)calloc(1, 1); \
146 		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
147 		str->s[str->l] = '\0'; \
148 		return str->l; \
149 	}
150 
151 #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \
152 	__KS_TYPE(type_t) \
153 	__KS_BASIC(SCOPE, type_t, __bufsize) \
154 	__KS_GETUNTIL(SCOPE, __read) \
155 	__KS_INLINED(__read)
156 
157 #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize)
158 
159 #define KSTREAM_DECLARE(type_t, __read) \
160 	__KS_TYPE(type_t) \
161 	extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \
162 	extern kstream_t *ks_init(type_t f); \
163 	extern void ks_destroy(kstream_t *ks); \
164 	__KS_INLINED(__read)
165 
166 /******************
167  * FASTA/Q parser *
168  ******************/
169 
170 #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
171 
172 #define __KSEQ_BASIC(SCOPE, type_t) \
173 	SCOPE kseq_t *kseq_init(type_t fd) \
174 	{ \
175 		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
176 		s->f = ks_init(fd); \
177 		return s; \
178 	} \
179 	SCOPE void kseq_destroy(kseq_t *ks) \
180 	{ \
181 		if (!ks) return; \
182 		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
183 		ks_destroy(ks->f); \
184 		free(ks); \
185 	}
186 
187 /* Return value:
188    >=0  length of the sequence (normal)
189    -1   end-of-file
190    -2   truncated quality string
191  */
192 #define __KSEQ_READ(SCOPE) \
193 	SCOPE int kseq_read(kseq_t *seq) \
194 	{ \
195 		int c; \
196 		kstream_t *ks = seq->f; \
197 		if (seq->last_char == 0) { /* then jump to the next header line */ \
198 			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
199 			if (c == -1) return -1; /* end of file */ \
200 			seq->last_char = c; \
201 		} /* else: the first header char has been read in the previous call */ \
202 		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
203 		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
204 		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
205 		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
206 			seq->seq.m = 256; \
207 			seq->seq.s = (char*)malloc(seq->seq.m); \
208 		} \
209 		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
210 			if (c == '\n') continue; /* skip empty lines */ \
211 			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
212 			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
213 		} \
214 		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
215 		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
216 			seq->seq.m = seq->seq.l + 2; \
217 			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
218 			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
219 		} \
220 		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
221 		if (c != '+') return seq->seq.l; /* FASTA */ \
222 		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
223 			seq->qual.m = seq->seq.m; \
224 			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
225 		} \
226 		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
227 		if (c == -1) return -2; /* error: no quality string */ \
228 		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
229 		seq->last_char = 0;	/* we have not come to the next header line */ \
230 		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
231 		return seq->seq.l; \
232 	}
233 
234 #define __KSEQ_TYPE(type_t) \
235 	typedef struct { \
236 		kstring_t name, comment, seq, qual; \
237 		int last_char; \
238 		kstream_t *f; \
239 	} kseq_t;
240 
241 #define KSEQ_INIT2(SCOPE, type_t, __read) \
242 	KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \
243 	__KSEQ_TYPE(type_t) \
244 	__KSEQ_BASIC(SCOPE, type_t) \
245 	__KSEQ_READ(SCOPE)
246 
247 #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
248 
249 #define KSEQ_DECLARE(type_t) \
250 	__KS_TYPE(type_t) \
251 	__KSEQ_TYPE(type_t) \
252 	extern kseq_t *kseq_init(type_t fd); \
253 	void kseq_destroy(kseq_t *ks); \
254 	int kseq_read(kseq_t *seq);
255 
256 #endif
257