1 /* The MIT License
2 
3    Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
4 
5    Permission is hereby granted, free of charge, to any person obtaining
6    a copy of this software and associated documentation files (the
7    "Software"), to deal in the Software without restriction, including
8    without limitation the rights to use, copy, modify, merge, publish,
9    distribute, sublicense, and/or sell copies of the Software, and to
10    permit persons to whom the Software is furnished to do so, subject to
11    the following conditions:
12 
13    The above copyright notice and this permission notice shall be
14    included in all copies or substantial portions of the Software.
15 
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23    SOFTWARE.
24 */
25 
26 /* Last Modified: 05MAR2012 */
27 
28 #ifndef AC_KSEQ_H
29 #define AC_KSEQ_H
30 
31 #include <ctype.h>
32 #include <string.h>
33 #include <stdlib.h>
34 
35 #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
36 #define KS_SEP_TAB   1 // isspace() && !' '
37 #define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
38 #define KS_SEP_MAX   2
39 
40 #define __KS_TYPE(type_t) \
41 	typedef struct __kstream_t { \
42 		int begin, end; \
43 		int is_eof:2, bufsize:30; \
44         uint64_t seek_pos; \
45 		type_t f; \
46 		unsigned char *buf; \
47 	} kstream_t;
48 
49 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
50 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
51 
52 #define __KS_BASIC(SCOPE, type_t, __bufsize) \
53 	SCOPE kstream_t *ks_init(type_t f) \
54 	{ \
55 		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
56 		ks->f = f; ks->bufsize = __bufsize; \
57 		ks->buf = (unsigned char*)malloc(__bufsize); \
58 		return ks; \
59 	} \
60 	SCOPE void ks_destroy(kstream_t *ks) \
61 	{ \
62 		if (!ks) return; \
63 		free(ks->buf); \
64 		free(ks); \
65 	}
66 
67 #define __KS_INLINED(__read) \
68 	static inline int ks_getc(kstream_t *ks) \
69 	{ \
70 		if (ks->is_eof && ks->begin >= ks->end) return -1; \
71 		if (ks->begin >= ks->end) { \
72 			ks->begin = 0; \
73 			ks->end = __read(ks->f, ks->buf, ks->bufsize); \
74 			if (ks->end == 0) { ks->is_eof = 1; return -1; } \
75 		} \
76         ks->seek_pos++; \
77 		return (int)ks->buf[ks->begin++]; \
78 	} \
79 	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
80 	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
81 
82 #ifndef KSTRING_T
83 #define KSTRING_T kstring_t
84 typedef struct __kstring_t {
85 	size_t l, m;
86 	char *s;
87 } kstring_t;
88 #endif
89 
90 #ifndef kroundup32
91 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
92 #endif
93 
94 #define __KS_GETUNTIL(SCOPE, __read) \
95 	SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append)  \
96 	{ \
97 		int gotany = 0; \
98 		if (dret) *dret = 0; \
99 		str->l = append? str->l : 0; \
100         uint64_t seek_pos = str->l; \
101 		for (;;) { \
102 			int i; \
103 			if (ks->begin >= ks->end) { \
104 				if (!ks->is_eof) { \
105 					ks->begin = 0; \
106 					ks->end = __read(ks->f, ks->buf, ks->bufsize); \
107 					if (ks->end == 0) { ks->is_eof = 1; break; } \
108 				} else break; \
109 			} \
110 			if (delimiter == KS_SEP_LINE) {  \
111 				for (i = ks->begin; i < ks->end; ++i)  \
112 					if (ks->buf[i] == '\n') break; \
113 			} else if (delimiter > KS_SEP_MAX) { \
114 				for (i = ks->begin; i < ks->end; ++i) \
115 					if (ks->buf[i] == delimiter) break; \
116 			} else if (delimiter == KS_SEP_SPACE) { \
117 				for (i = ks->begin; i < ks->end; ++i) \
118 					if (isspace(ks->buf[i])) break; \
119 			} else if (delimiter == KS_SEP_TAB) { \
120 				for (i = ks->begin; i < ks->end; ++i) \
121 					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break;  \
122 			} else i = 0; /* never come to here! */ \
123 			if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
124 				str->m = str->l + (i - ks->begin) + 1; \
125 				kroundup32(str->m); \
126 				str->s = (char*)realloc(str->s, str->m); \
127 			} \
128             seek_pos += i - ks->begin; if ( i < ks->end ) seek_pos++; \
129 			gotany = 1; \
130 			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin);  \
131 			str->l = str->l + (i - ks->begin); \
132 			ks->begin = i + 1; \
133 			if (i < ks->end) { \
134 				if (dret) *dret = ks->buf[i]; \
135 				break; \
136 			} \
137 		} \
138 		if (!gotany && ks_eof(ks)) return -1; \
139         ks->seek_pos += seek_pos; \
140 		if (str->s == 0) { \
141 			str->m = 1; \
142 			str->s = (char*)calloc(1, 1); \
143 		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
144 		str->s[str->l] = '\0';											\
145 		return str->l; \
146 	}
147 
148 #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \
149 	__KS_TYPE(type_t) \
150 	__KS_BASIC(SCOPE, type_t, __bufsize) \
151 	__KS_GETUNTIL(SCOPE, __read) \
152 	__KS_INLINED(__read)
153 
154 #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize)
155 
156 #define KSTREAM_DECLARE(type_t, __read) \
157 	__KS_TYPE(type_t) \
158 	extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \
159 	extern kstream_t *ks_init(type_t f); \
160 	extern void ks_destroy(kstream_t *ks); \
161 	__KS_INLINED(__read)
162 
163 /******************
164  * FASTA/Q parser *
165  ******************/
166 
167 #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
168 
169 #define __KSEQ_BASIC(SCOPE, type_t)										\
170 	SCOPE kseq_t *kseq_init(type_t fd)									\
171 	{																	\
172 		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
173 		s->f = ks_init(fd);												\
174 		return s;														\
175 	}																	\
176 	SCOPE void kseq_destroy(kseq_t *ks)									\
177 	{																	\
178 		if (!ks) return;												\
179 		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
180 		ks_destroy(ks->f);												\
181 		free(ks);														\
182 	}
183 
184 /* Return value:
185    >=0  length of the sequence (normal)
186    -1   end-of-file
187    -2   truncated quality string
188  */
189 #define __KSEQ_READ(SCOPE) \
190 	SCOPE int kseq_read(kseq_t *seq) \
191 	{ \
192 		int c; \
193 		kstream_t *ks = seq->f; \
194 		if (seq->last_char == 0) { /* then jump to the next header line */ \
195 			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
196 			if (c == -1) return -1; /* end of file */ \
197 			seq->last_char = c; \
198 		} /* else: the first header char has been read in the previous call */ \
199 		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
200 		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
201 		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
202 		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
203 			seq->seq.m = 256; \
204 			seq->seq.s = (char*)malloc(seq->seq.m); \
205 		} \
206 		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
207 			if (c == '\n') continue; /* skip empty lines */ \
208 			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
209 			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
210 		} \
211 		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */	\
212 		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
213 			seq->seq.m = seq->seq.l + 2; \
214 			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
215 			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
216 		} \
217 		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
218 		if (c != '+') return seq->seq.l; /* FASTA */ \
219 		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
220 			seq->qual.m = seq->seq.m; \
221 			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
222 		} \
223 		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
224 		if (c == -1) return -2; /* error: no quality string */ \
225 		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
226 		seq->last_char = 0;	/* we have not come to the next header line */ \
227 		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
228 		return seq->seq.l; \
229 	}
230 
231 #define __KSEQ_TYPE(type_t)						\
232 	typedef struct {							\
233 		kstring_t name, comment, seq, qual;		\
234 		int last_char;							\
235 		kstream_t *f;							\
236 	} kseq_t;
237 
238 #define KSEQ_INIT2(SCOPE, type_t, __read)		\
239 	KSTREAM_INIT(type_t, __read, 16384)			\
240 	__KSEQ_TYPE(type_t)							\
241 	__KSEQ_BASIC(SCOPE, type_t)					\
242 	__KSEQ_READ(SCOPE)
243 
244 #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
245 
246 #define KSEQ_DECLARE(type_t) \
247 	__KS_TYPE(type_t) \
248 	__KSEQ_TYPE(type_t) \
249 	extern kseq_t *kseq_init(type_t fd); \
250 	void kseq_destroy(kseq_t *ks); \
251 	int kseq_read(kseq_t *seq);
252 
253 #endif
254