1 /* The MIT License 2 3 Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk> 4 5 Permission is hereby granted, free of charge, to any person obtaining 6 a copy of this software and associated documentation files (the 7 "Software"), to deal in the Software without restriction, including 8 without limitation the rights to use, copy, modify, merge, publish, 9 distribute, sublicense, and/or sell copies of the Software, and to 10 permit persons to whom the Software is furnished to do so, subject to 11 the following conditions: 12 13 The above copyright notice and this permission notice shall be 14 included in all copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 SOFTWARE. 24 */ 25 26 /* Last Modified: 05MAR2012 */ 27 28 #ifndef AC_KSEQ_H 29 #define AC_KSEQ_H 30 31 #include <ctype.h> 32 #include <string.h> 33 #include <stdlib.h> 34 35 #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 36 #define KS_SEP_TAB 1 // isspace() && !' ' 37 #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 38 #define KS_SEP_MAX 2 39 40 #ifndef klib_unused 41 #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) 42 #define klib_unused __attribute__ ((__unused__)) 43 #else 44 #define klib_unused 45 #endif 46 #endif /* klib_unused */ 47 48 #define __KS_TYPE(type_t) \ 49 typedef struct __kstream_t { \ 50 int begin, end; \ 51 int is_eof:2, bufsize:30; \ 52 type_t f; \ 53 unsigned char *buf; \ 54 } kstream_t; 55 56 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 57 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 58 59 #define __KS_BASIC(SCOPE, type_t, __bufsize) \ 60 SCOPE kstream_t *ks_init(type_t f) \ 61 { \ 62 kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 63 ks->f = f; ks->bufsize = __bufsize; \ 64 ks->buf = (unsigned char*)malloc(__bufsize); \ 65 return ks; \ 66 } \ 67 SCOPE void ks_destroy(kstream_t *ks) \ 68 { \ 69 if (!ks) return; \ 70 free(ks->buf); \ 71 free(ks); \ 72 } 73 74 #define __KS_INLINED(__read) \ 75 static inline klib_unused int ks_getc(kstream_t *ks) \ 76 { \ 77 if (ks->is_eof && ks->begin >= ks->end) return -1; \ 78 if (ks->begin >= ks->end) { \ 79 ks->begin = 0; \ 80 ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 81 if (ks->end < ks->bufsize) ks->is_eof = 1; \ 82 if (ks->end == 0) return -1; \ 83 } \ 84 return (int)ks->buf[ks->begin++]; \ 85 } \ 86 static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 87 { return ks_getuntil2(ks, delimiter, str, dret, 0); } 88 89 #ifndef KSTRING_T 90 #define KSTRING_T kstring_t 91 typedef struct __kstring_t { 92 size_t l, m; 93 char *s; 94 } kstring_t; 95 #endif 96 97 #ifndef kroundup32 98 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 99 #endif 100 101 #define __KS_GETUNTIL(SCOPE, __read) \ 102 SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 103 { \ 104 if (dret) *dret = 0; \ 105 str->l = append? str->l : 0; \ 106 if (ks->begin >= ks->end && ks->is_eof) return -1; \ 107 for (;;) { \ 108 int i; \ 109 if (ks->begin >= ks->end) { \ 110 if (!ks->is_eof) { \ 111 ks->begin = 0; \ 112 ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 113 if (ks->end < ks->bufsize) ks->is_eof = 1; \ 114 if (ks->end == 0) break; \ 115 } else break; \ 116 } \ 117 if (delimiter == KS_SEP_LINE) { \ 118 for (i = ks->begin; i < ks->end; ++i) \ 119 if (ks->buf[i] == '\n') break; \ 120 } else if (delimiter > KS_SEP_MAX) { \ 121 for (i = ks->begin; i < ks->end; ++i) \ 122 if (ks->buf[i] == delimiter) break; \ 123 } else if (delimiter == KS_SEP_SPACE) { \ 124 for (i = ks->begin; i < ks->end; ++i) \ 125 if (isspace(ks->buf[i])) break; \ 126 } else if (delimiter == KS_SEP_TAB) { \ 127 for (i = ks->begin; i < ks->end; ++i) \ 128 if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 129 } else i = 0; /* never come to here! */ \ 130 if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ 131 str->m = str->l + (i - ks->begin) + 1; \ 132 kroundup32(str->m); \ 133 str->s = (char*)realloc(str->s, str->m); \ 134 } \ 135 memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 136 str->l = str->l + (i - ks->begin); \ 137 ks->begin = i + 1; \ 138 if (i < ks->end) { \ 139 if (dret) *dret = ks->buf[i]; \ 140 break; \ 141 } \ 142 } \ 143 if (str->s == 0) { \ 144 str->m = 1; \ 145 str->s = (char*)calloc(1, 1); \ 146 } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 147 str->s[str->l] = '\0'; \ 148 return str->l; \ 149 } 150 151 #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ 152 __KS_TYPE(type_t) \ 153 __KS_BASIC(SCOPE, type_t, __bufsize) \ 154 __KS_GETUNTIL(SCOPE, __read) \ 155 __KS_INLINED(__read) 156 157 #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) 158 159 #define KSTREAM_DECLARE(type_t, __read) \ 160 __KS_TYPE(type_t) \ 161 extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ 162 extern kstream_t *ks_init(type_t f); \ 163 extern void ks_destroy(kstream_t *ks); \ 164 __KS_INLINED(__read) 165 166 /****************** 167 * FASTA/Q parser * 168 ******************/ 169 170 #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) 171 172 #define __KSEQ_BASIC(SCOPE, type_t) \ 173 SCOPE kseq_t *kseq_init(type_t fd) \ 174 { \ 175 kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ 176 s->f = ks_init(fd); \ 177 return s; \ 178 } \ 179 SCOPE void kseq_destroy(kseq_t *ks) \ 180 { \ 181 if (!ks) return; \ 182 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ 183 ks_destroy(ks->f); \ 184 free(ks); \ 185 } 186 187 /* Return value: 188 >=0 length of the sequence (normal) 189 -1 end-of-file 190 -2 truncated quality string 191 */ 192 #define __KSEQ_READ(SCOPE) \ 193 SCOPE int kseq_read(kseq_t *seq) \ 194 { \ 195 int c; \ 196 kstream_t *ks = seq->f; \ 197 if (seq->last_char == 0) { /* then jump to the next header line */ \ 198 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ 199 if (c == -1) return -1; /* end of file */ \ 200 seq->last_char = c; \ 201 } /* else: the first header char has been read in the previous call */ \ 202 seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ 203 if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ 204 if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ 205 if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ 206 seq->seq.m = 256; \ 207 seq->seq.s = (char*)malloc(seq->seq.m); \ 208 } \ 209 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ 210 if (c == '\n') continue; /* skip empty lines */ \ 211 seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ 212 ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ 213 } \ 214 if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ 215 if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ 216 seq->seq.m = seq->seq.l + 2; \ 217 kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ 218 seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ 219 } \ 220 seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ 221 if (c != '+') return seq->seq.l; /* FASTA */ \ 222 if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ 223 seq->qual.m = seq->seq.m; \ 224 seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ 225 } \ 226 while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ 227 if (c == -1) return -2; /* error: no quality string */ \ 228 while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ 229 seq->last_char = 0; /* we have not come to the next header line */ \ 230 if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ 231 return seq->seq.l; \ 232 } 233 234 #define __KSEQ_TYPE(type_t) \ 235 typedef struct { \ 236 kstring_t name, comment, seq, qual; \ 237 int last_char; \ 238 kstream_t *f; \ 239 } kseq_t; 240 241 #define KSEQ_INIT2(SCOPE, type_t, __read) \ 242 KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ 243 __KSEQ_TYPE(type_t) \ 244 __KSEQ_BASIC(SCOPE, type_t) \ 245 __KSEQ_READ(SCOPE) 246 247 #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) 248 249 #define KSEQ_DECLARE(type_t) \ 250 __KS_TYPE(type_t) \ 251 __KSEQ_TYPE(type_t) \ 252 extern kseq_t *kseq_init(type_t fd); \ 253 void kseq_destroy(kseq_t *ks); \ 254 int kseq_read(kseq_t *seq); 255 256 #endif 257