1 /* The MIT License 2 3 Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk> 4 5 Permission is hereby granted, free of charge, to any person obtaining 6 a copy of this software and associated documentation files (the 7 "Software"), to deal in the Software without restriction, including 8 without limitation the rights to use, copy, modify, merge, publish, 9 distribute, sublicense, and/or sell copies of the Software, and to 10 permit persons to whom the Software is furnished to do so, subject to 11 the following conditions: 12 13 The above copyright notice and this permission notice shall be 14 included in all copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 SOFTWARE. 24 */ 25 26 /* Last Modified: 05MAR2012 */ 27 28 #ifndef AC_KSEQ_H 29 #define AC_KSEQ_H 30 31 #include <ctype.h> 32 #include <string.h> 33 #include <stdlib.h> 34 35 #ifndef klib_unused 36 #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) 37 #define klib_unused __attribute__ ((__unused__)) 38 #else 39 #define klib_unused 40 #endif 41 #endif /* klib_unused */ 42 43 #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 44 #define KS_SEP_TAB 1 // isspace() && !' ' 45 #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 46 #define KS_SEP_MAX 2 47 48 #define __KS_TYPE(type_t) \ 49 typedef struct __kstream_t { \ 50 int begin, end; \ 51 int is_eof:2, bufsize:30; \ 52 uint64_t seek_pos; \ 53 type_t f; \ 54 unsigned char *buf; \ 55 } kstream_t; 56 57 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 58 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 59 60 #define __KS_BASIC(SCOPE, type_t, __bufsize) \ 61 SCOPE kstream_t *ks_init(type_t f) \ 62 { \ 63 kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 64 ks->f = f; ks->bufsize = __bufsize; \ 65 ks->buf = (unsigned char*)malloc(__bufsize); \ 66 return ks; \ 67 } \ 68 SCOPE void ks_destroy(kstream_t *ks) \ 69 { \ 70 if (!ks) return; \ 71 free(ks->buf); \ 72 free(ks); \ 73 } 74 75 #define __KS_INLINED(__read) \ 76 static inline klib_unused int ks_getc(kstream_t *ks) \ 77 { \ 78 if (ks->is_eof && ks->begin >= ks->end) return -1; \ 79 if (ks->begin >= ks->end) { \ 80 ks->begin = 0; \ 81 ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 82 if (ks->end == 0) { ks->is_eof = 1; return -1; } \ 83 } \ 84 ks->seek_pos++; \ 85 return (int)ks->buf[ks->begin++]; \ 86 } \ 87 static inline klib_unused int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 88 { return ks_getuntil2(ks, delimiter, str, dret, 0); } 89 90 #ifndef KSTRING_T 91 #define KSTRING_T kstring_t 92 typedef struct __kstring_t { 93 size_t l, m; 94 char *s; 95 } kstring_t; 96 #endif 97 98 #ifndef kroundup32 99 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 100 #endif 101 102 #define __KS_GETUNTIL(SCOPE, __read) \ 103 SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 104 { \ 105 int gotany = 0; \ 106 if (dret) *dret = 0; \ 107 str->l = append? str->l : 0; \ 108 uint64_t seek_pos = str->l; \ 109 for (;;) { \ 110 int i; \ 111 if (ks->begin >= ks->end) { \ 112 if (!ks->is_eof) { \ 113 ks->begin = 0; \ 114 ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 115 if (ks->end == 0) { ks->is_eof = 1; break; } \ 116 } else break; \ 117 } \ 118 if (delimiter == KS_SEP_LINE) { \ 119 for (i = ks->begin; i < ks->end; ++i) \ 120 if (ks->buf[i] == '\n') break; \ 121 } else if (delimiter > KS_SEP_MAX) { \ 122 for (i = ks->begin; i < ks->end; ++i) \ 123 if (ks->buf[i] == delimiter) break; \ 124 } else if (delimiter == KS_SEP_SPACE) { \ 125 for (i = ks->begin; i < ks->end; ++i) \ 126 if (isspace(ks->buf[i])) break; \ 127 } else if (delimiter == KS_SEP_TAB) { \ 128 for (i = ks->begin; i < ks->end; ++i) \ 129 if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 130 } else i = 0; /* never come to here! */ \ 131 if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ 132 str->m = str->l + (i - ks->begin) + 1; \ 133 kroundup32(str->m); \ 134 str->s = (char*)realloc(str->s, str->m); \ 135 } \ 136 seek_pos += i - ks->begin; if ( i < ks->end ) seek_pos++; \ 137 gotany = 1; \ 138 memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 139 str->l = str->l + (i - ks->begin); \ 140 ks->begin = i + 1; \ 141 if (i < ks->end) { \ 142 if (dret) *dret = ks->buf[i]; \ 143 break; \ 144 } \ 145 } \ 146 if (!gotany && ks_eof(ks)) return -1; \ 147 ks->seek_pos += seek_pos; \ 148 if (str->s == 0) { \ 149 str->m = 1; \ 150 str->s = (char*)calloc(1, 1); \ 151 } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 152 str->s[str->l] = '\0'; \ 153 return str->l; \ 154 } 155 156 #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ 157 __KS_TYPE(type_t) \ 158 __KS_BASIC(SCOPE, type_t, __bufsize) \ 159 __KS_GETUNTIL(SCOPE, __read) \ 160 __KS_INLINED(__read) 161 162 #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) 163 164 #define KSTREAM_DECLARE(type_t, __read) \ 165 __KS_TYPE(type_t) \ 166 extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ 167 extern kstream_t *ks_init(type_t f); \ 168 extern void ks_destroy(kstream_t *ks); \ 169 __KS_INLINED(__read) 170 171 /****************** 172 * FASTA/Q parser * 173 ******************/ 174 175 #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) 176 177 #define __KSEQ_BASIC(SCOPE, type_t) \ 178 SCOPE kseq_t *kseq_init(type_t fd) \ 179 { \ 180 kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ 181 s->f = ks_init(fd); \ 182 return s; \ 183 } \ 184 SCOPE void kseq_destroy(kseq_t *ks) \ 185 { \ 186 if (!ks) return; \ 187 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ 188 ks_destroy(ks->f); \ 189 free(ks); \ 190 } 191 192 /* Return value: 193 >=0 length of the sequence (normal) 194 -1 end-of-file 195 -2 truncated quality string 196 */ 197 #define __KSEQ_READ(SCOPE) \ 198 SCOPE int kseq_read(kseq_t *seq) \ 199 { \ 200 int c; \ 201 kstream_t *ks = seq->f; \ 202 if (seq->last_char == 0) { /* then jump to the next header line */ \ 203 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ 204 if (c == -1) return -1; /* end of file */ \ 205 seq->last_char = c; \ 206 } /* else: the first header char has been read in the previous call */ \ 207 seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ 208 if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ 209 if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ 210 if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ 211 seq->seq.m = 256; \ 212 seq->seq.s = (char*)malloc(seq->seq.m); \ 213 } \ 214 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ 215 if (c == '\n') continue; /* skip empty lines */ \ 216 seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ 217 ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ 218 } \ 219 if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ 220 if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ 221 seq->seq.m = seq->seq.l + 2; \ 222 kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ 223 seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ 224 } \ 225 seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ 226 if (c != '+') return seq->seq.l; /* FASTA */ \ 227 if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ 228 seq->qual.m = seq->seq.m; \ 229 seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ 230 } \ 231 while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ 232 if (c == -1) return -2; /* error: no quality string */ \ 233 while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ 234 seq->last_char = 0; /* we have not come to the next header line */ \ 235 if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ 236 return seq->seq.l; \ 237 } 238 239 #define __KSEQ_TYPE(type_t) \ 240 typedef struct { \ 241 kstring_t name, comment, seq, qual; \ 242 int last_char; \ 243 kstream_t *f; \ 244 } kseq_t; 245 246 #define KSEQ_INIT2(SCOPE, type_t, __read) \ 247 KSTREAM_INIT(type_t, __read, 16384) \ 248 __KSEQ_TYPE(type_t) \ 249 __KSEQ_BASIC(SCOPE, type_t) \ 250 __KSEQ_READ(SCOPE) 251 252 #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) 253 254 #define KSEQ_DECLARE(type_t) \ 255 __KS_TYPE(type_t) \ 256 __KSEQ_TYPE(type_t) \ 257 extern kseq_t *kseq_init(type_t fd); \ 258 void kseq_destroy(kseq_t *ks); \ 259 int kseq_read(kseq_t *seq); 260 261 #endif 262