1 /* The MIT License 2 3 Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk> 4 5 Permission is hereby granted, free of charge, to any person obtaining 6 a copy of this software and associated documentation files (the 7 "Software"), to deal in the Software without restriction, including 8 without limitation the rights to use, copy, modify, merge, publish, 9 distribute, sublicense, and/or sell copies of the Software, and to 10 permit persons to whom the Software is furnished to do so, subject to 11 the following conditions: 12 13 The above copyright notice and this permission notice shall be 14 included in all copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 SOFTWARE. 24 */ 25 26 /* Last Modified: 05MAR2012 */ 27 28 #ifndef AC_KSEQ_H 29 #define AC_KSEQ_H 30 31 #include <ctype.h> 32 #include <string.h> 33 #include <stdlib.h> 34 35 #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 36 #define KS_SEP_TAB 1 // isspace() && !' ' 37 #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 38 #define KS_SEP_MAX 2 39 40 #define __KS_TYPE(type_t) \ 41 typedef struct __kstream_t { \ 42 int begin, end; \ 43 int is_eof:2, bufsize:30; \ 44 uint64_t seek_pos; \ 45 type_t f; \ 46 unsigned char *buf; \ 47 } kstream_t; 48 49 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 50 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 51 52 #define __KS_BASIC(SCOPE, type_t, __bufsize) \ 53 SCOPE kstream_t *ks_init(type_t f) \ 54 { \ 55 kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 56 ks->f = f; ks->bufsize = __bufsize; \ 57 ks->buf = (unsigned char*)malloc(__bufsize); \ 58 return ks; \ 59 } \ 60 SCOPE void ks_destroy(kstream_t *ks) \ 61 { \ 62 if (!ks) return; \ 63 free(ks->buf); \ 64 free(ks); \ 65 } 66 67 #define __KS_INLINED(__read) \ 68 static inline int ks_getc(kstream_t *ks) \ 69 { \ 70 if (ks->is_eof && ks->begin >= ks->end) return -1; \ 71 if (ks->begin >= ks->end) { \ 72 ks->begin = 0; \ 73 ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 74 if (ks->end == 0) { ks->is_eof = 1; return -1; } \ 75 } \ 76 ks->seek_pos++; \ 77 return (int)ks->buf[ks->begin++]; \ 78 } \ 79 static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 80 { return ks_getuntil2(ks, delimiter, str, dret, 0); } 81 82 #ifndef KSTRING_T 83 #define KSTRING_T kstring_t 84 typedef struct __kstring_t { 85 size_t l, m; 86 char *s; 87 } kstring_t; 88 #endif 89 90 #ifndef kroundup32 91 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 92 #endif 93 94 #define __KS_GETUNTIL(SCOPE, __read) \ 95 SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 96 { \ 97 int gotany = 0; \ 98 if (dret) *dret = 0; \ 99 str->l = append? str->l : 0; \ 100 uint64_t seek_pos = str->l; \ 101 for (;;) { \ 102 int i; \ 103 if (ks->begin >= ks->end) { \ 104 if (!ks->is_eof) { \ 105 ks->begin = 0; \ 106 ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 107 if (ks->end == 0) { ks->is_eof = 1; break; } \ 108 } else break; \ 109 } \ 110 if (delimiter == KS_SEP_LINE) { \ 111 for (i = ks->begin; i < ks->end; ++i) \ 112 if (ks->buf[i] == '\n') break; \ 113 } else if (delimiter > KS_SEP_MAX) { \ 114 for (i = ks->begin; i < ks->end; ++i) \ 115 if (ks->buf[i] == delimiter) break; \ 116 } else if (delimiter == KS_SEP_SPACE) { \ 117 for (i = ks->begin; i < ks->end; ++i) \ 118 if (isspace(ks->buf[i])) break; \ 119 } else if (delimiter == KS_SEP_TAB) { \ 120 for (i = ks->begin; i < ks->end; ++i) \ 121 if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 122 } else i = 0; /* never come to here! */ \ 123 if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ 124 str->m = str->l + (i - ks->begin) + 1; \ 125 kroundup32(str->m); \ 126 str->s = (char*)realloc(str->s, str->m); \ 127 } \ 128 seek_pos += i - ks->begin; if ( i < ks->end ) seek_pos++; \ 129 gotany = 1; \ 130 memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 131 str->l = str->l + (i - ks->begin); \ 132 ks->begin = i + 1; \ 133 if (i < ks->end) { \ 134 if (dret) *dret = ks->buf[i]; \ 135 break; \ 136 } \ 137 } \ 138 if (!gotany && ks_eof(ks)) return -1; \ 139 ks->seek_pos += seek_pos; \ 140 if (str->s == 0) { \ 141 str->m = 1; \ 142 str->s = (char*)calloc(1, 1); \ 143 } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 144 str->s[str->l] = '\0'; \ 145 return str->l; \ 146 } 147 148 #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ 149 __KS_TYPE(type_t) \ 150 __KS_BASIC(SCOPE, type_t, __bufsize) \ 151 __KS_GETUNTIL(SCOPE, __read) \ 152 __KS_INLINED(__read) 153 154 #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) 155 156 #define KSTREAM_DECLARE(type_t, __read) \ 157 __KS_TYPE(type_t) \ 158 extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ 159 extern kstream_t *ks_init(type_t f); \ 160 extern void ks_destroy(kstream_t *ks); \ 161 __KS_INLINED(__read) 162 163 /****************** 164 * FASTA/Q parser * 165 ******************/ 166 167 #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) 168 169 #define __KSEQ_BASIC(SCOPE, type_t) \ 170 SCOPE kseq_t *kseq_init(type_t fd) \ 171 { \ 172 kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ 173 s->f = ks_init(fd); \ 174 return s; \ 175 } \ 176 SCOPE void kseq_destroy(kseq_t *ks) \ 177 { \ 178 if (!ks) return; \ 179 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ 180 ks_destroy(ks->f); \ 181 free(ks); \ 182 } 183 184 /* Return value: 185 >=0 length of the sequence (normal) 186 -1 end-of-file 187 -2 truncated quality string 188 */ 189 #define __KSEQ_READ(SCOPE) \ 190 SCOPE int kseq_read(kseq_t *seq) \ 191 { \ 192 int c; \ 193 kstream_t *ks = seq->f; \ 194 if (seq->last_char == 0) { /* then jump to the next header line */ \ 195 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ 196 if (c == -1) return -1; /* end of file */ \ 197 seq->last_char = c; \ 198 } /* else: the first header char has been read in the previous call */ \ 199 seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ 200 if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ 201 if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ 202 if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ 203 seq->seq.m = 256; \ 204 seq->seq.s = (char*)malloc(seq->seq.m); \ 205 } \ 206 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ 207 if (c == '\n') continue; /* skip empty lines */ \ 208 seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ 209 ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ 210 } \ 211 if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ 212 if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ 213 seq->seq.m = seq->seq.l + 2; \ 214 kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ 215 seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ 216 } \ 217 seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ 218 if (c != '+') return seq->seq.l; /* FASTA */ \ 219 if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ 220 seq->qual.m = seq->seq.m; \ 221 seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ 222 } \ 223 while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ 224 if (c == -1) return -2; /* error: no quality string */ \ 225 while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ 226 seq->last_char = 0; /* we have not come to the next header line */ \ 227 if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ 228 return seq->seq.l; \ 229 } 230 231 #define __KSEQ_TYPE(type_t) \ 232 typedef struct { \ 233 kstring_t name, comment, seq, qual; \ 234 int last_char; \ 235 kstream_t *f; \ 236 } kseq_t; 237 238 #define KSEQ_INIT2(SCOPE, type_t, __read) \ 239 KSTREAM_INIT(type_t, __read, 16384) \ 240 __KSEQ_TYPE(type_t) \ 241 __KSEQ_BASIC(SCOPE, type_t) \ 242 __KSEQ_READ(SCOPE) 243 244 #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) 245 246 #define KSEQ_DECLARE(type_t) \ 247 __KS_TYPE(type_t) \ 248 __KSEQ_TYPE(type_t) \ 249 extern kseq_t *kseq_init(type_t fd); \ 250 void kseq_destroy(kseq_t *ks); \ 251 int kseq_read(kseq_t *seq); 252 253 #endif 254