1 /*
2 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3 *
4 * This file is part of Bowtie 2.
5 *
6 * Bowtie 2 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Bowtie 2 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef FILEBUF_H_
21 #define FILEBUF_H_
22
23 #include <iostream>
24 #include <fstream>
25 #include <string>
26 #include <stdio.h>
27 #include <string.h>
28 #include <stdint.h>
29 #include <stdexcept>
30 #include "assert_helpers.h"
31 #include <errno.h>
32 #include <stdlib.h>
33 #include <zlib.h>
34 #ifdef WITH_ZSTD
35 #include "zstd_decompress.h"
36 #endif
37
38 /**
39 * Simple, fast helper for determining if a character is a newline.
40 */
isnewline(int c)41 static inline bool isnewline(int c) {
42 return c == '\r' || c == '\n';
43 }
44
45 /**
46 * Simple, fast helper for determining if a character is a non-newline
47 * whitespace character.
48 */
isspace_notnl(int c)49 static inline bool isspace_notnl(int c) {
50 return isspace(c) && !isnewline(c);
51 }
52
53 /**
54 * Simple wrapper for a FILE*, istream or ifstream that reads it in chunks
55 * using fread and keeps those chunks in a buffer. It also services calls to
56 * get(), peek() and gets() from the buffer, reading in additional chunks when
57 * necessary.
58 *
59 * Helper functions do things like parse strings, numbers, and FASTA records.
60 *
61 *
62 */
63 class FileBuf {
64 public:
FileBuf()65 FileBuf() {
66 init();
67 }
68
FileBuf(FILE * in)69 FileBuf(FILE *in) {
70 init();
71 _in = in;
72 assert(_in != NULL);
73 }
74
FileBuf(gzFile in)75 FileBuf(gzFile in) {
76 init();
77 _zIn = in;
78 assert(_zIn != NULL);
79 }
80 #ifdef WITH_ZSTD
FileBuf(zstdStrm * zstdIn)81 FileBuf(zstdStrm *zstdIn) {
82 init();
83 _zstdIn = zstdIn;
84 assert(_zstdIn != NULL);
85 }
86 #endif
87
FileBuf(std::ifstream * inf)88 FileBuf(std::ifstream *inf) {
89 init();
90 _inf = inf;
91 assert(_inf != NULL);
92 }
93
FileBuf(std::istream * ins)94 FileBuf(std::istream *ins) {
95 init();
96 _ins = ins;
97 assert(_ins != NULL);
98 }
99
100
~FileBuf()101 ~FileBuf() {
102 close();
103 }
104
105 /**
106 * Return true iff there is a stream ready to read.
107 */
isOpen()108 bool isOpen() {
109 return _in != NULL || _inf != NULL || _ins != NULL;
110 }
111
112 /**
113 * Close the input stream (if that's possible)
114 */
close()115 void close() {
116 if(_in != NULL && _in != stdin) {
117 fclose(_in);
118 } else if(_inf != NULL) {
119 _inf->close();
120 } else if(_zIn != NULL) {
121 gzclose(_zIn);
122 #ifdef WITH_ZSTD
123 } else if(_zstdIn != NULL) {
124 zstdClose(_zstdIn);
125 #endif
126 } else {
127 // can't close _ins
128 }
129 }
130
131 /**
132 * Get the next character of input and advance.
133 */
get()134 int get() {
135 assert(_in != NULL || _zIn != NULL || _inf != NULL || _ins != NULL);
136 #ifdef WITH_ZSTD
137 assert(zstdIn != NULL)
138 #endif
139 int c = peek();
140 if(c != -1) {
141 _cur++;
142 if(_lastn_cur < LASTN_BUF_SZ) _lastn_buf[_lastn_cur++] = c;
143 }
144 return c;
145 }
146
147 /**
148 * Return true iff all input is exhausted.
149 */
eof()150 bool eof() {
151 return (_cur == _buf_sz) && _done;
152 }
153
154 /**
155 * Initialize the buffer with a new C-style file.
156 */
newFile(FILE * in)157 void newFile(FILE *in) {
158 _in = in;
159 _zIn = NULL;
160 _inf = NULL;
161 _ins = NULL;
162 #ifdef WITH_ZSTD
163 _zstdIn = NULL;
164 #endif
165 _cur = BUF_SZ;
166 _buf_sz = BUF_SZ;
167 _done = false;
168 }
169
170 /**
171 * Initialize the buffer with a new gz file.
172 */
newFile(gzFile in)173 void newFile(gzFile in) {
174 _in = NULL;
175 _zIn = in;
176 _inf = NULL;
177 _ins = NULL;
178 #ifdef WITH_ZSTD
179 _zstdIn = NULL;
180 #endif
181 _cur = BUF_SZ;
182 _buf_sz = BUF_SZ;
183 _done = false;
184 }
185
186 #ifdef WITH_ZSTD
187 /**
188 * Initialize the buffer with a new ZSTD file.
189 */
newFile(zstdStrm * s)190 void newFile(zstdStrm *s) {
191 _in = NULL;
192 _zIn = NULL;
193 _inf = NULL;
194 _ins = NULL;
195 _zstdIn = s;
196 _cur = BUF_SZ;
197 _buf_sz = BUF_SZ;
198 _done = false;
199 }
200 #endif
201 /**
202 * Initialize the buffer with a new ifstream.
203 */
newFile(std::ifstream * __inf)204 void newFile(std::ifstream *__inf) {
205 _in = NULL;
206 _zIn = NULL;
207 _inf = __inf;
208 _ins = NULL;
209 #ifdef WITH_ZSTD
210 _zstdIn = NULL;
211 #endif
212 _cur = BUF_SZ;
213 _buf_sz = BUF_SZ;
214 _done = false;
215 }
216
217 /**
218 * Initialize the buffer with a new istream.
219 */
newFile(std::istream * __ins)220 void newFile(std::istream *__ins) {
221 _in = NULL;
222 _zIn = NULL;
223 _inf = NULL;
224 _ins = __ins;
225 #ifdef WITH_ZSTD
226 _zstdIn = NULL;
227 #endif
228 _cur = BUF_SZ;
229 _buf_sz = BUF_SZ;
230 _done = false;
231 }
232
233 /**
234 * Restore state as though we just started reading the input
235 * stream.
236 */
reset()237 void reset() {
238 if(_inf != NULL) {
239 _inf->clear();
240 _inf->seekg(0, std::ios::beg);
241 } else if(_ins != NULL) {
242 _ins->clear();
243 _ins->seekg(0, std::ios::beg);
244 } else if (_zIn != NULL) {
245 gzrewind(_zIn);
246 #ifdef WITH_ZSTD
247 } else if (_zstdIn) {
248 zstdRewind(_zstdIn);
249 #endif
250 } else {
251 rewind(_in);
252 }
253 _cur = BUF_SZ;
254 _buf_sz = BUF_SZ;
255 _done = false;
256 }
257
258 /**
259 * Peek at the next character of the input stream without
260 * advancing. Typically we can simple read it from the buffer.
261 * Occasionally we'll need to read in a new buffer's worth of data.
262 */
peek()263 int peek() {
264 assert(_in != NULL || _zIn != NULL || _inf != NULL || _ins != NULL);
265 #ifdef WITH_ZSTD
266 assert(zstdIn != NULL);
267 #endif
268 assert_leq(_cur, _buf_sz);
269 if(_cur == _buf_sz) {
270 if(_done) {
271 // We already exhausted the input stream
272 return -1;
273 }
274 // Read a new buffer's worth of data
275 else {
276 // Get the next chunk
277 if(_inf != NULL) {
278 _inf->read((char*)_buf, BUF_SZ);
279 _buf_sz = _inf->gcount();
280 } else if(_zIn != NULL) {
281 _buf_sz = gzread(_zIn, (void *)_buf, BUF_SZ);
282 } else if(_ins != NULL) {
283 _ins->read((char*)_buf, BUF_SZ);
284 _buf_sz = _ins->gcount();
285 #ifdef WITH_ZSTD
286 } else if (_zstdIn != NULL) {
287 _buf_sz = zstdRead(_zstdIn, (void *)_buf, BUF_SZ);
288 #endif
289 } else {
290 assert(_in != NULL);
291 // TODO: consider an _unlocked function
292 _buf_sz = fread(_buf, 1, BUF_SZ, _in);
293 }
294 _cur = 0;
295 if(_buf_sz == 0) {
296 // Exhausted, and we have nothing to return to the
297 // caller
298 _done = true;
299 return -1;
300 } else if(_buf_sz < BUF_SZ) {
301 // Exhausted
302 _done = true;
303 }
304 }
305 }
306 return (int)_buf[_cur];
307 }
308
309 /**
310 * Store a string of characters from the input file into 'buf',
311 * until we see a newline, EOF, or until 'len' characters have been
312 * read.
313 */
gets(char * buf,size_t len)314 size_t gets(char *buf, size_t len) {
315 size_t stored = 0;
316 while(true) {
317 int c = get();
318 if(c == -1) {
319 // End-of-file
320 buf[stored] = '\0';
321 return stored;
322 }
323 if(stored == len-1 || isnewline(c)) {
324 // End of string
325 buf[stored] = '\0';
326 // Skip over all end-of-line characters
327 int pc = peek();
328 while(isnewline(pc)) {
329 get(); // discard
330 pc = peek();
331 }
332 // Next get() will be after all newline characters
333 return stored;
334 }
335 buf[stored++] = (char)c;
336 }
337 }
338
339 /**
340 * Store a string of characters from the input file into 'buf',
341 * until we see a newline, EOF, or until 'len' characters have been
342 * read.
343 */
get(char * buf,size_t len)344 size_t get(char *buf, size_t len) {
345 size_t stored = 0;
346 for(size_t i = 0; i < len; i++) {
347 int c = get();
348 if(c == -1) return i;
349 buf[stored++] = (char)c;
350 }
351 return len;
352 }
353
354 static const size_t LASTN_BUF_SZ = 8 * 1024;
355
356 /**
357 * Keep get()ing characters until a non-whitespace character (or
358 * -1) is reached, and return it.
359 */
getPastWhitespace()360 int getPastWhitespace() {
361 int c;
362 while(isspace(c = get()) && c != -1);
363 return c;
364 }
365
366 /**
367 * Keep get()ing characters until a we've passed over the next
368 * string of newline characters (\r's and \n's) or -1 is reached,
369 * and return it.
370 */
getPastNewline()371 int getPastNewline() {
372 int c = get();
373 while(!isnewline(c) && c != -1) c = get();
374 while(isnewline(c)) c = get();
375 assert_neq(c, '\r');
376 assert_neq(c, '\n');
377 return c;
378 }
379
380 /**
381 * Keep get()ing characters until a we've passed over the next
382 * string of newline characters (\r's and \n's) or -1 is reached,
383 * and return it.
384 */
peekPastNewline()385 int peekPastNewline() {
386 int c = peek();
387 while(!isnewline(c) && c != -1) c = get();
388 while(isnewline(c)) c = get();
389 assert_neq(c, '\r');
390 assert_neq(c, '\n');
391 return c;
392 }
393
394 /**
395 * Keep peek()ing then get()ing characters until the next return
396 * from peek() is just after the last newline of the line.
397 */
peekUptoNewline()398 int peekUptoNewline() {
399 int c = peek();
400 while(!isnewline(c) && c != -1) {
401 get(); c = peek();
402 }
403 while(isnewline(c)) {
404 get();
405 c = peek();
406 }
407 assert_neq(c, '\r');
408 assert_neq(c, '\n');
409 return c;
410 }
411
412 /**
413 * Parse a FASTA record. Append name characters to 'name' and and append
414 * all sequence characters to 'seq'. If gotCaret is true, assuming the
415 * file cursor has already moved just past the starting '>' character.
416 */
417 template <typename TNameStr, typename TSeqStr>
418 void parseFastaRecord(
419 TNameStr& name,
420 TSeqStr& seq,
421 bool gotCaret = false)
422 {
423 int c;
424 if(!gotCaret) {
425 // Skip over caret and non-newline whitespace
426 c = peek();
427 while(isspace_notnl(c) || c == '>') { get(); c = peek(); }
428 } else {
429 // Skip over non-newline whitespace
430 c = peek();
431 while(isspace_notnl(c)) { get(); c = peek(); }
432 }
433 size_t namecur = 0, seqcur = 0;
434 // c is the first character of the fasta name record, or is the first
435 // newline character if the name record is empty
436 while(!isnewline(c) && c != -1) {
437 name[namecur++] = c; get(); c = peek();
438 }
439 // sequence consists of all the non-whitespace characters between here
440 // and the next caret
441 while(true) {
442 // skip over whitespace
443 while(isspace(c)) { get(); c = peek(); }
444 // if we see caret or EOF, break
445 if(c == '>' || c == -1) break;
446 // append and continue
447 seq[seqcur++] = c;
448 get(); c = peek();
449 }
450 }
451
452 /**
453 * Parse a FASTA record and return its length. If gotCaret is true,
454 * assuming the file cursor has already moved just past the starting '>'
455 * character.
456 */
457 void parseFastaRecordLength(
458 size_t& nameLen,
459 size_t& seqLen,
460 bool gotCaret = false)
461 {
462 int c;
463 nameLen = seqLen = 0;
464 if(!gotCaret) {
465 // Skip over caret and non-newline whitespace
466 c = peek();
467 while(isspace_notnl(c) || c == '>') { get(); c = peek(); }
468 } else {
469 // Skip over non-newline whitespace
470 c = peek();
471 while(isspace_notnl(c)) { get(); c = peek(); }
472 }
473 // c is the first character of the fasta name record, or is the first
474 // newline character if the name record is empty
475 while(!isnewline(c) && c != -1) {
476 nameLen++; get(); c = peek();
477 }
478 // sequence consists of all the non-whitespace characters between here
479 // and the next caret
480 while(true) {
481 // skip over whitespace
482 while(isspace(c)) { get(); c = peek(); }
483 // if we see caret or EOF, break
484 if(c == '>' || c == -1) break;
485 // append and continue
486 seqLen++;
487 get(); c = peek();
488 }
489 }
490
491 /**
492 * Reset to the beginning of the last-N-chars buffer.
493 */
resetLastN()494 void resetLastN() {
495 _lastn_cur = 0;
496 }
497
498 /**
499 * Copy the last several characters in the last-N-chars buffer
500 * (since the last reset) into the provided buffer.
501 */
copyLastN(char * buf)502 size_t copyLastN(char *buf) {
503 memcpy(buf, _lastn_buf, _lastn_cur);
504 return _lastn_cur;
505 }
506
507 /**
508 * Get const pointer to the last-N-chars buffer.
509 */
lastN()510 const char *lastN() const {
511 return _lastn_buf;
512 }
513
514 /**
515 * Get current size of the last-N-chars buffer.
516 */
lastNLen()517 size_t lastNLen() const {
518 return _lastn_cur;
519 }
520
521 private:
522
init()523 void init() {
524 _in = NULL;
525 _zIn = NULL;
526 _inf = NULL;
527 _ins = NULL;
528 #ifdef WITH_ZSTD
529 _zstdIn = NULL;
530 #endif
531 _cur = _buf_sz = BUF_SZ;
532 _done = false;
533 _lastn_cur = 0;
534 // no need to clear _buf[]
535 }
536
537 static const size_t BUF_SZ = 256 * 1024;
538 FILE *_in;
539 gzFile _zIn;
540 #ifdef WITH_ZSTD
541 zstdStrm *_zstdIn;
542 #endif
543 std::ifstream *_inf;
544 std::istream *_ins;
545 size_t _cur;
546 size_t _buf_sz;
547 bool _done;
548 uint8_t _buf[BUF_SZ]; // (large) input buffer
549 size_t _lastn_cur;
550 char _lastn_buf[LASTN_BUF_SZ]; // buffer of the last N chars dispensed
551 };
552
553 /**
554 * Wrapper for a buffered output stream that writes bitpairs.
555 */
556 class BitpairOutFileBuf {
557 public:
558 /**
559 * Open a new output stream to a file with given name.
560 */
BitpairOutFileBuf(const char * in)561 BitpairOutFileBuf(const char *in) : bpPtr_(0), cur_(0) {
562 assert(in != NULL);
563 out_ = fopen(in, "wb");
564 if(out_ == NULL) {
565 std::cerr << "Error: Could not open bitpair-output file " << in << std::endl;
566 throw 1;
567 }
568 memset(buf_, 0, BUF_SZ);
569 }
570
571 /**
572 * Write a single bitpair into the buf. Flush the buffer if it's
573 * full.
574 */
write(int bp)575 void write(int bp) {
576 assert_lt(bp, 4);
577 assert_geq(bp, 0);
578 buf_[cur_] |= (bp << bpPtr_);
579 if(bpPtr_ == 6) {
580 bpPtr_ = 0;
581 cur_++;
582 if(cur_ == BUF_SZ) {
583 // Flush the buffer
584 if(!fwrite((const void *)buf_, BUF_SZ, 1, out_)) {
585 std::cerr << "Error writing to the reference index file (.4.ebwt)" << std::endl;
586 throw 1;
587 }
588 // Reset to beginning of the buffer
589 cur_ = 0;
590 }
591 // Initialize next octet to 0
592 buf_[cur_] = 0;
593 } else {
594 bpPtr_ += 2;
595 }
596 }
597
598 /**
599 * Write any remaining bitpairs and then close the input
600 */
close()601 void close() {
602 if(cur_ > 0 || bpPtr_ > 0) {
603 if(bpPtr_ == 0) cur_--;
604 if(!fwrite((const void *)buf_, cur_ + 1, 1, out_)) {
605 std::cerr << "Error writing to the reference index file (.4.ebwt)" << std::endl;
606 throw 1;
607 }
608 }
609 fclose(out_);
610 }
611 private:
612 static const size_t BUF_SZ = 128 * 1024;
613 FILE *out_;
614 int bpPtr_;
615 size_t cur_;
616 char buf_[BUF_SZ]; // (large) input buffer
617 };
618
619 /**
620 * Wrapper for a buffered output stream that writes characters and
621 * other data types. This class is *not* synchronized; the caller is
622 * responsible for synchronization.
623 */
624 class OutFileBuf {
625
626 public:
627
628 /**
629 * Open a new output stream to a file with given name.
630 */
631 OutFileBuf(const std::string& out, bool binary = false) :
632 name_(out.c_str()), cur_(0), closed_(false)
633 {
634 out_ = fopen(out.c_str(), binary ? "wb" : "w");
635 if(out_ == NULL) {
636 std::cerr << "Error: Could not open alignment output file " << out.c_str() << std::endl;
637 throw 1;
638 }
639 if(setvbuf(out_, NULL, _IOFBF, 10* 1024* 1024))
640 std::cerr << "Warning: Could not allocate the proper buffer size for output file stream. " << std::endl;
641 }
642
643 /**
644 * Open a new output stream to a file with given name.
645 */
646 OutFileBuf(const char *out, bool binary = false) :
name_(out)647 name_(out), cur_(0), closed_(false)
648 {
649 assert(out != NULL);
650 out_ = fopen(out, binary ? "wb" : "w");
651 if(out_ == NULL) {
652 std::cerr << "Error: Could not open alignment output file " << out << std::endl;
653 throw 1;
654 }
655 }
656
657 /**
658 * Open a new output stream to standard out.
659 */
OutFileBuf()660 OutFileBuf() : name_("cout"), cur_(0), closed_(false) {
661 out_ = stdout;
662 }
663
664 /**
665 * Close buffer when object is destroyed.
666 */
~OutFileBuf()667 ~OutFileBuf() { close(); }
668
669 /**
670 * Open a new output stream to a file with given name.
671 */
672 void setFile(const char *out, bool binary = false) {
673 assert(out != NULL);
674 out_ = fopen(out, binary ? "wb" : "w");
675 if(out_ == NULL) {
676 std::cerr << "Error: Could not open alignment output file " << out << std::endl;
677 throw 1;
678 }
679 reset();
680 }
681
682 /**
683 * Write a single character into the write buffer and, if
684 * necessary, flush.
685 */
write(char c)686 void write(char c) {
687 assert(!closed_);
688 if(cur_ == BUF_SZ) flush();
689 buf_[cur_++] = c;
690 }
691
692 /**
693 * Write a c++ string to the write buffer and, if necessary, flush.
694 */
writeString(const std::string & s)695 void writeString(const std::string& s) {
696 assert(!closed_);
697 size_t slen = s.length();
698 if(cur_ + slen > BUF_SZ) {
699 if(cur_ > 0) flush();
700 if(slen >= BUF_SZ) {
701 if (slen != fwrite(s.c_str(), 1, slen, out_)) {
702 std::cerr << "Error: outputting data" << std::endl;
703 throw 1;
704 }
705 } else {
706 memcpy(&buf_[cur_], s.data(), slen);
707 assert_eq(0, cur_);
708 cur_ = slen;
709 }
710 } else {
711 memcpy(&buf_[cur_], s.data(), slen);
712 cur_ += slen;
713 }
714 assert_leq(cur_, BUF_SZ);
715 }
716
717 /**
718 * Write a c++ string to the write buffer and, if necessary, flush.
719 */
720 template<typename T>
writeString(const T & s)721 void writeString(const T& s) {
722 assert(!closed_);
723 size_t slen = s.length();
724 if(cur_ + slen > BUF_SZ) {
725 if(cur_ > 0) flush();
726 if(slen >= BUF_SZ) {
727 if (slen != fwrite(s.toZBuf(), 1, slen, out_)) {
728 std::cerr << "Error outputting data" << std::endl;
729 throw 1;
730 }
731 } else {
732 memcpy(&buf_[cur_], s.toZBuf(), slen);
733 assert_eq(0, cur_);
734 cur_ = slen;
735 }
736 } else {
737 memcpy(&buf_[cur_], s.toZBuf(), slen);
738 cur_ += slen;
739 }
740 assert_leq(cur_, BUF_SZ);
741 }
742
743 /**
744 * Write a c++ string to the write buffer and, if necessary, flush.
745 */
writeChars(const char * s,size_t len)746 void writeChars(const char * s, size_t len) {
747 assert(!closed_);
748 if(cur_ + len > BUF_SZ) {
749 if(cur_ > 0) flush();
750 if(len >= BUF_SZ) {
751 if (fwrite(s, len, 1, out_) != 1) {
752 std::cerr << "Error outputting data" << std::endl;
753 throw 1;
754 }
755 } else {
756 memcpy(&buf_[cur_], s, len);
757 assert_eq(0, cur_);
758 cur_ = len;
759 }
760 } else {
761 memcpy(&buf_[cur_], s, len);
762 cur_ += len;
763 }
764 assert_leq(cur_, BUF_SZ);
765 }
766
767 /**
768 * Write a 0-terminated C string to the output stream.
769 */
writeChars(const char * s)770 void writeChars(const char * s) {
771 writeChars(s, strlen(s));
772 }
773
774 /**
775 * Write any remaining bitpairs and then close the input
776 */
close()777 void close() {
778 if(closed_) return;
779 if(cur_ > 0) flush();
780 closed_ = true;
781 if(out_ != stdout) {
782 fclose(out_);
783 }
784 }
785
786 /**
787 * Reset so that the next write is as though it's the first.
788 */
reset()789 void reset() {
790 cur_ = 0;
791 closed_ = false;
792 }
793
flush()794 void flush() {
795 if(cur_ != fwrite((const void *)buf_, 1, cur_, out_)) {
796 if (errno == EPIPE) {
797 exit(EXIT_SUCCESS);
798 }
799 std::cerr << "Error while flushing and closing output" << std::endl;
800 throw 1;
801 }
802 cur_ = 0;
803 }
804
805 /**
806 * Return true iff this stream is closed.
807 */
closed()808 bool closed() const {
809 return closed_;
810 }
811
812 /**
813 * Return the filename.
814 */
name()815 const char *name() {
816 return name_;
817 }
818
819 private:
820
821 static const size_t BUF_SZ = 16 * 1024;
822
823 const char *name_;
824 FILE *out_;
825 size_t cur_;
826 char buf_[BUF_SZ]; // (large) input buffer
827 bool closed_;
828 };
829
830 #endif /*ndef FILEBUF_H_*/
831