1 /*
2 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3 *
4 * This file is part of Bowtie 2.
5 *
6 * Bowtie 2 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Bowtie 2 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include <cmath>
21 #include <stdio.h>
22 #include <iostream>
23 #include <sstream>
24 #include <string>
25 #include <stdexcept>
26 #include <string.h>
27 #include <fcntl.h>
28 #include "sstring.h"
29
30 #include "pat.h"
31 #include "filebuf.h"
32 #include "formats.h"
33 #include "util.h"
34 #include "str_util.h"
35 #include "tokenize.h"
36 #include "endian_swap.h"
37
38 using namespace std;
39
40 /**
41 * Calculate a per-read random seed based on a combination of
42 * the read data (incl. sequence, name, quals) and the global
43 * seed in '_randSeed'.
44 */
genRandSeed(const BTDnaString & qry,const BTString & qual,const BTString & name,uint32_t seed)45 static uint32_t genRandSeed(
46 const BTDnaString& qry,
47 const BTString& qual,
48 const BTString& name,
49 uint32_t seed)
50 {
51 // Calculate a per-read random seed based on a combination of
52 // the read data (incl. sequence, name, quals) and the global
53 // seed
54 uint32_t rseed = (seed + 101) * 59 * 61 * 67 * 71 * 73 * 79 * 83;
55 size_t qlen = qry.length();
56 // Throw all the characters of the read into the random seed
57 for(size_t i = 0; i < qlen; i++) {
58 int p = (int)qry[i];
59 assert_leq(p, 4);
60 size_t off = ((i & 15) << 1);
61 rseed ^= ((uint32_t)p << off);
62 }
63 // Throw all the quality values for the read into the random
64 // seed
65 for(size_t i = 0; i < qlen; i++) {
66 int p = (int)qual[i];
67 assert_leq(p, 255);
68 size_t off = ((i & 3) << 3);
69 rseed ^= (p << off);
70 }
71 // Throw all the characters in the read name into the random
72 // seed
73 size_t namelen = name.length();
74 for(size_t i = 0; i < namelen; i++) {
75 int p = (int)name[i];
76 if(p == '/') break;
77 assert_leq(p, 255);
78 size_t off = ((i & 3) << 3);
79 rseed ^= (p << off);
80 }
81 return rseed;
82 }
83
84 /**
85 * Return a new dynamically allocated PatternSource for the given
86 * format, using the given list of strings as the filenames to read
87 * from or as the sequences themselves (i.e. if -c was used).
88 */
patsrcFromStrings(const PatternParams & p,const EList<string> & qs)89 PatternSource* PatternSource::patsrcFromStrings(
90 const PatternParams& p,
91 const EList<string>& qs)
92 {
93 switch(p.format) {
94 case FASTA: return new FastaPatternSource(qs, p, p.interleaved);
95 case FASTA_CONT: return new FastaContinuousPatternSource(qs, p);
96 case RAW: return new RawPatternSource(qs, p);
97 case FASTQ: return new FastqPatternSource(qs, p, p.interleaved);
98 case BAM: return new BAMPatternSource(qs, p);
99 case TAB_MATE5: return new TabbedPatternSource(qs, p, false);
100 case TAB_MATE6: return new TabbedPatternSource(qs, p, true);
101 case CMDLINE: return new VectorPatternSource(qs, p);
102 case QSEQ: return new QseqPatternSource(qs, p);
103 #ifdef USE_SRA
104 case SRA_FASTA:
105 case SRA_FASTQ: return new SRAPatternSource(qs, p);
106 #endif
107 default: {
108 cerr << "Internal error; bad patsrc format: " << p.format << endl;
109 throw 1;
110 }
111 }
112 }
113
114 /**
115 * Once name/sequence/qualities have been parsed for an
116 * unpaired read, set all the other key fields of the Read
117 * struct.
118 */
finalize(Read & ra)119 void PatternSourcePerThread::finalize(Read& ra) {
120 ra.mate = 1;
121 ra.rdid = buf_.rdid();
122 ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, pp_.seed);
123 ra.finalize();
124 if(pp_.fixName) {
125 ra.fixMateName(1);
126 }
127 }
128
129 /**
130 * Once name/sequence/qualities have been parsed for a
131 * paired-end read, set all the other key fields of the Read
132 * structs.
133 */
finalizePair(Read & ra,Read & rb)134 void PatternSourcePerThread::finalizePair(Read& ra, Read& rb) {
135 ra.mate = 1;
136 rb.mate = 2;
137 ra.rdid = rb.rdid = buf_.rdid();
138 ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, pp_.seed);
139 rb.seed = genRandSeed(rb.patFw, rb.qual, rb.name, pp_.seed);
140 ra.finalize();
141 rb.finalize();
142 if(pp_.fixName) {
143 ra.fixMateName(1);
144 rb.fixMateName(2);
145 }
146 }
147
148 /**
149 * Get the next paired or unpaired read from the wrapped
150 * PatternComposer. Returns a pair of bools; first indicates
151 * whether we were successful, second indicates whether we're
152 * done.
153 */
nextReadPair()154 pair<bool, bool> PatternSourcePerThread::nextReadPair() {
155 // Prepare batch
156 if(buf_.exhausted()) {
157 pair<bool, int> res = nextBatch(); // more parsing needed!
158 if(res.first && res.second == 0) {
159 return make_pair(false, true);
160 }
161 last_batch_ = res.first;
162 last_batch_size_ = res.second;
163 assert_eq(0, buf_.cur_buf_);
164 } else {
165 buf_.next(); // advance cursor; no parsing or locking needed
166 assert_gt(buf_.cur_buf_, 0);
167 }
168 // Now fully parse read/pair *outside* the critical section
169 assert(!buf_.read_a().readOrigBuf.empty());
170 assert(buf_.read_a().empty());
171 if(!parse(buf_.read_a(), buf_.read_b())) {
172 return make_pair(false, false);
173 }
174 // Finalize read/pair
175 if(!buf_.read_b().patFw.empty()) {
176 trim(buf_.read_a());
177 trim(buf_.read_b());
178 finalizePair(buf_.read_a(), buf_.read_b());
179 } else {
180 trim(buf_.read_a());
181 finalize(buf_.read_a());
182 }
183 bool this_is_last = buf_.cur_buf_ == static_cast<unsigned int>(last_batch_size_-1);
184 return make_pair(true, this_is_last ? last_batch_ : false);
185 }
186
187 /**
188 * The main member function for dispensing pairs of reads or
189 * singleton reads. Returns true iff ra and rb contain a new
190 * pair; returns false if ra contains a new unpaired read.
191 */
nextBatch(PerThreadReadBuf & pt)192 pair<bool, int> SoloPatternComposer::nextBatch(PerThreadReadBuf& pt) {
193 size_t cur = cur_;
194 while(cur < src_->size()) {
195 // Patterns from srca_[cur_] are unpaired
196 pair<bool, int> res;
197 do {
198 res = (*src_)[cur]->nextBatch(
199 pt,
200 true, // batch A (or pairs)
201 lock_); // grab lock below
202 } while(!res.first && res.second == 0);
203 if(res.second == 0) {
204 ThreadSafe ts(mutex_m);
205 if(cur + 1 > cur_) {
206 cur_++;
207 }
208 cur = cur_;
209 continue; // on to next pair of PatternSources
210 }
211 return res;
212 }
213 assert_leq(cur, src_->size());
214 return make_pair(true, 0);
215 }
216
217 /**
218 * The main member function for dispensing pairs of reads or
219 * singleton reads. Returns true iff ra and rb contain a new
220 * pair; returns false if ra contains a new unpaired read.
221 */
nextBatch(PerThreadReadBuf & pt)222 pair<bool, int> DualPatternComposer::nextBatch(PerThreadReadBuf& pt) {
223 // 'cur' indexes the current pair of PatternSources
224 size_t cur = cur_;
225 while(cur < srca_->size()) {
226 if((*srcb_)[cur] == NULL) {
227 // Patterns from srca_ are unpaired
228 pair<bool, int> res = (*srca_)[cur]->nextBatch(
229 pt,
230 true, // batch A (or pairs)
231 lock_); // grab lock below
232 if(res.second == 0 && cur < srca_->size() - 1) {
233 ThreadSafe ts(mutex_m);
234 if(cur + 1 > cur_) cur_++;
235 cur = cur_; // Move on to next PatternSource
236 continue; // on to next pair of PatternSources
237 }
238 return make_pair(res.first && cur == srca_->size() - 1, res.second);
239 } else {
240 pair<bool, int> resa, resb;
241 // Lock to ensure that this thread gets parallel reads
242 // in the two mate files
243 {
244 ThreadSafe ts(mutex_m);
245 resa = (*srca_)[cur]->nextBatch(
246 pt,
247 true, // batch A
248 false); // don't grab lock below
249 resb = (*srcb_)[cur]->nextBatch(
250 pt,
251 false, // batch B
252 false); // don't grab lock below
253 assert_eq((*srca_)[cur]->readCount(),
254 (*srcb_)[cur]->readCount());
255 }
256 if(resa.second < resb.second) {
257 cerr << "Error, fewer reads in file specified with -1 "
258 << "than in file specified with -2" << endl;
259 throw 1;
260 } else if(resa.second == 0 && resb.second == 0) {
261 ThreadSafe ts(mutex_m);
262 if(cur + 1 > cur_) {
263 cur_++;
264 }
265 cur = cur_; // Move on to next PatternSource
266 continue; // on to next pair of PatternSources
267 } else if(resb.second < resa.second) {
268 cerr << "Error, fewer reads in file specified with -2 "
269 << "than in file specified with -1" << endl;
270 throw 1;
271 }
272 assert_eq(resa.first, resb.first);
273 assert_eq(resa.second, resb.second);
274 return make_pair(resa.first && cur == srca_->size() - 1, resa.second);
275 }
276 }
277 assert_leq(cur, srca_->size());
278 return make_pair(true, 0);
279 }
280
281 /**
282 * Given the values for all of the various arguments used to specify
283 * the read and quality input, create a list of pattern sources to
284 * dispense them.
285 */
setupPatternComposer(const EList<string> & si,const EList<string> & m1,const EList<string> & m2,const EList<string> & m12,const EList<string> & q,const EList<string> & q1,const EList<string> & q2,const EList<string> & sra_accs,PatternParams & p,bool verbose)286 PatternComposer* PatternComposer::setupPatternComposer(
287 const EList<string>& si, // singles, from argv
288 const EList<string>& m1, // mate1's, from -1 arg
289 const EList<string>& m2, // mate2's, from -2 arg
290 const EList<string>& m12, // both mates on each line, from --12 arg
291 const EList<string>& q, // qualities associated with singles
292 const EList<string>& q1, // qualities associated with m1
293 const EList<string>& q2, // qualities associated with m2
294 #ifdef USE_SRA
295 const EList<string>& sra_accs, // SRA accessions
296 #endif
297 PatternParams& p, // read-in parameters
298 bool verbose) // be talkative?
299 {
300 EList<PatternSource*>* a = new EList<PatternSource*>();
301 EList<PatternSource*>* b = new EList<PatternSource*>();
302 // Create list of pattern sources for paired reads appearing
303 // interleaved in a single file
304 for(size_t i = 0; i < m12.size(); i++) {
305 const EList<string>* qs = &m12;
306 EList<string> tmp;
307 if(p.fileParallel) {
308 // Feed query files one to each PatternSource
309 qs = &tmp;
310 tmp.push_back(m12[i]);
311 assert_eq(1, tmp.size());
312 }
313 a->push_back(PatternSource::patsrcFromStrings(p, *qs));
314 b->push_back(NULL);
315 p.interleaved = false;
316 if(!p.fileParallel) {
317 break;
318 }
319 }
320
321 #ifdef USE_SRA
322 for(size_t i = 0; i < sra_accs.size(); i++) {
323 const EList<string>* qs = &sra_accs;
324 EList<string> tmp;
325 if(p.fileParallel) {
326 // Feed query files one to each PatternSource
327 qs = &tmp;
328 tmp.push_back(sra_accs[i]);
329 assert_eq(1, tmp.size());
330 }
331 a->push_back(PatternSource::patsrcFromStrings(p, *qs));
332 b->push_back(NULL);
333 if(!p.fileParallel) {
334 break;
335 }
336 }
337 #endif
338
339 // Create list of pattern sources for paired reads
340 for(size_t i = 0; i < m1.size(); i++) {
341 const EList<string>* qs = &m1;
342 EList<string> tmpSeq;
343 EList<string> tmpQual;
344 if(p.fileParallel) {
345 // Feed query files one to each PatternSource
346 qs = &tmpSeq;
347 tmpSeq.push_back(m1[i]);
348 assert_eq(1, tmpSeq.size());
349 }
350 a->push_back(PatternSource::patsrcFromStrings(p, *qs));
351 if(!p.fileParallel) {
352 break;
353 }
354 }
355
356 // Create list of pattern sources for paired reads
357 for(size_t i = 0; i < m2.size(); i++) {
358 const EList<string>* qs = &m2;
359 EList<string> tmpSeq;
360 EList<string> tmpQual;
361 if(p.fileParallel) {
362 // Feed query files one to each PatternSource
363 qs = &tmpSeq;
364 tmpSeq.push_back(m2[i]);
365 assert_eq(1, tmpSeq.size());
366 }
367 b->push_back(PatternSource::patsrcFromStrings(p, *qs));
368 if(!p.fileParallel) {
369 break;
370 }
371 }
372 // All mates/mate files must be paired
373 assert_eq(a->size(), b->size());
374
375 // Create list of pattern sources for the unpaired reads
376 for(size_t i = 0; i < si.size(); i++) {
377 const EList<string>* qs = &si;
378 PatternSource* patsrc = NULL;
379 EList<string> tmpSeq;
380 EList<string> tmpQual;
381 if(p.fileParallel) {
382 // Feed query files one to each PatternSource
383 qs = &tmpSeq;
384 tmpSeq.push_back(si[i]);
385 assert_eq(1, tmpSeq.size());
386 }
387 patsrc = PatternSource::patsrcFromStrings(p, *qs);
388 assert(patsrc != NULL);
389 a->push_back(patsrc);
390 b->push_back(NULL);
391 if(!p.fileParallel) {
392 break;
393 }
394 }
395
396 PatternComposer *patsrc = NULL;
397 patsrc = new DualPatternComposer(a, b, p);
398 return patsrc;
399 }
400
free_EList_pmembers(const EList<PatternSource * > & elist)401 void PatternComposer::free_EList_pmembers( const EList<PatternSource*> &elist) {
402 for (size_t i = 0; i < elist.size(); i++)
403 if (elist[i] != NULL)
404 delete elist[i];
405 }
406
407 /**
408 * Fill Read with the sequence, quality and name for the next
409 * read in the list of read files. This function gets called by
410 * all the search threads, so we must handle synchronization.
411 *
412 * Returns pair<bool, int> where bool indicates whether we're
413 * completely done, and int indicates how many reads were read.
414 */
nextBatchImpl(PerThreadReadBuf & pt,bool batch_a)415 pair<bool, int> CFilePatternSource::nextBatchImpl(
416 PerThreadReadBuf& pt,
417 bool batch_a)
418 {
419 bool done = false;
420 unsigned nread = 0;
421 pt.setReadId(readCnt_);
422 while(true) { // loop that moves on to next file when needed
423 do {
424 pair<bool, int> ret = nextBatchFromFile(pt, batch_a, nread);
425 done = ret.first;
426 nread = ret.second;
427 } while(!done && nread == 0); // not sure why this would happen
428 if(done && filecur_ < infiles_.size()) { // finished with this file
429 open();
430 resetForNextFile(); // reset state to handle a fresh file
431 filecur_++;
432 if(nread == 0 || (nread < pt.max_buf_)) {
433 continue;
434 }
435 done = false;
436 }
437 break;
438 }
439 assert_geq(nread, 0);
440 readCnt_ += nread;
441 return make_pair(done, nread);
442 }
443
nextBatch(PerThreadReadBuf & pt,bool batch_a,bool lock)444 pair<bool, int> CFilePatternSource::nextBatch(
445 PerThreadReadBuf& pt,
446 bool batch_a,
447 bool lock)
448 {
449 if(lock) {
450 // synchronization at this level because both reading and manipulation of
451 // current file pointer have to be protected
452 ThreadSafe ts(mutex);
453 return nextBatchImpl(pt, batch_a);
454 } else {
455 return nextBatchImpl(pt, batch_a);
456 }
457 }
458
459 /**
460 * Open the next file in the list of input files.
461 */
open()462 void CFilePatternSource::open() {
463 if(is_open_) {
464 is_open_ = false;
465 switch (compressionType_) {
466 case CompressionType::GZIP:
467 gzclose(zfp_);
468 zfp_ = NULL;
469 break;
470 #ifdef WITH_ZSTD
471 case CompressionType::ZSTD:
472 zstdClose(zstdfp_);
473 zstdfp_ = NULL;
474 break;
475 #endif
476 case CompressionType::NONE:
477 fclose(fp_);
478 fp_ = NULL;
479 }
480 }
481 while(filecur_ < infiles_.size()) {
482 if(infiles_[filecur_] == "-") {
483 // always assume that data from stdin is compressed
484 compressionType_ = CompressionType::GZIP;
485 int fd = dup(fileno(stdin));
486 zfp_ = gzdopen(fd, "rb");
487
488 if (zfp_ == NULL) {
489 close(fd);
490 }
491 }
492 else {
493 const char* filename = infiles_[filecur_].c_str();
494
495 int fd = ::open(filename, O_RDONLY);
496 bool is_fifo = false;
497
498 #ifndef _WIN32
499 struct stat st;
500 if (fstat(fd, &st) != 0) {
501 perror("stat");
502 }
503
504 is_fifo = S_ISFIFO(st.st_mode) != 0;
505 #endif
506 #define CHECK_ERROR(exp) ((exp) == NULL) ? true : false
507
508 bool err = false;
509 if (pp_.format == BAM) {
510 err = CHECK_ERROR(fp_ = fdopen(fd, "rb"));
511 compressionType_ = CompressionType::NONE;
512 } else if (is_fifo) {
513 err = CHECK_ERROR(zfp_ = gzdopen(fd, "rb"));
514 compressionType_ = CompressionType::GZIP;
515 } else if (is_gzipped_file(fd)) {
516 err = CHECK_ERROR(zfp_ = gzdopen(fd, "rb"));
517 compressionType_ = CompressionType::GZIP;
518 #ifdef WITH_ZSTD
519 } else if (is_zstd_file(fd)) {
520 err = CHECK_ERROR(zstdfp_ = zstdFdOpen(fd));
521 compressionType_ = CompressionType::ZSTD;
522 #endif
523 } else {
524 err = CHECK_ERROR(fp_ = fdopen(fd, "r"));
525 compressionType_ = CompressionType::NONE;
526 }
527
528 if(err) {
529 if (fd != -1) {
530 close(fd);
531 }
532
533 if(!errs_[filecur_]) {
534 cerr << "Warning: Could not open read file \""
535 << filename
536 << "\" for reading; skipping..." << endl;
537 errs_[filecur_] = true;
538 }
539 filecur_++;
540 compressionType_ = CompressionType::NONE;
541 continue;
542 }
543 }
544 is_open_ = true;
545 if (compressionType_ == CompressionType::GZIP) {
546 #if ZLIB_VERNUM < 0x1235
547 cerr << "Warning: gzbuffer added in zlib v1.2.3.5. Unable to change "
548 "buffer size from default of 8192." << endl;
549 #else
550 gzbuffer(zfp_, 128*1024);
551 #endif
552 }
553 else if (compressionType_ == CompressionType::NONE) {
554 setvbuf(fp_, buf_, _IOFBF, 64*1024);
555 }
556 return;
557 }
558 cerr << "Error: No input read files were valid" << endl;
559 exit(1);
560 return;
561 }
562
563 /**
564 * Constructor for vector pattern source, used when the user has
565 * specified the input strings on the command line using the -c
566 * option.
567 */
VectorPatternSource(const EList<string> & seqs,const PatternParams & p)568 VectorPatternSource::VectorPatternSource(
569 const EList<string>& seqs,
570 const PatternParams& p) :
571 PatternSource(p),
572 cur_(p.skip),
573 skip_(p.skip),
574 paired_(false),
575 tokbuf_(),
576 bufs_()
577 {
578 // Install sequences in buffers, ready for immediate copying in
579 // nextBatch(). Formatting of the buffer is just like
580 // TabbedPatternSource.
581 const size_t seqslen = seqs.size();
582 for(size_t i = 0; i < seqslen; i++) {
583 tokbuf_.clear();
584 tokenize(seqs[i], ":", tokbuf_, 2);
585 assert_gt(tokbuf_.size(), 0);
586 assert_leq(tokbuf_.size(), 2);
587 // Get another buffer ready
588 bufs_.expand();
589 bufs_.back().clear();
590 // Install name
591 itoa10<TReadId>(static_cast<TReadId>(i), nametmp_);
592 bufs_.back().install(nametmp_);
593 bufs_.back().append('\t');
594 // Install sequence
595 bufs_.back().append(tokbuf_[0].c_str());
596 bufs_.back().append('\t');
597 // Install qualities
598 if(tokbuf_.size() > 1) {
599 bufs_.back().append(tokbuf_[1].c_str());
600 } else {
601 const size_t len = tokbuf_[0].length();
602 for(size_t i = 0; i < len; i++) {
603 bufs_.back().append('I');
604 }
605 }
606 }
607 }
608
609 /**
610 * Read next batch. However, batch concept is not very applicable for this
611 * PatternSource where all the info has already been parsed into the fields
612 * in the contsructor. This essentially modifies the pt as though we read
613 * in some number of patterns.
614 */
nextBatchImpl(PerThreadReadBuf & pt,bool batch_a)615 pair<bool, int> VectorPatternSource::nextBatchImpl(
616 PerThreadReadBuf& pt,
617 bool batch_a)
618 {
619 pt.setReadId(cur_);
620 EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
621 size_t readi = 0;
622 for(; readi < pt.max_buf_ && cur_ < bufs_.size(); readi++, cur_++) {
623 readbuf[readi].readOrigBuf = bufs_[cur_];
624 }
625 readCnt_ += readi;
626 return make_pair(cur_ == bufs_.size(), readi);
627 }
628
nextBatch(PerThreadReadBuf & pt,bool batch_a,bool lock)629 pair<bool, int> VectorPatternSource::nextBatch(
630 PerThreadReadBuf& pt,
631 bool batch_a,
632 bool lock)
633 {
634 if(lock) {
635 ThreadSafe ts(mutex);
636 return nextBatchImpl(pt, batch_a);
637 } else {
638 return nextBatchImpl(pt, batch_a);
639 }
640 }
641
642 /**
643 * Finishes parsing outside the critical section.
644 */
parse(Read & ra,Read & rb,TReadId rdid) const645 bool VectorPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const {
646 // Very similar to TabbedPatternSource
647
648 // Light parser (nextBatchFromFile) puts unparsed data
649 // into Read& r, even when the read is paired.
650 assert(ra.empty());
651 assert(!ra.readOrigBuf.empty()); // raw data for read/pair is here
652 int c = '\t';
653 size_t cur = 0;
654 const size_t buflen = ra.readOrigBuf.length();
655
656 // Loop over the two ends
657 for(int endi = 0; endi < 2 && c == '\t'; endi++) {
658 Read& r = ((endi == 0) ? ra : rb);
659 assert(r.name.empty());
660 // Parse name if (a) this is the first end, or
661 // (b) this is tab6
662 if(endi < 1 || paired_) {
663 // Parse read name
664 c = ra.readOrigBuf[cur++];
665 while(c != '\t' && cur < buflen) {
666 r.name.append(c);
667 c = ra.readOrigBuf[cur++];
668 }
669 assert_eq('\t', c);
670 if(cur >= buflen) {
671 return false; // record ended prematurely
672 }
673 } else if(endi > 0) {
674 // if this is the second end and we're parsing
675 // tab5, copy name from first end
676 rb.name = ra.name;
677 }
678
679 // Parse sequence
680 assert(r.patFw.empty());
681 c = ra.readOrigBuf[cur++];
682 int nchar = 0;
683 while(c != '\t' && cur < buflen) {
684 if(isalpha(c)) {
685 assert_in(toupper(c), "ACGTN");
686 if(nchar++ >= pp_.trim5) {
687 assert_neq(0, asc2dnacat[c]);
688 r.patFw.append(asc2dna[c]); // ascii to int
689 }
690 }
691 c = ra.readOrigBuf[cur++];
692 }
693 assert_eq('\t', c);
694 if(cur >= buflen) {
695 return false; // record ended prematurely
696 }
697 // record amt trimmed from 5' end due to --trim5
698 r.trimmed5 = (int)(nchar - r.patFw.length());
699 // record amt trimmed from 3' end due to --trim3
700 r.trimmed3 = (int)(r.patFw.trimEnd(pp_.trim3));
701
702 // Parse qualities
703 assert(r.qual.empty());
704 c = ra.readOrigBuf[cur++];
705 int nqual = 0;
706 while(c != '\t' && c != '\n' && c != '\r') {
707 if(c == ' ') {
708 wrongQualityFormat(r.name);
709 return false;
710 }
711 char cadd = charToPhred33(c, false, false);
712 if(++nqual > pp_.trim5) {
713 r.qual.append(cadd);
714 }
715 if(cur >= buflen) break;
716 c = ra.readOrigBuf[cur++];
717 }
718 if(nchar > nqual) {
719 tooFewQualities(r.name);
720 return false;
721 } else if(nqual > nchar) {
722 tooManyQualities(r.name);
723 return false;
724 }
725 r.qual.trimEnd(pp_.trim3);
726 assert(c == '\t' || c == '\n' || c == '\r' || cur >= buflen);
727 assert_eq(r.patFw.length(), r.qual.length());
728 }
729 ra.parsed = true;
730 if(!rb.parsed && !rb.readOrigBuf.empty()) {
731 return parse(rb, ra, rdid);
732 }
733 return true;
734 }
735
736 /**
737 * Light-parse a FASTA batch into the given buffer.
738 */
nextBatchFromFile(PerThreadReadBuf & pt,bool batch_a,unsigned readi)739 pair<bool, int> FastaPatternSource::nextBatchFromFile(
740 PerThreadReadBuf& pt,
741 bool batch_a, unsigned readi)
742 {
743 int c;
744 EList<Read>* readbuf = batch_a ? &pt.bufa_ : &pt.bufb_;
745 if(first_) {
746 c = getc_wrapper();
747 if (c == EOF) {
748 return make_pair(true, 0);
749 }
750 while(c == '\r' || c == '\n') {
751 c = getc_wrapper();
752 }
753 if(c != '>') {
754 cerr << "Error: reads file does not look like a FASTA file" << endl;
755 throw 1;
756 }
757 first_ = false;
758 }
759 bool done = false;
760 // Read until we run out of input or until we've filled the buffer
761 while (readi < pt.max_buf_ && !done) {
762 Read::TBuf& buf = (*readbuf)[readi].readOrigBuf;
763 buf.clear();
764 buf.append('>');
765 while(true) {
766 c = getc_wrapper();
767 if(c < 0 || c == '>') {
768 done = c < 0;
769 break;
770 }
771 buf.append(c);
772 }
773 if (interleaved_) {
774 // alternate between read buffers
775 batch_a = !batch_a;
776 readbuf = batch_a ? &pt.bufa_ : &pt.bufb_;
777 // increment read counter after each pair gets read
778 readi = batch_a ? readi+1 : readi;
779 } else {
780 readi++;
781 }
782
783 }
784 // Immediate EOF case
785 if(done && (*readbuf)[readi-1].readOrigBuf.length() == 1) {
786 readi--;
787 }
788 return make_pair(done, readi);
789 }
790
791 /**
792 * Finalize FASTA parsing outside critical section.
793 */
parse(Read & r,Read & rb,TReadId rdid) const794 bool FastaPatternSource::parse(Read& r, Read& rb, TReadId rdid) const {
795 // We assume the light parser has put the raw data for the separate ends
796 // into separate Read objects. That doesn't have to be the case, but
797 // that's how we've chosen to do it for FastqPatternSource
798 assert(!r.readOrigBuf.empty());
799 assert(r.empty());
800 int c = -1;
801 size_t cur = 1;
802 const size_t buflen = r.readOrigBuf.length();
803
804 // Parse read name
805 assert(r.name.empty());
806 while(cur < buflen) {
807 c = r.readOrigBuf[cur++];
808 if(c == '\n' || c == '\r') {
809 do {
810 c = r.readOrigBuf[cur++];
811 } while((c == '\n' || c == '\r') && cur < buflen);
812 break;
813 }
814 r.name.append(c);
815 }
816 if(cur >= buflen) {
817 return false; // FASTA ended prematurely
818 }
819
820 // Parse sequence
821 int nchar = 0;
822 assert(r.patFw.empty());
823 assert(c != '\n' && c != '\r');
824 assert_lt(cur, buflen);
825 while(cur < buflen) {
826 if(c == '.') {
827 c = 'N';
828 }
829 if(isalpha(c)) {
830 // If it's past the 5'-end trim point
831 if(nchar++ >= pp_.trim5) {
832 r.patFw.append(asc2dna[c]);
833 }
834 }
835 assert_lt(cur, buflen);
836 c = r.readOrigBuf[cur++];
837 if ((c == '\n' || c == '\r')
838 && cur < buflen
839 && r.readOrigBuf[cur] != '>') {
840 c = r.readOrigBuf[cur++];
841 }
842 }
843 r.trimmed5 = (int)(nchar - r.patFw.length());
844 r.trimmed3 = (int)(r.patFw.trimEnd(pp_.trim3));
845
846 for(size_t i = 0; i < r.patFw.length(); i++) {
847 r.qual.append('I');
848 }
849
850 // Set up a default name if one hasn't been set
851 if(r.name.empty()) {
852 char cbuf[20];
853 itoa10<TReadId>(static_cast<TReadId>(rdid), cbuf);
854 r.name.install(cbuf);
855 }
856 r.parsed = true;
857 if(!rb.parsed && !rb.readOrigBuf.empty()) {
858 return parse(rb, r, rdid);
859 }
860 return true;
861 }
862
863 /**
864 * Light-parse a FASTA-continuous batch into the given buffer.
865 * This is trickier for FASTA-continuous than for other formats,
866 * for several reasons:
867 *
868 * 1. Reads are substrings of a longer FASTA input string
869 * 2. Reads may overlap w/r/t the longer FASTA string
870 * 3. Read names depend on the most recently observed FASTA
871 * record name
872 */
nextBatchFromFile(PerThreadReadBuf & pt,bool batch_a,unsigned readi)873 pair<bool, int> FastaContinuousPatternSource::nextBatchFromFile(
874 PerThreadReadBuf& pt,
875 bool batch_a, unsigned readi)
876 {
877 int c = -1;
878 EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
879 while(readi < pt.max_buf_) {
880 c = getc_wrapper();
881 if(c < 0) {
882 break;
883 }
884 if(c == '>') {
885 resetForNextFile();
886 c = getc_wrapper();
887 bool sawSpace = false;
888 while(c != '\n' && c != '\r') {
889 if(!sawSpace) {
890 sawSpace = isspace(c);
891 }
892 if(!sawSpace) {
893 name_prefix_buf_.append(c);
894 }
895 c = getc_wrapper();
896 }
897 while(c == '\n' || c == '\r') {
898 c = getc_wrapper();
899 }
900 if(c < 0) {
901 break;
902 }
903 name_prefix_buf_.append('_');
904 }
905 int cat = asc2dnacat[c];
906 if(cat >= 2) c = 'N';
907 if(cat == 0) {
908 // Non-DNA, non-IUPAC char; skip
909 continue;
910 } else {
911 // DNA char
912 buf_[bufCur_++] = c;
913 if(bufCur_ == 1024) {
914 bufCur_ = 0; // wrap around circular buf
915 }
916 if(eat_ > 0) {
917 eat_--;
918 // Try to keep cur_ aligned with the offset
919 // into the reference; that lets us see where
920 // the sampling gaps are by looking at the read
921 // name
922 if(!beginning_) {
923 cur_++;
924 }
925 continue;
926 }
927 // install name
928 readbuf[readi].readOrigBuf = name_prefix_buf_;
929 itoa10<TReadId>(cur_ - last_, name_int_buf_);
930 readbuf[readi].readOrigBuf.append(name_int_buf_);
931 readbuf[readi].readOrigBuf.append('\t');
932 // install sequence
933 for(size_t i = 0; i < length_; i++) {
934 if(length_ - i <= bufCur_) {
935 c = buf_[bufCur_ - (length_ - i)];
936 } else {
937 // Rotate
938 c = buf_[bufCur_ - (length_ - i) + 1024];
939 }
940 readbuf[readi].readOrigBuf.append(c);
941 }
942 eat_ = freq_-1;
943 cur_++;
944 beginning_ = false;
945 readi++;
946 }
947 }
948 return make_pair(c < 0, readi);
949 }
950
951 /**
952 * Finalize FASTA-continuous parsing outside critical section.
953 */
parse(Read & ra,Read & rb,TReadId rdid) const954 bool FastaContinuousPatternSource::parse(
955 Read& ra,
956 Read& rb,
957 TReadId rdid) const
958 {
959 // Light parser (nextBatchFromFile) puts unparsed data
960 // into Read& r, even when the read is paired.
961 assert(ra.empty());
962 assert(rb.empty());
963 assert(!ra.readOrigBuf.empty()); // raw data for read/pair is here
964 assert(rb.readOrigBuf.empty());
965 int c = '\t';
966 size_t cur = 0;
967 const size_t buflen = ra.readOrigBuf.length();
968
969 // Parse read name
970 c = ra.readOrigBuf[cur++];
971 while(c != '\t' && cur < buflen) {
972 ra.name.append(c);
973 c = ra.readOrigBuf[cur++];
974 }
975 assert_eq('\t', c);
976 if(cur >= buflen) {
977 return false; // record ended prematurely
978 }
979
980 // Parse sequence
981 assert(ra.patFw.empty());
982 int nchar = 0;
983 while(cur < buflen) {
984 c = ra.readOrigBuf[cur++];
985 if(isalpha(c)) {
986 assert_in(toupper(c), "ACGTN");
987 if(nchar++ >= pp_.trim5) {
988 assert_neq(0, asc2dnacat[c]);
989 ra.patFw.append(asc2dna[c]); // ascii to int
990 }
991 }
992 }
993 // record amt trimmed from 5' end due to --trim5
994 ra.trimmed5 = (int)(nchar - ra.patFw.length());
995 // record amt trimmed from 3' end due to --trim3
996 ra.trimmed3 = (int)(ra.patFw.trimEnd(pp_.trim3));
997
998 // Make fake qualities
999 assert(ra.qual.empty());
1000 const size_t len = ra.patFw.length();
1001 for(size_t i = 0; i < len; i++) {
1002 ra.qual.append('I');
1003 }
1004 return true;
1005 }
1006
1007
1008 /**
1009 * "Light" parser. This is inside the critical section, so the key is to do
1010 * just enough parsing so that another function downstream (finalize()) can do
1011 * the rest of the parsing. Really this function's only job is to stick every
1012 * for lines worth of the input file into a buffer (r.readOrigBuf). finalize()
1013 * then parses the contents of r.readOrigBuf later.
1014 */
nextBatchFromFile(PerThreadReadBuf & pt,bool batch_a,unsigned readi)1015 pair<bool, int> FastqPatternSource::nextBatchFromFile(
1016 PerThreadReadBuf& pt,
1017 bool batch_a, unsigned readi)
1018 {
1019 int c = -1;
1020 EList<Read>* readbuf = batch_a ? &pt.bufa_ : &pt.bufb_;
1021 if(first_) {
1022 c = getc_wrapper();
1023 if (c == EOF) {
1024 return make_pair(true, 0);
1025 }
1026 while(c == '\r' || c == '\n') {
1027 c = getc_wrapper();
1028 }
1029 if(c != '@') {
1030 cerr << "Error: reads file does not look like a FASTQ file" << endl;
1031 throw 1;
1032 }
1033 first_ = false;
1034 (*readbuf)[readi].readOrigBuf.append('@');
1035 }
1036
1037 bool done = false, aborted = false;
1038 // Read until we run out of input or until we've filled the buffer
1039 while (readi < pt.max_buf_ && !done) {
1040 Read::TBuf& buf = (*readbuf)[readi].readOrigBuf;
1041 int newlines = 4;
1042 while(newlines) {
1043 c = getc_wrapper();
1044 done = c < 0;
1045 if(c == '\n' || (done && newlines == 1)) {
1046 // Saw newline, or EOF that we're
1047 // interpreting as final newline
1048 newlines--;
1049 c = '\n';
1050 } else if(done) {
1051 // account for newline at the end of the file
1052 if (newlines == 4) {
1053 newlines = 0;
1054 }
1055 else {
1056 aborted = true; // Unexpected EOF
1057 }
1058 break;
1059 }
1060 buf.append(c);
1061 }
1062 if (c > 0) {
1063 if (interleaved_) {
1064 // alternate between read buffers
1065 batch_a = !batch_a;
1066 readbuf = batch_a ? &pt.bufa_ : &pt.bufb_;
1067 // increment read counter after each pair gets read
1068 readi = batch_a ? readi+1 : readi;
1069 }
1070 else {
1071 readi++;
1072 }
1073 }
1074 }
1075 if(aborted) {
1076 readi--;
1077 }
1078 return make_pair(done, readi);
1079 }
1080
1081 /**
1082 * Finalize FASTQ parsing outside critical section.
1083 */
parse(Read & r,Read & rb,TReadId rdid) const1084 bool FastqPatternSource::parse(Read &r, Read& rb, TReadId rdid) const {
1085 // We assume the light parser has put the raw data for the separate ends
1086 // into separate Read objects. That doesn't have to be the case, but
1087 // that's how we've chosen to do it for FastqPatternSource
1088 assert(!r.readOrigBuf.empty());
1089 assert(r.empty());
1090 int c;
1091 size_t cur = 1;
1092 const size_t buflen = r.readOrigBuf.length();
1093
1094 // Parse read name
1095 assert(r.name.empty());
1096 while(true) {
1097 assert_lt(cur, buflen);
1098 c = r.readOrigBuf[cur++];
1099 if(c == '\n' || c == '\r') {
1100 do {
1101 c = r.readOrigBuf[cur++];
1102 } while(c == '\n' || c == '\r');
1103 break;
1104 }
1105 r.name.append(c);
1106 }
1107
1108 // Parse sequence
1109 int nchar = 0;
1110 assert(r.patFw.empty());
1111 while(c != '+') {
1112 if(c == '.') {
1113 c = 'N';
1114 }
1115 if(isalpha(c)) {
1116 // If it's past the 5'-end trim point
1117 if(nchar++ >= pp_.trim5) {
1118 r.patFw.append(asc2dna[c]);
1119 }
1120 }
1121 assert(cur < r.readOrigBuf.length());
1122 c = r.readOrigBuf[cur++];
1123 }
1124 r.trimmed5 = (int)(nchar - r.patFw.length());
1125 r.trimmed3 = (int)(r.patFw.trimEnd(pp_.trim3));
1126
1127 assert_eq('+', c);
1128 do {
1129 assert(cur < r.readOrigBuf.length());
1130 c = r.readOrigBuf[cur++];
1131 } while(c != '\n' && c != '\r');
1132 while(cur < buflen && (c == '\n' || c == '\r')) {
1133 c = r.readOrigBuf[cur++];
1134 }
1135
1136 assert(r.qual.empty());
1137 if(nchar > 0) {
1138 int nqual = 0;
1139 if (pp_.intQuals) {
1140 int cur_int = 0;
1141 while(c != '\t' && c != '\n' && c != '\r') {
1142 cur_int *= 10;
1143 cur_int += (int)(c - '0');
1144 c = r.readOrigBuf[cur++];
1145 if(c == ' ' || c == '\t' || c == '\n' || c == '\r') {
1146 char cadd = intToPhred33(cur_int, pp_.solexa64);
1147 cur_int = 0;
1148 assert_geq(cadd, 33);
1149 if(++nqual > pp_.trim5) {
1150 r.qual.append(cadd);
1151 }
1152 }
1153 }
1154 } else {
1155 c = charToPhred33(c, pp_.solexa64, pp_.phred64);
1156 if(nqual++ >= r.trimmed5) {
1157 r.qual.append(c);
1158 }
1159 while(cur < r.readOrigBuf.length()) {
1160 c = r.readOrigBuf[cur++];
1161 if (c == ' ') {
1162 wrongQualityFormat(r.name);
1163 return false;
1164 }
1165 if(c == '\r' || c == '\n') {
1166 break;
1167 }
1168 c = charToPhred33(c, pp_.solexa64, pp_.phred64);
1169 if(nqual++ >= r.trimmed5) {
1170 r.qual.append(c);
1171 }
1172 }
1173 r.qual.trimEnd(r.trimmed3);
1174 if(r.qual.length() < r.patFw.length()) {
1175 tooFewQualities(r.name);
1176 return false;
1177 } else if(r.qual.length() > r.patFw.length()) {
1178 tooManyQualities(r.name);
1179 return false;
1180 }
1181 }
1182 }
1183 // Set up a default name if one hasn't been set
1184 if(r.name.empty()) {
1185 char cbuf[20];
1186 itoa10<TReadId>(static_cast<TReadId>(rdid), cbuf);
1187 r.name.install(cbuf);
1188 }
1189 r.parsed = true;
1190 if(!rb.parsed && !rb.readOrigBuf.empty()) {
1191 return parse(rb, r, rdid);
1192 }
1193 return true;
1194 }
1195
1196 const int BAMPatternSource::offset[] = {
1197 0, //refID
1198 4, //pos
1199 8, //l_read_name
1200 9, //mapq
1201 10, //bin
1202 12, //n_cigar_op
1203 14, //flag
1204 16, //l_seq
1205 20, //next_refID
1206 24, //next_pos
1207 28, //tlen
1208 32, //read_name
1209 };
1210
1211 const uint8_t BAMPatternSource::EOF_MARKER[] = {
1212 0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
1213 0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x1b, 0x00, 0x03, 0x00,
1214 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1215 };
1216
nextBGZFBlockFromFile(BGZF & b)1217 uint16_t BAMPatternSource::nextBGZFBlockFromFile(BGZF& b) {
1218 if (fread(&b.hdr, sizeof(b.hdr), 1, fp_) != 1) {
1219 if (feof(fp_))
1220 return 0;
1221 std::cerr << "Error while reading BAM header" << std::endl;
1222 exit(EXIT_FAILURE);
1223 }
1224 if (currentlyBigEndian()) {
1225 b.hdr.mtime = endianSwapU32(b.hdr.mtime);
1226 b.hdr.xlen = endianSwapU16(b.hdr.xlen);
1227 }
1228 uint8_t *extra = new uint8_t[b.hdr.xlen];
1229 if (fread(extra, b.hdr.xlen, 1, fp_) != 1) {
1230 std::cerr << "Error while reading BAM extra subfields" << std::endl;
1231 exit(EXIT_FAILURE);
1232 }
1233 if (memcmp(EOF_MARKER, &b.hdr, sizeof(b.hdr)) == 0 &&
1234 memcmp(EOF_MARKER + sizeof(b.hdr), extra, 6 /* sizeof BAM subfield */) == 0)
1235 {
1236 delete[] extra;
1237 return 0;
1238 }
1239 uint16_t bsize = 0;
1240 for (uint16_t i = 0; i < b.hdr.xlen;) {
1241 if (extra[0] == 66 && extra[1] == 67) {
1242 bsize = *((uint16_t *)(extra + 4));
1243 if (currentlyBigEndian())
1244 bsize = endianSwapU16(bsize);
1245 bsize -= (b.hdr.xlen + 19);
1246 break;
1247 }
1248 i = i + 2;
1249 uint16_t sub_field_len = *((uint16_t *)(extra + 2));
1250 if (currentlyBigEndian())
1251 sub_field_len = endianSwapU16(sub_field_len);
1252 i = i + 2 + sub_field_len;
1253 }
1254 delete[] extra;
1255 if (bsize == 0)
1256 return 0;
1257 if (fread(b.cdata, bsize, 1, fp_) != 1) {
1258 std::cerr << "Error while reading BAM CDATA (compressed data)" << std::endl;
1259 exit(EXIT_FAILURE);
1260 }
1261 if (fread(&b.ftr, sizeof(b.ftr), 1, fp_) != 1) {
1262 std::cerr << "Error while reading BAM footer" << std::endl;
1263 exit(EXIT_FAILURE);
1264 }
1265 if (currentlyBigEndian()) {
1266 b.ftr.crc32 = endianSwapU32(b.ftr.crc32);
1267 b.ftr.isize = endianSwapU32(b.ftr.isize);
1268 }
1269 return bsize;
1270 }
1271
nextBatch(PerThreadReadBuf & pt,bool batch_a,bool lock)1272 std::pair<bool, int> BAMPatternSource::nextBatch(PerThreadReadBuf& pt, bool batch_a, bool lock) {
1273 bool done = false;
1274 uint16_t cdata_len;
1275 unsigned nread = 0;
1276
1277 do {
1278 if (bam_batch_indexes_[pt.tid_] >= bam_batches_[pt.tid_].size()) {
1279 BGZF block;
1280 std::vector<uint8_t>& batch = bam_batches_[pt.tid_];
1281 if (lock) {
1282 ThreadSafe ts(mutex);
1283 if (first_) {
1284 // parse the BAM header;
1285 nextBGZFBlockFromFile(block);
1286 first_ = false;
1287 }
1288 cdata_len = nextBGZFBlockFromFile(block);
1289 } else {
1290 if (first_) {
1291 // parse the BAM header
1292 nextBGZFBlockFromFile(block);
1293 first_ = false;
1294 }
1295 cdata_len = nextBGZFBlockFromFile(block);
1296 }
1297 if (cdata_len == 0) {
1298 done = nread == 0;
1299 break;
1300 }
1301 bam_batch_indexes_[pt.tid_] = 0;
1302 batch.resize(block.ftr.isize);
1303 int ret_code = decompress_bgzf_block(&batch[0], block.ftr.isize, block.cdata, cdata_len);
1304 if (ret_code != Z_OK) {
1305 return make_pair(true, 0);
1306 }
1307 uLong crc = crc32(0L, Z_NULL, 0);
1308 crc = crc32(crc, &batch[0], batch.size());
1309 assert(crc == block.ftr.crc32);
1310 }
1311 std::pair<bool, int> ret = get_alignments(pt, batch_a, nread, lock);
1312 done = ret.first;
1313 } while (!done && nread < pt.max_buf_);
1314
1315 if (lock) {
1316 ThreadSafe ts(mutex);
1317 pt.setReadId(readCnt_);
1318 readCnt_ += nread;
1319 } else {
1320 pt.setReadId(readCnt_);
1321 readCnt_ += nread;
1322 }
1323 return make_pair(done, nread);
1324 }
1325
get_alignments(PerThreadReadBuf & pt,bool batch_a,unsigned & readi,bool lock)1326 std::pair<bool, int> BAMPatternSource::get_alignments(PerThreadReadBuf& pt, bool batch_a, unsigned& readi, bool lock) {
1327 size_t& i = bam_batch_indexes_[pt.tid_];
1328 bool done = false;
1329 bool read1 = true;
1330
1331 while (readi < pt.max_buf_) {
1332 if (i >= bam_batches_[pt.tid_].size()) {
1333 return make_pair(false, readi);
1334 }
1335
1336 uint16_t flag;
1337 int32_t block_size;
1338 EList<Read>& readbuf = pp_.align_paired_reads && !read1 ? pt.bufb_ : pt.bufa_;
1339
1340 memcpy(&block_size, &bam_batches_[pt.tid_][0] + i, sizeof(block_size));
1341 if (currentlyBigEndian())
1342 block_size = endianSwapU32(block_size);
1343 if (block_size <= 0) {
1344 return make_pair(done, readi);
1345 }
1346 i += sizeof(block_size);
1347
1348 memcpy(&flag, &bam_batches_[pt.tid_][0] + i + offset[BAMField::flag], sizeof(flag));
1349 if (currentlyBigEndian())
1350 flag = endianSwapU16(flag);
1351 if (!pp_.align_paired_reads && ((flag & 0x40) != 0 || (flag & 0x80) != 0)) {
1352 readbuf[readi].readOrigBuf.clear();
1353 i += block_size;
1354 continue;
1355 }
1356
1357 if (pp_.align_paired_reads && ((flag & 0x40) == 0 && (flag & 0x80) == 0)) {
1358 readbuf[readi].readOrigBuf.clear();
1359 i += block_size;
1360 continue;
1361 }
1362
1363 if (pp_.align_paired_reads &&
1364 (((flag & 0x40) != 0 && i + block_size == bam_batches_[pt.tid_].size()) ||
1365 ((flag & 0x80) != 0 && i == sizeof(block_size))))
1366 {
1367 if (lock) {
1368 ThreadSafe ts(orphan_mates_mutex_);
1369 get_or_store_orhaned_mate(pt.bufa_, pt.bufb_, readi, &bam_batches_[pt.tid_][0] + i, block_size);
1370 i += block_size;
1371 } else {
1372 get_or_store_orhaned_mate(pt.bufa_, pt.bufb_, readi, &bam_batches_[pt.tid_][0] + i, block_size);
1373 i += block_size;
1374 }
1375
1376 } else {
1377 readbuf[readi].readOrigBuf.resize(block_size);
1378
1379 memcpy(readbuf[readi].readOrigBuf.wbuf(), &bam_batches_[pt.tid_][0] + i, block_size);
1380 i += block_size;
1381
1382 read1 = !read1;
1383 readi = (pp_.align_paired_reads &&
1384 pt.bufb_[readi].readOrigBuf.length() == 0) ? readi : readi + 1;
1385 }
1386 }
1387
1388 return make_pair(done, readi);
1389 }
1390
get_or_store_orhaned_mate(EList<Read> & buf_a,EList<Read> & buf_b,unsigned & readi,const uint8_t * mate,size_t mate_len)1391 void BAMPatternSource::get_or_store_orhaned_mate(EList<Read>& buf_a, EList<Read>& buf_b, unsigned& readi, const uint8_t *mate, size_t mate_len) {
1392 const char *read_name =
1393 (const char *)(mate + offset[BAMField::read_name]);
1394 size_t i;
1395 uint32_t hash = hash_str(read_name);
1396 orphan_mate_t *empty_slot = NULL;
1397
1398 for (i = 0; i < orphan_mates.size(); i++) {
1399 if (empty_slot == NULL && orphan_mates[i].empty())
1400 empty_slot = &orphan_mates[i];
1401 if (orphan_mates[i].hash == hash)
1402 break;
1403 }
1404 if (i == orphan_mates.size()) {
1405 // vector is full
1406 if (empty_slot == NULL) {
1407 orphan_mates.push_back(orphan_mate_t());
1408 empty_slot = &orphan_mates.back();
1409 }
1410 empty_slot->hash = hash;
1411 if (empty_slot->cap < mate_len) {
1412 delete[] empty_slot->data;
1413 empty_slot->data = NULL;
1414 }
1415 if (empty_slot->data == NULL) {
1416 empty_slot->data = new uint8_t[mate_len];
1417 empty_slot->cap = mate_len;
1418 }
1419 memcpy(empty_slot->data, mate, mate_len);
1420 empty_slot->size = mate_len;
1421 } else {
1422 uint8_t flag;
1423 Read& ra = buf_a[readi];
1424 Read& rb = buf_b[readi];
1425
1426 memcpy(&flag, mate + offset[BAMField::flag], sizeof(flag));
1427 if ((flag & 0x40) != 0) {
1428 ra.readOrigBuf.resize(mate_len);
1429 memcpy(ra.readOrigBuf.wbuf(), mate, mate_len);
1430 rb.readOrigBuf.resize(orphan_mates[i].size);
1431 memcpy(rb.readOrigBuf.wbuf(), orphan_mates[i].data, orphan_mates[i].size);
1432 } else {
1433 rb.readOrigBuf.resize(mate_len);
1434 memcpy(rb.readOrigBuf.wbuf(), mate, mate_len);
1435 ra.readOrigBuf.resize(orphan_mates[i].size);
1436 memcpy(ra.readOrigBuf.wbuf(), orphan_mates[i].data, orphan_mates[i].size);
1437 }
1438 readi++;
1439 orphan_mates[i].reset();
1440 }
1441 }
1442
decompress_bgzf_block(uint8_t * dst,size_t dst_len,uint8_t * src,size_t src_len)1443 int BAMPatternSource::decompress_bgzf_block(uint8_t *dst, size_t dst_len, uint8_t *src, size_t src_len) {
1444 z_stream strm;
1445
1446 strm.zalloc = Z_NULL;
1447 strm.zfree = Z_NULL;
1448 strm.opaque = Z_NULL;
1449
1450 strm.avail_in = src_len;
1451 strm.next_in = src;
1452 strm.avail_out = dst_len;
1453 strm.next_out = dst;
1454
1455 int ret = inflateInit2(&strm, -8);
1456 if (ret != Z_OK) {
1457 return ret;
1458 }
1459
1460 ret = inflate(&strm, Z_FINISH);
1461 if (ret != Z_STREAM_END) {
1462 return ret;
1463 }
1464
1465 return inflateEnd(&strm);
1466 }
1467
parse(Read & ra,Read & rb,TReadId rdid) const1468 bool BAMPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const {
1469 uint8_t l_read_name;
1470 int32_t l_seq;
1471 uint16_t n_cigar_op;
1472 const char* buf = ra.readOrigBuf.buf();
1473 int block_size = ra.readOrigBuf.length();
1474
1475 memcpy(&l_read_name, buf + offset[BAMField::l_read_name], sizeof(l_read_name));
1476 memcpy(&n_cigar_op, buf + offset[BAMField::n_cigar_op], sizeof(n_cigar_op));
1477 memcpy(&l_seq, buf + offset[BAMField::l_seq], sizeof(l_seq));
1478 if (currentlyBigEndian()) {
1479 n_cigar_op = endianSwapU16(n_cigar_op);
1480 l_seq = endianSwapU32(l_seq);
1481 }
1482
1483 int off = offset[BAMField::read_name];
1484 ra.name.install(buf + off, l_read_name-1);
1485 off += (l_read_name + sizeof(uint32_t) * n_cigar_op);
1486 const char* seq = buf + off;
1487 off += (l_seq+1)/2;
1488 const char* qual = buf + off;
1489 for (int i = 0; i < l_seq; i++) {
1490 if (i < pp_.trim5) {
1491 ra.trimmed5 += 1;
1492 } else {
1493 ra.qual.append(qual[i] + 33);
1494 int base = "=ACMGRSVTWYHKDBN"[static_cast<uint8_t>(seq[i/2]) >> 4*(1-(i%2)) & 0xf];
1495 ra.patFw.append(asc2dna[base]);
1496 }
1497 }
1498 ra.trimmed3 = (int)(ra.patFw.trimEnd(pp_.trim3));
1499 ra.qual.trimEnd(ra.trimmed3);
1500
1501 if (pp_.preserve_tags) {
1502 off += l_seq;
1503 ra.preservedOptFlags.install(buf + off, block_size - off);
1504 }
1505
1506 ra.parsed = true;
1507 if (!rb.parsed && rb.readOrigBuf.length() != 0) {
1508 return parse(rb, ra, rdid);
1509 }
1510
1511 return true;
1512 }
1513
1514 /**
1515 * Light-parse a batch of tabbed-format reads into given buffer.
1516 */
nextBatchFromFile(PerThreadReadBuf & pt,bool batch_a,unsigned readi)1517 pair<bool, int> TabbedPatternSource::nextBatchFromFile(
1518 PerThreadReadBuf& pt,
1519 bool batch_a, unsigned readi)
1520 {
1521 int c = getc_wrapper();
1522 while(c >= 0 && (c == '\n' || c == '\r')) {
1523 c = getc_wrapper();
1524 }
1525 EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
1526 // Read until we run out of input or until we've filled the buffer
1527 for(; readi < pt.max_buf_ && c >= 0; readi++) {
1528 readbuf[readi].readOrigBuf.clear();
1529 while(c >= 0 && c != '\n' && c != '\r') {
1530 readbuf[readi].readOrigBuf.append(c);
1531 c = getc_wrapper();
1532 }
1533 while(c >= 0 && (c == '\n' || c == '\r') && readi < pt.max_buf_ - 1) {
1534 c = getc_wrapper();
1535 }
1536 }
1537 return make_pair(c < 0, readi);
1538 }
1539
1540 /**
1541 * Finalize tabbed parsing outside critical section.
1542 */
parse(Read & ra,Read & rb,TReadId rdid) const1543 bool TabbedPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const {
1544 // Light parser (nextBatchFromFile) puts unparsed data
1545 // into Read& r, even when the read is paired.
1546 assert(ra.empty());
1547 assert(rb.empty());
1548 assert(!ra.readOrigBuf.empty()); // raw data for read/pair is here
1549 assert(rb.readOrigBuf.empty());
1550 int c = '\t';
1551 size_t cur = 0;
1552 const size_t buflen = ra.readOrigBuf.length();
1553
1554 // Loop over the two ends
1555 for(int endi = 0; endi < 2 && c == '\t'; endi++) {
1556 Read& r = ((endi == 0) ? ra : rb);
1557 assert(r.name.empty());
1558 // Parse name if (a) this is the first end, or
1559 // (b) this is tab6
1560 if(endi < 1 || secondName_) {
1561 // Parse read name
1562 c = ra.readOrigBuf[cur++];
1563 while(c != '\t' && cur < buflen) {
1564 r.name.append(c);
1565 c = ra.readOrigBuf[cur++];
1566 }
1567 assert_eq('\t', c);
1568 if(cur >= buflen) {
1569 return false; // record ended prematurely
1570 }
1571 } else if(endi > 0) {
1572 // if this is the second end and we're parsing
1573 // tab5, copy name from first end
1574 rb.name = ra.name;
1575 }
1576
1577 // Parse sequence
1578 assert(r.patFw.empty());
1579 c = ra.readOrigBuf[cur++];
1580 int nchar = 0;
1581 while(c != '\t' && cur < buflen) {
1582 if(isalpha(c)) {
1583 assert_in(toupper(c), "ACGTN");
1584 if(nchar++ >= pp_.trim5) {
1585 assert_neq(0, asc2dnacat[c]);
1586 r.patFw.append(asc2dna[c]); // ascii to int
1587 }
1588 }
1589 c = ra.readOrigBuf[cur++];
1590 }
1591 assert_eq('\t', c);
1592 if(cur >= buflen) {
1593 return false; // record ended prematurely
1594 }
1595 // record amt trimmed from 5' end due to --trim5
1596 r.trimmed5 = (int)(nchar - r.patFw.length());
1597 // record amt trimmed from 3' end due to --trim3
1598 r.trimmed3 = (int)(r.patFw.trimEnd(pp_.trim3));
1599
1600 // Parse qualities
1601 assert(r.qual.empty());
1602 c = ra.readOrigBuf[cur++];
1603 int nqual = 0;
1604 if (pp_.intQuals) {
1605 int cur_int = 0;
1606 while(c != '\t' && c != '\n' && c != '\r' && cur < buflen) {
1607 cur_int *= 10;
1608 cur_int += (int)(c - '0');
1609 c = ra.readOrigBuf[cur++];
1610 if(c == ' ' || c == '\t' || c == '\n' || c == '\r') {
1611 char cadd = intToPhred33(cur_int, pp_.solexa64);
1612 cur_int = 0;
1613 assert_geq(cadd, 33);
1614 if(++nqual > pp_.trim5) {
1615 r.qual.append(cadd);
1616 }
1617 }
1618 }
1619 } else {
1620 while(c != '\t' && c != '\n' && c != '\r') {
1621 if(c == ' ') {
1622 wrongQualityFormat(r.name);
1623 return false;
1624 }
1625 char cadd = charToPhred33(c, pp_.solexa64, pp_.phred64);
1626 if(++nqual > pp_.trim5) {
1627 r.qual.append(cadd);
1628 }
1629 if(cur >= buflen) break;
1630 c = ra.readOrigBuf[cur++];
1631 }
1632 }
1633 if(nchar > nqual) {
1634 tooFewQualities(r.name);
1635 return false;
1636 } else if(nqual > nchar) {
1637 tooManyQualities(r.name);
1638 return false;
1639 }
1640 r.qual.trimEnd(pp_.trim3);
1641 assert(c == '\t' || c == '\n' || c == '\r' || cur >= buflen);
1642 assert_eq(r.patFw.length(), r.qual.length());
1643 }
1644 return true;
1645 }
1646
1647 /**
1648 * Light-parse a batch of raw-format reads into given buffer.
1649 */
nextBatchFromFile(PerThreadReadBuf & pt,bool batch_a,unsigned readi)1650 pair<bool, int> RawPatternSource::nextBatchFromFile(
1651 PerThreadReadBuf& pt,
1652 bool batch_a,
1653 unsigned readi)
1654 {
1655 int c = getc_wrapper();
1656 while(c >= 0 && (c == '\n' || c == '\r')) {
1657 c = getc_wrapper();
1658 }
1659 EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
1660 // Read until we run out of input or until we've filled the buffer
1661 for(; readi < pt.max_buf_ && c >= 0; readi++) {
1662 readbuf[readi].readOrigBuf.clear();
1663 while(c >= 0 && c != '\n' && c != '\r') {
1664 readbuf[readi].readOrigBuf.append(c);
1665 c = getc_wrapper();
1666 }
1667 while(c >= 0 && (c == '\n' || c == '\r')) {
1668 c = getc_wrapper();
1669 }
1670 }
1671 // incase a valid character is consumed between batches
1672 if (c >= 0 && c != '\n' && c != '\r') {
1673 ungetc_wrapper(c);
1674 }
1675 return make_pair(c < 0, readi);
1676 }
1677
1678 /**
1679 * Finalize raw parsing outside critical section.
1680 */
parse(Read & r,Read & rb,TReadId rdid) const1681 bool RawPatternSource::parse(Read& r, Read& rb, TReadId rdid) const {
1682 assert(r.empty());
1683 assert(!r.readOrigBuf.empty()); // raw data for read/pair is here
1684 int c = '\n';
1685 size_t cur = 0;
1686 const size_t buflen = r.readOrigBuf.length();
1687
1688 // Parse sequence
1689 assert(r.patFw.empty());
1690 int nchar = 0;
1691 while(cur < buflen) {
1692 c = r.readOrigBuf[cur++];
1693 assert(c != '\r' && c != '\n');
1694 if(isalpha(c)) {
1695 assert_in(toupper(c), "ACGTN");
1696 if(nchar++ >= pp_.trim5) {
1697 assert_neq(0, asc2dnacat[c]);
1698 r.patFw.append(asc2dna[c]); // ascii to int
1699 }
1700 }
1701 }
1702 assert_eq(cur, buflen);
1703 // record amt trimmed from 5' end due to --trim5
1704 r.trimmed5 = (int)(nchar - r.patFw.length());
1705 // record amt trimmed from 3' end due to --trim3
1706 r.trimmed3 = (int)(r.patFw.trimEnd(pp_.trim3));
1707
1708 // Give the name field a dummy value
1709 char cbuf[20];
1710 itoa10<TReadId>(rdid, cbuf);
1711 r.name.install(cbuf);
1712
1713 // Give the base qualities dummy values
1714 assert(r.qual.empty());
1715 const size_t len = r.patFw.length();
1716 for(size_t i = 0; i < len; i++) {
1717 r.qual.append('I');
1718 }
1719 r.parsed = true;
1720 if(!rb.parsed && !rb.readOrigBuf.empty()) {
1721 return parse(rb, r, rdid);
1722 }
1723 return true;
1724 }
1725
1726
wrongQualityFormat(const BTString & read_name)1727 void wrongQualityFormat(const BTString& read_name) {
1728 cerr << "Error: Encountered one or more spaces while parsing the quality "
1729 << "string for read " << read_name << ". If this is a FASTQ file "
1730 << "with integer (non-ASCII-encoded) qualities, try re-running with "
1731 << "the --integer-quals option." << endl;
1732 throw 1;
1733 }
1734
tooFewQualities(const BTString & read_name)1735 void tooFewQualities(const BTString& read_name) {
1736 cerr << "Error: Read " << read_name << " has more read characters than "
1737 << "quality values." << endl;
1738 throw 1;
1739 }
1740
tooManyQualities(const BTString & read_name)1741 void tooManyQualities(const BTString& read_name) {
1742 cerr << "Error: Read " << read_name << " has more quality values than read "
1743 << "characters." << endl;
1744 throw 1;
1745 }
1746
1747 #ifdef USE_SRA
1748
nextBatchImpl(PerThreadReadBuf & pt,bool batch_a)1749 std::pair<bool, int> SRAPatternSource::nextBatchImpl(
1750 PerThreadReadBuf& pt,
1751 bool batch_a)
1752 {
1753 EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
1754 size_t readi = 0;
1755 bool done = false;
1756
1757 for(; readi < pt.max_buf_; readi++) {
1758 if(!sra_its_[pt.tid_]->nextRead() || !sra_its_[pt.tid_]->nextFragment()) {
1759 done = true;
1760 break;
1761 }
1762 const ngs::StringRef rname = sra_its_[pt.tid_]->getReadId();
1763 const ngs::StringRef ra_seq = sra_its_[pt.tid_]->getFragmentBases();
1764 const ngs::StringRef ra_qual = sra_its_[pt.tid_]->getFragmentQualities();
1765 readbuf[readi].readOrigBuf.install(rname.data(), rname.size());
1766 readbuf[readi].readOrigBuf.append('\t');
1767 readbuf[readi].readOrigBuf.append(ra_seq.data(), ra_seq.size());
1768 readbuf[readi].readOrigBuf.append('\t');
1769 readbuf[readi].readOrigBuf.append(ra_qual.data(), ra_qual.size());
1770 if(sra_its_[pt.tid_]->nextFragment()) {
1771 const ngs::StringRef rb_seq = sra_its_[pt.tid_]->getFragmentBases();
1772 const ngs::StringRef rb_qual = sra_its_[pt.tid_]->getFragmentQualities();
1773 readbuf[readi].readOrigBuf.append('\t');
1774 readbuf[readi].readOrigBuf.append(rb_seq.data(), rb_seq.size());
1775 readbuf[readi].readOrigBuf.append('\t');
1776 readbuf[readi].readOrigBuf.append(rb_qual.data(), rb_qual.size());
1777 }
1778 readbuf[readi].readOrigBuf.append('\n');
1779 }
1780
1781 pt.setReadId(readCnt_);
1782
1783 {
1784 ThreadSafe ts(mutex);
1785 readCnt_ += readi;
1786 }
1787
1788 return make_pair(done, readi);
1789 }
1790
1791 /**
1792 * TODO: need to think about whether this can be done in a sensible way
1793 */
nextBatch(PerThreadReadBuf & pt,bool batch_a,bool lock)1794 std::pair<bool, int> SRAPatternSource::nextBatch(
1795 PerThreadReadBuf& pt,
1796 bool batch_a,
1797 bool lock)
1798 {
1799 if(lock) {
1800 // synchronization at this level because both reading and manipulation of
1801 // current file pointer have to be protected
1802 return nextBatchImpl(pt, batch_a);
1803 } else {
1804 return nextBatchImpl(pt, batch_a);
1805 }
1806 }
1807
1808 /**
1809 * Finalize tabbed parsing outside critical section.
1810 */
parse(Read & ra,Read & rb,TReadId rdid) const1811 bool SRAPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const {
1812 // Light parser (nextBatchFromFile) puts unparsed data
1813 // into Read& r, even when the read is paired.
1814 assert(ra.empty());
1815 assert(rb.empty());
1816 assert(!ra.readOrigBuf.empty()); // raw data for read/pair is here
1817 assert(rb.readOrigBuf.empty());
1818 int c = '\t';
1819 size_t cur = 0;
1820 const size_t buflen = ra.readOrigBuf.length();
1821
1822 // Loop over the two ends
1823 for(int endi = 0; endi < 2 && c == '\t'; endi++) {
1824 Read& r = ((endi == 0) ? ra : rb);
1825 assert(r.name.empty());
1826 // Parse name if (a) this is the first end, or
1827 // (b) this is tab6
1828 if(endi < 1) {
1829 // Parse read name
1830 c = ra.readOrigBuf[cur++];
1831 while(c != '\t' && cur < buflen) {
1832 r.name.append(c);
1833 c = ra.readOrigBuf[cur++];
1834 }
1835 assert_eq('\t', c);
1836 if(cur >= buflen) {
1837 return false; // record ended prematurely
1838 }
1839 } else if(endi > 0) {
1840 // if this is the second end and we're parsing
1841 // tab5, copy name from first end
1842 rb.name = ra.name;
1843 }
1844
1845 // Parse sequence
1846 assert(r.patFw.empty());
1847 c = ra.readOrigBuf[cur++];
1848 int nchar = 0;
1849 while(c != '\t' && cur < buflen) {
1850 if(isalpha(c)) {
1851 assert_in(toupper(c), "ACGTN");
1852 if(nchar++ >= pp_.trim5) {
1853 assert_neq(0, asc2dnacat[c]);
1854 r.patFw.append(asc2dna[c]); // ascii to int
1855 }
1856 }
1857 c = ra.readOrigBuf[cur++];
1858 }
1859 assert_eq('\t', c);
1860 if(cur >= buflen) {
1861 return false; // record ended prematurely
1862 }
1863 // record amt trimmed from 5' end due to --trim5
1864 r.trimmed5 = (int)(nchar - r.patFw.length());
1865 // record amt trimmed from 3' end due to --trim3
1866 r.trimmed3 = (int)(r.patFw.trimEnd(pp_.trim3));
1867
1868 // Parse qualities
1869 assert(r.qual.empty());
1870 c = ra.readOrigBuf[cur++];
1871 int nqual = 0;
1872 if (pp_.intQuals) {
1873 int cur_int = 0;
1874 while(c != '\t' && c != '\n' && c != '\r' && cur < buflen) {
1875 cur_int *= 10;
1876 cur_int += (int)(c - '0');
1877 c = ra.readOrigBuf[cur++];
1878 if(c == ' ' || c == '\t' || c == '\n' || c == '\r') {
1879 char cadd = intToPhred33(cur_int, pp_.solexa64);
1880 cur_int = 0;
1881 assert_geq(cadd, 33);
1882 if(++nqual > pp_.trim5) {
1883 r.qual.append(cadd);
1884 }
1885 }
1886 }
1887 } else {
1888 while(c != '\t' && c != '\n' && c != '\r') {
1889 if(c == ' ') {
1890 wrongQualityFormat(r.name);
1891 return false;
1892 }
1893 char cadd = charToPhred33(c, pp_.solexa64, pp_.phred64);
1894 if(++nqual > pp_.trim5) {
1895 r.qual.append(cadd);
1896 }
1897 if(cur >= buflen) break;
1898 c = ra.readOrigBuf[cur++];
1899 }
1900 }
1901 if(nchar > nqual) {
1902 tooFewQualities(r.name);
1903 return false;
1904 } else if(nqual > nchar) {
1905 tooManyQualities(r.name);
1906 return false;
1907 }
1908 r.qual.trimEnd(pp_.trim3);
1909 assert(c == '\t' || c == '\n' || c == '\r' || cur >= buflen);
1910 assert_eq(r.patFw.length(), r.qual.length());
1911 }
1912 return true;
1913 }
1914
open()1915 void SRAPatternSource::open() {
1916 const string& sra_acc = sra_accs_[sra_acc_cur_];
1917 string version = "bowtie2";
1918 ncbi::NGS::setAppVersionString(version);
1919 assert(!sra_acc.empty());
1920 try {
1921 // open requested accession using SRA implementation of the API
1922 ngs::ReadCollection sra_run = ncbi::NGS::openReadCollection(sra_acc);
1923
1924 // compute window to iterate through
1925 size_t MAX_ROW = sra_run.getReadCount();
1926 pp_.upto -= pp_.skip;
1927
1928 if (pp_.upto <= MAX_ROW) {
1929 MAX_ROW = pp_.upto;
1930 }
1931 if(MAX_ROW < 0) {
1932 return;
1933 }
1934
1935 size_t window_size = MAX_ROW / sra_its_.size();
1936 size_t remainder = MAX_ROW % sra_its_.size();
1937 size_t i = 0, start = 1;
1938
1939 if (pp_.skip > 0) {
1940 start = pp_.skip + 1;
1941 readCnt_ = pp_.skip;
1942 }
1943
1944 while (i < sra_its_.size()) {
1945 sra_its_[i] = new ngs::ReadIterator(sra_run.getReadRange(start, window_size, ngs::Read::all));
1946 assert(sra_its_[i] != NULL);
1947
1948 i++;
1949 start += window_size;
1950 if (i == sra_its_.size() - 1) {
1951 window_size += remainder;
1952 }
1953 }
1954
1955 } catch(...) {
1956 cerr << "Warning: Could not access \"" << sra_acc << "\" for reading; skipping..." << endl;
1957 }
1958 }
1959
1960 #endif
1961