1 /*
2  * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3  *
4  * This file is part of Bowtie 2.
5  *
6  * Bowtie 2 is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Bowtie 2 is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <cmath>
21 #include <stdio.h>
22 #include <iostream>
23 #include <sstream>
24 #include <string>
25 #include <stdexcept>
26 #include <string.h>
27 #include <fcntl.h>
28 #include "sstring.h"
29 
30 #include "pat.h"
31 #include "filebuf.h"
32 #include "formats.h"
33 #include "util.h"
34 #include "str_util.h"
35 #include "tokenize.h"
36 #include "endian_swap.h"
37 
38 using namespace std;
39 
40 /**
41  * Calculate a per-read random seed based on a combination of
42  * the read data (incl. sequence, name, quals) and the global
43  * seed in '_randSeed'.
44  */
genRandSeed(const BTDnaString & qry,const BTString & qual,const BTString & name,uint32_t seed)45 static uint32_t genRandSeed(
46 	const BTDnaString& qry,
47 	const BTString& qual,
48 	const BTString& name,
49 	uint32_t seed)
50 {
51 	// Calculate a per-read random seed based on a combination of
52 	// the read data (incl. sequence, name, quals) and the global
53 	// seed
54 	uint32_t rseed = (seed + 101) * 59 * 61 * 67 * 71 * 73 * 79 * 83;
55 	size_t qlen = qry.length();
56 	// Throw all the characters of the read into the random seed
57 	for(size_t i = 0; i < qlen; i++) {
58 		int p = (int)qry[i];
59 		assert_leq(p, 4);
60 		size_t off = ((i & 15) << 1);
61 		rseed ^= ((uint32_t)p << off);
62 	}
63 	// Throw all the quality values for the read into the random
64 	// seed
65 	for(size_t i = 0; i < qlen; i++) {
66 		int p = (int)qual[i];
67 		assert_leq(p, 255);
68 		size_t off = ((i & 3) << 3);
69 		rseed ^= (p << off);
70 	}
71 	// Throw all the characters in the read name into the random
72 	// seed
73 	size_t namelen = name.length();
74 	for(size_t i = 0; i < namelen; i++) {
75 		int p = (int)name[i];
76 		if(p == '/') break;
77 		assert_leq(p, 255);
78 		size_t off = ((i & 3) << 3);
79 		rseed ^= (p << off);
80 	}
81 	return rseed;
82 }
83 
84 /**
85  * Return a new dynamically allocated PatternSource for the given
86  * format, using the given list of strings as the filenames to read
87  * from or as the sequences themselves (i.e. if -c was used).
88  */
patsrcFromStrings(const PatternParams & p,const EList<string> & qs)89 PatternSource* PatternSource::patsrcFromStrings(
90 	const PatternParams& p,
91 	const EList<string>& qs)
92 {
93 	switch(p.format) {
94 		case FASTA:       return new FastaPatternSource(qs, p, p.interleaved);
95 		case FASTA_CONT:  return new FastaContinuousPatternSource(qs, p);
96 		case RAW:         return new RawPatternSource(qs, p);
97 		case FASTQ:       return new FastqPatternSource(qs, p, p.interleaved);
98 		case BAM:         return new BAMPatternSource(qs, p);
99 		case TAB_MATE5:   return new TabbedPatternSource(qs, p, false);
100 		case TAB_MATE6:   return new TabbedPatternSource(qs, p, true);
101 		case CMDLINE:     return new VectorPatternSource(qs, p);
102 		case QSEQ:        return new QseqPatternSource(qs, p);
103 #ifdef USE_SRA
104 		case SRA_FASTA:
105 		case SRA_FASTQ:   return new SRAPatternSource(qs, p);
106 #endif
107 		default: {
108 			cerr << "Internal error; bad patsrc format: " << p.format << endl;
109 			throw 1;
110 		}
111 	}
112 }
113 
114 /**
115  * Once name/sequence/qualities have been parsed for an
116  * unpaired read, set all the other key fields of the Read
117  * struct.
118  */
finalize(Read & ra)119 void PatternSourcePerThread::finalize(Read& ra) {
120 	ra.mate = 1;
121 	ra.rdid = buf_.rdid();
122 	ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, pp_.seed);
123 	ra.finalize();
124 	if(pp_.fixName) {
125 		ra.fixMateName(1);
126 	}
127 }
128 
129 /**
130  * Once name/sequence/qualities have been parsed for a
131  * paired-end read, set all the other key fields of the Read
132  * structs.
133  */
finalizePair(Read & ra,Read & rb)134 void PatternSourcePerThread::finalizePair(Read& ra, Read& rb) {
135 	ra.mate = 1;
136 	rb.mate = 2;
137 	ra.rdid = rb.rdid = buf_.rdid();
138 	ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, pp_.seed);
139 	rb.seed = genRandSeed(rb.patFw, rb.qual, rb.name, pp_.seed);
140 	ra.finalize();
141 	rb.finalize();
142 	if(pp_.fixName) {
143 		ra.fixMateName(1);
144 		rb.fixMateName(2);
145 	}
146 }
147 
148 /**
149  * Get the next paired or unpaired read from the wrapped
150  * PatternComposer.  Returns a pair of bools; first indicates
151  * whether we were successful, second indicates whether we're
152  * done.
153  */
nextReadPair()154 pair<bool, bool> PatternSourcePerThread::nextReadPair() {
155 	// Prepare batch
156 	if(buf_.exhausted()) {
157 		pair<bool, int> res = nextBatch(); // more parsing needed!
158 		if(res.first && res.second == 0) {
159 			return make_pair(false, true);
160 		}
161 		last_batch_ = res.first;
162 		last_batch_size_ = res.second;
163 		assert_eq(0, buf_.cur_buf_);
164 	} else {
165 		buf_.next(); // advance cursor; no parsing or locking needed
166 		assert_gt(buf_.cur_buf_, 0);
167 	}
168 	// Now fully parse read/pair *outside* the critical section
169 	assert(!buf_.read_a().readOrigBuf.empty());
170 	assert(buf_.read_a().empty());
171 	if(!parse(buf_.read_a(), buf_.read_b())) {
172 		return make_pair(false, false);
173 	}
174 	// Finalize read/pair
175 	if(!buf_.read_b().patFw.empty()) {
176 		trim(buf_.read_a());
177 		trim(buf_.read_b());
178 		finalizePair(buf_.read_a(), buf_.read_b());
179 	} else {
180 		trim(buf_.read_a());
181 		finalize(buf_.read_a());
182 	}
183 	bool this_is_last = buf_.cur_buf_ == static_cast<unsigned int>(last_batch_size_-1);
184 	return make_pair(true, this_is_last ? last_batch_ : false);
185 }
186 
187 /**
188  * The main member function for dispensing pairs of reads or
189  * singleton reads.  Returns true iff ra and rb contain a new
190  * pair; returns false if ra contains a new unpaired read.
191  */
nextBatch(PerThreadReadBuf & pt)192 pair<bool, int> SoloPatternComposer::nextBatch(PerThreadReadBuf& pt) {
193 	size_t cur = cur_;
194 	while(cur < src_->size()) {
195 		// Patterns from srca_[cur_] are unpaired
196 		pair<bool, int> res;
197 		do {
198 			res = (*src_)[cur]->nextBatch(
199 				pt,
200 				true,  // batch A (or pairs)
201 				lock_); // grab lock below
202 		} while(!res.first && res.second == 0);
203 		if(res.second == 0) {
204 			ThreadSafe ts(mutex_m);
205 			if(cur + 1 > cur_) {
206 				cur_++;
207 			}
208 			cur = cur_;
209 			continue; // on to next pair of PatternSources
210 		}
211 		return res;
212 	}
213 	assert_leq(cur, src_->size());
214 	return make_pair(true, 0);
215 }
216 
217 /**
218  * The main member function for dispensing pairs of reads or
219  * singleton reads.  Returns true iff ra and rb contain a new
220  * pair; returns false if ra contains a new unpaired read.
221  */
nextBatch(PerThreadReadBuf & pt)222 pair<bool, int> DualPatternComposer::nextBatch(PerThreadReadBuf& pt) {
223 	// 'cur' indexes the current pair of PatternSources
224 	size_t cur = cur_;
225 	while(cur < srca_->size()) {
226 		if((*srcb_)[cur] == NULL) {
227 			// Patterns from srca_ are unpaired
228 			pair<bool, int> res = (*srca_)[cur]->nextBatch(
229 				pt,
230 				true,  // batch A (or pairs)
231 				lock_); // grab lock below
232 			if(res.second == 0 && cur < srca_->size() - 1) {
233 				ThreadSafe ts(mutex_m);
234 				if(cur + 1 > cur_) cur_++;
235 				cur = cur_; // Move on to next PatternSource
236 				continue; // on to next pair of PatternSources
237 			}
238 			return make_pair(res.first && cur == srca_->size() - 1, res.second);
239 		} else {
240 			pair<bool, int> resa, resb;
241 			// Lock to ensure that this thread gets parallel reads
242 			// in the two mate files
243 			{
244 				ThreadSafe ts(mutex_m);
245 				resa = (*srca_)[cur]->nextBatch(
246 					pt,
247 					true,   // batch A
248 					false); // don't grab lock below
249 				resb = (*srcb_)[cur]->nextBatch(
250 					pt,
251 					false,  // batch B
252 					false); // don't grab lock below
253 				assert_eq((*srca_)[cur]->readCount(),
254 				          (*srcb_)[cur]->readCount());
255 			}
256 			if(resa.second < resb.second) {
257 				cerr << "Error, fewer reads in file specified with -1 "
258 				     << "than in file specified with -2" << endl;
259 				throw 1;
260 			} else if(resa.second == 0 && resb.second == 0) {
261 				ThreadSafe ts(mutex_m);
262 				if(cur + 1 > cur_) {
263 					cur_++;
264 				}
265 				cur = cur_; // Move on to next PatternSource
266 				continue; // on to next pair of PatternSources
267 			} else if(resb.second < resa.second) {
268 				cerr << "Error, fewer reads in file specified with -2 "
269 				     << "than in file specified with -1" << endl;
270 				throw 1;
271 			}
272 			assert_eq(resa.first, resb.first);
273 			assert_eq(resa.second, resb.second);
274 			return make_pair(resa.first && cur == srca_->size() - 1, resa.second);
275 		}
276 	}
277 	assert_leq(cur, srca_->size());
278 	return make_pair(true, 0);
279 }
280 
281 /**
282  * Given the values for all of the various arguments used to specify
283  * the read and quality input, create a list of pattern sources to
284  * dispense them.
285  */
setupPatternComposer(const EList<string> & si,const EList<string> & m1,const EList<string> & m2,const EList<string> & m12,const EList<string> & q,const EList<string> & q1,const EList<string> & q2,const EList<string> & sra_accs,PatternParams & p,bool verbose)286 PatternComposer* PatternComposer::setupPatternComposer(
287 	const EList<string>& si,   // singles, from argv
288 	const EList<string>& m1,   // mate1's, from -1 arg
289 	const EList<string>& m2,   // mate2's, from -2 arg
290 	const EList<string>& m12,  // both mates on each line, from --12 arg
291 	const EList<string>& q,    // qualities associated with singles
292 	const EList<string>& q1,   // qualities associated with m1
293 	const EList<string>& q2,   // qualities associated with m2
294 #ifdef USE_SRA
295 	const EList<string>& sra_accs, // SRA accessions
296 #endif
297 	PatternParams& p,    // read-in parameters
298 	bool verbose)              // be talkative?
299 {
300 	EList<PatternSource*>* a  = new EList<PatternSource*>();
301 	EList<PatternSource*>* b  = new EList<PatternSource*>();
302 	// Create list of pattern sources for paired reads appearing
303 	// interleaved in a single file
304 	for(size_t i = 0; i < m12.size(); i++) {
305 		const EList<string>* qs = &m12;
306 		EList<string> tmp;
307 		if(p.fileParallel) {
308 			// Feed query files one to each PatternSource
309 			qs = &tmp;
310 			tmp.push_back(m12[i]);
311 			assert_eq(1, tmp.size());
312 		}
313 		a->push_back(PatternSource::patsrcFromStrings(p, *qs));
314 		b->push_back(NULL);
315 		p.interleaved = false;
316 		if(!p.fileParallel) {
317 			break;
318 		}
319 	}
320 
321 #ifdef USE_SRA
322 	for(size_t i = 0; i < sra_accs.size(); i++) {
323 		const EList<string>* qs = &sra_accs;
324 		EList<string> tmp;
325 		if(p.fileParallel) {
326 			// Feed query files one to each PatternSource
327 			qs = &tmp;
328 			tmp.push_back(sra_accs[i]);
329 			assert_eq(1, tmp.size());
330 		}
331 		a->push_back(PatternSource::patsrcFromStrings(p, *qs));
332 		b->push_back(NULL);
333 		if(!p.fileParallel) {
334 			break;
335 		}
336 	}
337 #endif
338 
339 	// Create list of pattern sources for paired reads
340 	for(size_t i = 0; i < m1.size(); i++) {
341 		const EList<string>* qs = &m1;
342 		EList<string> tmpSeq;
343 		EList<string> tmpQual;
344 		if(p.fileParallel) {
345 			// Feed query files one to each PatternSource
346 			qs = &tmpSeq;
347 			tmpSeq.push_back(m1[i]);
348 			assert_eq(1, tmpSeq.size());
349 		}
350 		a->push_back(PatternSource::patsrcFromStrings(p, *qs));
351 		if(!p.fileParallel) {
352 			break;
353 		}
354 	}
355 
356 	// Create list of pattern sources for paired reads
357 	for(size_t i = 0; i < m2.size(); i++) {
358 		const EList<string>* qs = &m2;
359 		EList<string> tmpSeq;
360 		EList<string> tmpQual;
361 		if(p.fileParallel) {
362 			// Feed query files one to each PatternSource
363 			qs = &tmpSeq;
364 			tmpSeq.push_back(m2[i]);
365 			assert_eq(1, tmpSeq.size());
366 		}
367 		b->push_back(PatternSource::patsrcFromStrings(p, *qs));
368 		if(!p.fileParallel) {
369 			break;
370 		}
371 	}
372 	// All mates/mate files must be paired
373 	assert_eq(a->size(), b->size());
374 
375 	// Create list of pattern sources for the unpaired reads
376 	for(size_t i = 0; i < si.size(); i++) {
377 		const EList<string>* qs = &si;
378 		PatternSource* patsrc = NULL;
379 		EList<string> tmpSeq;
380 		EList<string> tmpQual;
381 		if(p.fileParallel) {
382 			// Feed query files one to each PatternSource
383 			qs = &tmpSeq;
384 			tmpSeq.push_back(si[i]);
385 			assert_eq(1, tmpSeq.size());
386 		}
387 		patsrc = PatternSource::patsrcFromStrings(p, *qs);
388 		assert(patsrc != NULL);
389 		a->push_back(patsrc);
390 		b->push_back(NULL);
391 		if(!p.fileParallel) {
392 			break;
393 		}
394 	}
395 
396 	PatternComposer *patsrc = NULL;
397 	patsrc = new DualPatternComposer(a, b, p);
398 	return patsrc;
399 }
400 
free_EList_pmembers(const EList<PatternSource * > & elist)401 void PatternComposer::free_EList_pmembers( const EList<PatternSource*> &elist) {
402 	for (size_t i = 0; i < elist.size(); i++)
403 		if (elist[i] != NULL)
404 			delete elist[i];
405 }
406 
407 /**
408  * Fill Read with the sequence, quality and name for the next
409  * read in the list of read files. This function gets called by
410  * all the search threads, so we must handle synchronization.
411  *
412  * Returns pair<bool, int> where bool indicates whether we're
413  * completely done, and int indicates how many reads were read.
414  */
nextBatchImpl(PerThreadReadBuf & pt,bool batch_a)415 pair<bool, int> CFilePatternSource::nextBatchImpl(
416 	PerThreadReadBuf& pt,
417 	bool batch_a)
418 {
419 	bool done = false;
420 	unsigned nread = 0;
421 	pt.setReadId(readCnt_);
422 	while(true) { // loop that moves on to next file when needed
423 		do {
424 			pair<bool, int> ret = nextBatchFromFile(pt, batch_a, nread);
425 			done = ret.first;
426 			nread = ret.second;
427 		} while(!done && nread == 0); // not sure why this would happen
428 		if(done && filecur_ < infiles_.size()) { // finished with this file
429 			open();
430 			resetForNextFile(); // reset state to handle a fresh file
431 			filecur_++;
432 			if(nread == 0 || (nread < pt.max_buf_)) {
433 				continue;
434 			}
435 			done = false;
436 		}
437 		break;
438 	}
439 	assert_geq(nread, 0);
440 	readCnt_ += nread;
441 	return make_pair(done, nread);
442 }
443 
nextBatch(PerThreadReadBuf & pt,bool batch_a,bool lock)444 pair<bool, int> CFilePatternSource::nextBatch(
445 	PerThreadReadBuf& pt,
446 	bool batch_a,
447 	bool lock)
448 {
449 	if(lock) {
450 		// synchronization at this level because both reading and manipulation of
451 		// current file pointer have to be protected
452 		ThreadSafe ts(mutex);
453 		return nextBatchImpl(pt, batch_a);
454 	} else {
455 		return nextBatchImpl(pt, batch_a);
456 	}
457 }
458 
459 /**
460  * Open the next file in the list of input files.
461  */
open()462 void CFilePatternSource::open() {
463 	if(is_open_) {
464 		is_open_ = false;
465 		switch (compressionType_) {
466 		case CompressionType::GZIP:
467 			gzclose(zfp_);
468 			zfp_ = NULL;
469 			break;
470 #ifdef WITH_ZSTD
471 		case CompressionType::ZSTD:
472 			zstdClose(zstdfp_);
473 			zstdfp_ = NULL;
474 			break;
475 #endif
476 		case CompressionType::NONE:
477 			fclose(fp_);
478 			fp_ = NULL;
479 		}
480 	}
481 	while(filecur_ < infiles_.size()) {
482 		if(infiles_[filecur_] == "-") {
483 			// always assume that data from stdin is compressed
484 			compressionType_ = CompressionType::GZIP;
485 			int fd = dup(fileno(stdin));
486 			zfp_ = gzdopen(fd, "rb");
487 
488 			if (zfp_ == NULL) {
489 				close(fd);
490 			}
491 		}
492 		else {
493 			const char* filename = infiles_[filecur_].c_str();
494 
495 			int fd = ::open(filename, O_RDONLY);
496 			bool is_fifo = false;
497 
498 #ifndef _WIN32
499 			struct stat st;
500 			if (fstat(fd, &st) != 0) {
501 				perror("stat");
502 			}
503 
504 			is_fifo = S_ISFIFO(st.st_mode) != 0;
505 #endif
506 #define CHECK_ERROR(exp) ((exp) == NULL) ? true : false
507 
508 			bool err = false;
509                         if (pp_.format == BAM) {
510 				err = CHECK_ERROR(fp_ = fdopen(fd, "rb"));
511 				compressionType_ = CompressionType::NONE;
512                         } else if (is_fifo) {
513 				err = CHECK_ERROR(zfp_ = gzdopen(fd, "rb"));
514 				compressionType_ = CompressionType::GZIP;
515                         } else if (is_gzipped_file(fd)) {
516 				err = CHECK_ERROR(zfp_ = gzdopen(fd, "rb"));
517 				compressionType_ = CompressionType::GZIP;
518 #ifdef WITH_ZSTD
519                         } else if (is_zstd_file(fd)) {
520 				err = CHECK_ERROR(zstdfp_ = zstdFdOpen(fd));
521 				compressionType_ = CompressionType::ZSTD;
522 #endif
523                         } else {
524 				err = CHECK_ERROR(fp_ = fdopen(fd, "r"));
525 				compressionType_ = CompressionType::NONE;
526                         }
527 
528 			if(err) {
529 				if (fd != -1) {
530 					close(fd);
531 				}
532 
533 				if(!errs_[filecur_]) {
534 					cerr << "Warning: Could not open read file \""
535 					     << filename
536 					     << "\" for reading; skipping..." << endl;
537 					errs_[filecur_] = true;
538 				}
539 				filecur_++;
540 				compressionType_ = CompressionType::NONE;
541 				continue;
542 			}
543 		}
544 		is_open_ = true;
545 		if (compressionType_ == CompressionType::GZIP) {
546 #if ZLIB_VERNUM < 0x1235
547 			cerr << "Warning: gzbuffer added in zlib v1.2.3.5. Unable to change "
548 				"buffer size from default of 8192." << endl;
549 #else
550 			gzbuffer(zfp_, 128*1024);
551 #endif
552 		}
553 		else if (compressionType_ == CompressionType::NONE) {
554 			setvbuf(fp_, buf_, _IOFBF, 64*1024);
555 		}
556 		return;
557 	}
558 	cerr << "Error: No input read files were valid" << endl;
559 	exit(1);
560 	return;
561 }
562 
563 /**
564  * Constructor for vector pattern source, used when the user has
565  * specified the input strings on the command line using the -c
566  * option.
567  */
VectorPatternSource(const EList<string> & seqs,const PatternParams & p)568 VectorPatternSource::VectorPatternSource(
569 	const EList<string>& seqs,
570 	const PatternParams& p) :
571 	PatternSource(p),
572 	cur_(p.skip),
573 	skip_(p.skip),
574 	paired_(false),
575 	tokbuf_(),
576 	bufs_()
577 {
578 	// Install sequences in buffers, ready for immediate copying in
579 	// nextBatch().  Formatting of the buffer is just like
580 	// TabbedPatternSource.
581 	const size_t seqslen = seqs.size();
582 	for(size_t i = 0; i < seqslen; i++) {
583 		tokbuf_.clear();
584 		tokenize(seqs[i], ":", tokbuf_, 2);
585 		assert_gt(tokbuf_.size(), 0);
586 		assert_leq(tokbuf_.size(), 2);
587 		// Get another buffer ready
588 		bufs_.expand();
589 		bufs_.back().clear();
590 		// Install name
591 		itoa10<TReadId>(static_cast<TReadId>(i), nametmp_);
592 		bufs_.back().install(nametmp_);
593 		bufs_.back().append('\t');
594 		// Install sequence
595 		bufs_.back().append(tokbuf_[0].c_str());
596 		bufs_.back().append('\t');
597 		// Install qualities
598 		if(tokbuf_.size() > 1) {
599 			bufs_.back().append(tokbuf_[1].c_str());
600 		} else {
601 			const size_t len = tokbuf_[0].length();
602 			for(size_t i = 0; i < len; i++) {
603 				bufs_.back().append('I');
604 			}
605 		}
606 	}
607 }
608 
609 /**
610  * Read next batch.  However, batch concept is not very applicable for this
611  * PatternSource where all the info has already been parsed into the fields
612  * in the contsructor.	This essentially modifies the pt as though we read
613  * in some number of patterns.
614  */
nextBatchImpl(PerThreadReadBuf & pt,bool batch_a)615 pair<bool, int> VectorPatternSource::nextBatchImpl(
616 	PerThreadReadBuf& pt,
617 	bool batch_a)
618 {
619 	pt.setReadId(cur_);
620 	EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
621 	size_t readi = 0;
622 	for(; readi < pt.max_buf_ && cur_ < bufs_.size(); readi++, cur_++) {
623 		readbuf[readi].readOrigBuf = bufs_[cur_];
624 	}
625 	readCnt_ += readi;
626 	return make_pair(cur_ == bufs_.size(), readi);
627 }
628 
nextBatch(PerThreadReadBuf & pt,bool batch_a,bool lock)629 pair<bool, int> VectorPatternSource::nextBatch(
630 	PerThreadReadBuf& pt,
631 	bool batch_a,
632 	bool lock)
633 {
634 	if(lock) {
635 		ThreadSafe ts(mutex);
636 		return nextBatchImpl(pt, batch_a);
637 	} else {
638 		return nextBatchImpl(pt, batch_a);
639 	}
640 }
641 
642 /**
643  * Finishes parsing outside the critical section.
644  */
parse(Read & ra,Read & rb,TReadId rdid) const645 bool VectorPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const {
646 	// Very similar to TabbedPatternSource
647 
648 	// Light parser (nextBatchFromFile) puts unparsed data
649 	// into Read& r, even when the read is paired.
650 	assert(ra.empty());
651 	assert(!ra.readOrigBuf.empty()); // raw data for read/pair is here
652 	int c = '\t';
653 	size_t cur = 0;
654 	const size_t buflen = ra.readOrigBuf.length();
655 
656 	// Loop over the two ends
657 	for(int endi = 0; endi < 2 && c == '\t'; endi++) {
658 		Read& r = ((endi == 0) ? ra : rb);
659 		assert(r.name.empty());
660 		// Parse name if (a) this is the first end, or
661 		// (b) this is tab6
662 		if(endi < 1 || paired_) {
663 			// Parse read name
664 			c = ra.readOrigBuf[cur++];
665 			while(c != '\t' && cur < buflen) {
666 				r.name.append(c);
667 				c = ra.readOrigBuf[cur++];
668 			}
669 			assert_eq('\t', c);
670 			if(cur >= buflen) {
671 				return false; // record ended prematurely
672 			}
673 		} else if(endi > 0) {
674 			// if this is the second end and we're parsing
675 			// tab5, copy name from first end
676 			rb.name = ra.name;
677 		}
678 
679 		// Parse sequence
680 		assert(r.patFw.empty());
681 		c = ra.readOrigBuf[cur++];
682 		int nchar = 0;
683 		while(c != '\t' && cur < buflen) {
684 			if(isalpha(c)) {
685 				assert_in(toupper(c), "ACGTN");
686 				if(nchar++ >= pp_.trim5) {
687 					assert_neq(0, asc2dnacat[c]);
688 					r.patFw.append(asc2dna[c]); // ascii to int
689 				}
690 			}
691 			c = ra.readOrigBuf[cur++];
692 		}
693 		assert_eq('\t', c);
694 		if(cur >= buflen) {
695 			return false; // record ended prematurely
696 		}
697 		// record amt trimmed from 5' end due to --trim5
698 		r.trimmed5 = (int)(nchar - r.patFw.length());
699 		// record amt trimmed from 3' end due to --trim3
700 		r.trimmed3 = (int)(r.patFw.trimEnd(pp_.trim3));
701 
702 		// Parse qualities
703 		assert(r.qual.empty());
704 		c = ra.readOrigBuf[cur++];
705 		int nqual = 0;
706 		while(c != '\t' && c != '\n' && c != '\r') {
707 			if(c == ' ') {
708 				wrongQualityFormat(r.name);
709 				return false;
710 			}
711 			char cadd = charToPhred33(c, false, false);
712 			if(++nqual > pp_.trim5) {
713 				r.qual.append(cadd);
714 			}
715 			if(cur >= buflen) break;
716 			c = ra.readOrigBuf[cur++];
717 		}
718 		if(nchar > nqual) {
719 			tooFewQualities(r.name);
720 			return false;
721 		} else if(nqual > nchar) {
722 			tooManyQualities(r.name);
723 			return false;
724 		}
725 		r.qual.trimEnd(pp_.trim3);
726 		assert(c == '\t' || c == '\n' || c == '\r' || cur >= buflen);
727 		assert_eq(r.patFw.length(), r.qual.length());
728 	}
729 	ra.parsed = true;
730 	if(!rb.parsed && !rb.readOrigBuf.empty()) {
731 		return parse(rb, ra, rdid);
732 	}
733 	return true;
734 }
735 
736 /**
737  * Light-parse a FASTA batch into the given buffer.
738  */
nextBatchFromFile(PerThreadReadBuf & pt,bool batch_a,unsigned readi)739 pair<bool, int> FastaPatternSource::nextBatchFromFile(
740 	PerThreadReadBuf& pt,
741 	bool batch_a, unsigned readi)
742 {
743 	int c;
744 	EList<Read>* readbuf = batch_a ? &pt.bufa_ : &pt.bufb_;
745 	if(first_) {
746 		c = getc_wrapper();
747 		if (c == EOF) {
748 			return make_pair(true, 0);
749 		}
750 		while(c == '\r' || c == '\n') {
751 			c = getc_wrapper();
752 		}
753 		if(c != '>') {
754 			cerr << "Error: reads file does not look like a FASTA file" << endl;
755 			throw 1;
756 		}
757 		first_ = false;
758 	}
759 	bool done = false;
760 	// Read until we run out of input or until we've filled the buffer
761 	while (readi < pt.max_buf_ && !done) {
762 		Read::TBuf& buf = (*readbuf)[readi].readOrigBuf;
763 		buf.clear();
764 		buf.append('>');
765 		while(true) {
766 			c = getc_wrapper();
767 			if(c < 0 || c == '>') {
768 				done = c < 0;
769 				break;
770 			}
771 			buf.append(c);
772 		}
773 		if (interleaved_) {
774 			// alternate between read buffers
775 			batch_a = !batch_a;
776 			readbuf = batch_a ? &pt.bufa_ : &pt.bufb_;
777 			// increment read counter after each pair gets read
778 			readi = batch_a ? readi+1 : readi;
779 		} else {
780 			readi++;
781 		}
782 
783 	}
784 	// Immediate EOF case
785 	if(done && (*readbuf)[readi-1].readOrigBuf.length() == 1) {
786 		readi--;
787 	}
788 	return make_pair(done, readi);
789 }
790 
791 /**
792  * Finalize FASTA parsing outside critical section.
793  */
parse(Read & r,Read & rb,TReadId rdid) const794 bool FastaPatternSource::parse(Read& r, Read& rb, TReadId rdid) const {
795 	// We assume the light parser has put the raw data for the separate ends
796 	// into separate Read objects.	That doesn't have to be the case, but
797 	// that's how we've chosen to do it for FastqPatternSource
798 	assert(!r.readOrigBuf.empty());
799 	assert(r.empty());
800 	int c = -1;
801 	size_t cur = 1;
802 	const size_t buflen = r.readOrigBuf.length();
803 
804 	// Parse read name
805 	assert(r.name.empty());
806 	while(cur < buflen) {
807 		c = r.readOrigBuf[cur++];
808 		if(c == '\n' || c == '\r') {
809 			do {
810 				c = r.readOrigBuf[cur++];
811 			} while((c == '\n' || c == '\r') && cur < buflen);
812 			break;
813 		}
814 		r.name.append(c);
815 	}
816 	if(cur >= buflen) {
817 		return false; // FASTA ended prematurely
818 	}
819 
820 	// Parse sequence
821 	int nchar = 0;
822 	assert(r.patFw.empty());
823 	assert(c != '\n' && c != '\r');
824 	assert_lt(cur, buflen);
825 	while(cur < buflen) {
826 		if(c == '.') {
827 			c = 'N';
828 		}
829 		if(isalpha(c)) {
830 			// If it's past the 5'-end trim point
831 			if(nchar++ >= pp_.trim5) {
832 				r.patFw.append(asc2dna[c]);
833 			}
834 		}
835 		assert_lt(cur, buflen);
836 		c = r.readOrigBuf[cur++];
837 		if ((c == '\n' || c == '\r')
838 				&& cur < buflen
839 				&& r.readOrigBuf[cur] != '>') {
840 			c = r.readOrigBuf[cur++];
841 		}
842 	}
843 	r.trimmed5 = (int)(nchar - r.patFw.length());
844 	r.trimmed3 = (int)(r.patFw.trimEnd(pp_.trim3));
845 
846 	for(size_t i = 0; i < r.patFw.length(); i++) {
847 		r.qual.append('I');
848 	}
849 
850 	// Set up a default name if one hasn't been set
851 	if(r.name.empty()) {
852 		char cbuf[20];
853 		itoa10<TReadId>(static_cast<TReadId>(rdid), cbuf);
854 		r.name.install(cbuf);
855 	}
856 	r.parsed = true;
857 	if(!rb.parsed && !rb.readOrigBuf.empty()) {
858 		return parse(rb, r, rdid);
859 	}
860 	return true;
861 }
862 
863 /**
864  * Light-parse a FASTA-continuous batch into the given buffer.
865  * This is trickier for FASTA-continuous than for other formats,
866  * for several reasons:
867  *
868  * 1. Reads are substrings of a longer FASTA input string
869  * 2. Reads may overlap w/r/t the longer FASTA string
870  * 3. Read names depend on the most recently observed FASTA
871  *	  record name
872  */
nextBatchFromFile(PerThreadReadBuf & pt,bool batch_a,unsigned readi)873 pair<bool, int> FastaContinuousPatternSource::nextBatchFromFile(
874 	PerThreadReadBuf& pt,
875 	bool batch_a, unsigned readi)
876 {
877 	int c = -1;
878 	EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
879 	while(readi < pt.max_buf_) {
880 		c = getc_wrapper();
881 		if(c < 0) {
882 			break;
883 		}
884 		if(c == '>') {
885 			resetForNextFile();
886 			c = getc_wrapper();
887 			bool sawSpace = false;
888 			while(c != '\n' && c != '\r') {
889 				if(!sawSpace) {
890 					sawSpace = isspace(c);
891 				}
892 				if(!sawSpace) {
893 					name_prefix_buf_.append(c);
894 				}
895 				c = getc_wrapper();
896 			}
897 			while(c == '\n' || c == '\r') {
898 				c = getc_wrapper();
899 			}
900 			if(c < 0) {
901 				break;
902 			}
903 			name_prefix_buf_.append('_');
904 		}
905 		int cat = asc2dnacat[c];
906 		if(cat >= 2) c = 'N';
907 		if(cat == 0) {
908 			// Non-DNA, non-IUPAC char; skip
909 			continue;
910 		} else {
911 			// DNA char
912 			buf_[bufCur_++] = c;
913 			if(bufCur_ == 1024) {
914 				bufCur_ = 0; // wrap around circular buf
915 			}
916 			if(eat_ > 0) {
917 				eat_--;
918 				// Try to keep cur_ aligned with the offset
919 				// into the reference; that lets us see where
920 				// the sampling gaps are by looking at the read
921 				// name
922 				if(!beginning_) {
923 					cur_++;
924 				}
925 				continue;
926 			}
927 			// install name
928 			readbuf[readi].readOrigBuf = name_prefix_buf_;
929 			itoa10<TReadId>(cur_ - last_, name_int_buf_);
930 			readbuf[readi].readOrigBuf.append(name_int_buf_);
931 			readbuf[readi].readOrigBuf.append('\t');
932 			// install sequence
933 			for(size_t i = 0; i < length_; i++) {
934 				if(length_ - i <= bufCur_) {
935 					c = buf_[bufCur_ - (length_ - i)];
936 				} else {
937 					// Rotate
938 					c = buf_[bufCur_ - (length_ - i) + 1024];
939 				}
940 				readbuf[readi].readOrigBuf.append(c);
941 			}
942 			eat_ = freq_-1;
943 			cur_++;
944 			beginning_ = false;
945 			readi++;
946 		}
947 	}
948 	return make_pair(c < 0, readi);
949 }
950 
951 /**
952  * Finalize FASTA-continuous parsing outside critical section.
953  */
parse(Read & ra,Read & rb,TReadId rdid) const954 bool FastaContinuousPatternSource::parse(
955 	Read& ra,
956 	Read& rb,
957 	TReadId rdid) const
958 {
959 	// Light parser (nextBatchFromFile) puts unparsed data
960 	// into Read& r, even when the read is paired.
961 	assert(ra.empty());
962 	assert(rb.empty());
963 	assert(!ra.readOrigBuf.empty()); // raw data for read/pair is here
964 	assert(rb.readOrigBuf.empty());
965 	int c = '\t';
966 	size_t cur = 0;
967 	const size_t buflen = ra.readOrigBuf.length();
968 
969 	// Parse read name
970 	c = ra.readOrigBuf[cur++];
971 	while(c != '\t' && cur < buflen) {
972 		ra.name.append(c);
973 		c = ra.readOrigBuf[cur++];
974 	}
975 	assert_eq('\t', c);
976 	if(cur >= buflen) {
977 		return false; // record ended prematurely
978 	}
979 
980 	// Parse sequence
981 	assert(ra.patFw.empty());
982 	int nchar = 0;
983 	while(cur < buflen) {
984 		c = ra.readOrigBuf[cur++];
985 		if(isalpha(c)) {
986 			assert_in(toupper(c), "ACGTN");
987 			if(nchar++ >= pp_.trim5) {
988 				assert_neq(0, asc2dnacat[c]);
989 				ra.patFw.append(asc2dna[c]); // ascii to int
990 			}
991 		}
992 	}
993 	// record amt trimmed from 5' end due to --trim5
994 	ra.trimmed5 = (int)(nchar - ra.patFw.length());
995 	// record amt trimmed from 3' end due to --trim3
996 	ra.trimmed3 = (int)(ra.patFw.trimEnd(pp_.trim3));
997 
998 	// Make fake qualities
999 	assert(ra.qual.empty());
1000 	const size_t len = ra.patFw.length();
1001 	for(size_t i = 0; i < len; i++) {
1002 		ra.qual.append('I');
1003 	}
1004 	return true;
1005 }
1006 
1007 
1008 /**
1009  * "Light" parser. This is inside the critical section, so the key is to do
1010  * just enough parsing so that another function downstream (finalize()) can do
1011  * the rest of the parsing.  Really this function's only job is to stick every
1012  * for lines worth of the input file into a buffer (r.readOrigBuf).  finalize()
1013  * then parses the contents of r.readOrigBuf later.
1014  */
nextBatchFromFile(PerThreadReadBuf & pt,bool batch_a,unsigned readi)1015 pair<bool, int> FastqPatternSource::nextBatchFromFile(
1016 	PerThreadReadBuf& pt,
1017 	bool batch_a, unsigned readi)
1018 {
1019 	int c = -1;
1020 	EList<Read>* readbuf = batch_a ? &pt.bufa_ : &pt.bufb_;
1021 	if(first_) {
1022 		c = getc_wrapper();
1023 		if (c == EOF) {
1024 			return make_pair(true, 0);
1025 		}
1026 		while(c == '\r' || c == '\n') {
1027 			c = getc_wrapper();
1028 		}
1029 		if(c != '@') {
1030 			cerr << "Error: reads file does not look like a FASTQ file" << endl;
1031 			throw 1;
1032 		}
1033 		first_ = false;
1034 		(*readbuf)[readi].readOrigBuf.append('@');
1035 	}
1036 
1037 	bool done = false, aborted = false;
1038 	// Read until we run out of input or until we've filled the buffer
1039 	while (readi < pt.max_buf_ && !done) {
1040 		Read::TBuf& buf = (*readbuf)[readi].readOrigBuf;
1041 		int newlines = 4;
1042 		while(newlines) {
1043 			c = getc_wrapper();
1044 			done = c < 0;
1045 			if(c == '\n' || (done && newlines == 1)) {
1046 				// Saw newline, or EOF that we're
1047 				// interpreting as final newline
1048 				newlines--;
1049 				c = '\n';
1050 			} else if(done) {
1051 				// account for newline at the end of the file
1052 				if (newlines == 4) {
1053 					newlines = 0;
1054 				}
1055 				else {
1056 					aborted = true; // Unexpected EOF
1057 				}
1058 				break;
1059 			}
1060 			buf.append(c);
1061 		}
1062 		if (c > 0) {
1063 			if (interleaved_) {
1064 				// alternate between read buffers
1065 				batch_a = !batch_a;
1066 				readbuf = batch_a ? &pt.bufa_ : &pt.bufb_;
1067 				// increment read counter after each pair gets read
1068 				readi = batch_a ? readi+1 : readi;
1069 			}
1070 			else {
1071 				readi++;
1072 			}
1073 		}
1074 	}
1075 	if(aborted) {
1076 		readi--;
1077 	}
1078 	return make_pair(done, readi);
1079 }
1080 
1081 /**
1082  * Finalize FASTQ parsing outside critical section.
1083  */
parse(Read & r,Read & rb,TReadId rdid) const1084 bool FastqPatternSource::parse(Read &r, Read& rb, TReadId rdid) const {
1085 	// We assume the light parser has put the raw data for the separate ends
1086 	// into separate Read objects. That doesn't have to be the case, but
1087 	// that's how we've chosen to do it for FastqPatternSource
1088 	assert(!r.readOrigBuf.empty());
1089 	assert(r.empty());
1090 	int c;
1091 	size_t cur = 1;
1092 	const size_t buflen = r.readOrigBuf.length();
1093 
1094 	// Parse read name
1095 	assert(r.name.empty());
1096 	while(true) {
1097 		assert_lt(cur, buflen);
1098 		c = r.readOrigBuf[cur++];
1099 		if(c == '\n' || c == '\r') {
1100 			do {
1101 				c = r.readOrigBuf[cur++];
1102 			} while(c == '\n' || c == '\r');
1103 			break;
1104 		}
1105 		r.name.append(c);
1106 	}
1107 
1108 	// Parse sequence
1109 	int nchar = 0;
1110 	assert(r.patFw.empty());
1111 	while(c != '+') {
1112 		if(c == '.') {
1113 			c = 'N';
1114 		}
1115 		if(isalpha(c)) {
1116 			// If it's past the 5'-end trim point
1117 			if(nchar++ >= pp_.trim5) {
1118 				r.patFw.append(asc2dna[c]);
1119 			}
1120 		}
1121 		assert(cur < r.readOrigBuf.length());
1122 		c = r.readOrigBuf[cur++];
1123 	}
1124 	r.trimmed5 = (int)(nchar - r.patFw.length());
1125 	r.trimmed3 = (int)(r.patFw.trimEnd(pp_.trim3));
1126 
1127 	assert_eq('+', c);
1128 	do {
1129 		assert(cur < r.readOrigBuf.length());
1130 		c = r.readOrigBuf[cur++];
1131 	} while(c != '\n' && c != '\r');
1132 	while(cur < buflen && (c == '\n' || c == '\r')) {
1133 		c = r.readOrigBuf[cur++];
1134 	}
1135 
1136 	assert(r.qual.empty());
1137 	if(nchar > 0) {
1138 		int nqual = 0;
1139 		if (pp_.intQuals) {
1140 			int cur_int = 0;
1141 			while(c != '\t' && c != '\n' && c != '\r') {
1142 				cur_int *= 10;
1143 				cur_int += (int)(c - '0');
1144 				c = r.readOrigBuf[cur++];
1145 				if(c == ' ' || c == '\t' || c == '\n' || c == '\r') {
1146 					char cadd = intToPhred33(cur_int, pp_.solexa64);
1147 					cur_int = 0;
1148 					assert_geq(cadd, 33);
1149 					if(++nqual > pp_.trim5) {
1150 						r.qual.append(cadd);
1151 					}
1152 				}
1153 			}
1154 		} else {
1155 			c = charToPhred33(c, pp_.solexa64, pp_.phred64);
1156 			if(nqual++ >= r.trimmed5) {
1157 				r.qual.append(c);
1158 			}
1159 			while(cur < r.readOrigBuf.length()) {
1160 				c = r.readOrigBuf[cur++];
1161 				if (c == ' ') {
1162 					wrongQualityFormat(r.name);
1163 					return false;
1164 				}
1165 				if(c == '\r' || c == '\n') {
1166 					break;
1167 				}
1168 				c = charToPhred33(c, pp_.solexa64, pp_.phred64);
1169 				if(nqual++ >= r.trimmed5) {
1170 					r.qual.append(c);
1171 				}
1172 			}
1173 			r.qual.trimEnd(r.trimmed3);
1174 			if(r.qual.length() < r.patFw.length()) {
1175 				tooFewQualities(r.name);
1176 				return false;
1177 			} else if(r.qual.length() > r.patFw.length()) {
1178 				tooManyQualities(r.name);
1179 				return false;
1180 			}
1181 		}
1182 	}
1183 	// Set up a default name if one hasn't been set
1184 	if(r.name.empty()) {
1185 		char cbuf[20];
1186 		itoa10<TReadId>(static_cast<TReadId>(rdid), cbuf);
1187 		r.name.install(cbuf);
1188 	}
1189 	r.parsed = true;
1190 	if(!rb.parsed && !rb.readOrigBuf.empty()) {
1191 		return parse(rb, r, rdid);
1192 	}
1193 	return true;
1194 }
1195 
1196 const int BAMPatternSource::offset[] = {
1197 	0,   //refID
1198 	4,   //pos
1199 	8,   //l_read_name
1200 	9,   //mapq
1201 	10,  //bin
1202 	12,  //n_cigar_op
1203 	14,  //flag
1204 	16,  //l_seq
1205 	20,  //next_refID
1206 	24,  //next_pos
1207 	28,  //tlen
1208 	32,  //read_name
1209 };
1210 
1211 const uint8_t BAMPatternSource::EOF_MARKER[] = {
1212 	0x1f,  0x8b,  0x08,  0x04,  0x00,  0x00,  0x00,  0x00,  0x00,  0xff,
1213 	0x06,  0x00,  0x42,  0x43,  0x02,  0x00,  0x1b,  0x00,  0x03,  0x00,
1214 	0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00
1215 };
1216 
nextBGZFBlockFromFile(BGZF & b)1217 uint16_t BAMPatternSource::nextBGZFBlockFromFile(BGZF& b) {
1218         if (fread(&b.hdr, sizeof(b.hdr), 1, fp_) != 1) {
1219 		if (feof(fp_))
1220 			return 0;
1221                 std::cerr << "Error while reading BAM header" << std::endl;
1222                 exit(EXIT_FAILURE);
1223         }
1224 	if (currentlyBigEndian()) {
1225 		b.hdr.mtime = endianSwapU32(b.hdr.mtime);
1226 		b.hdr.xlen  = endianSwapU16(b.hdr.xlen);
1227 	}
1228 	uint8_t *extra = new uint8_t[b.hdr.xlen];
1229         if (fread(extra, b.hdr.xlen, 1, fp_) != 1) {
1230                 std::cerr << "Error while reading BAM extra subfields" << std::endl;
1231                 exit(EXIT_FAILURE);
1232         }
1233 	if (memcmp(EOF_MARKER, &b.hdr, sizeof(b.hdr)) == 0 &&
1234             memcmp(EOF_MARKER + sizeof(b.hdr), extra, 6 /* sizeof BAM subfield */) == 0)
1235 	{
1236 		delete[] extra;
1237 		return 0;
1238 	}
1239 	uint16_t bsize = 0;
1240 	for (uint16_t i = 0; i < b.hdr.xlen;) {
1241 		if (extra[0] == 66 && extra[1] == 67) {
1242 			bsize = *((uint16_t *)(extra + 4));
1243 			if (currentlyBigEndian())
1244 				bsize = endianSwapU16(bsize);
1245 			bsize -= (b.hdr.xlen + 19);
1246 			break;
1247 		}
1248 		i = i + 2;
1249 		uint16_t sub_field_len = *((uint16_t *)(extra + 2));
1250 		if (currentlyBigEndian())
1251 			sub_field_len = endianSwapU16(sub_field_len);
1252 		i = i + 2 + sub_field_len;
1253 	}
1254 	delete[] extra;
1255 	if (bsize == 0)
1256 		return 0;
1257         if (fread(b.cdata, bsize, 1, fp_) != 1) {
1258                 std::cerr << "Error while reading BAM CDATA (compressed data)" << std::endl;
1259                 exit(EXIT_FAILURE);
1260         }
1261         if (fread(&b.ftr, sizeof(b.ftr), 1, fp_) != 1) {
1262                 std::cerr << "Error while reading BAM footer" << std::endl;
1263                 exit(EXIT_FAILURE);
1264         }
1265 	if (currentlyBigEndian()) {
1266 		b.ftr.crc32 = endianSwapU32(b.ftr.crc32);
1267 		b.ftr.isize = endianSwapU32(b.ftr.isize);
1268 	}
1269 	return bsize;
1270 }
1271 
nextBatch(PerThreadReadBuf & pt,bool batch_a,bool lock)1272 std::pair<bool, int> BAMPatternSource::nextBatch(PerThreadReadBuf& pt, bool batch_a, bool lock) {
1273         bool done = false;
1274 	uint16_t cdata_len;
1275 	unsigned nread = 0;
1276 
1277 	do {
1278 		if (bam_batch_indexes_[pt.tid_] >= bam_batches_[pt.tid_].size()) {
1279 			BGZF block;
1280 			std::vector<uint8_t>& batch = bam_batches_[pt.tid_];
1281 			if (lock) {
1282 				ThreadSafe ts(mutex);
1283 				if (first_) {
1284                                         // parse the BAM header;
1285 					nextBGZFBlockFromFile(block);
1286 					first_ = false;
1287 				}
1288 				cdata_len = nextBGZFBlockFromFile(block);
1289 			} else {
1290 				if (first_) {
1291 					// parse the BAM header
1292 					nextBGZFBlockFromFile(block);
1293 					first_ = false;
1294 				}
1295 				cdata_len = nextBGZFBlockFromFile(block);
1296 			}
1297 			if (cdata_len == 0) {
1298 				done = nread == 0;
1299 				break;
1300 			}
1301 			bam_batch_indexes_[pt.tid_] = 0;
1302 			batch.resize(block.ftr.isize);
1303 			int ret_code = decompress_bgzf_block(&batch[0], block.ftr.isize, block.cdata, cdata_len);
1304 			if (ret_code != Z_OK) {
1305 				return make_pair(true, 0);
1306 			}
1307 			uLong crc = crc32(0L, Z_NULL, 0);
1308 			crc = crc32(crc, &batch[0], batch.size());
1309 			assert(crc == block.ftr.crc32);
1310 		}
1311 		std::pair<bool, int> ret = get_alignments(pt, batch_a, nread, lock);
1312 		done = ret.first;
1313 	} while (!done && nread < pt.max_buf_);
1314 
1315 	if (lock) {
1316 		ThreadSafe ts(mutex);
1317 		pt.setReadId(readCnt_);
1318 		readCnt_ += nread;
1319 	} else {
1320 		pt.setReadId(readCnt_);
1321 		readCnt_ += nread;
1322 	}
1323 	return make_pair(done, nread);
1324 }
1325 
get_alignments(PerThreadReadBuf & pt,bool batch_a,unsigned & readi,bool lock)1326 std::pair<bool, int> BAMPatternSource::get_alignments(PerThreadReadBuf& pt, bool batch_a, unsigned& readi, bool lock) {
1327 	size_t& i = bam_batch_indexes_[pt.tid_];
1328 	bool done = false;
1329 	bool read1 = true;
1330 
1331 	while (readi < pt.max_buf_) {
1332 		if (i >= bam_batches_[pt.tid_].size()) {
1333 			return make_pair(false, readi);
1334 		}
1335 
1336 		uint16_t flag;
1337 		int32_t block_size;
1338 		EList<Read>& readbuf = pp_.align_paired_reads && !read1 ? pt.bufb_ : pt.bufa_;
1339 
1340 		memcpy(&block_size, &bam_batches_[pt.tid_][0] + i, sizeof(block_size));
1341 		if (currentlyBigEndian())
1342 			block_size = endianSwapU32(block_size);
1343 		if (block_size <= 0) {
1344 			return make_pair(done, readi);
1345 		}
1346 		i += sizeof(block_size);
1347 
1348 		memcpy(&flag, &bam_batches_[pt.tid_][0] + i + offset[BAMField::flag], sizeof(flag));
1349 		if (currentlyBigEndian())
1350 			flag = endianSwapU16(flag);
1351 		if (!pp_.align_paired_reads && ((flag & 0x40) != 0 || (flag & 0x80) != 0)) {
1352 			readbuf[readi].readOrigBuf.clear();
1353 			i += block_size;
1354 			continue;
1355 		}
1356 
1357 		if (pp_.align_paired_reads && ((flag & 0x40) == 0 && (flag & 0x80) == 0)) {
1358 			readbuf[readi].readOrigBuf.clear();
1359 			i += block_size;
1360 			continue;
1361 		}
1362 
1363 		if (pp_.align_paired_reads &&
1364                     (((flag & 0x40) != 0 && i + block_size == bam_batches_[pt.tid_].size()) ||
1365                      ((flag & 0x80) != 0 && i == sizeof(block_size))))
1366 		{
1367 			if (lock) {
1368 				ThreadSafe ts(orphan_mates_mutex_);
1369 				get_or_store_orhaned_mate(pt.bufa_, pt.bufb_, readi, &bam_batches_[pt.tid_][0] + i, block_size);
1370 				i += block_size;
1371 			} else {
1372 				get_or_store_orhaned_mate(pt.bufa_, pt.bufb_, readi, &bam_batches_[pt.tid_][0] + i, block_size);
1373 				i += block_size;
1374 			}
1375 
1376 		} else {
1377 			readbuf[readi].readOrigBuf.resize(block_size);
1378 
1379 			memcpy(readbuf[readi].readOrigBuf.wbuf(), &bam_batches_[pt.tid_][0] + i, block_size);
1380 			i += block_size;
1381 
1382 			read1 = !read1;
1383 			readi = (pp_.align_paired_reads &&
1384 				 pt.bufb_[readi].readOrigBuf.length() == 0) ? readi : readi + 1;
1385 		}
1386 	}
1387 
1388 	return make_pair(done, readi);
1389 }
1390 
get_or_store_orhaned_mate(EList<Read> & buf_a,EList<Read> & buf_b,unsigned & readi,const uint8_t * mate,size_t mate_len)1391 void BAMPatternSource::get_or_store_orhaned_mate(EList<Read>& buf_a, EList<Read>& buf_b, unsigned& readi, const uint8_t *mate, size_t mate_len) {
1392 	const char *read_name =
1393 		(const char *)(mate + offset[BAMField::read_name]);
1394 	size_t i;
1395 	uint32_t hash = hash_str(read_name);
1396 	orphan_mate_t *empty_slot = NULL;
1397 
1398 	for (i = 0; i < orphan_mates.size(); i++) {
1399 		if (empty_slot == NULL && orphan_mates[i].empty())
1400 			empty_slot = &orphan_mates[i];
1401 		if (orphan_mates[i].hash == hash)
1402 			break;
1403 	}
1404 	if (i == orphan_mates.size()) {
1405 		// vector is full
1406 		if (empty_slot == NULL) {
1407 			orphan_mates.push_back(orphan_mate_t());
1408 			empty_slot = &orphan_mates.back();
1409 		}
1410 		empty_slot->hash = hash;
1411 		if (empty_slot->cap < mate_len) {
1412 			delete[] empty_slot->data;
1413 			empty_slot->data = NULL;
1414 		}
1415 		if (empty_slot->data == NULL) {
1416 			empty_slot->data = new uint8_t[mate_len];
1417 			empty_slot->cap = mate_len;
1418 		}
1419 		memcpy(empty_slot->data, mate, mate_len);
1420 		empty_slot->size = mate_len;
1421 	} else {
1422 		uint8_t flag;
1423 		Read& ra = buf_a[readi];
1424 		Read& rb = buf_b[readi];
1425 
1426 		memcpy(&flag, mate + offset[BAMField::flag], sizeof(flag));
1427 		if ((flag & 0x40) != 0) {
1428 			ra.readOrigBuf.resize(mate_len);
1429 			memcpy(ra.readOrigBuf.wbuf(), mate, mate_len);
1430 			rb.readOrigBuf.resize(orphan_mates[i].size);
1431 			memcpy(rb.readOrigBuf.wbuf(), orphan_mates[i].data, orphan_mates[i].size);
1432 		} else {
1433 			rb.readOrigBuf.resize(mate_len);
1434 			memcpy(rb.readOrigBuf.wbuf(), mate, mate_len);
1435 			ra.readOrigBuf.resize(orphan_mates[i].size);
1436 			memcpy(ra.readOrigBuf.wbuf(), orphan_mates[i].data, orphan_mates[i].size);
1437 		}
1438 		readi++;
1439 		orphan_mates[i].reset();
1440 	}
1441 }
1442 
decompress_bgzf_block(uint8_t * dst,size_t dst_len,uint8_t * src,size_t src_len)1443 int BAMPatternSource::decompress_bgzf_block(uint8_t *dst, size_t dst_len, uint8_t *src, size_t src_len) {
1444 	z_stream strm;
1445 
1446 	strm.zalloc = Z_NULL;
1447 	strm.zfree = Z_NULL;
1448 	strm.opaque = Z_NULL;
1449 
1450 	strm.avail_in = src_len;
1451 	strm.next_in = src;
1452 	strm.avail_out = dst_len;
1453 	strm.next_out = dst;
1454 
1455 	int ret  = inflateInit2(&strm, -8);
1456 	if (ret != Z_OK) {
1457 		return ret;
1458 	}
1459 
1460 	ret = inflate(&strm, Z_FINISH);
1461 	if (ret != Z_STREAM_END) {
1462 		return ret;
1463 	}
1464 
1465 	return inflateEnd(&strm);
1466 }
1467 
parse(Read & ra,Read & rb,TReadId rdid) const1468 bool BAMPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const {
1469 	uint8_t l_read_name;
1470 	int32_t l_seq;
1471 	uint16_t n_cigar_op;
1472 	const char* buf = ra.readOrigBuf.buf();
1473 	int block_size = ra.readOrigBuf.length();
1474 
1475 	memcpy(&l_read_name, buf + offset[BAMField::l_read_name], sizeof(l_read_name));
1476 	memcpy(&n_cigar_op, buf + offset[BAMField::n_cigar_op], sizeof(n_cigar_op));
1477 	memcpy(&l_seq, buf + offset[BAMField::l_seq], sizeof(l_seq));
1478 	if (currentlyBigEndian()) {
1479 		n_cigar_op = endianSwapU16(n_cigar_op);
1480 		l_seq = endianSwapU32(l_seq);
1481 	}
1482 
1483 	int off = offset[BAMField::read_name];
1484 	ra.name.install(buf + off, l_read_name-1);
1485 	off += (l_read_name + sizeof(uint32_t) * n_cigar_op);
1486 	const char* seq = buf + off;
1487 	off += (l_seq+1)/2;
1488 	const char* qual = buf + off;
1489 	for (int i = 0; i < l_seq; i++) {
1490 		if (i < pp_.trim5) {
1491 			ra.trimmed5 += 1;
1492 		} else {
1493 			ra.qual.append(qual[i] + 33);
1494 			int base = "=ACMGRSVTWYHKDBN"[static_cast<uint8_t>(seq[i/2]) >> 4*(1-(i%2)) & 0xf];
1495 			ra.patFw.append(asc2dna[base]);
1496 		}
1497 	}
1498 	ra.trimmed3 = (int)(ra.patFw.trimEnd(pp_.trim3));
1499 	ra.qual.trimEnd(ra.trimmed3);
1500 
1501 	if (pp_.preserve_tags) {
1502 		off += l_seq;
1503 		ra.preservedOptFlags.install(buf + off, block_size - off);
1504 	}
1505 
1506 	ra.parsed = true;
1507 	if (!rb.parsed && rb.readOrigBuf.length() != 0) {
1508 		return parse(rb, ra, rdid);
1509 	}
1510 
1511 	return true;
1512 }
1513 
1514 /**
1515  * Light-parse a batch of tabbed-format reads into given buffer.
1516  */
nextBatchFromFile(PerThreadReadBuf & pt,bool batch_a,unsigned readi)1517 pair<bool, int> TabbedPatternSource::nextBatchFromFile(
1518 	PerThreadReadBuf& pt,
1519 	bool batch_a, unsigned readi)
1520 {
1521 	int c = getc_wrapper();
1522 	while(c >= 0 && (c == '\n' || c == '\r')) {
1523 		c = getc_wrapper();
1524 	}
1525 	EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
1526 	// Read until we run out of input or until we've filled the buffer
1527 	for(; readi < pt.max_buf_ && c >= 0; readi++) {
1528 		readbuf[readi].readOrigBuf.clear();
1529 		while(c >= 0 && c != '\n' && c != '\r') {
1530 			readbuf[readi].readOrigBuf.append(c);
1531 			c = getc_wrapper();
1532 		}
1533 		while(c >= 0 && (c == '\n' || c == '\r') && readi < pt.max_buf_ - 1) {
1534 			c = getc_wrapper();
1535 		}
1536 	}
1537 	return make_pair(c < 0, readi);
1538 }
1539 
1540 /**
1541  * Finalize tabbed parsing outside critical section.
1542  */
parse(Read & ra,Read & rb,TReadId rdid) const1543 bool TabbedPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const {
1544 	// Light parser (nextBatchFromFile) puts unparsed data
1545 	// into Read& r, even when the read is paired.
1546 	assert(ra.empty());
1547 	assert(rb.empty());
1548 	assert(!ra.readOrigBuf.empty()); // raw data for read/pair is here
1549 	assert(rb.readOrigBuf.empty());
1550 	int c = '\t';
1551 	size_t cur = 0;
1552 	const size_t buflen = ra.readOrigBuf.length();
1553 
1554 	// Loop over the two ends
1555 	for(int endi = 0; endi < 2 && c == '\t'; endi++) {
1556 		Read& r = ((endi == 0) ? ra : rb);
1557 		assert(r.name.empty());
1558 		// Parse name if (a) this is the first end, or
1559 		// (b) this is tab6
1560 		if(endi < 1 || secondName_) {
1561 			// Parse read name
1562 			c = ra.readOrigBuf[cur++];
1563 			while(c != '\t' && cur < buflen) {
1564 				r.name.append(c);
1565 				c = ra.readOrigBuf[cur++];
1566 			}
1567 			assert_eq('\t', c);
1568 			if(cur >= buflen) {
1569 				return false; // record ended prematurely
1570 			}
1571 		} else if(endi > 0) {
1572 			// if this is the second end and we're parsing
1573 			// tab5, copy name from first end
1574 			rb.name = ra.name;
1575 		}
1576 
1577 		// Parse sequence
1578 		assert(r.patFw.empty());
1579 		c = ra.readOrigBuf[cur++];
1580 		int nchar = 0;
1581 		while(c != '\t' && cur < buflen) {
1582 			if(isalpha(c)) {
1583 				assert_in(toupper(c), "ACGTN");
1584 				if(nchar++ >= pp_.trim5) {
1585 					assert_neq(0, asc2dnacat[c]);
1586 					r.patFw.append(asc2dna[c]); // ascii to int
1587 				}
1588 			}
1589 			c = ra.readOrigBuf[cur++];
1590 		}
1591 		assert_eq('\t', c);
1592 		if(cur >= buflen) {
1593 			return false; // record ended prematurely
1594 		}
1595 		// record amt trimmed from 5' end due to --trim5
1596 		r.trimmed5 = (int)(nchar - r.patFw.length());
1597 		// record amt trimmed from 3' end due to --trim3
1598 		r.trimmed3 = (int)(r.patFw.trimEnd(pp_.trim3));
1599 
1600 		// Parse qualities
1601 		assert(r.qual.empty());
1602 		c = ra.readOrigBuf[cur++];
1603 		int nqual = 0;
1604 		if (pp_.intQuals) {
1605 			int cur_int = 0;
1606 			while(c != '\t' && c != '\n' && c != '\r' && cur < buflen) {
1607 				cur_int *= 10;
1608 				cur_int += (int)(c - '0');
1609 				c = ra.readOrigBuf[cur++];
1610 				if(c == ' ' || c == '\t' || c == '\n' || c == '\r') {
1611 					char cadd = intToPhred33(cur_int, pp_.solexa64);
1612 					cur_int = 0;
1613 					assert_geq(cadd, 33);
1614 					if(++nqual > pp_.trim5) {
1615 						r.qual.append(cadd);
1616 					}
1617 				}
1618 			}
1619 		} else {
1620 			while(c != '\t' && c != '\n' && c != '\r') {
1621 				if(c == ' ') {
1622 					wrongQualityFormat(r.name);
1623 					return false;
1624 				}
1625 				char cadd = charToPhred33(c, pp_.solexa64, pp_.phred64);
1626 				if(++nqual > pp_.trim5) {
1627 					r.qual.append(cadd);
1628 				}
1629 				if(cur >= buflen) break;
1630 				c = ra.readOrigBuf[cur++];
1631 			}
1632 		}
1633 		if(nchar > nqual) {
1634 			tooFewQualities(r.name);
1635 			return false;
1636 		} else if(nqual > nchar) {
1637 			tooManyQualities(r.name);
1638 			return false;
1639 		}
1640 		r.qual.trimEnd(pp_.trim3);
1641 		assert(c == '\t' || c == '\n' || c == '\r' || cur >= buflen);
1642 		assert_eq(r.patFw.length(), r.qual.length());
1643 	}
1644 	return true;
1645 }
1646 
1647 /**
1648  * Light-parse a batch of raw-format reads into given buffer.
1649  */
nextBatchFromFile(PerThreadReadBuf & pt,bool batch_a,unsigned readi)1650 pair<bool, int> RawPatternSource::nextBatchFromFile(
1651 	PerThreadReadBuf& pt,
1652 	bool batch_a,
1653     unsigned readi)
1654 {
1655 	int c = getc_wrapper();
1656 	while(c >= 0 && (c == '\n' || c == '\r')) {
1657 		c = getc_wrapper();
1658 	}
1659 	EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
1660 	// Read until we run out of input or until we've filled the buffer
1661 	for(; readi < pt.max_buf_ && c >= 0; readi++) {
1662 		readbuf[readi].readOrigBuf.clear();
1663 		while(c >= 0 && c != '\n' && c != '\r') {
1664 			readbuf[readi].readOrigBuf.append(c);
1665 			c = getc_wrapper();
1666 		}
1667 		while(c >= 0 && (c == '\n' || c == '\r')) {
1668 			c = getc_wrapper();
1669 		}
1670 	}
1671 	// incase a valid character is consumed between batches
1672 	if (c >= 0 && c != '\n' && c != '\r') {
1673 		ungetc_wrapper(c);
1674 	}
1675 	return make_pair(c < 0, readi);
1676 }
1677 
1678 /**
1679  * Finalize raw parsing outside critical section.
1680  */
parse(Read & r,Read & rb,TReadId rdid) const1681 bool RawPatternSource::parse(Read& r, Read& rb, TReadId rdid) const {
1682 	assert(r.empty());
1683 	assert(!r.readOrigBuf.empty()); // raw data for read/pair is here
1684 	int c = '\n';
1685 	size_t cur = 0;
1686 	const size_t buflen = r.readOrigBuf.length();
1687 
1688 	// Parse sequence
1689 	assert(r.patFw.empty());
1690 	int nchar = 0;
1691 	while(cur < buflen) {
1692 		c = r.readOrigBuf[cur++];
1693 		assert(c != '\r' && c != '\n');
1694 		if(isalpha(c)) {
1695 			assert_in(toupper(c), "ACGTN");
1696 			if(nchar++ >= pp_.trim5) {
1697 				assert_neq(0, asc2dnacat[c]);
1698 				r.patFw.append(asc2dna[c]); // ascii to int
1699 			}
1700 		}
1701 	}
1702 	assert_eq(cur, buflen);
1703 	// record amt trimmed from 5' end due to --trim5
1704 	r.trimmed5 = (int)(nchar - r.patFw.length());
1705 	// record amt trimmed from 3' end due to --trim3
1706 	r.trimmed3 = (int)(r.patFw.trimEnd(pp_.trim3));
1707 
1708 	// Give the name field a dummy value
1709 	char cbuf[20];
1710 	itoa10<TReadId>(rdid, cbuf);
1711 	r.name.install(cbuf);
1712 
1713 	// Give the base qualities dummy values
1714 	assert(r.qual.empty());
1715 	const size_t len = r.patFw.length();
1716 	for(size_t i = 0; i < len; i++) {
1717 		r.qual.append('I');
1718 	}
1719 	r.parsed = true;
1720 	if(!rb.parsed && !rb.readOrigBuf.empty()) {
1721 		return parse(rb, r, rdid);
1722 	}
1723 	return true;
1724 }
1725 
1726 
wrongQualityFormat(const BTString & read_name)1727 void wrongQualityFormat(const BTString& read_name) {
1728 	cerr << "Error: Encountered one or more spaces while parsing the quality "
1729 		 << "string for read " << read_name << ".  If this is a FASTQ file "
1730 		 << "with integer (non-ASCII-encoded) qualities, try re-running with "
1731 		 << "the --integer-quals option." << endl;
1732 	throw 1;
1733 }
1734 
tooFewQualities(const BTString & read_name)1735 void tooFewQualities(const BTString& read_name) {
1736 	cerr << "Error: Read " << read_name << " has more read characters than "
1737 		 << "quality values." << endl;
1738 	throw 1;
1739 }
1740 
tooManyQualities(const BTString & read_name)1741 void tooManyQualities(const BTString& read_name) {
1742 	cerr << "Error: Read " << read_name << " has more quality values than read "
1743 		 << "characters." << endl;
1744 	throw 1;
1745 }
1746 
1747 #ifdef USE_SRA
1748 
nextBatchImpl(PerThreadReadBuf & pt,bool batch_a)1749 std::pair<bool, int> SRAPatternSource::nextBatchImpl(
1750 	PerThreadReadBuf& pt,
1751 	bool batch_a)
1752 {
1753 	EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
1754 	size_t readi = 0;
1755 	bool done = false;
1756 
1757 	for(; readi < pt.max_buf_; readi++) {
1758 		if(!sra_its_[pt.tid_]->nextRead() || !sra_its_[pt.tid_]->nextFragment()) {
1759 			done = true;
1760 			break;
1761 		}
1762 		const ngs::StringRef rname = sra_its_[pt.tid_]->getReadId();
1763 		const ngs::StringRef ra_seq = sra_its_[pt.tid_]->getFragmentBases();
1764 		const ngs::StringRef ra_qual = sra_its_[pt.tid_]->getFragmentQualities();
1765 		readbuf[readi].readOrigBuf.install(rname.data(), rname.size());
1766 		readbuf[readi].readOrigBuf.append('\t');
1767 		readbuf[readi].readOrigBuf.append(ra_seq.data(), ra_seq.size());
1768 		readbuf[readi].readOrigBuf.append('\t');
1769 		readbuf[readi].readOrigBuf.append(ra_qual.data(), ra_qual.size());
1770 		if(sra_its_[pt.tid_]->nextFragment()) {
1771 			const ngs::StringRef rb_seq = sra_its_[pt.tid_]->getFragmentBases();
1772 			const ngs::StringRef rb_qual = sra_its_[pt.tid_]->getFragmentQualities();
1773 			readbuf[readi].readOrigBuf.append('\t');
1774 			readbuf[readi].readOrigBuf.append(rb_seq.data(), rb_seq.size());
1775 			readbuf[readi].readOrigBuf.append('\t');
1776 			readbuf[readi].readOrigBuf.append(rb_qual.data(), rb_qual.size());
1777 		}
1778 		readbuf[readi].readOrigBuf.append('\n');
1779 	}
1780 
1781 	pt.setReadId(readCnt_);
1782 
1783 	{
1784 		ThreadSafe ts(mutex);
1785 		readCnt_ += readi;
1786 	}
1787 
1788 	return make_pair(done, readi);
1789 }
1790 
1791 /**
1792  * TODO: need to think about whether this can be done in a sensible way
1793  */
nextBatch(PerThreadReadBuf & pt,bool batch_a,bool lock)1794 std::pair<bool, int> SRAPatternSource::nextBatch(
1795 	PerThreadReadBuf& pt,
1796 	bool batch_a,
1797 	bool lock)
1798 {
1799 	if(lock) {
1800 		// synchronization at this level because both reading and manipulation of
1801 		// current file pointer have to be protected
1802 		return nextBatchImpl(pt, batch_a);
1803 	} else {
1804 		return nextBatchImpl(pt, batch_a);
1805 	}
1806 }
1807 
1808 /**
1809  * Finalize tabbed parsing outside critical section.
1810  */
parse(Read & ra,Read & rb,TReadId rdid) const1811 bool SRAPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const {
1812 	// Light parser (nextBatchFromFile) puts unparsed data
1813 	// into Read& r, even when the read is paired.
1814 	assert(ra.empty());
1815 	assert(rb.empty());
1816 	assert(!ra.readOrigBuf.empty()); // raw data for read/pair is here
1817 	assert(rb.readOrigBuf.empty());
1818 	int c = '\t';
1819 	size_t cur = 0;
1820 	const size_t buflen = ra.readOrigBuf.length();
1821 
1822 	// Loop over the two ends
1823 	for(int endi = 0; endi < 2 && c == '\t'; endi++) {
1824 		Read& r = ((endi == 0) ? ra : rb);
1825 		assert(r.name.empty());
1826 		// Parse name if (a) this is the first end, or
1827 		// (b) this is tab6
1828 		if(endi < 1) {
1829 			// Parse read name
1830 			c = ra.readOrigBuf[cur++];
1831 			while(c != '\t' && cur < buflen) {
1832 				r.name.append(c);
1833 				c = ra.readOrigBuf[cur++];
1834 			}
1835 			assert_eq('\t', c);
1836 			if(cur >= buflen) {
1837 				return false; // record ended prematurely
1838 			}
1839 		} else if(endi > 0) {
1840 			// if this is the second end and we're parsing
1841 			// tab5, copy name from first end
1842 			rb.name = ra.name;
1843 		}
1844 
1845 		// Parse sequence
1846 		assert(r.patFw.empty());
1847 		c = ra.readOrigBuf[cur++];
1848 		int nchar = 0;
1849 		while(c != '\t' && cur < buflen) {
1850 			if(isalpha(c)) {
1851 				assert_in(toupper(c), "ACGTN");
1852 				if(nchar++ >= pp_.trim5) {
1853 					assert_neq(0, asc2dnacat[c]);
1854 					r.patFw.append(asc2dna[c]); // ascii to int
1855 				}
1856 			}
1857 			c = ra.readOrigBuf[cur++];
1858 		}
1859 		assert_eq('\t', c);
1860 		if(cur >= buflen) {
1861 			return false; // record ended prematurely
1862 		}
1863 		// record amt trimmed from 5' end due to --trim5
1864 		r.trimmed5 = (int)(nchar - r.patFw.length());
1865 		// record amt trimmed from 3' end due to --trim3
1866 		r.trimmed3 = (int)(r.patFw.trimEnd(pp_.trim3));
1867 
1868 		// Parse qualities
1869 		assert(r.qual.empty());
1870 		c = ra.readOrigBuf[cur++];
1871 		int nqual = 0;
1872 		if (pp_.intQuals) {
1873 			int cur_int = 0;
1874 			while(c != '\t' && c != '\n' && c != '\r' && cur < buflen) {
1875 				cur_int *= 10;
1876 				cur_int += (int)(c - '0');
1877 				c = ra.readOrigBuf[cur++];
1878 				if(c == ' ' || c == '\t' || c == '\n' || c == '\r') {
1879 					char cadd = intToPhred33(cur_int, pp_.solexa64);
1880 					cur_int = 0;
1881 					assert_geq(cadd, 33);
1882 					if(++nqual > pp_.trim5) {
1883 						r.qual.append(cadd);
1884 					}
1885 				}
1886 			}
1887 		} else {
1888 			while(c != '\t' && c != '\n' && c != '\r') {
1889 				if(c == ' ') {
1890 					wrongQualityFormat(r.name);
1891 					return false;
1892 				}
1893 				char cadd = charToPhred33(c, pp_.solexa64, pp_.phred64);
1894 				if(++nqual > pp_.trim5) {
1895 					r.qual.append(cadd);
1896 				}
1897 				if(cur >= buflen) break;
1898 				c = ra.readOrigBuf[cur++];
1899 			}
1900 		}
1901 		if(nchar > nqual) {
1902 			tooFewQualities(r.name);
1903 			return false;
1904 		} else if(nqual > nchar) {
1905 			tooManyQualities(r.name);
1906 			return false;
1907 		}
1908 		r.qual.trimEnd(pp_.trim3);
1909 		assert(c == '\t' || c == '\n' || c == '\r' || cur >= buflen);
1910 		assert_eq(r.patFw.length(), r.qual.length());
1911 	}
1912 	return true;
1913 }
1914 
open()1915 void SRAPatternSource::open() {
1916 	const string& sra_acc = sra_accs_[sra_acc_cur_];
1917 	string version = "bowtie2";
1918 	ncbi::NGS::setAppVersionString(version);
1919 	assert(!sra_acc.empty());
1920 	try {
1921 		// open requested accession using SRA implementation of the API
1922 		ngs::ReadCollection sra_run = ncbi::NGS::openReadCollection(sra_acc);
1923 
1924 		// compute window to iterate through
1925 		size_t MAX_ROW = sra_run.getReadCount();
1926 		pp_.upto -= pp_.skip;
1927 
1928 		if (pp_.upto <= MAX_ROW) {
1929 			MAX_ROW = pp_.upto;
1930 		}
1931 		if(MAX_ROW < 0) {
1932 			return;
1933 		}
1934 
1935 		size_t window_size = MAX_ROW / sra_its_.size();
1936 		size_t remainder = MAX_ROW % sra_its_.size();
1937 		size_t i = 0, start = 1;
1938 
1939 		if (pp_.skip > 0) {
1940 			start = pp_.skip + 1;
1941 			readCnt_ = pp_.skip;
1942 		}
1943 
1944 		while (i < sra_its_.size()) {
1945 			sra_its_[i] = new ngs::ReadIterator(sra_run.getReadRange(start, window_size, ngs::Read::all));
1946 			assert(sra_its_[i] != NULL);
1947 
1948 			i++;
1949 			start += window_size;
1950 			if (i == sra_its_.size() - 1) {
1951 				window_size += remainder;
1952 			}
1953 		}
1954 
1955 	} catch(...) {
1956 		cerr << "Warning: Could not access \"" << sra_acc << "\" for reading; skipping..." << endl;
1957 	}
1958 }
1959 
1960 #endif
1961