1 /*
2  * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3  *
4  * This file is part of Bowtie 2.
5  *
6  * Bowtie 2 is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Bowtie 2 is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #ifndef PAT_H_
21 #define PAT_H_
22 
23 #include <cassert>
24 #include <cmath>
25 #include <stdexcept>
26 #include <vector>
27 #include <string>
28 #include <cstring>
29 #include <ctype.h>
30 #include <fstream>
31 #include "alphabet.h"
32 #include "assert_helpers.h"
33 #include "tokenize.h"
34 #include "random_source.h"
35 #include "threading.h"
36 #include "filebuf.h"
37 #include "qual.h"
38 #include "search_globals.h"
39 #include "sstring.h"
40 #include "ds.h"
41 #include "read.h"
42 #include "util.h"
43 
44 /**
45  * Classes and routines for reading reads from various input sources.
46  */
47 
48 using namespace std;
49 
50 /**
51  * Calculate a per-read random seed based on a combination of
52  * the read data (incl. sequence, name, quals) and the global
53  * seed in '_randSeed'.
54  */
genRandSeed(const BTDnaString & qry,const BTString & qual,const BTString & name,uint32_t seed)55 static inline uint32_t genRandSeed(const BTDnaString& qry,
56                                    const BTString& qual,
57                                    const BTString& name,
58                                    uint32_t seed)
59 {
60 	// Calculate a per-read random seed based on a combination of
61 	// the read data (incl. sequence, name, quals) and the global
62 	// seed
63 	uint32_t rseed = (seed + 101) * 59 * 61 * 67 * 71 * 73 * 79 * 83;
64 	size_t qlen = qry.length();
65 	// Throw all the characters of the read into the random seed
66 	for(size_t i = 0; i < qlen; i++) {
67 		int p = (int)qry[i];
68 		assert_leq(p, 4);
69 		size_t off = ((i & 15) << 1);
70 		rseed ^= (p << off);
71 	}
72 	// Throw all the quality values for the read into the random
73 	// seed
74 	for(size_t i = 0; i < qlen; i++) {
75 		int p = (int)qual[i];
76 		assert_leq(p, 255);
77 		size_t off = ((i & 3) << 3);
78 		rseed ^= (p << off);
79 	}
80 	// Throw all the characters in the read name into the random
81 	// seed
82 	size_t namelen = name.length();
83 	for(size_t i = 0; i < namelen; i++) {
84 		int p = (int)name[i];
85 		if(p == '/') break;
86 		assert_leq(p, 255);
87 		size_t off = ((i & 3) << 3);
88 		rseed ^= (p << off);
89 	}
90 	return rseed;
91 }
92 
93 /**
94  * Parameters affecting how reads and read in.
95  */
96 struct PatternParams {
PatternParamsPatternParams97 	PatternParams(
98 		int format_,
99 		bool fileParallel_,
100 		uint32_t seed_,
101 		bool useSpinlock_,
102 		bool solexa64_,
103 		bool phred64_,
104 		bool intQuals_,
105 		bool fuzzy_,
106 		int sampleLen_,
107 		int sampleFreq_,
108 		uint32_t skip_) :
109 		format(format_),
110 		fileParallel(fileParallel_),
111 		seed(seed_),
112 		useSpinlock(useSpinlock_),
113 		solexa64(solexa64_),
114 		phred64(phred64_),
115 		intQuals(intQuals_),
116 		fuzzy(fuzzy_),
117 		sampleLen(sampleLen_),
118 		sampleFreq(sampleFreq_),
119 		skip(skip_) { }
120 
121 	int format;           // file format
122 	bool fileParallel;    // true -> wrap files with separate PairedPatternSources
123 	uint32_t seed;        // pseudo-random seed
124 	bool useSpinlock;     // use spin locks instead of pthreads
125 	bool solexa64;        // true -> qualities are on solexa64 scale
126 	bool phred64;         // true -> qualities are on phred64 scale
127 	bool intQuals;        // true -> qualities are space-separated numbers
128 	bool fuzzy;           // true -> try to parse fuzzy fastq
129 	int sampleLen;        // length of sampled reads for FastaContinuous...
130 	int sampleFreq;       // frequency of sampled reads for FastaContinuous...
131 	uint32_t skip;        // skip the first 'skip' patterns
132 };
133 
134 /**
135  * Encapsulates a synchronized source of patterns; usually a file.
136  * Optionally reverses reads and quality strings before returning them,
137  * though that is usually more efficiently done by the concrete
138  * subclass.  Concrete subclasses should delimit critical sections with
139  * calls to lock() and unlock().
140  */
141 class PatternSource {
142 
143 public:
144 
PatternSource(const PatternParams & p)145 	PatternSource(const PatternParams& p) :
146 		seed_(p.seed),
147 		readCnt_(0),
148 		numWrappers_(0),
149 		doLocking_(true),
150 		useSpinlock_(p.useSpinlock),
151 		mutex()
152 	{
153 	}
154 
~PatternSource()155 	virtual ~PatternSource() { }
156 
157 	/**
158 	 * Call this whenever this PatternSource is wrapped by a new
159 	 * WrappedPatternSourcePerThread.  This helps us keep track of
160 	 * whether locks will be contended.
161 	 */
addWrapper()162 	void addWrapper() {
163 		lock();
164 		numWrappers_++;
165 		unlock();
166 	}
167 
168 	/**
169 	 * The main member function for dispensing patterns.
170 	 *
171 	 * Returns true iff a pair was parsed succesfully.
172 	 */
173 	virtual bool nextReadPair(
174 		Read& ra,
175 		Read& rb,
176 		TReadId& rdid,
177 		TReadId& endid,
178 		bool& success,
179 		bool& done,
180 		bool& paired,
181 		bool fixName);
182 
183 	/**
184 	 * The main member function for dispensing patterns.
185 	 */
186 	virtual bool nextRead(
187 		Read& r,
188 		TReadId& rdid,
189 		TReadId& endid,
190 		bool& success,
191 		bool& done);
192 
193 	/**
194 	 * Implementation to be provided by concrete subclasses.  An
195 	 * implementation for this member is only relevant for formats that
196 	 * can read in a pair of reads in a single transaction with a
197 	 * single input source.  If paired-end input is given as a pair of
198 	 * parallel files, this member should throw an error and exit.
199 	 */
200 	virtual bool nextReadPairImpl(
201 		Read& ra,
202 		Read& rb,
203 		TReadId& rdid,
204 		TReadId& endid,
205 		bool& success,
206 		bool& done,
207 		bool& paired) = 0;
208 
209 	/**
210 	 * Implementation to be provided by concrete subclasses.  An
211 	 * implementation for this member is only relevant for formats
212 	 * where individual input sources look like single-end-read
213 	 * sources, e.g., formats where paired-end reads are specified in
214 	 * parallel read files.
215 	 */
216 	virtual bool nextReadImpl(
217 		Read& r,
218 		TReadId& rdid,
219 		TReadId& endid,
220 		bool& success,
221 		bool& done) = 0;
222 
223 	/// Reset state to start over again with the first read
reset()224 	virtual void reset() { readCnt_ = 0; }
225 
226 	/**
227 	 * Concrete subclasses call lock() to enter a critical region.
228 	 * What constitutes a critical region depends on the subclass.
229 	 */
lock()230 	void lock() {
231 		if(!doLocking_) return; // no contention
232         mutex.lock();
233 	}
234 
235 	/**
236 	 * Concrete subclasses call unlock() to exit a critical region
237 	 * What constitutes a critical region depends on the subclass.
238 	 */
unlock()239 	void unlock() {
240 		if(!doLocking_) return; // no contention
241         mutex.unlock();
242 	}
243 
244 	/**
245 	 * Return a new dynamically allocated PatternSource for the given
246 	 * format, using the given list of strings as the filenames to read
247 	 * from or as the sequences themselves (i.e. if -c was used).
248 	 */
249 	static PatternSource* patsrcFromStrings(
250 		const PatternParams& p,
251 		const EList<string>& qs,
252         size_t nthreads = 1);
253 
254 	/**
255 	 * Return the number of reads attempted.
256 	 */
readCnt()257 	TReadId readCnt() const { return readCnt_ - 1; }
258 
259 protected:
260 
261 	uint32_t seed_;
262 
263 	/// The number of reads read by this PatternSource
264 	TReadId readCnt_;
265 
266 	int numWrappers_;      /// # threads that own a wrapper for this PatternSource
267 	bool doLocking_;       /// override whether to lock (true = don't override)
268 	/// User can ask to use the normal pthreads-style lock even if
269 	/// spinlocks is enabled and compiled in.  This is sometimes better
270 	/// if we expect bad I/O latency on some reads.
271 	bool useSpinlock_;
272 	MUTEX_T mutex;
273 };
274 
275 /**
276  * Abstract parent class for synhconized sources of paired-end reads
277  * (and possibly also single-end reads).
278  */
279 class PairedPatternSource {
280 public:
PairedPatternSource(const PatternParams & p)281 	PairedPatternSource(const PatternParams& p) : mutex_m(), seed_(p.seed) {}
~PairedPatternSource()282 	virtual ~PairedPatternSource() { }
283 
284 	virtual void addWrapper() = 0;
285 	virtual void reset() = 0;
286 
287 	virtual bool nextReadPair(
288 		Read& ra,
289 		Read& rb,
290 		TReadId& rdid,
291 		TReadId& endid,
292 		bool& success,
293 		bool& done,
294 		bool& paired,
295 		bool fixName) = 0;
296 
297 	virtual pair<TReadId, TReadId> readCnt() const = 0;
298 
299 	/**
300 	 * Lock this PairedPatternSource, usually because one of its shared
301 	 * fields is being updated.
302 	 */
lock()303 	void lock() {
304 		mutex_m.lock();
305 	}
306 
307 	/**
308 	 * Unlock this PairedPatternSource.
309 	 */
unlock()310 	void unlock() {
311 		mutex_m.unlock();
312 	}
313 
314 	/**
315 	 * Given the values for all of the various arguments used to specify
316 	 * the read and quality input, create a list of pattern sources to
317 	 * dispense them.
318 	 */
319 	static PairedPatternSource* setupPatternSources(
320 		const EList<string>& si,    // singles, from argv
321 		const EList<string>& m1,    // mate1's, from -1 arg
322 		const EList<string>& m2,    // mate2's, from -2 arg
323 		const EList<string>& m12,   // both mates on each line, from --12 arg
324 #ifdef USE_SRA
325         const EList<string>& sra_accs,
326 #endif
327 		const EList<string>& q,     // qualities associated with singles
328 		const EList<string>& q1,    // qualities associated with m1
329 		const EList<string>& q2,    // qualities associated with m2
330 		const PatternParams& p,     // read-in params
331         size_t nthreads,
332 		bool verbose);              // be talkative?
333 
334 protected:
335 
336 	MUTEX_T mutex_m; /// mutex for syncing over critical regions
337 	uint32_t seed_;
338 };
339 
340 /**
341  * Encapsulates a synchronized source of both paired-end reads and
342  * unpaired reads, where the paired-end must come from parallel files.
343  */
344 class PairedSoloPatternSource : public PairedPatternSource {
345 
346 public:
347 
PairedSoloPatternSource(const EList<PatternSource * > * src,const PatternParams & p)348 	PairedSoloPatternSource(
349 		const EList<PatternSource*>* src,
350 		const PatternParams& p) :
351 		PairedPatternSource(p),
352 		cur_(0),
353 		src_(src)
354 	{
355 		assert(src_ != NULL);
356 		for(size_t i = 0; i < src_->size(); i++) {
357 			assert((*src_)[i] != NULL);
358 		}
359 	}
360 
~PairedSoloPatternSource()361 	virtual ~PairedSoloPatternSource() { delete src_; }
362 
363 	/**
364 	 * Call this whenever this PairedPatternSource is wrapped by a new
365 	 * WrappedPatternSourcePerThread.  This helps us keep track of
366 	 * whether locks within PatternSources will be contended.
367 	 */
addWrapper()368 	virtual void addWrapper() {
369 		for(size_t i = 0; i < src_->size(); i++) {
370 			(*src_)[i]->addWrapper();
371 		}
372 	}
373 
374 	/**
375 	 * Reset this object and all the PatternSources under it so that
376 	 * the next call to nextReadPair gets the very first read pair.
377 	 */
reset()378 	virtual void reset() {
379 		for(size_t i = 0; i < src_->size(); i++) {
380 			(*src_)[i]->reset();
381 		}
382 		cur_ = 0;
383 	}
384 
385 	/**
386 	 * The main member function for dispensing pairs of reads or
387 	 * singleton reads.  Returns true iff ra and rb contain a new
388 	 * pair; returns false if ra contains a new unpaired read.
389 	 */
390 	virtual bool nextReadPair(
391 		Read& ra,
392 		Read& rb,
393 		TReadId& rdid,
394 		TReadId& endid,
395 		bool& success,
396 		bool& done,
397 		bool& paired,
398 		bool fixName);
399 
400 	/**
401 	 * Return the number of reads attempted.
402 	 */
readCnt()403 	virtual pair<TReadId, TReadId> readCnt() const {
404 		uint64_t ret = 0llu;
405 		for(size_t i = 0; i < src_->size(); i++) ret += (*src_)[i]->readCnt();
406 		return make_pair(ret, 0llu);
407 	}
408 
409 protected:
410 
411 	volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors
412 	const EList<PatternSource*>* src_; /// PatternSources for paired-end reads
413 };
414 
415 /**
416  * Encapsulates a synchronized source of both paired-end reads and
417  * unpaired reads, where the paired-end must come from parallel files.
418  */
419 class PairedDualPatternSource : public PairedPatternSource {
420 
421 public:
422 
PairedDualPatternSource(const EList<PatternSource * > * srca,const EList<PatternSource * > * srcb,const PatternParams & p)423 	PairedDualPatternSource(
424 		const EList<PatternSource*>* srca,
425 		const EList<PatternSource*>* srcb,
426 		const PatternParams& p) :
427 		PairedPatternSource(p), cur_(0), srca_(srca), srcb_(srcb)
428 	{
429 		assert(srca_ != NULL);
430 		assert(srcb_ != NULL);
431 		// srca_ and srcb_ must be parallel
432 		assert_eq(srca_->size(), srcb_->size());
433 		for(size_t i = 0; i < srca_->size(); i++) {
434 			// Can't have NULL first-mate sources.  Second-mate sources
435 			// can be NULL, in the case when the corresponding first-
436 			// mate source is unpaired.
437 			assert((*srca_)[i] != NULL);
438 			for(size_t j = 0; j < srcb_->size(); j++) {
439 				assert_neq((*srca_)[i], (*srcb_)[j]);
440 			}
441 		}
442 	}
443 
~PairedDualPatternSource()444 	virtual ~PairedDualPatternSource() {
445 		delete srca_;
446 		delete srcb_;
447 	}
448 
449 	/**
450 	 * Call this whenever this PairedPatternSource is wrapped by a new
451 	 * WrappedPatternSourcePerThread.  This helps us keep track of
452 	 * whether locks within PatternSources will be contended.
453 	 */
addWrapper()454 	virtual void addWrapper() {
455 		for(size_t i = 0; i < srca_->size(); i++) {
456 			(*srca_)[i]->addWrapper();
457 			if((*srcb_)[i] != NULL) {
458 				(*srcb_)[i]->addWrapper();
459 			}
460 		}
461 	}
462 
463 	/**
464 	 * Reset this object and all the PatternSources under it so that
465 	 * the next call to nextReadPair gets the very first read pair.
466 	 */
reset()467 	virtual void reset() {
468 		for(size_t i = 0; i < srca_->size(); i++) {
469 			(*srca_)[i]->reset();
470 			if((*srcb_)[i] != NULL) {
471 				(*srcb_)[i]->reset();
472 			}
473 		}
474 		cur_ = 0;
475 	}
476 
477 	/**
478 	 * The main member function for dispensing pairs of reads or
479 	 * singleton reads.  Returns true iff ra and rb contain a new
480 	 * pair; returns false if ra contains a new unpaired read.
481 	 */
482 	virtual bool nextReadPair(
483 		Read& ra,
484 		Read& rb,
485 		TReadId& rdid,
486 		TReadId& endid,
487 		bool& success,
488 		bool& done,
489 		bool& paired,
490 		bool fixName);
491 
492 	/**
493 	 * Return the number of reads attempted.
494 	 */
495 	virtual pair<TReadId, TReadId> readCnt() const;
496 
497 protected:
498 
499 	volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors
500 	const EList<PatternSource*>* srca_; /// PatternSources for 1st mates and/or unpaired reads
501 	const EList<PatternSource*>* srcb_; /// PatternSources for 2nd mates
502 };
503 
504 /**
505  * Encapsulates a single thread's interaction with the PatternSource.
506  * Most notably, this class holds the buffers into which the
507  * PatterSource will write sequences.  This class is *not* threadsafe
508  * - it doesn't need to be since there's one per thread.  PatternSource
509  * is thread-safe.
510  */
511 class PatternSourcePerThread {
512 
513 public:
514 
PatternSourcePerThread()515 	PatternSourcePerThread() :
516 		buf1_(), buf2_(), rdid_(0xffffffff), endid_(0xffffffff) { }
517 
~PatternSourcePerThread()518 	virtual ~PatternSourcePerThread() { }
519 
520 	/**
521 	 * Read the next read pair.
522 	 */
nextReadPair(bool & success,bool & done,bool & paired,bool fixName)523 	virtual bool nextReadPair(
524 		bool& success,
525 		bool& done,
526 		bool& paired,
527 		bool fixName)
528 	{
529 		return success;
530 	}
531 
bufa()532 	Read& bufa()             { return buf1_;    }
bufb()533 	Read& bufb()             { return buf2_;    }
bufa()534 	const Read& bufa() const { return buf1_;    }
bufb()535 	const Read& bufb() const { return buf2_;    }
536 
rdid()537 	TReadId       rdid()  const { return rdid_;  }
endid()538 	TReadId       endid() const { return endid_; }
reset()539 	virtual void  reset()       { rdid_ = endid_ = 0xffffffff;  }
540 
541 	/**
542 	 * Return the length of mate 1 or mate 2.
543 	 */
length(int mate)544 	size_t length(int mate) const {
545 		return (mate == 1) ? buf1_.length() : buf2_.length();
546 	}
547 
548 protected:
549 
550 	Read  buf1_;    // read buffer for mate a
551 	Read  buf2_;    // read buffer for mate b
552 	TReadId rdid_;  // index of read just read
553 	TReadId endid_; // index of read just read
554 };
555 
556 /**
557  * Abstract parent factory for PatternSourcePerThreads.
558  */
559 class PatternSourcePerThreadFactory {
560 public:
~PatternSourcePerThreadFactory()561 	virtual ~PatternSourcePerThreadFactory() { }
562 	virtual PatternSourcePerThread* create() const = 0;
563 	virtual EList<PatternSourcePerThread*>* create(uint32_t n) const = 0;
564 
565 	/// Free memory associated with a pattern source
destroy(PatternSourcePerThread * patsrc)566 	virtual void destroy(PatternSourcePerThread* patsrc) const {
567 		assert(patsrc != NULL);
568 		// Free the PatternSourcePerThread
569 		delete patsrc;
570 	}
571 
572 	/// Free memory associated with a pattern source list
destroy(EList<PatternSourcePerThread * > * patsrcs)573 	virtual void destroy(EList<PatternSourcePerThread*>* patsrcs) const {
574 		assert(patsrcs != NULL);
575 		// Free all of the PatternSourcePerThreads
576 		for(size_t i = 0; i < patsrcs->size(); i++) {
577 			if((*patsrcs)[i] != NULL) {
578 				delete (*patsrcs)[i];
579 				(*patsrcs)[i] = NULL;
580 			}
581 		}
582 		// Free the vector
583 		delete patsrcs;
584 	}
585 };
586 
587 /**
588  * A per-thread wrapper for a PairedPatternSource.
589  */
590 class WrappedPatternSourcePerThread : public PatternSourcePerThread {
591 public:
WrappedPatternSourcePerThread(PairedPatternSource & __patsrc)592 	WrappedPatternSourcePerThread(PairedPatternSource& __patsrc) :
593 		patsrc_(__patsrc)
594 	{
595 		patsrc_.addWrapper();
596 	}
597 
598 	/**
599 	 * Get the next paired or unpaired read from the wrapped
600 	 * PairedPatternSource.
601 	 */
602 	virtual bool nextReadPair(
603 		bool& success,
604 		bool& done,
605 		bool& paired,
606 		bool fixName);
607 
608 private:
609 
610 	/// Container for obtaining paired reads from PatternSources
611 	PairedPatternSource& patsrc_;
612 };
613 
614 /**
615  * Abstract parent factory for PatternSourcePerThreads.
616  */
617 class WrappedPatternSourcePerThreadFactory : public PatternSourcePerThreadFactory {
618 public:
WrappedPatternSourcePerThreadFactory(PairedPatternSource & patsrc)619 	WrappedPatternSourcePerThreadFactory(PairedPatternSource& patsrc) :
620 		patsrc_(patsrc) { }
621 
622 	/**
623 	 * Create a new heap-allocated WrappedPatternSourcePerThreads.
624 	 */
create()625 	virtual PatternSourcePerThread* create() const {
626 		return new WrappedPatternSourcePerThread(patsrc_);
627 	}
628 
629 	/**
630 	 * Create a new heap-allocated vector of heap-allocated
631 	 * WrappedPatternSourcePerThreads.
632 	 */
create(uint32_t n)633 	virtual EList<PatternSourcePerThread*>* create(uint32_t n) const {
634 		EList<PatternSourcePerThread*>* v = new EList<PatternSourcePerThread*>;
635 		for(size_t i = 0; i < n; i++) {
636 			v->push_back(new WrappedPatternSourcePerThread(patsrc_));
637 			assert(v->back() != NULL);
638 		}
639 		return v;
640 	}
641 
642 private:
643 	/// Container for obtaining paired reads from PatternSources
644 	PairedPatternSource& patsrc_;
645 };
646 
647 /// Skip to the end of the current string of newline chars and return
648 /// the first character after the newline chars, or -1 for EOF
getOverNewline(FileBuf & in)649 static inline int getOverNewline(FileBuf& in) {
650 	int c;
651 	while(isspace(c = in.get()));
652 	return c;
653 }
654 
655 /// Skip to the end of the current string of newline chars such that
656 /// the next call to get() returns the first character after the
657 /// whitespace
peekOverNewline(FileBuf & in)658 static inline int peekOverNewline(FileBuf& in) {
659 	while(true) {
660 		int c = in.peek();
661 		if(c != '\r' && c != '\n') {
662 			return c;
663 		}
664 		in.get();
665 	}
666 }
667 
668 /// Skip to the end of the current line; return the first character
669 /// of the next line or -1 for EOF
getToEndOfLine(FileBuf & in)670 static inline int getToEndOfLine(FileBuf& in) {
671 	while(true) {
672 		int c = in.get(); if(c < 0) return -1;
673 		if(c == '\n' || c == '\r') {
674 			while(c == '\n' || c == '\r') {
675 				c = in.get(); if(c < 0) return -1;
676 			}
677 			// c now holds first character of next line
678 			return c;
679 		}
680 	}
681 }
682 
683 /// Skip to the end of the current line such that the next call to
684 /// get() returns the first character on the next line
peekToEndOfLine(FileBuf & in)685 static inline int peekToEndOfLine(FileBuf& in) {
686 	while(true) {
687 		int c = in.get(); if(c < 0) return c;
688 		if(c == '\n' || c == '\r') {
689 			c = in.peek();
690 			while(c == '\n' || c == '\r') {
691 				in.get(); if(c < 0) return c; // consume \r or \n
692 				c = in.peek();
693 			}
694 			// next get() gets first character of next line
695 			return c;
696 		}
697 	}
698 }
699 
700 extern void wrongQualityFormat(const BTString& read_name);
701 extern void tooFewQualities(const BTString& read_name);
702 extern void tooManyQualities(const BTString& read_name);
703 
704 /**
705  * Encapsulates a source of patterns which is an in-memory vector.
706  */
707 class VectorPatternSource : public PatternSource {
708 
709 public:
710 
711 	VectorPatternSource(
712 		const EList<string>& v,
713 		const PatternParams& p);
714 
~VectorPatternSource()715 	virtual ~VectorPatternSource() { }
716 
717 	virtual bool nextReadImpl(
718 		Read& r,
719 		TReadId& rdid,
720 		TReadId& endid,
721 		bool& success,
722 		bool& done);
723 
724 	/**
725 	 * This is unused, but implementation is given for completeness.
726 	 */
727 	virtual bool nextReadPairImpl(
728 		Read& ra,
729 		Read& rb,
730 		TReadId& rdid,
731 		TReadId& endid,
732 		bool& success,
733 		bool& done,
734 		bool& paired);
735 
reset()736 	virtual void reset() {
737 		PatternSource::reset();
738 		cur_ = skip_;
739 		paired_ = false;
740 	}
741 
742 private:
743 
744 	size_t cur_;
745 	uint32_t skip_;
746 	bool paired_;
747 	EList<BTDnaString> v_;  // forward sequences
748 	EList<BTString> quals_; // forward qualities
749 	EList<BTString> names_; // names
750 	EList<int> trimmed3_;   // names
751 	EList<int> trimmed5_;   // names
752 };
753 
754 /**
755  *
756  */
757 class BufferedFilePatternSource : public PatternSource {
758 public:
BufferedFilePatternSource(const EList<string> & infiles,const PatternParams & p)759 	BufferedFilePatternSource(
760 		const EList<string>& infiles,
761 		const PatternParams& p) :
762 		PatternSource(p),
763 		infiles_(infiles),
764 		filecur_(0),
765 		fb_(),
766 		skip_(p.skip),
767 		first_(true)
768 	{
769 		assert_gt(infiles.size(), 0);
770 		errs_.resize(infiles_.size());
771 		errs_.fill(0, infiles_.size(), false);
772 		assert(!fb_.isOpen());
773 		open(); // open first file in the list
774 		filecur_++;
775 	}
776 
~BufferedFilePatternSource()777 	virtual ~BufferedFilePatternSource() {
778 		if(fb_.isOpen()) fb_.close();
779 	}
780 
781 	/**
782 	 * Fill Read with the sequence, quality and name for the next
783 	 * read in the list of read files.  This function gets called by
784 	 * all the search threads, so we must handle synchronization.
785 	 */
nextReadImpl(Read & r,TReadId & rdid,TReadId & endid,bool & success,bool & done)786 	virtual bool nextReadImpl(
787 		Read& r,
788 		TReadId& rdid,
789 		TReadId& endid,
790 		bool& success,
791 		bool& done)
792 	{
793 		// We'll be manipulating our file handle/filecur_ state
794 		lock();
795 		while(true) {
796 			do { read(r, rdid, endid, success, done); }
797 			while(!success && !done);
798 			if(!success && filecur_ < infiles_.size()) {
799 				assert(done);
800 				open();
801 				resetForNextFile(); // reset state to handle a fresh file
802 				filecur_++;
803 				continue;
804 			}
805 			break;
806 		}
807 		assert(r.repOk());
808 		// Leaving critical region
809 		unlock();
810 		return success;
811 	}
812 
813 	/**
814 	 *
815 	 */
nextReadPairImpl(Read & ra,Read & rb,TReadId & rdid,TReadId & endid,bool & success,bool & done,bool & paired)816 	virtual bool nextReadPairImpl(
817 		Read& ra,
818 		Read& rb,
819 		TReadId& rdid,
820 		TReadId& endid,
821 		bool& success,
822 		bool& done,
823 		bool& paired)
824 	{
825 		// We'll be manipulating our file handle/filecur_ state
826 		lock();
827 		while(true) {
828 			do { readPair(ra, rb, rdid, endid, success, done, paired); }
829 			while(!success && !done);
830 			if(!success && filecur_ < infiles_.size()) {
831 				assert(done);
832 				open();
833 				resetForNextFile(); // reset state to handle a fresh file
834 				filecur_++;
835 				continue;
836 			}
837 			break;
838 		}
839 		assert(ra.repOk());
840 		assert(rb.repOk());
841 		// Leaving critical region
842 		unlock();
843 		return success;
844 	}
845 
846 	/**
847 	 * Reset state so that we read start reading again from the
848 	 * beginning of the first file.  Should only be called by the
849 	 * master thread.
850 	 */
reset()851 	virtual void reset() {
852 		PatternSource::reset();
853         filecur_ = 0;
854 		open();
855 		filecur_++;
856 	}
857 
858 protected:
859 
860 	/// Read another pattern from the input file; this is overridden
861 	/// to deal with specific file formats
862 	virtual bool read(
863 		Read& r,
864 		TReadId& rdid,
865 		TReadId& endid,
866 		bool& success,
867 		bool& done) = 0;
868 
869 	/// Read another pattern pair from the input file; this is
870 	/// overridden to deal with specific file formats
871 	virtual bool readPair(
872 		Read& ra,
873 		Read& rb,
874 		TReadId& rdid,
875 		TReadId& endid,
876 		bool& success,
877 		bool& done,
878 		bool& paired) = 0;
879 
880 	/// Reset state to handle a fresh file
resetForNextFile()881 	virtual void resetForNextFile() { }
882 
open()883 	void open() {
884 		if(fb_.isOpen()) fb_.close();
885 		while(filecur_ < infiles_.size()) {
886 			// Open read
887 			FILE *in;
888 			if(infiles_[filecur_] == "-") {
889 				in = stdin;
890 			} else if((in = fopen(infiles_[filecur_].c_str(), "rb")) == NULL) {
891 				if(!errs_[filecur_]) {
892 					cerr << "Warning: Could not open read file \"" << infiles_[filecur_].c_str() << "\" for reading; skipping..." << endl;
893 					errs_[filecur_] = true;
894 				}
895 				filecur_++;
896 				continue;
897 			}
898 			fb_.newFile(in);
899 			return;
900 		}
901 		cerr << "Error: No input read files were valid" << endl;
902 		exit(1);
903 		return;
904 	}
905 
906 	EList<string> infiles_;  // filenames for read files
907 	EList<bool> errs_;       // whether we've already printed an error for each file
908 	size_t filecur_;         // index into infiles_ of next file to read
909 	FileBuf fb_;             // read file currently being read from
910 	TReadId skip_;           // number of reads to skip
911 	bool first_;
912 };
913 
914 /**
915  * Parse a single quality string from fb and store qualities in r.
916  * Assume the next character obtained via fb.get() is the first
917  * character of the quality string.  When returning, the next
918  * character returned by fb.peek() or fb.get() should be the first
919  * character of the following line.
920  */
921 int parseQuals(
922 	Read& r,
923 	FileBuf& fb,
924 	int firstc,
925 	int readLen,
926 	int trim3,
927 	int trim5,
928 	bool intQuals,
929 	bool phred64,
930 	bool solexa64);
931 
932 /**
933  * Synchronized concrete pattern source for a list of FASTA or CSFASTA
934  * (if color = true) files.
935  */
936 class FastaPatternSource : public BufferedFilePatternSource {
937 public:
FastaPatternSource(const EList<string> & infiles,const PatternParams & p)938 	FastaPatternSource(const EList<string>& infiles,
939 	                   const PatternParams& p) :
940 		BufferedFilePatternSource(infiles, p),
941 		first_(true), solexa64_(p.solexa64), phred64_(p.phred64), intQuals_(p.intQuals)
942 	{ }
reset()943 	virtual void reset() {
944 		first_ = true;
945 		BufferedFilePatternSource::reset();
946 	}
947 protected:
948 	/**
949 	 * Scan to the next FASTA record (starting with >) and return the first
950 	 * character of the record (which will always be >).
951 	 */
skipToNextFastaRecord(FileBuf & in)952 	static int skipToNextFastaRecord(FileBuf& in) {
953 		int c;
954 		while((c = in.get()) != '>') {
955 			if(in.eof()) return -1;
956 		}
957 		return c;
958 	}
959 
960 	/// Called when we have to bail without having parsed a read.
bail(Read & r)961 	void bail(Read& r) {
962 		r.reset();
963 		fb_.resetLastN();
964 	}
965 
966 	/// Read another pattern from a FASTA input file
967 	virtual bool read(
968 		Read& r,
969 		TReadId& rdid,
970 		TReadId& endid,
971 		bool& success,
972 		bool& done);
973 
974 	/// Read another pair of patterns from a FASTA input file
readPair(Read & ra,Read & rb,TReadId & rdid,TReadId & endid,bool & success,bool & done,bool & paired)975 	virtual bool readPair(
976 		Read& ra,
977 		Read& rb,
978 		TReadId& rdid,
979 		TReadId& endid,
980 		bool& success,
981 		bool& done,
982 		bool& paired)
983 	{
984 		// (For now, we shouldn't ever be here)
985 		cerr << "In FastaPatternSource.readPair()" << endl;
986 		throw 1;
987 		return false;
988 	}
989 
resetForNextFile()990 	virtual void resetForNextFile() {
991 		first_ = true;
992 	}
993 
994 private:
995 	bool first_;
996 
997 public:
998 	bool solexa64_;
999 	bool phred64_;
1000 	bool intQuals_;
1001 };
1002 
1003 
1004 /**
1005  * Tokenize a line of space-separated integer quality values.
1006  */
tokenizeQualLine(FileBuf & filebuf,char * buf,size_t buflen,EList<string> & toks)1007 static inline bool tokenizeQualLine(
1008 	FileBuf& filebuf,
1009 	char *buf,
1010 	size_t buflen,
1011 	EList<string>& toks)
1012 {
1013 	size_t rd = filebuf.gets(buf, buflen);
1014 	if(rd == 0) return false;
1015 	assert(NULL == strrchr(buf, '\n'));
1016 	tokenize(string(buf), " ", toks);
1017 	return true;
1018 }
1019 
1020 /**
1021  * Synchronized concrete pattern source for a list of files with tab-
1022  * delimited name, seq, qual fields (or, for paired-end reads,
1023  * basename, seq1, qual1, seq2, qual2).
1024  */
1025 class TabbedPatternSource : public BufferedFilePatternSource {
1026 
1027 public:
1028 
TabbedPatternSource(const EList<string> & infiles,const PatternParams & p,bool secondName)1029 	TabbedPatternSource(
1030 		const EList<string>& infiles,
1031 		const PatternParams& p,
1032 		bool  secondName) :
1033 		BufferedFilePatternSource(infiles, p),
1034 		solQuals_(p.solexa64),
1035 		phred64Quals_(p.phred64),
1036 		intQuals_(p.intQuals),
1037 		secondName_(secondName) { }
1038 
1039 protected:
1040 
1041 	/// Read another pattern from a FASTA input file
1042 	virtual bool read(
1043 		Read& r,
1044 		TReadId& rdid,
1045 		TReadId& endid,
1046 		bool& success,
1047 		bool& done);
1048 
1049 	/// Read another pair of patterns from a FASTA input file
1050 	virtual bool readPair(
1051 		Read& ra,
1052 		Read& rb,
1053 		TReadId& rdid,
1054 		TReadId& endid,
1055 		bool& success,
1056 		bool& done,
1057 		bool& paired);
1058 
1059 private:
1060 
1061 	/**
1062 	 * Parse a name from fb_ and store in r.  Assume that the next
1063 	 * character obtained via fb_.get() is the first character of
1064 	 * the sequence and the string stops at the next char upto (could
1065 	 * be tab, newline, etc.).
1066 	 */
1067 	int parseName(Read& r, Read* r2, char upto = '\t');
1068 
1069 	/**
1070 	 * Parse a single sequence from fb_ and store in r.  Assume
1071 	 * that the next character obtained via fb_.get() is the first
1072 	 * character of the sequence and the sequence stops at the next
1073 	 * char upto (could be tab, newline, etc.).
1074 	 */
1075 	int parseSeq(Read& r, int& charsRead, int& trim5, char upto = '\t');
1076 
1077 	/**
1078 	 * Parse a single quality string from fb_ and store in r.
1079 	 * Assume that the next character obtained via fb_.get() is
1080 	 * the first character of the quality string and the string stops
1081 	 * at the next char upto (could be tab, newline, etc.).
1082 	 */
1083 	int parseQuals(Read& r, int charsRead, int dstLen, int trim5,
1084 	               char& c2, char upto = '\t', char upto2 = -1);
1085 
1086 	bool solQuals_;
1087 	bool phred64Quals_;
1088 	bool intQuals_;
1089 	EList<string> qualToks_;
1090 	bool secondName_;
1091 };
1092 
1093 /**
1094  * Synchronized concrete pattern source for Illumina Qseq files.  In
1095  * Qseq files, each read appears on a separate line and the tab-
1096  * delimited fields are:
1097  *
1098  * 1. Machine name
1099  * 2. Run number
1100  * 3. Lane number
1101  * 4. Tile number
1102  * 5. X coordinate of spot
1103  * 6. Y coordinate of spot
1104  * 7. Index: "Index sequence or 0. For no indexing, or for a file that
1105  *    has not been demultiplexed yet, this field should have a value of
1106  *    0."
1107  * 8. Read number: 1 for unpaired, 1 or 2 for paired
1108  * 9. Sequence
1109  * 10. Quality
1110  * 11. Filter: 1 = passed, 0 = didn't
1111  */
1112 class QseqPatternSource : public BufferedFilePatternSource {
1113 
1114 public:
1115 
QseqPatternSource(const EList<string> & infiles,const PatternParams & p)1116 	QseqPatternSource(
1117 		const EList<string>& infiles,
1118 	    const PatternParams& p) :
1119 		BufferedFilePatternSource(infiles, p),
1120 		solQuals_(p.solexa64),
1121 		phred64Quals_(p.phred64),
1122 		intQuals_(p.intQuals) { }
1123 
1124 protected:
1125 
1126 #define BAIL_UNPAIRED() { \
1127 	peekOverNewline(fb_); \
1128 	r.reset(); \
1129 	success = false; \
1130 	done = true; \
1131 	return success; \
1132 }
1133 
1134 	/**
1135 	 * Parse a name from fb_ and store in r.  Assume that the next
1136 	 * character obtained via fb_.get() is the first character of
1137 	 * the sequence and the string stops at the next char upto (could
1138 	 * be tab, newline, etc.).
1139 	 */
1140 	int parseName(
1141 		Read& r,      // buffer for mate 1
1142 		Read* r2,     // buffer for mate 2 (NULL if mate2 is read separately)
1143 		bool append,     // true -> append characters, false -> skip them
1144 		bool clearFirst, // clear the name buffer first
1145 		bool warnEmpty,  // emit a warning if nothing was added to the name
1146 		bool useDefault, // if nothing is read, put readCnt_ as a default value
1147 		int upto);       // stop parsing when we first reach character 'upto'
1148 
1149 	/**
1150 	 * Parse a single sequence from fb_ and store in r.  Assume
1151 	 * that the next character obtained via fb_.get() is the first
1152 	 * character of the sequence and the sequence stops at the next
1153 	 * char upto (could be tab, newline, etc.).
1154 	 */
1155 	int parseSeq(
1156 		Read& r,      // buffer for read
1157 		int& charsRead,
1158 		int& trim5,
1159 		char upto);
1160 
1161 	/**
1162 	 * Parse a single quality string from fb_ and store in r.
1163 	 * Assume that the next character obtained via fb_.get() is
1164 	 * the first character of the quality string and the string stops
1165 	 * at the next char upto (could be tab, newline, etc.).
1166 	 */
1167 	int parseQuals(
1168 		Read& r,      // buffer for read
1169 		int charsRead,
1170 		int dstLen,
1171 		int trim5,
1172 		char& c2,
1173 		char upto,
1174 		char upto2);
1175 
1176 	/**
1177 	 * Read another pattern from a Qseq input file.
1178 	 */
1179 	virtual bool read(
1180 		Read& r,
1181 		TReadId& rdid,
1182 		TReadId& endid,
1183 		bool& success,
1184 		bool& done);
1185 
1186 	/**
1187 	 * Read a pair of patterns from 1 Qseq file.  Note: this is never used.
1188 	 */
readPair(Read & ra,Read & rb,TReadId & rdid,TReadId & endid,bool & success,bool & done,bool & paired)1189 	virtual bool readPair(
1190 		Read& ra,
1191 		Read& rb,
1192 		TReadId& rdid,
1193 		TReadId& endid,
1194 		bool& success,
1195 		bool& done,
1196 		bool& paired)
1197 	{
1198 		// (For now, we shouldn't ever be here)
1199 		cerr << "In QseqPatternSource.readPair()" << endl;
1200 		throw 1;
1201 		return false;
1202 	}
1203 
1204 	bool solQuals_;
1205 	bool phred64Quals_;
1206 	bool intQuals_;
1207 	EList<string> qualToks_;
1208 };
1209 
1210 /**
1211  * Synchronized concrete pattern source for a list of FASTA files where
1212  * reads need to be extracted from long continuous sequences.
1213  */
1214 class FastaContinuousPatternSource : public BufferedFilePatternSource {
1215 public:
FastaContinuousPatternSource(const EList<string> & infiles,const PatternParams & p)1216 	FastaContinuousPatternSource(const EList<string>& infiles, const PatternParams& p) :
1217 		BufferedFilePatternSource(infiles, p),
1218 		length_(p.sampleLen), freq_(p.sampleFreq),
1219 		eat_(length_-1), beginning_(true),
1220 		bufCur_(0), subReadCnt_(0llu)
1221 	{
1222 		resetForNextFile();
1223 	}
1224 
reset()1225 	virtual void reset() {
1226 		BufferedFilePatternSource::reset();
1227 		resetForNextFile();
1228 	}
1229 
1230 protected:
1231 
1232 	/// Read another pattern from a FASTA input file
read(Read & r,TReadId & rdid,TReadId & endid,bool & success,bool & done)1233 	virtual bool read(
1234 		Read& r,
1235 		TReadId& rdid,
1236 		TReadId& endid,
1237 		bool& success,
1238 		bool& done)
1239 	{
1240 		success = true;
1241 		done = false;
1242 		r.reset();
1243 		while(true) {
1244 			r.color = gColor;
1245 			int c = fb_.get();
1246 			if(c < 0) { success = false; done = true; return success; }
1247 			if(c == '>') {
1248 				resetForNextFile();
1249 				c = fb_.peek();
1250 				bool sawSpace = false;
1251 				while(c != '\n' && c != '\r') {
1252 					if(!sawSpace) {
1253 						sawSpace = isspace(c);
1254 					}
1255 					if(!sawSpace) {
1256 						nameBuf_.append(c);
1257 					}
1258 					fb_.get();
1259 					c = fb_.peek();
1260 				}
1261 				while(c == '\n' || c == '\r') {
1262 					fb_.get();
1263 					c = fb_.peek();
1264 				}
1265 				nameBuf_.append('_');
1266 			} else {
1267 				int cat = asc2dnacat[c];
1268 				if(cat >= 2) c = 'N';
1269 				if(cat == 0) {
1270 					// Encountered non-DNA, non-IUPAC char; skip it
1271 					continue;
1272 				} else {
1273 					// DNA char
1274 					buf_[bufCur_++] = c;
1275 					if(bufCur_ == 1024) bufCur_ = 0;
1276 					if(eat_ > 0) {
1277 						eat_--;
1278 						// Try to keep readCnt_ aligned with the offset
1279 						// into the reference; that lets us see where
1280 						// the sampling gaps are by looking at the read
1281 						// name
1282 						if(!beginning_) readCnt_++;
1283 						continue;
1284 					}
1285 					for(size_t i = 0; i < length_; i++) {
1286 						if(length_ - i <= bufCur_) {
1287 							c = buf_[bufCur_ - (length_ - i)];
1288 						} else {
1289 							// Rotate
1290 							c = buf_[bufCur_ - (length_ - i) + 1024];
1291 						}
1292 						r.patFw.append(asc2dna[c]);
1293 						r.qual.append('I');
1294 					}
1295 					// Set up a default name if one hasn't been set
1296 					r.name = nameBuf_;
1297 					char cbuf[20];
1298 					itoa10<TReadId>(readCnt_ - subReadCnt_, cbuf);
1299 					r.name.append(cbuf);
1300 					eat_ = freq_-1;
1301 					readCnt_++;
1302 					beginning_ = false;
1303 					rdid = endid = readCnt_-1;
1304 					break;
1305 				}
1306 			}
1307 		}
1308 		return true;
1309 	}
1310 
1311 	/// Shouldn't ever be here; it's not sensible to obtain read pairs
1312 	// from a continuous input.
readPair(Read & ra,Read & rb,TReadId & rdid,TReadId & endid,bool & success,bool & done,bool & paired)1313 	virtual bool readPair(
1314 		Read& ra,
1315 		Read& rb,
1316 		TReadId& rdid,
1317 		TReadId& endid,
1318 		bool& success,
1319 		bool& done,
1320 		bool& paired)
1321 	{
1322 		cerr << "In FastaContinuousPatternSource.readPair()" << endl;
1323 		throw 1;
1324 		return false;
1325 	}
1326 
1327 	/**
1328 	 * Reset state to be read for the next file.
1329 	 */
resetForNextFile()1330 	virtual void resetForNextFile() {
1331 		eat_ = length_-1;
1332 		beginning_ = true;
1333 		bufCur_ = 0;
1334 		nameBuf_.clear();
1335 		subReadCnt_ = readCnt_;
1336 	}
1337 
1338 private:
1339 	size_t length_;     /// length of reads to generate
1340 	size_t freq_;       /// frequency to sample reads
1341 	size_t eat_;        /// number of characters we need to skip before
1342 	                    /// we have flushed all of the ambiguous or
1343 	                    /// non-existent characters out of our read
1344 	                    /// window
1345 	bool beginning_;    /// skipping over the first read length?
1346 	char buf_[1024];    /// read buffer
1347 	BTString nameBuf_;  /// read buffer for name of fasta record being
1348 	                    /// split into mers
1349 	size_t bufCur_;     /// buffer cursor; points to where we should
1350 	                    /// insert the next character
1351 	uint64_t subReadCnt_;/// number to subtract from readCnt_ to get
1352 	                    /// the pat id to output (so it resets to 0 for
1353 	                    /// each new sequence)
1354 };
1355 
1356 /**
1357  * Read a FASTQ-format file.
1358  * See: http://maq.sourceforge.net/fastq.shtml
1359  */
1360 class FastqPatternSource : public BufferedFilePatternSource {
1361 
1362 public:
1363 
FastqPatternSource(const EList<string> & infiles,const PatternParams & p)1364 	FastqPatternSource(const EList<string>& infiles, const PatternParams& p) :
1365 		BufferedFilePatternSource(infiles, p),
1366 		first_(true),
1367 		solQuals_(p.solexa64),
1368 		phred64Quals_(p.phred64),
1369 		intQuals_(p.intQuals),
1370 		fuzzy_(p.fuzzy)
1371 	{ }
1372 
reset()1373 	virtual void reset() {
1374 		first_ = true;
1375 		fb_.resetLastN();
1376 		BufferedFilePatternSource::reset();
1377 	}
1378 
1379 protected:
1380 
1381 	/**
1382 	 * Scan to the next FASTQ record (starting with @) and return the first
1383 	 * character of the record (which will always be @).  Since the quality
1384 	 * line may start with @, we keep scanning until we've seen a line
1385 	 * beginning with @ where the line two lines back began with +.
1386 	 */
skipToNextFastqRecord(FileBuf & in,bool sawPlus)1387 	static int skipToNextFastqRecord(FileBuf& in, bool sawPlus) {
1388 		int line = 0;
1389 		int plusLine = -1;
1390 		int c = in.get();
1391 		int firstc = c;
1392 		while(true) {
1393 			if(line > 20) {
1394 				// If we couldn't find our desired '@' in the first 20
1395 				// lines, it's time to give up
1396 				if(firstc == '>') {
1397 					// That firstc is '>' may be a hint that this is
1398 					// actually a FASTA file, so return it intact
1399 					return '>';
1400 				}
1401 				// Return an error
1402 				return -1;
1403 			}
1404 			if(c == -1) return -1;
1405 			if(c == '\n') {
1406 				c = in.get();
1407 				if(c == '@' && sawPlus && plusLine == (line-2)) {
1408 					return '@';
1409 				}
1410 				else if(c == '+') {
1411 					// Saw a '+' at the beginning of a line; remember where
1412 					// we saw it
1413 					sawPlus = true;
1414 					plusLine = line;
1415 				}
1416 				else if(c == -1) {
1417 					return -1;
1418 				}
1419 				line++;
1420 			}
1421 			c = in.get();
1422 		}
1423 	}
1424 
1425 	/// Read another pattern from a FASTQ input file
1426 	virtual bool read(
1427 		Read& r,
1428 		TReadId& rdid,
1429 		TReadId& endid,
1430 		bool& success,
1431 		bool& done);
1432 
1433 	/// Read another read pair from a FASTQ input file
readPair(Read & ra,Read & rb,TReadId & rdid,TReadId & endid,bool & success,bool & done,bool & paired)1434 	virtual bool readPair(
1435 		Read& ra,
1436 		Read& rb,
1437 		TReadId& rdid,
1438 		TReadId& endid,
1439 		bool& success,
1440 		bool& done,
1441 		bool& paired)
1442 	{
1443 		// (For now, we shouldn't ever be here)
1444 		cerr << "In FastqPatternSource.readPair()" << endl;
1445 		throw 1;
1446 		return false;
1447 	}
1448 
resetForNextFile()1449 	virtual void resetForNextFile() {
1450 		first_ = true;
1451 	}
1452 
1453 private:
1454 
1455 	/**
1456 	 * Do things we need to do if we have to bail in the middle of a
1457 	 * read, usually because we reached the end of the input without
1458 	 * finishing.
1459 	 */
bail(Read & r)1460 	void bail(Read& r) {
1461 		r.patFw.clear();
1462 		fb_.resetLastN();
1463 	}
1464 
1465 	bool first_;
1466 	bool solQuals_;
1467 	bool phred64Quals_;
1468 	bool intQuals_;
1469 	bool fuzzy_;
1470 	EList<string> qualToks_;
1471 };
1472 
1473 /**
1474  * Read a Raw-format file (one sequence per line).  No quality strings
1475  * allowed.  All qualities are assumed to be 'I' (40 on the Phred-33
1476  * scale).
1477  */
1478 class RawPatternSource : public BufferedFilePatternSource {
1479 
1480 public:
1481 
RawPatternSource(const EList<string> & infiles,const PatternParams & p)1482 	RawPatternSource(const EList<string>& infiles, const PatternParams& p) :
1483 		BufferedFilePatternSource(infiles, p), first_(true) { }
1484 
reset()1485 	virtual void reset() {
1486 		first_ = true;
1487 		BufferedFilePatternSource::reset();
1488 	}
1489 
1490 protected:
1491 
1492 	/// Read another pattern from a Raw input file
read(Read & r,TReadId & rdid,TReadId & endid,bool & success,bool & done)1493 	virtual bool read(
1494 		Read& r,
1495 		TReadId& rdid,
1496 		TReadId& endid,
1497 		bool& success,
1498 		bool& done)
1499 	{
1500 		int c;
1501 		success = true;
1502 		done = false;
1503 		r.reset();
1504 		c = getOverNewline(this->fb_);
1505 		if(c < 0) {
1506 			bail(r); success = false; done = true; return success;
1507 		}
1508 		assert(!isspace(c));
1509 		r.color = gColor;
1510 		int mytrim5 = gTrim5;
1511 		if(first_) {
1512 			// Check that the first character is sane for a raw file
1513 			int cc = c;
1514 			if(gColor) {
1515 				if(cc >= '0' && cc <= '4') cc = "ACGTN"[(int)cc - '0'];
1516 				if(cc == '.') cc = 'N';
1517 			}
1518 			if(asc2dnacat[cc] == 0) {
1519 				cerr << "Error: reads file does not look like a Raw file" << endl;
1520 				if(c == '>') {
1521 					cerr << "Reads file looks like a FASTA file; please use -f" << endl;
1522 				}
1523 				if(c == '@') {
1524 					cerr << "Reads file looks like a FASTQ file; please use -q" << endl;
1525 				}
1526 				throw 1;
1527 			}
1528 			first_ = false;
1529 		}
1530 		if(gColor) {
1531 			// This may be a primer character.  If so, keep it in the
1532 			// 'primer' field of the read buf and parse the rest of the
1533 			// read without it.
1534 			c = toupper(c);
1535 			if(asc2dnacat[c] > 0) {
1536 				// First char is a DNA char
1537 				int c2 = toupper(fb_.peek());
1538 				// Second char is a color char
1539 				if(asc2colcat[c2] > 0) {
1540 					r.primer = c;
1541 					r.trimc = c2;
1542 					mytrim5 += 2; // trim primer and first color
1543 				}
1544 			}
1545 			if(c < 0) {
1546 				bail(r); success = false; done = true; return success;
1547 			}
1548 		}
1549 		// _in now points just past the first character of a sequence
1550 		// line, and c holds the first character
1551 		int chs = 0;
1552 		while(!isspace(c) && c >= 0) {
1553 			if(gColor) {
1554 				if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
1555 				if(c == '.') c = 'N';
1556 			}
1557 			// 5' trimming
1558 			if(isalpha(c) && chs >= mytrim5) {
1559 				//size_t len = chs - mytrim5;
1560 				//if(len >= 1024) tooManyQualities(BTString("(no name)"));
1561 				r.patFw.append(asc2dna[c]);
1562 				r.qual.append('I');
1563 			}
1564 			chs++;
1565 			if(isspace(fb_.peek())) break;
1566 			c = fb_.get();
1567 		}
1568 		// 3' trimming
1569 		r.patFw.trimEnd(gTrim3);
1570 		r.qual.trimEnd(gTrim3);
1571 		c = peekToEndOfLine(fb_);
1572 		r.trimmed3 = gTrim3;
1573 		r.trimmed5 = mytrim5;
1574 		r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
1575 		fb_.resetLastN();
1576 
1577 		// Set up name
1578 		char cbuf[20];
1579 		itoa10<TReadId>(readCnt_, cbuf);
1580 		r.name.install(cbuf);
1581 		readCnt_++;
1582 
1583 		rdid = endid = readCnt_-1;
1584 		return success;
1585 	}
1586 
1587 	/// Read another read pair from a FASTQ input file
readPair(Read & ra,Read & rb,TReadId & rdid,TReadId & endid,bool & success,bool & done,bool & paired)1588 	virtual bool readPair(
1589 		Read& ra,
1590 		Read& rb,
1591 		TReadId& rdid,
1592 		TReadId& endid,
1593 		bool& success,
1594 		bool& done,
1595 		bool& paired)
1596 	{
1597 		// (For now, we shouldn't ever be here)
1598 		cerr << "In RawPatternSource.readPair()" << endl;
1599 		throw 1;
1600 		return false;
1601 	}
1602 
resetForNextFile()1603 	virtual void resetForNextFile() {
1604 		first_ = true;
1605 	}
1606 
1607 private:
1608 
1609 	/**
1610 	 * Do things we need to do if we have to bail in the middle of a
1611 	 * read, usually because we reached the end of the input without
1612 	 * finishing.
1613 	 */
bail(Read & r)1614 	void bail(Read& r) {
1615 		r.patFw.clear();
1616 		fb_.resetLastN();
1617 	}
1618 
1619 	bool first_;
1620 };
1621 
1622 #ifdef USE_SRA
1623 
1624 namespace ngs {
1625     class ReadCollection;
1626     class ReadIterator;
1627 }
1628 
1629 namespace tthread {
1630     class thread;
1631 };
1632 
1633 struct SRA_Data;
1634 
1635 /**
1636  *
1637  */
1638 class SRAPatternSource : public PatternSource {
1639 public:
1640     SRAPatternSource(
1641                      const EList<string>& sra_accs,
1642                      const PatternParams& p,
1643                      const size_t nthreads = 1) :
PatternSource(p)1644     PatternSource(p),
1645     sra_accs_(sra_accs),
1646     sra_acc_cur_(0),
1647     skip_(p.skip),
1648     first_(true),
1649     nthreads_(nthreads),
1650     sra_run_(NULL),
1651     sra_it_(NULL),
1652     sra_data_(NULL),
1653     io_thread_(NULL)
1654     {
1655         assert_gt(sra_accs_.size(), 0);
1656         errs_.resize(sra_accs_.size());
1657         errs_.fill(0, sra_accs_.size(), false);
1658         open(); // open first file in the list
1659         sra_acc_cur_++;
1660     }
1661 
1662     virtual ~SRAPatternSource();
1663 
1664     /**
1665      * Fill Read with the sequence, quality and name for the next
1666      * read in the list of read files.  This function gets called by
1667      * all the search threads, so we must handle synchronization.
1668      */
nextReadImpl(Read & r,TReadId & rdid,TReadId & endid,bool & success,bool & done)1669     virtual bool nextReadImpl(
1670                               Read& r,
1671                               TReadId& rdid,
1672                               TReadId& endid,
1673                               bool& success,
1674                               bool& done)
1675     {
1676         // We'll be manipulating our file handle/filecur_ state
1677         lock();
1678         while(true) {
1679             do { read(r, rdid, endid, success, done); }
1680             while(!success && !done);
1681             if(!success && sra_acc_cur_ < sra_accs_.size()) {
1682                 assert(done);
1683                 open();
1684                 resetForNextFile(); // reset state to handle a fresh file
1685                 sra_acc_cur_++;
1686                 continue;
1687             }
1688             break;
1689         }
1690         assert(r.repOk());
1691         // Leaving critical region
1692         unlock();
1693         return success;
1694     }
1695 
1696     /**
1697      *
1698      */
nextReadPairImpl(Read & ra,Read & rb,TReadId & rdid,TReadId & endid,bool & success,bool & done,bool & paired)1699     virtual bool nextReadPairImpl(
1700                                   Read& ra,
1701                                   Read& rb,
1702                                   TReadId& rdid,
1703                                   TReadId& endid,
1704                                   bool& success,
1705                                   bool& done,
1706                                   bool& paired)
1707     {
1708         // We'll be manipulating our file handle/filecur_ state
1709         lock();
1710         while(true) {
1711             do { readPair(ra, rb, rdid, endid, success, done, paired); }
1712             while(!success && !done);
1713             if(!success && sra_acc_cur_ < sra_accs_.size()) {
1714                 assert(done);
1715                 open();
1716                 resetForNextFile(); // reset state to handle a fresh file
1717                 sra_acc_cur_++;
1718                 continue;
1719             }
1720             break;
1721         }
1722         assert(ra.repOk());
1723         assert(rb.repOk());
1724         // Leaving critical region
1725         unlock();
1726         return success;
1727     }
1728 
1729     /**
1730      * Reset state so that we read start reading again from the
1731      * beginning of the first file.  Should only be called by the
1732      * master thread.
1733      */
reset()1734     virtual void reset() {
1735         PatternSource::reset();
1736         sra_acc_cur_ = 0,
1737         open();
1738         sra_acc_cur_++;
1739     }
1740 
1741     /// Read another pattern from the input file; this is overridden
1742     /// to deal with specific file formats
read(Read & r,TReadId & rdid,TReadId & endid,bool & success,bool & done)1743     virtual bool read(
1744                       Read& r,
1745                       TReadId& rdid,
1746                       TReadId& endid,
1747                       bool& success,
1748                       bool& done)
1749     {
1750         return true;
1751     }
1752 
1753     /// Read another pattern pair from the input file; this is
1754     /// overridden to deal with specific file formats
1755     virtual bool readPair(
1756                           Read& ra,
1757                           Read& rb,
1758                           TReadId& rdid,
1759                           TReadId& endid,
1760                           bool& success,
1761                           bool& done,
1762                           bool& paired);
1763 
1764 protected:
1765 
1766     /// Reset state to handle a fresh file
resetForNextFile()1767     virtual void resetForNextFile() { }
1768 
1769     void open();
1770 
1771     EList<string> sra_accs_; // filenames for read files
1772     EList<bool> errs_;       // whether we've already printed an error for each file
1773     size_t sra_acc_cur_;     // index into infiles_ of next file to read
1774     TReadId skip_;           // number of reads to skip
1775     bool first_;
1776 
1777     size_t nthreads_;
1778 
1779     ngs::ReadCollection* sra_run_;
1780     ngs::ReadIterator* sra_it_;
1781 
1782     SRA_Data* sra_data_;
1783     tthread::thread* io_thread_;
1784 };
1785 
1786 #endif
1787 
1788 #endif /*PAT_H_*/
1789