1 /*
2 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3 *
4 * This file is part of Bowtie 2.
5 *
6 * Bowtie 2 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Bowtie 2 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef PAT_H_
21 #define PAT_H_
22
23 #include <cassert>
24 #include <cmath>
25 #include <stdexcept>
26 #include <vector>
27 #include <string>
28 #include <cstring>
29 #include <ctype.h>
30 #include <fstream>
31 #include "alphabet.h"
32 #include "assert_helpers.h"
33 #include "tokenize.h"
34 #include "random_source.h"
35 #include "threading.h"
36 #include "filebuf.h"
37 #include "qual.h"
38 #include "search_globals.h"
39 #include "sstring.h"
40 #include "ds.h"
41 #include "read.h"
42 #include "util.h"
43
44 /**
45 * Classes and routines for reading reads from various input sources.
46 */
47
48 using namespace std;
49
50 /**
51 * Calculate a per-read random seed based on a combination of
52 * the read data (incl. sequence, name, quals) and the global
53 * seed in '_randSeed'.
54 */
genRandSeed(const BTDnaString & qry,const BTString & qual,const BTString & name,uint32_t seed)55 static inline uint32_t genRandSeed(const BTDnaString& qry,
56 const BTString& qual,
57 const BTString& name,
58 uint32_t seed)
59 {
60 // Calculate a per-read random seed based on a combination of
61 // the read data (incl. sequence, name, quals) and the global
62 // seed
63 uint32_t rseed = (seed + 101) * 59 * 61 * 67 * 71 * 73 * 79 * 83;
64 size_t qlen = qry.length();
65 // Throw all the characters of the read into the random seed
66 for(size_t i = 0; i < qlen; i++) {
67 int p = (int)qry[i];
68 assert_leq(p, 4);
69 size_t off = ((i & 15) << 1);
70 rseed ^= (p << off);
71 }
72 // Throw all the quality values for the read into the random
73 // seed
74 for(size_t i = 0; i < qlen; i++) {
75 int p = (int)qual[i];
76 assert_leq(p, 255);
77 size_t off = ((i & 3) << 3);
78 rseed ^= (p << off);
79 }
80 // Throw all the characters in the read name into the random
81 // seed
82 size_t namelen = name.length();
83 for(size_t i = 0; i < namelen; i++) {
84 int p = (int)name[i];
85 if(p == '/') break;
86 assert_leq(p, 255);
87 size_t off = ((i & 3) << 3);
88 rseed ^= (p << off);
89 }
90 return rseed;
91 }
92
93 /**
94 * Parameters affecting how reads and read in.
95 */
96 struct PatternParams {
PatternParamsPatternParams97 PatternParams(
98 int format_,
99 bool fileParallel_,
100 uint32_t seed_,
101 bool useSpinlock_,
102 bool solexa64_,
103 bool phred64_,
104 bool intQuals_,
105 bool fuzzy_,
106 int sampleLen_,
107 int sampleFreq_,
108 uint32_t skip_) :
109 format(format_),
110 fileParallel(fileParallel_),
111 seed(seed_),
112 useSpinlock(useSpinlock_),
113 solexa64(solexa64_),
114 phred64(phred64_),
115 intQuals(intQuals_),
116 fuzzy(fuzzy_),
117 sampleLen(sampleLen_),
118 sampleFreq(sampleFreq_),
119 skip(skip_) { }
120
121 int format; // file format
122 bool fileParallel; // true -> wrap files with separate PairedPatternSources
123 uint32_t seed; // pseudo-random seed
124 bool useSpinlock; // use spin locks instead of pthreads
125 bool solexa64; // true -> qualities are on solexa64 scale
126 bool phred64; // true -> qualities are on phred64 scale
127 bool intQuals; // true -> qualities are space-separated numbers
128 bool fuzzy; // true -> try to parse fuzzy fastq
129 int sampleLen; // length of sampled reads for FastaContinuous...
130 int sampleFreq; // frequency of sampled reads for FastaContinuous...
131 uint32_t skip; // skip the first 'skip' patterns
132 };
133
134 /**
135 * Encapsulates a synchronized source of patterns; usually a file.
136 * Optionally reverses reads and quality strings before returning them,
137 * though that is usually more efficiently done by the concrete
138 * subclass. Concrete subclasses should delimit critical sections with
139 * calls to lock() and unlock().
140 */
141 class PatternSource {
142
143 public:
144
PatternSource(const PatternParams & p)145 PatternSource(const PatternParams& p) :
146 seed_(p.seed),
147 readCnt_(0),
148 numWrappers_(0),
149 doLocking_(true),
150 useSpinlock_(p.useSpinlock),
151 mutex()
152 {
153 }
154
~PatternSource()155 virtual ~PatternSource() { }
156
157 /**
158 * Call this whenever this PatternSource is wrapped by a new
159 * WrappedPatternSourcePerThread. This helps us keep track of
160 * whether locks will be contended.
161 */
addWrapper()162 void addWrapper() {
163 lock();
164 numWrappers_++;
165 unlock();
166 }
167
168 /**
169 * The main member function for dispensing patterns.
170 *
171 * Returns true iff a pair was parsed succesfully.
172 */
173 virtual bool nextReadPair(
174 Read& ra,
175 Read& rb,
176 TReadId& rdid,
177 TReadId& endid,
178 bool& success,
179 bool& done,
180 bool& paired,
181 bool fixName);
182
183 /**
184 * The main member function for dispensing patterns.
185 */
186 virtual bool nextRead(
187 Read& r,
188 TReadId& rdid,
189 TReadId& endid,
190 bool& success,
191 bool& done);
192
193 /**
194 * Implementation to be provided by concrete subclasses. An
195 * implementation for this member is only relevant for formats that
196 * can read in a pair of reads in a single transaction with a
197 * single input source. If paired-end input is given as a pair of
198 * parallel files, this member should throw an error and exit.
199 */
200 virtual bool nextReadPairImpl(
201 Read& ra,
202 Read& rb,
203 TReadId& rdid,
204 TReadId& endid,
205 bool& success,
206 bool& done,
207 bool& paired) = 0;
208
209 /**
210 * Implementation to be provided by concrete subclasses. An
211 * implementation for this member is only relevant for formats
212 * where individual input sources look like single-end-read
213 * sources, e.g., formats where paired-end reads are specified in
214 * parallel read files.
215 */
216 virtual bool nextReadImpl(
217 Read& r,
218 TReadId& rdid,
219 TReadId& endid,
220 bool& success,
221 bool& done) = 0;
222
223 /// Reset state to start over again with the first read
reset()224 virtual void reset() { readCnt_ = 0; }
225
226 /**
227 * Concrete subclasses call lock() to enter a critical region.
228 * What constitutes a critical region depends on the subclass.
229 */
lock()230 void lock() {
231 if(!doLocking_) return; // no contention
232 mutex.lock();
233 }
234
235 /**
236 * Concrete subclasses call unlock() to exit a critical region
237 * What constitutes a critical region depends on the subclass.
238 */
unlock()239 void unlock() {
240 if(!doLocking_) return; // no contention
241 mutex.unlock();
242 }
243
244 /**
245 * Return a new dynamically allocated PatternSource for the given
246 * format, using the given list of strings as the filenames to read
247 * from or as the sequences themselves (i.e. if -c was used).
248 */
249 static PatternSource* patsrcFromStrings(
250 const PatternParams& p,
251 const EList<string>& qs,
252 size_t nthreads = 1);
253
254 /**
255 * Return the number of reads attempted.
256 */
readCnt()257 TReadId readCnt() const { return readCnt_ - 1; }
258
259 protected:
260
261 uint32_t seed_;
262
263 /// The number of reads read by this PatternSource
264 TReadId readCnt_;
265
266 int numWrappers_; /// # threads that own a wrapper for this PatternSource
267 bool doLocking_; /// override whether to lock (true = don't override)
268 /// User can ask to use the normal pthreads-style lock even if
269 /// spinlocks is enabled and compiled in. This is sometimes better
270 /// if we expect bad I/O latency on some reads.
271 bool useSpinlock_;
272 MUTEX_T mutex;
273 };
274
275 /**
276 * Abstract parent class for synhconized sources of paired-end reads
277 * (and possibly also single-end reads).
278 */
279 class PairedPatternSource {
280 public:
PairedPatternSource(const PatternParams & p)281 PairedPatternSource(const PatternParams& p) : mutex_m(), seed_(p.seed) {}
~PairedPatternSource()282 virtual ~PairedPatternSource() { }
283
284 virtual void addWrapper() = 0;
285 virtual void reset() = 0;
286
287 virtual bool nextReadPair(
288 Read& ra,
289 Read& rb,
290 TReadId& rdid,
291 TReadId& endid,
292 bool& success,
293 bool& done,
294 bool& paired,
295 bool fixName) = 0;
296
297 virtual pair<TReadId, TReadId> readCnt() const = 0;
298
299 /**
300 * Lock this PairedPatternSource, usually because one of its shared
301 * fields is being updated.
302 */
lock()303 void lock() {
304 mutex_m.lock();
305 }
306
307 /**
308 * Unlock this PairedPatternSource.
309 */
unlock()310 void unlock() {
311 mutex_m.unlock();
312 }
313
314 /**
315 * Given the values for all of the various arguments used to specify
316 * the read and quality input, create a list of pattern sources to
317 * dispense them.
318 */
319 static PairedPatternSource* setupPatternSources(
320 const EList<string>& si, // singles, from argv
321 const EList<string>& m1, // mate1's, from -1 arg
322 const EList<string>& m2, // mate2's, from -2 arg
323 const EList<string>& m12, // both mates on each line, from --12 arg
324 #ifdef USE_SRA
325 const EList<string>& sra_accs,
326 #endif
327 const EList<string>& q, // qualities associated with singles
328 const EList<string>& q1, // qualities associated with m1
329 const EList<string>& q2, // qualities associated with m2
330 const PatternParams& p, // read-in params
331 size_t nthreads,
332 bool verbose); // be talkative?
333
334 protected:
335
336 MUTEX_T mutex_m; /// mutex for syncing over critical regions
337 uint32_t seed_;
338 };
339
340 /**
341 * Encapsulates a synchronized source of both paired-end reads and
342 * unpaired reads, where the paired-end must come from parallel files.
343 */
344 class PairedSoloPatternSource : public PairedPatternSource {
345
346 public:
347
PairedSoloPatternSource(const EList<PatternSource * > * src,const PatternParams & p)348 PairedSoloPatternSource(
349 const EList<PatternSource*>* src,
350 const PatternParams& p) :
351 PairedPatternSource(p),
352 cur_(0),
353 src_(src)
354 {
355 assert(src_ != NULL);
356 for(size_t i = 0; i < src_->size(); i++) {
357 assert((*src_)[i] != NULL);
358 }
359 }
360
~PairedSoloPatternSource()361 virtual ~PairedSoloPatternSource() { delete src_; }
362
363 /**
364 * Call this whenever this PairedPatternSource is wrapped by a new
365 * WrappedPatternSourcePerThread. This helps us keep track of
366 * whether locks within PatternSources will be contended.
367 */
addWrapper()368 virtual void addWrapper() {
369 for(size_t i = 0; i < src_->size(); i++) {
370 (*src_)[i]->addWrapper();
371 }
372 }
373
374 /**
375 * Reset this object and all the PatternSources under it so that
376 * the next call to nextReadPair gets the very first read pair.
377 */
reset()378 virtual void reset() {
379 for(size_t i = 0; i < src_->size(); i++) {
380 (*src_)[i]->reset();
381 }
382 cur_ = 0;
383 }
384
385 /**
386 * The main member function for dispensing pairs of reads or
387 * singleton reads. Returns true iff ra and rb contain a new
388 * pair; returns false if ra contains a new unpaired read.
389 */
390 virtual bool nextReadPair(
391 Read& ra,
392 Read& rb,
393 TReadId& rdid,
394 TReadId& endid,
395 bool& success,
396 bool& done,
397 bool& paired,
398 bool fixName);
399
400 /**
401 * Return the number of reads attempted.
402 */
readCnt()403 virtual pair<TReadId, TReadId> readCnt() const {
404 uint64_t ret = 0llu;
405 for(size_t i = 0; i < src_->size(); i++) ret += (*src_)[i]->readCnt();
406 return make_pair(ret, 0llu);
407 }
408
409 protected:
410
411 volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors
412 const EList<PatternSource*>* src_; /// PatternSources for paired-end reads
413 };
414
415 /**
416 * Encapsulates a synchronized source of both paired-end reads and
417 * unpaired reads, where the paired-end must come from parallel files.
418 */
419 class PairedDualPatternSource : public PairedPatternSource {
420
421 public:
422
PairedDualPatternSource(const EList<PatternSource * > * srca,const EList<PatternSource * > * srcb,const PatternParams & p)423 PairedDualPatternSource(
424 const EList<PatternSource*>* srca,
425 const EList<PatternSource*>* srcb,
426 const PatternParams& p) :
427 PairedPatternSource(p), cur_(0), srca_(srca), srcb_(srcb)
428 {
429 assert(srca_ != NULL);
430 assert(srcb_ != NULL);
431 // srca_ and srcb_ must be parallel
432 assert_eq(srca_->size(), srcb_->size());
433 for(size_t i = 0; i < srca_->size(); i++) {
434 // Can't have NULL first-mate sources. Second-mate sources
435 // can be NULL, in the case when the corresponding first-
436 // mate source is unpaired.
437 assert((*srca_)[i] != NULL);
438 for(size_t j = 0; j < srcb_->size(); j++) {
439 assert_neq((*srca_)[i], (*srcb_)[j]);
440 }
441 }
442 }
443
~PairedDualPatternSource()444 virtual ~PairedDualPatternSource() {
445 delete srca_;
446 delete srcb_;
447 }
448
449 /**
450 * Call this whenever this PairedPatternSource is wrapped by a new
451 * WrappedPatternSourcePerThread. This helps us keep track of
452 * whether locks within PatternSources will be contended.
453 */
addWrapper()454 virtual void addWrapper() {
455 for(size_t i = 0; i < srca_->size(); i++) {
456 (*srca_)[i]->addWrapper();
457 if((*srcb_)[i] != NULL) {
458 (*srcb_)[i]->addWrapper();
459 }
460 }
461 }
462
463 /**
464 * Reset this object and all the PatternSources under it so that
465 * the next call to nextReadPair gets the very first read pair.
466 */
reset()467 virtual void reset() {
468 for(size_t i = 0; i < srca_->size(); i++) {
469 (*srca_)[i]->reset();
470 if((*srcb_)[i] != NULL) {
471 (*srcb_)[i]->reset();
472 }
473 }
474 cur_ = 0;
475 }
476
477 /**
478 * The main member function for dispensing pairs of reads or
479 * singleton reads. Returns true iff ra and rb contain a new
480 * pair; returns false if ra contains a new unpaired read.
481 */
482 virtual bool nextReadPair(
483 Read& ra,
484 Read& rb,
485 TReadId& rdid,
486 TReadId& endid,
487 bool& success,
488 bool& done,
489 bool& paired,
490 bool fixName);
491
492 /**
493 * Return the number of reads attempted.
494 */
495 virtual pair<TReadId, TReadId> readCnt() const;
496
497 protected:
498
499 volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors
500 const EList<PatternSource*>* srca_; /// PatternSources for 1st mates and/or unpaired reads
501 const EList<PatternSource*>* srcb_; /// PatternSources for 2nd mates
502 };
503
504 /**
505 * Encapsulates a single thread's interaction with the PatternSource.
506 * Most notably, this class holds the buffers into which the
507 * PatterSource will write sequences. This class is *not* threadsafe
508 * - it doesn't need to be since there's one per thread. PatternSource
509 * is thread-safe.
510 */
511 class PatternSourcePerThread {
512
513 public:
514
PatternSourcePerThread()515 PatternSourcePerThread() :
516 buf1_(), buf2_(), rdid_(0xffffffff), endid_(0xffffffff) { }
517
~PatternSourcePerThread()518 virtual ~PatternSourcePerThread() { }
519
520 /**
521 * Read the next read pair.
522 */
nextReadPair(bool & success,bool & done,bool & paired,bool fixName)523 virtual bool nextReadPair(
524 bool& success,
525 bool& done,
526 bool& paired,
527 bool fixName)
528 {
529 return success;
530 }
531
bufa()532 Read& bufa() { return buf1_; }
bufb()533 Read& bufb() { return buf2_; }
bufa()534 const Read& bufa() const { return buf1_; }
bufb()535 const Read& bufb() const { return buf2_; }
536
rdid()537 TReadId rdid() const { return rdid_; }
endid()538 TReadId endid() const { return endid_; }
reset()539 virtual void reset() { rdid_ = endid_ = 0xffffffff; }
540
541 /**
542 * Return the length of mate 1 or mate 2.
543 */
length(int mate)544 size_t length(int mate) const {
545 return (mate == 1) ? buf1_.length() : buf2_.length();
546 }
547
548 protected:
549
550 Read buf1_; // read buffer for mate a
551 Read buf2_; // read buffer for mate b
552 TReadId rdid_; // index of read just read
553 TReadId endid_; // index of read just read
554 };
555
556 /**
557 * Abstract parent factory for PatternSourcePerThreads.
558 */
559 class PatternSourcePerThreadFactory {
560 public:
~PatternSourcePerThreadFactory()561 virtual ~PatternSourcePerThreadFactory() { }
562 virtual PatternSourcePerThread* create() const = 0;
563 virtual EList<PatternSourcePerThread*>* create(uint32_t n) const = 0;
564
565 /// Free memory associated with a pattern source
destroy(PatternSourcePerThread * patsrc)566 virtual void destroy(PatternSourcePerThread* patsrc) const {
567 assert(patsrc != NULL);
568 // Free the PatternSourcePerThread
569 delete patsrc;
570 }
571
572 /// Free memory associated with a pattern source list
destroy(EList<PatternSourcePerThread * > * patsrcs)573 virtual void destroy(EList<PatternSourcePerThread*>* patsrcs) const {
574 assert(patsrcs != NULL);
575 // Free all of the PatternSourcePerThreads
576 for(size_t i = 0; i < patsrcs->size(); i++) {
577 if((*patsrcs)[i] != NULL) {
578 delete (*patsrcs)[i];
579 (*patsrcs)[i] = NULL;
580 }
581 }
582 // Free the vector
583 delete patsrcs;
584 }
585 };
586
587 /**
588 * A per-thread wrapper for a PairedPatternSource.
589 */
590 class WrappedPatternSourcePerThread : public PatternSourcePerThread {
591 public:
WrappedPatternSourcePerThread(PairedPatternSource & __patsrc)592 WrappedPatternSourcePerThread(PairedPatternSource& __patsrc) :
593 patsrc_(__patsrc)
594 {
595 patsrc_.addWrapper();
596 }
597
598 /**
599 * Get the next paired or unpaired read from the wrapped
600 * PairedPatternSource.
601 */
602 virtual bool nextReadPair(
603 bool& success,
604 bool& done,
605 bool& paired,
606 bool fixName);
607
608 private:
609
610 /// Container for obtaining paired reads from PatternSources
611 PairedPatternSource& patsrc_;
612 };
613
614 /**
615 * Abstract parent factory for PatternSourcePerThreads.
616 */
617 class WrappedPatternSourcePerThreadFactory : public PatternSourcePerThreadFactory {
618 public:
WrappedPatternSourcePerThreadFactory(PairedPatternSource & patsrc)619 WrappedPatternSourcePerThreadFactory(PairedPatternSource& patsrc) :
620 patsrc_(patsrc) { }
621
622 /**
623 * Create a new heap-allocated WrappedPatternSourcePerThreads.
624 */
create()625 virtual PatternSourcePerThread* create() const {
626 return new WrappedPatternSourcePerThread(patsrc_);
627 }
628
629 /**
630 * Create a new heap-allocated vector of heap-allocated
631 * WrappedPatternSourcePerThreads.
632 */
create(uint32_t n)633 virtual EList<PatternSourcePerThread*>* create(uint32_t n) const {
634 EList<PatternSourcePerThread*>* v = new EList<PatternSourcePerThread*>;
635 for(size_t i = 0; i < n; i++) {
636 v->push_back(new WrappedPatternSourcePerThread(patsrc_));
637 assert(v->back() != NULL);
638 }
639 return v;
640 }
641
642 private:
643 /// Container for obtaining paired reads from PatternSources
644 PairedPatternSource& patsrc_;
645 };
646
647 /// Skip to the end of the current string of newline chars and return
648 /// the first character after the newline chars, or -1 for EOF
getOverNewline(FileBuf & in)649 static inline int getOverNewline(FileBuf& in) {
650 int c;
651 while(isspace(c = in.get()));
652 return c;
653 }
654
655 /// Skip to the end of the current string of newline chars such that
656 /// the next call to get() returns the first character after the
657 /// whitespace
peekOverNewline(FileBuf & in)658 static inline int peekOverNewline(FileBuf& in) {
659 while(true) {
660 int c = in.peek();
661 if(c != '\r' && c != '\n') {
662 return c;
663 }
664 in.get();
665 }
666 }
667
668 /// Skip to the end of the current line; return the first character
669 /// of the next line or -1 for EOF
getToEndOfLine(FileBuf & in)670 static inline int getToEndOfLine(FileBuf& in) {
671 while(true) {
672 int c = in.get(); if(c < 0) return -1;
673 if(c == '\n' || c == '\r') {
674 while(c == '\n' || c == '\r') {
675 c = in.get(); if(c < 0) return -1;
676 }
677 // c now holds first character of next line
678 return c;
679 }
680 }
681 }
682
683 /// Skip to the end of the current line such that the next call to
684 /// get() returns the first character on the next line
peekToEndOfLine(FileBuf & in)685 static inline int peekToEndOfLine(FileBuf& in) {
686 while(true) {
687 int c = in.get(); if(c < 0) return c;
688 if(c == '\n' || c == '\r') {
689 c = in.peek();
690 while(c == '\n' || c == '\r') {
691 in.get(); if(c < 0) return c; // consume \r or \n
692 c = in.peek();
693 }
694 // next get() gets first character of next line
695 return c;
696 }
697 }
698 }
699
700 extern void wrongQualityFormat(const BTString& read_name);
701 extern void tooFewQualities(const BTString& read_name);
702 extern void tooManyQualities(const BTString& read_name);
703
704 /**
705 * Encapsulates a source of patterns which is an in-memory vector.
706 */
707 class VectorPatternSource : public PatternSource {
708
709 public:
710
711 VectorPatternSource(
712 const EList<string>& v,
713 const PatternParams& p);
714
~VectorPatternSource()715 virtual ~VectorPatternSource() { }
716
717 virtual bool nextReadImpl(
718 Read& r,
719 TReadId& rdid,
720 TReadId& endid,
721 bool& success,
722 bool& done);
723
724 /**
725 * This is unused, but implementation is given for completeness.
726 */
727 virtual bool nextReadPairImpl(
728 Read& ra,
729 Read& rb,
730 TReadId& rdid,
731 TReadId& endid,
732 bool& success,
733 bool& done,
734 bool& paired);
735
reset()736 virtual void reset() {
737 PatternSource::reset();
738 cur_ = skip_;
739 paired_ = false;
740 }
741
742 private:
743
744 size_t cur_;
745 uint32_t skip_;
746 bool paired_;
747 EList<BTDnaString> v_; // forward sequences
748 EList<BTString> quals_; // forward qualities
749 EList<BTString> names_; // names
750 EList<int> trimmed3_; // names
751 EList<int> trimmed5_; // names
752 };
753
754 /**
755 *
756 */
757 class BufferedFilePatternSource : public PatternSource {
758 public:
BufferedFilePatternSource(const EList<string> & infiles,const PatternParams & p)759 BufferedFilePatternSource(
760 const EList<string>& infiles,
761 const PatternParams& p) :
762 PatternSource(p),
763 infiles_(infiles),
764 filecur_(0),
765 fb_(),
766 skip_(p.skip),
767 first_(true)
768 {
769 assert_gt(infiles.size(), 0);
770 errs_.resize(infiles_.size());
771 errs_.fill(0, infiles_.size(), false);
772 assert(!fb_.isOpen());
773 open(); // open first file in the list
774 filecur_++;
775 }
776
~BufferedFilePatternSource()777 virtual ~BufferedFilePatternSource() {
778 if(fb_.isOpen()) fb_.close();
779 }
780
781 /**
782 * Fill Read with the sequence, quality and name for the next
783 * read in the list of read files. This function gets called by
784 * all the search threads, so we must handle synchronization.
785 */
nextReadImpl(Read & r,TReadId & rdid,TReadId & endid,bool & success,bool & done)786 virtual bool nextReadImpl(
787 Read& r,
788 TReadId& rdid,
789 TReadId& endid,
790 bool& success,
791 bool& done)
792 {
793 // We'll be manipulating our file handle/filecur_ state
794 lock();
795 while(true) {
796 do { read(r, rdid, endid, success, done); }
797 while(!success && !done);
798 if(!success && filecur_ < infiles_.size()) {
799 assert(done);
800 open();
801 resetForNextFile(); // reset state to handle a fresh file
802 filecur_++;
803 continue;
804 }
805 break;
806 }
807 assert(r.repOk());
808 // Leaving critical region
809 unlock();
810 return success;
811 }
812
813 /**
814 *
815 */
nextReadPairImpl(Read & ra,Read & rb,TReadId & rdid,TReadId & endid,bool & success,bool & done,bool & paired)816 virtual bool nextReadPairImpl(
817 Read& ra,
818 Read& rb,
819 TReadId& rdid,
820 TReadId& endid,
821 bool& success,
822 bool& done,
823 bool& paired)
824 {
825 // We'll be manipulating our file handle/filecur_ state
826 lock();
827 while(true) {
828 do { readPair(ra, rb, rdid, endid, success, done, paired); }
829 while(!success && !done);
830 if(!success && filecur_ < infiles_.size()) {
831 assert(done);
832 open();
833 resetForNextFile(); // reset state to handle a fresh file
834 filecur_++;
835 continue;
836 }
837 break;
838 }
839 assert(ra.repOk());
840 assert(rb.repOk());
841 // Leaving critical region
842 unlock();
843 return success;
844 }
845
846 /**
847 * Reset state so that we read start reading again from the
848 * beginning of the first file. Should only be called by the
849 * master thread.
850 */
reset()851 virtual void reset() {
852 PatternSource::reset();
853 filecur_ = 0;
854 open();
855 filecur_++;
856 }
857
858 protected:
859
860 /// Read another pattern from the input file; this is overridden
861 /// to deal with specific file formats
862 virtual bool read(
863 Read& r,
864 TReadId& rdid,
865 TReadId& endid,
866 bool& success,
867 bool& done) = 0;
868
869 /// Read another pattern pair from the input file; this is
870 /// overridden to deal with specific file formats
871 virtual bool readPair(
872 Read& ra,
873 Read& rb,
874 TReadId& rdid,
875 TReadId& endid,
876 bool& success,
877 bool& done,
878 bool& paired) = 0;
879
880 /// Reset state to handle a fresh file
resetForNextFile()881 virtual void resetForNextFile() { }
882
open()883 void open() {
884 if(fb_.isOpen()) fb_.close();
885 while(filecur_ < infiles_.size()) {
886 // Open read
887 FILE *in;
888 if(infiles_[filecur_] == "-") {
889 in = stdin;
890 } else if((in = fopen(infiles_[filecur_].c_str(), "rb")) == NULL) {
891 if(!errs_[filecur_]) {
892 cerr << "Warning: Could not open read file \"" << infiles_[filecur_].c_str() << "\" for reading; skipping..." << endl;
893 errs_[filecur_] = true;
894 }
895 filecur_++;
896 continue;
897 }
898 fb_.newFile(in);
899 return;
900 }
901 cerr << "Error: No input read files were valid" << endl;
902 exit(1);
903 return;
904 }
905
906 EList<string> infiles_; // filenames for read files
907 EList<bool> errs_; // whether we've already printed an error for each file
908 size_t filecur_; // index into infiles_ of next file to read
909 FileBuf fb_; // read file currently being read from
910 TReadId skip_; // number of reads to skip
911 bool first_;
912 };
913
914 /**
915 * Parse a single quality string from fb and store qualities in r.
916 * Assume the next character obtained via fb.get() is the first
917 * character of the quality string. When returning, the next
918 * character returned by fb.peek() or fb.get() should be the first
919 * character of the following line.
920 */
921 int parseQuals(
922 Read& r,
923 FileBuf& fb,
924 int firstc,
925 int readLen,
926 int trim3,
927 int trim5,
928 bool intQuals,
929 bool phred64,
930 bool solexa64);
931
932 /**
933 * Synchronized concrete pattern source for a list of FASTA or CSFASTA
934 * (if color = true) files.
935 */
936 class FastaPatternSource : public BufferedFilePatternSource {
937 public:
FastaPatternSource(const EList<string> & infiles,const PatternParams & p)938 FastaPatternSource(const EList<string>& infiles,
939 const PatternParams& p) :
940 BufferedFilePatternSource(infiles, p),
941 first_(true), solexa64_(p.solexa64), phred64_(p.phred64), intQuals_(p.intQuals)
942 { }
reset()943 virtual void reset() {
944 first_ = true;
945 BufferedFilePatternSource::reset();
946 }
947 protected:
948 /**
949 * Scan to the next FASTA record (starting with >) and return the first
950 * character of the record (which will always be >).
951 */
skipToNextFastaRecord(FileBuf & in)952 static int skipToNextFastaRecord(FileBuf& in) {
953 int c;
954 while((c = in.get()) != '>') {
955 if(in.eof()) return -1;
956 }
957 return c;
958 }
959
960 /// Called when we have to bail without having parsed a read.
bail(Read & r)961 void bail(Read& r) {
962 r.reset();
963 fb_.resetLastN();
964 }
965
966 /// Read another pattern from a FASTA input file
967 virtual bool read(
968 Read& r,
969 TReadId& rdid,
970 TReadId& endid,
971 bool& success,
972 bool& done);
973
974 /// Read another pair of patterns from a FASTA input file
readPair(Read & ra,Read & rb,TReadId & rdid,TReadId & endid,bool & success,bool & done,bool & paired)975 virtual bool readPair(
976 Read& ra,
977 Read& rb,
978 TReadId& rdid,
979 TReadId& endid,
980 bool& success,
981 bool& done,
982 bool& paired)
983 {
984 // (For now, we shouldn't ever be here)
985 cerr << "In FastaPatternSource.readPair()" << endl;
986 throw 1;
987 return false;
988 }
989
resetForNextFile()990 virtual void resetForNextFile() {
991 first_ = true;
992 }
993
994 private:
995 bool first_;
996
997 public:
998 bool solexa64_;
999 bool phred64_;
1000 bool intQuals_;
1001 };
1002
1003
1004 /**
1005 * Tokenize a line of space-separated integer quality values.
1006 */
tokenizeQualLine(FileBuf & filebuf,char * buf,size_t buflen,EList<string> & toks)1007 static inline bool tokenizeQualLine(
1008 FileBuf& filebuf,
1009 char *buf,
1010 size_t buflen,
1011 EList<string>& toks)
1012 {
1013 size_t rd = filebuf.gets(buf, buflen);
1014 if(rd == 0) return false;
1015 assert(NULL == strrchr(buf, '\n'));
1016 tokenize(string(buf), " ", toks);
1017 return true;
1018 }
1019
1020 /**
1021 * Synchronized concrete pattern source for a list of files with tab-
1022 * delimited name, seq, qual fields (or, for paired-end reads,
1023 * basename, seq1, qual1, seq2, qual2).
1024 */
1025 class TabbedPatternSource : public BufferedFilePatternSource {
1026
1027 public:
1028
TabbedPatternSource(const EList<string> & infiles,const PatternParams & p,bool secondName)1029 TabbedPatternSource(
1030 const EList<string>& infiles,
1031 const PatternParams& p,
1032 bool secondName) :
1033 BufferedFilePatternSource(infiles, p),
1034 solQuals_(p.solexa64),
1035 phred64Quals_(p.phred64),
1036 intQuals_(p.intQuals),
1037 secondName_(secondName) { }
1038
1039 protected:
1040
1041 /// Read another pattern from a FASTA input file
1042 virtual bool read(
1043 Read& r,
1044 TReadId& rdid,
1045 TReadId& endid,
1046 bool& success,
1047 bool& done);
1048
1049 /// Read another pair of patterns from a FASTA input file
1050 virtual bool readPair(
1051 Read& ra,
1052 Read& rb,
1053 TReadId& rdid,
1054 TReadId& endid,
1055 bool& success,
1056 bool& done,
1057 bool& paired);
1058
1059 private:
1060
1061 /**
1062 * Parse a name from fb_ and store in r. Assume that the next
1063 * character obtained via fb_.get() is the first character of
1064 * the sequence and the string stops at the next char upto (could
1065 * be tab, newline, etc.).
1066 */
1067 int parseName(Read& r, Read* r2, char upto = '\t');
1068
1069 /**
1070 * Parse a single sequence from fb_ and store in r. Assume
1071 * that the next character obtained via fb_.get() is the first
1072 * character of the sequence and the sequence stops at the next
1073 * char upto (could be tab, newline, etc.).
1074 */
1075 int parseSeq(Read& r, int& charsRead, int& trim5, char upto = '\t');
1076
1077 /**
1078 * Parse a single quality string from fb_ and store in r.
1079 * Assume that the next character obtained via fb_.get() is
1080 * the first character of the quality string and the string stops
1081 * at the next char upto (could be tab, newline, etc.).
1082 */
1083 int parseQuals(Read& r, int charsRead, int dstLen, int trim5,
1084 char& c2, char upto = '\t', char upto2 = -1);
1085
1086 bool solQuals_;
1087 bool phred64Quals_;
1088 bool intQuals_;
1089 EList<string> qualToks_;
1090 bool secondName_;
1091 };
1092
1093 /**
1094 * Synchronized concrete pattern source for Illumina Qseq files. In
1095 * Qseq files, each read appears on a separate line and the tab-
1096 * delimited fields are:
1097 *
1098 * 1. Machine name
1099 * 2. Run number
1100 * 3. Lane number
1101 * 4. Tile number
1102 * 5. X coordinate of spot
1103 * 6. Y coordinate of spot
1104 * 7. Index: "Index sequence or 0. For no indexing, or for a file that
1105 * has not been demultiplexed yet, this field should have a value of
1106 * 0."
1107 * 8. Read number: 1 for unpaired, 1 or 2 for paired
1108 * 9. Sequence
1109 * 10. Quality
1110 * 11. Filter: 1 = passed, 0 = didn't
1111 */
1112 class QseqPatternSource : public BufferedFilePatternSource {
1113
1114 public:
1115
QseqPatternSource(const EList<string> & infiles,const PatternParams & p)1116 QseqPatternSource(
1117 const EList<string>& infiles,
1118 const PatternParams& p) :
1119 BufferedFilePatternSource(infiles, p),
1120 solQuals_(p.solexa64),
1121 phred64Quals_(p.phred64),
1122 intQuals_(p.intQuals) { }
1123
1124 protected:
1125
1126 #define BAIL_UNPAIRED() { \
1127 peekOverNewline(fb_); \
1128 r.reset(); \
1129 success = false; \
1130 done = true; \
1131 return success; \
1132 }
1133
1134 /**
1135 * Parse a name from fb_ and store in r. Assume that the next
1136 * character obtained via fb_.get() is the first character of
1137 * the sequence and the string stops at the next char upto (could
1138 * be tab, newline, etc.).
1139 */
1140 int parseName(
1141 Read& r, // buffer for mate 1
1142 Read* r2, // buffer for mate 2 (NULL if mate2 is read separately)
1143 bool append, // true -> append characters, false -> skip them
1144 bool clearFirst, // clear the name buffer first
1145 bool warnEmpty, // emit a warning if nothing was added to the name
1146 bool useDefault, // if nothing is read, put readCnt_ as a default value
1147 int upto); // stop parsing when we first reach character 'upto'
1148
1149 /**
1150 * Parse a single sequence from fb_ and store in r. Assume
1151 * that the next character obtained via fb_.get() is the first
1152 * character of the sequence and the sequence stops at the next
1153 * char upto (could be tab, newline, etc.).
1154 */
1155 int parseSeq(
1156 Read& r, // buffer for read
1157 int& charsRead,
1158 int& trim5,
1159 char upto);
1160
1161 /**
1162 * Parse a single quality string from fb_ and store in r.
1163 * Assume that the next character obtained via fb_.get() is
1164 * the first character of the quality string and the string stops
1165 * at the next char upto (could be tab, newline, etc.).
1166 */
1167 int parseQuals(
1168 Read& r, // buffer for read
1169 int charsRead,
1170 int dstLen,
1171 int trim5,
1172 char& c2,
1173 char upto,
1174 char upto2);
1175
1176 /**
1177 * Read another pattern from a Qseq input file.
1178 */
1179 virtual bool read(
1180 Read& r,
1181 TReadId& rdid,
1182 TReadId& endid,
1183 bool& success,
1184 bool& done);
1185
1186 /**
1187 * Read a pair of patterns from 1 Qseq file. Note: this is never used.
1188 */
readPair(Read & ra,Read & rb,TReadId & rdid,TReadId & endid,bool & success,bool & done,bool & paired)1189 virtual bool readPair(
1190 Read& ra,
1191 Read& rb,
1192 TReadId& rdid,
1193 TReadId& endid,
1194 bool& success,
1195 bool& done,
1196 bool& paired)
1197 {
1198 // (For now, we shouldn't ever be here)
1199 cerr << "In QseqPatternSource.readPair()" << endl;
1200 throw 1;
1201 return false;
1202 }
1203
1204 bool solQuals_;
1205 bool phred64Quals_;
1206 bool intQuals_;
1207 EList<string> qualToks_;
1208 };
1209
1210 /**
1211 * Synchronized concrete pattern source for a list of FASTA files where
1212 * reads need to be extracted from long continuous sequences.
1213 */
1214 class FastaContinuousPatternSource : public BufferedFilePatternSource {
1215 public:
FastaContinuousPatternSource(const EList<string> & infiles,const PatternParams & p)1216 FastaContinuousPatternSource(const EList<string>& infiles, const PatternParams& p) :
1217 BufferedFilePatternSource(infiles, p),
1218 length_(p.sampleLen), freq_(p.sampleFreq),
1219 eat_(length_-1), beginning_(true),
1220 bufCur_(0), subReadCnt_(0llu)
1221 {
1222 resetForNextFile();
1223 }
1224
reset()1225 virtual void reset() {
1226 BufferedFilePatternSource::reset();
1227 resetForNextFile();
1228 }
1229
1230 protected:
1231
1232 /// Read another pattern from a FASTA input file
read(Read & r,TReadId & rdid,TReadId & endid,bool & success,bool & done)1233 virtual bool read(
1234 Read& r,
1235 TReadId& rdid,
1236 TReadId& endid,
1237 bool& success,
1238 bool& done)
1239 {
1240 success = true;
1241 done = false;
1242 r.reset();
1243 while(true) {
1244 r.color = gColor;
1245 int c = fb_.get();
1246 if(c < 0) { success = false; done = true; return success; }
1247 if(c == '>') {
1248 resetForNextFile();
1249 c = fb_.peek();
1250 bool sawSpace = false;
1251 while(c != '\n' && c != '\r') {
1252 if(!sawSpace) {
1253 sawSpace = isspace(c);
1254 }
1255 if(!sawSpace) {
1256 nameBuf_.append(c);
1257 }
1258 fb_.get();
1259 c = fb_.peek();
1260 }
1261 while(c == '\n' || c == '\r') {
1262 fb_.get();
1263 c = fb_.peek();
1264 }
1265 nameBuf_.append('_');
1266 } else {
1267 int cat = asc2dnacat[c];
1268 if(cat >= 2) c = 'N';
1269 if(cat == 0) {
1270 // Encountered non-DNA, non-IUPAC char; skip it
1271 continue;
1272 } else {
1273 // DNA char
1274 buf_[bufCur_++] = c;
1275 if(bufCur_ == 1024) bufCur_ = 0;
1276 if(eat_ > 0) {
1277 eat_--;
1278 // Try to keep readCnt_ aligned with the offset
1279 // into the reference; that lets us see where
1280 // the sampling gaps are by looking at the read
1281 // name
1282 if(!beginning_) readCnt_++;
1283 continue;
1284 }
1285 for(size_t i = 0; i < length_; i++) {
1286 if(length_ - i <= bufCur_) {
1287 c = buf_[bufCur_ - (length_ - i)];
1288 } else {
1289 // Rotate
1290 c = buf_[bufCur_ - (length_ - i) + 1024];
1291 }
1292 r.patFw.append(asc2dna[c]);
1293 r.qual.append('I');
1294 }
1295 // Set up a default name if one hasn't been set
1296 r.name = nameBuf_;
1297 char cbuf[20];
1298 itoa10<TReadId>(readCnt_ - subReadCnt_, cbuf);
1299 r.name.append(cbuf);
1300 eat_ = freq_-1;
1301 readCnt_++;
1302 beginning_ = false;
1303 rdid = endid = readCnt_-1;
1304 break;
1305 }
1306 }
1307 }
1308 return true;
1309 }
1310
1311 /// Shouldn't ever be here; it's not sensible to obtain read pairs
1312 // from a continuous input.
readPair(Read & ra,Read & rb,TReadId & rdid,TReadId & endid,bool & success,bool & done,bool & paired)1313 virtual bool readPair(
1314 Read& ra,
1315 Read& rb,
1316 TReadId& rdid,
1317 TReadId& endid,
1318 bool& success,
1319 bool& done,
1320 bool& paired)
1321 {
1322 cerr << "In FastaContinuousPatternSource.readPair()" << endl;
1323 throw 1;
1324 return false;
1325 }
1326
1327 /**
1328 * Reset state to be read for the next file.
1329 */
resetForNextFile()1330 virtual void resetForNextFile() {
1331 eat_ = length_-1;
1332 beginning_ = true;
1333 bufCur_ = 0;
1334 nameBuf_.clear();
1335 subReadCnt_ = readCnt_;
1336 }
1337
1338 private:
1339 size_t length_; /// length of reads to generate
1340 size_t freq_; /// frequency to sample reads
1341 size_t eat_; /// number of characters we need to skip before
1342 /// we have flushed all of the ambiguous or
1343 /// non-existent characters out of our read
1344 /// window
1345 bool beginning_; /// skipping over the first read length?
1346 char buf_[1024]; /// read buffer
1347 BTString nameBuf_; /// read buffer for name of fasta record being
1348 /// split into mers
1349 size_t bufCur_; /// buffer cursor; points to where we should
1350 /// insert the next character
1351 uint64_t subReadCnt_;/// number to subtract from readCnt_ to get
1352 /// the pat id to output (so it resets to 0 for
1353 /// each new sequence)
1354 };
1355
1356 /**
1357 * Read a FASTQ-format file.
1358 * See: http://maq.sourceforge.net/fastq.shtml
1359 */
1360 class FastqPatternSource : public BufferedFilePatternSource {
1361
1362 public:
1363
FastqPatternSource(const EList<string> & infiles,const PatternParams & p)1364 FastqPatternSource(const EList<string>& infiles, const PatternParams& p) :
1365 BufferedFilePatternSource(infiles, p),
1366 first_(true),
1367 solQuals_(p.solexa64),
1368 phred64Quals_(p.phred64),
1369 intQuals_(p.intQuals),
1370 fuzzy_(p.fuzzy)
1371 { }
1372
reset()1373 virtual void reset() {
1374 first_ = true;
1375 fb_.resetLastN();
1376 BufferedFilePatternSource::reset();
1377 }
1378
1379 protected:
1380
1381 /**
1382 * Scan to the next FASTQ record (starting with @) and return the first
1383 * character of the record (which will always be @). Since the quality
1384 * line may start with @, we keep scanning until we've seen a line
1385 * beginning with @ where the line two lines back began with +.
1386 */
skipToNextFastqRecord(FileBuf & in,bool sawPlus)1387 static int skipToNextFastqRecord(FileBuf& in, bool sawPlus) {
1388 int line = 0;
1389 int plusLine = -1;
1390 int c = in.get();
1391 int firstc = c;
1392 while(true) {
1393 if(line > 20) {
1394 // If we couldn't find our desired '@' in the first 20
1395 // lines, it's time to give up
1396 if(firstc == '>') {
1397 // That firstc is '>' may be a hint that this is
1398 // actually a FASTA file, so return it intact
1399 return '>';
1400 }
1401 // Return an error
1402 return -1;
1403 }
1404 if(c == -1) return -1;
1405 if(c == '\n') {
1406 c = in.get();
1407 if(c == '@' && sawPlus && plusLine == (line-2)) {
1408 return '@';
1409 }
1410 else if(c == '+') {
1411 // Saw a '+' at the beginning of a line; remember where
1412 // we saw it
1413 sawPlus = true;
1414 plusLine = line;
1415 }
1416 else if(c == -1) {
1417 return -1;
1418 }
1419 line++;
1420 }
1421 c = in.get();
1422 }
1423 }
1424
1425 /// Read another pattern from a FASTQ input file
1426 virtual bool read(
1427 Read& r,
1428 TReadId& rdid,
1429 TReadId& endid,
1430 bool& success,
1431 bool& done);
1432
1433 /// Read another read pair from a FASTQ input file
readPair(Read & ra,Read & rb,TReadId & rdid,TReadId & endid,bool & success,bool & done,bool & paired)1434 virtual bool readPair(
1435 Read& ra,
1436 Read& rb,
1437 TReadId& rdid,
1438 TReadId& endid,
1439 bool& success,
1440 bool& done,
1441 bool& paired)
1442 {
1443 // (For now, we shouldn't ever be here)
1444 cerr << "In FastqPatternSource.readPair()" << endl;
1445 throw 1;
1446 return false;
1447 }
1448
resetForNextFile()1449 virtual void resetForNextFile() {
1450 first_ = true;
1451 }
1452
1453 private:
1454
1455 /**
1456 * Do things we need to do if we have to bail in the middle of a
1457 * read, usually because we reached the end of the input without
1458 * finishing.
1459 */
bail(Read & r)1460 void bail(Read& r) {
1461 r.patFw.clear();
1462 fb_.resetLastN();
1463 }
1464
1465 bool first_;
1466 bool solQuals_;
1467 bool phred64Quals_;
1468 bool intQuals_;
1469 bool fuzzy_;
1470 EList<string> qualToks_;
1471 };
1472
1473 /**
1474 * Read a Raw-format file (one sequence per line). No quality strings
1475 * allowed. All qualities are assumed to be 'I' (40 on the Phred-33
1476 * scale).
1477 */
1478 class RawPatternSource : public BufferedFilePatternSource {
1479
1480 public:
1481
RawPatternSource(const EList<string> & infiles,const PatternParams & p)1482 RawPatternSource(const EList<string>& infiles, const PatternParams& p) :
1483 BufferedFilePatternSource(infiles, p), first_(true) { }
1484
reset()1485 virtual void reset() {
1486 first_ = true;
1487 BufferedFilePatternSource::reset();
1488 }
1489
1490 protected:
1491
1492 /// Read another pattern from a Raw input file
read(Read & r,TReadId & rdid,TReadId & endid,bool & success,bool & done)1493 virtual bool read(
1494 Read& r,
1495 TReadId& rdid,
1496 TReadId& endid,
1497 bool& success,
1498 bool& done)
1499 {
1500 int c;
1501 success = true;
1502 done = false;
1503 r.reset();
1504 c = getOverNewline(this->fb_);
1505 if(c < 0) {
1506 bail(r); success = false; done = true; return success;
1507 }
1508 assert(!isspace(c));
1509 r.color = gColor;
1510 int mytrim5 = gTrim5;
1511 if(first_) {
1512 // Check that the first character is sane for a raw file
1513 int cc = c;
1514 if(gColor) {
1515 if(cc >= '0' && cc <= '4') cc = "ACGTN"[(int)cc - '0'];
1516 if(cc == '.') cc = 'N';
1517 }
1518 if(asc2dnacat[cc] == 0) {
1519 cerr << "Error: reads file does not look like a Raw file" << endl;
1520 if(c == '>') {
1521 cerr << "Reads file looks like a FASTA file; please use -f" << endl;
1522 }
1523 if(c == '@') {
1524 cerr << "Reads file looks like a FASTQ file; please use -q" << endl;
1525 }
1526 throw 1;
1527 }
1528 first_ = false;
1529 }
1530 if(gColor) {
1531 // This may be a primer character. If so, keep it in the
1532 // 'primer' field of the read buf and parse the rest of the
1533 // read without it.
1534 c = toupper(c);
1535 if(asc2dnacat[c] > 0) {
1536 // First char is a DNA char
1537 int c2 = toupper(fb_.peek());
1538 // Second char is a color char
1539 if(asc2colcat[c2] > 0) {
1540 r.primer = c;
1541 r.trimc = c2;
1542 mytrim5 += 2; // trim primer and first color
1543 }
1544 }
1545 if(c < 0) {
1546 bail(r); success = false; done = true; return success;
1547 }
1548 }
1549 // _in now points just past the first character of a sequence
1550 // line, and c holds the first character
1551 int chs = 0;
1552 while(!isspace(c) && c >= 0) {
1553 if(gColor) {
1554 if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
1555 if(c == '.') c = 'N';
1556 }
1557 // 5' trimming
1558 if(isalpha(c) && chs >= mytrim5) {
1559 //size_t len = chs - mytrim5;
1560 //if(len >= 1024) tooManyQualities(BTString("(no name)"));
1561 r.patFw.append(asc2dna[c]);
1562 r.qual.append('I');
1563 }
1564 chs++;
1565 if(isspace(fb_.peek())) break;
1566 c = fb_.get();
1567 }
1568 // 3' trimming
1569 r.patFw.trimEnd(gTrim3);
1570 r.qual.trimEnd(gTrim3);
1571 c = peekToEndOfLine(fb_);
1572 r.trimmed3 = gTrim3;
1573 r.trimmed5 = mytrim5;
1574 r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
1575 fb_.resetLastN();
1576
1577 // Set up name
1578 char cbuf[20];
1579 itoa10<TReadId>(readCnt_, cbuf);
1580 r.name.install(cbuf);
1581 readCnt_++;
1582
1583 rdid = endid = readCnt_-1;
1584 return success;
1585 }
1586
1587 /// Read another read pair from a FASTQ input file
readPair(Read & ra,Read & rb,TReadId & rdid,TReadId & endid,bool & success,bool & done,bool & paired)1588 virtual bool readPair(
1589 Read& ra,
1590 Read& rb,
1591 TReadId& rdid,
1592 TReadId& endid,
1593 bool& success,
1594 bool& done,
1595 bool& paired)
1596 {
1597 // (For now, we shouldn't ever be here)
1598 cerr << "In RawPatternSource.readPair()" << endl;
1599 throw 1;
1600 return false;
1601 }
1602
resetForNextFile()1603 virtual void resetForNextFile() {
1604 first_ = true;
1605 }
1606
1607 private:
1608
1609 /**
1610 * Do things we need to do if we have to bail in the middle of a
1611 * read, usually because we reached the end of the input without
1612 * finishing.
1613 */
bail(Read & r)1614 void bail(Read& r) {
1615 r.patFw.clear();
1616 fb_.resetLastN();
1617 }
1618
1619 bool first_;
1620 };
1621
1622 #ifdef USE_SRA
1623
1624 namespace ngs {
1625 class ReadCollection;
1626 class ReadIterator;
1627 }
1628
1629 namespace tthread {
1630 class thread;
1631 };
1632
1633 struct SRA_Data;
1634
1635 /**
1636 *
1637 */
1638 class SRAPatternSource : public PatternSource {
1639 public:
1640 SRAPatternSource(
1641 const EList<string>& sra_accs,
1642 const PatternParams& p,
1643 const size_t nthreads = 1) :
PatternSource(p)1644 PatternSource(p),
1645 sra_accs_(sra_accs),
1646 sra_acc_cur_(0),
1647 skip_(p.skip),
1648 first_(true),
1649 nthreads_(nthreads),
1650 sra_run_(NULL),
1651 sra_it_(NULL),
1652 sra_data_(NULL),
1653 io_thread_(NULL)
1654 {
1655 assert_gt(sra_accs_.size(), 0);
1656 errs_.resize(sra_accs_.size());
1657 errs_.fill(0, sra_accs_.size(), false);
1658 open(); // open first file in the list
1659 sra_acc_cur_++;
1660 }
1661
1662 virtual ~SRAPatternSource();
1663
1664 /**
1665 * Fill Read with the sequence, quality and name for the next
1666 * read in the list of read files. This function gets called by
1667 * all the search threads, so we must handle synchronization.
1668 */
nextReadImpl(Read & r,TReadId & rdid,TReadId & endid,bool & success,bool & done)1669 virtual bool nextReadImpl(
1670 Read& r,
1671 TReadId& rdid,
1672 TReadId& endid,
1673 bool& success,
1674 bool& done)
1675 {
1676 // We'll be manipulating our file handle/filecur_ state
1677 lock();
1678 while(true) {
1679 do { read(r, rdid, endid, success, done); }
1680 while(!success && !done);
1681 if(!success && sra_acc_cur_ < sra_accs_.size()) {
1682 assert(done);
1683 open();
1684 resetForNextFile(); // reset state to handle a fresh file
1685 sra_acc_cur_++;
1686 continue;
1687 }
1688 break;
1689 }
1690 assert(r.repOk());
1691 // Leaving critical region
1692 unlock();
1693 return success;
1694 }
1695
1696 /**
1697 *
1698 */
nextReadPairImpl(Read & ra,Read & rb,TReadId & rdid,TReadId & endid,bool & success,bool & done,bool & paired)1699 virtual bool nextReadPairImpl(
1700 Read& ra,
1701 Read& rb,
1702 TReadId& rdid,
1703 TReadId& endid,
1704 bool& success,
1705 bool& done,
1706 bool& paired)
1707 {
1708 // We'll be manipulating our file handle/filecur_ state
1709 lock();
1710 while(true) {
1711 do { readPair(ra, rb, rdid, endid, success, done, paired); }
1712 while(!success && !done);
1713 if(!success && sra_acc_cur_ < sra_accs_.size()) {
1714 assert(done);
1715 open();
1716 resetForNextFile(); // reset state to handle a fresh file
1717 sra_acc_cur_++;
1718 continue;
1719 }
1720 break;
1721 }
1722 assert(ra.repOk());
1723 assert(rb.repOk());
1724 // Leaving critical region
1725 unlock();
1726 return success;
1727 }
1728
1729 /**
1730 * Reset state so that we read start reading again from the
1731 * beginning of the first file. Should only be called by the
1732 * master thread.
1733 */
reset()1734 virtual void reset() {
1735 PatternSource::reset();
1736 sra_acc_cur_ = 0,
1737 open();
1738 sra_acc_cur_++;
1739 }
1740
1741 /// Read another pattern from the input file; this is overridden
1742 /// to deal with specific file formats
read(Read & r,TReadId & rdid,TReadId & endid,bool & success,bool & done)1743 virtual bool read(
1744 Read& r,
1745 TReadId& rdid,
1746 TReadId& endid,
1747 bool& success,
1748 bool& done)
1749 {
1750 return true;
1751 }
1752
1753 /// Read another pattern pair from the input file; this is
1754 /// overridden to deal with specific file formats
1755 virtual bool readPair(
1756 Read& ra,
1757 Read& rb,
1758 TReadId& rdid,
1759 TReadId& endid,
1760 bool& success,
1761 bool& done,
1762 bool& paired);
1763
1764 protected:
1765
1766 /// Reset state to handle a fresh file
resetForNextFile()1767 virtual void resetForNextFile() { }
1768
1769 void open();
1770
1771 EList<string> sra_accs_; // filenames for read files
1772 EList<bool> errs_; // whether we've already printed an error for each file
1773 size_t sra_acc_cur_; // index into infiles_ of next file to read
1774 TReadId skip_; // number of reads to skip
1775 bool first_;
1776
1777 size_t nthreads_;
1778
1779 ngs::ReadCollection* sra_run_;
1780 ngs::ReadIterator* sra_it_;
1781
1782 SRA_Data* sra_data_;
1783 tthread::thread* io_thread_;
1784 };
1785
1786 #endif
1787
1788 #endif /*PAT_H_*/
1789