1 /*
2  * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3  *
4  * This file is part of Bowtie 2.
5  *
6  * Bowtie 2 is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Bowtie 2 is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #ifndef READ_H_
21 #define READ_H_
22 
23 #include <iostream>
24 #include <stdint.h>
25 #include <sys/time.h>
26 
27 #include "ds.h"
28 #include "filebuf.h"
29 #include "hit_set.h"
30 #include "sstring.h"
31 #include "util.h"
32 
33 typedef uint64_t TReadId;
34 typedef size_t TReadOff;
35 typedef int64_t TAlScore;
36 
37 struct HitSet;
38 
39 /**
40  * A buffer for keeping all relevant information about a single read.
41  */
42 struct Read {
43 
44 	typedef SStringExpandable<char, 0, 2, 2048> TBuf;
45 
ReadRead46 	Read() { reset(); }
47 
ReadRead48 	Read(const char *nm, const char *seq, const char *ql) { init(nm, seq, ql); }
49 
resetRead50 	void reset() {
51 		rdid = 0;
52 		patid = 0;
53 		trimmed5 = trimmed3 = 0;
54 		readOrigBuf.clear();
55 		patFw.clear();
56 		patRc.clear();
57 		qual.clear();
58 		patFwRev.clear();
59 		patRcRev.clear();
60 		qualRev.clear();
61 		name.clear();
62 		filter = '?';
63 		seed = 0;
64 		parsed = false;
65 		ns_ = 0;
66 	}
67 
68 	/**
69 	 * Finish initializing a new read.
70 	 */
finalizeRead71 	void finalize() {
72 		for(size_t i = 0; i < patFw.length(); i++) {
73 			if((int)patFw[i] > 3) {
74 				ns_++;
75 			}
76 		}
77 		constructRevComps();
78 		constructReverses();
79 	}
80 
81 	/**
82 	 * Simple init function, used for testing.
83 	 */
initRead84 	void init(
85 		const char *nm,
86 		const char *seq,
87 		const char *ql)
88 	{
89 		reset();
90 		patFw.installChars(seq);
91 		qual.install(ql);
92 		for(size_t i = 0; i < patFw.length(); i++) {
93 			if((int)patFw[i] > 3) {
94 				ns_++;
95 			}
96 		}
97 		constructRevComps();
98 		constructReverses();
99 		if(nm != NULL) name.install(nm);
100 	}
101 
102 	/// Return true iff the read (pair) is empty
emptyRead103 	bool empty() const {
104 		return patFw.empty();
105 	}
106 
107 	/// Return length of the read in the buffer
lengthRead108 	size_t length() const {
109 		return patFw.length();
110 	}
111 
112 	/**
113 	 * Return the number of Ns in the read.
114 	 */
nsRead115 	size_t ns() const {
116 		return ns_;
117 	}
118 
119 	/**
120 	 * Construct reverse complement of the pattern and the fuzzy
121 	 * alternative patters.
122 	 */
constructRevCompsRead123 	void constructRevComps() {
124 		patRc.installReverseComp(patFw);
125 	}
126 
127 	/**
128 	 * Given patFw, patRc, and qual, construct the *Rev versions in
129 	 * place.  Assumes constructRevComps() was called previously.
130 	 */
constructReversesRead131 	void constructReverses() {
132 		patFwRev.installReverse(patFw);
133 		patRcRev.installReverse(patRc);
134 		qualRev.installReverse(qual);
135 	}
136 
137 	/**
138 	 * Append a "/1" or "/2" string onto the end of the name buf if
139 	 * it's not already there.
140 	 */
fixMateNameRead141 	void fixMateName(int i) {
142 		assert(i == 1 || i == 2);
143 		size_t namelen = name.length();
144 		bool append = false;
145 		if(namelen < 2) {
146 			// Name is too short to possibly have /1 or /2 on the end
147 			append = true;
148 		} else {
149 			if(i == 1) {
150 				// append = true iff mate name does not already end in /1
151 				append =
152 					name[namelen-2] != '/' ||
153 					name[namelen-1] != '1';
154 			} else {
155 				// append = true iff mate name does not already end in /2
156 				append =
157 					name[namelen-2] != '/' ||
158 					name[namelen-1] != '2';
159 			}
160 		}
161 		if(append) {
162 			name.append('/');
163 			name.append("012"[i]);
164 		}
165 	}
166 
167 	/**
168 	 * Dump basic information about this read to the given ostream.
169 	 */
dumpRead170 	void dump(std::ostream& os) const {
171 		using namespace std;
172 		os << name << ' ';
173 		os << patFw;
174 		os << ' ';
175 		os << qual.toZBuf() << " ";
176 	}
177 
178 	/**
179 	 * Check whether two reads are the same in the sense that they will
180 	 * lead to us finding the same set of alignments.
181 	 */
sameRead182 	static bool same(
183 		const BTDnaString& seq1,
184 		const BTString&    qual1,
185 		const BTDnaString& seq2,
186 		const BTString&    qual2,
187 		bool qualitiesMatter)
188 	{
189 		if(seq1.length() != seq2.length()) {
190 			return false;
191 		}
192 		for(size_t i = 0; i < seq1.length(); i++) {
193 			if(seq1[i] != seq2[i]) return false;
194 		}
195 		if(qualitiesMatter) {
196 			if(qual1.length() != qual2.length()) {
197 				return false;
198 			}
199 			for(size_t i = 0; i < qual1.length(); i++) {
200 				if(qual1[i] != qual2[i]) return false;
201 			}
202 		}
203 		return true;
204 	}
205 
206 	/**
207 	 * Get the nucleotide and quality value at the given offset from 5' end.
208 	 * If 'fw' is false, get the reverse complement.
209 	 */
getRead210 	std::pair<int, int> get(TReadOff off5p, bool fw) const {
211 		assert_lt(off5p, length());
212 		int c = (int)patFw[off5p];
213         int q = qual[off5p];
214         assert_geq(q, 33);
215 		return make_pair((!fw && c < 4) ? (c ^ 3) : c, q - 33);
216 	}
217 
218 	/**
219 	 * Get the nucleotide at the given offset from 5' end.
220 	 * If 'fw' is false, get the reverse complement.
221 	 */
getcRead222 	int getc(TReadOff off5p, bool fw) const {
223 		assert_lt(off5p, length());
224 		int c = (int)patFw[off5p];
225 		return (!fw && c < 4) ? (c ^ 3) : c;
226 	}
227 
228 	/**
229 	 * Get the quality value at the given offset from 5' end.
230 	 */
getqRead231 	int getq(TReadOff off5p) const {
232 		assert_lt(off5p, length());
233         int q = qual[off5p];
234         assert_geq(q, 33);
235 		return q-33;
236 	}
237 
238 #ifndef NDEBUG
239 	/**
240 	 * Check that read info is internally consistent.
241 	 */
repOkRead242 	bool repOk() const {
243 		if(patFw.empty()) return true;
244 		assert_eq(qual.length(), patFw.length());
245 		return true;
246 	}
247 #endif
248 
249 	BTDnaString patFw;            // forward-strand sequence
250 	BTDnaString patRc;            // reverse-complement sequence
251 	BTString    qual;             // quality values
252 
253 	BTDnaString patFwRev;
254 	BTDnaString patRcRev;
255 	BTString    qualRev;
256 
257 	// For remembering the exact input text used to define a read
258 	TBuf readOrigBuf;
259 
260 	BTString name;      // read name
261 	TReadId  rdid;      // 0-based id based on pair's offset in read file(s)
262 	int      mate;      // 0 = single-end, 1 = mate1, 2 = mate2
263 	uint32_t seed;      // random seed
264 	bool     parsed;    // true iff read has been fully parsed
265 	size_t   ns_;       // # Ns
266 	char     filter;    // if read format permits filter char, set it here
267 	char     primer;
268 	char     trimc;
269 	uint32_t patid;
270 	int      trimmed5;  // amount actually trimmed off 5' end
271 	int      trimmed3;  // amount actually trimmed off 3' end
272 	HitSet   hitset;    // holds previously-found hits; for chaining
273 };
274 
275 /**
276  * A string of FmStringOps represent a string of tasks performed by the
277  * best-first alignment search.  We model the search as a series of FM ops
278  * interspersed with reported alignments.
279  */
280 struct FmStringOp {
281 	bool alignment;  // true -> found an alignment
282 	TAlScore pen;    // penalty of the FM op or alignment
283 	size_t n;        // number of FM ops (only relevant for non-alignment)
284 };
285 
286 /**
287  * A string that summarizes the progress of an FM-index-assistet best-first
288  * search.  Useful for trying to figure out what the aligner is spending its
289  * time doing for a given read.
290  */
291 struct FmString {
292 
293 	/**
294 	 * Add one or more FM index ops to the op string
295 	 */
addFmString296 	void add(bool alignment, TAlScore pen, size_t nops) {
297 		if(ops.empty() || ops.back().pen != pen) {
298 			ops.push_back(FmStringOp());
299 			ops.back().alignment = alignment;
300 			ops.back().pen = pen;
301 			ops.back().n = 0;
302 		}
303 		ops.back().n++;
304 	}
305 
306 	/**
307 	 * Reset FmString to uninitialized state.
308 	 */
resetFmString309 	void reset() {
310 		pen = std::numeric_limits<TAlScore>::max();
311 		ops.clear();
312 	}
313 
314 	/**
315 	 * Print a :Z optional field where certain characters (whitespace, colon
316 	 * and percent) are escaped using % escapes.
317 	 */
printFmString318 	void print(BTString& o, char *buf) const {
319 		for(size_t i = 0; i < ops.size(); i++) {
320 			if(i > 0) {
321 				o.append(';');
322 			}
323 			if(ops[i].alignment) {
324 				o.append("A,");
325 				itoa10(ops[i].pen, buf);
326 				o.append(buf);
327 			} else {
328 				o.append("F,");
329 				itoa10(ops[i].pen, buf); o.append(buf);
330 				o.append(',');
331 				itoa10(ops[i].n, buf); o.append(buf);
332 			}
333 		}
334 	}
335 
336 	TAlScore pen;          // current penalty
337 	EList<FmStringOp> ops; // op string
338 };
339 
340 /**
341  * Key per-read metrics.  These are used for thresholds, allowing us to bail
342  * for unproductive reads.  They also the basis of what's printed when the user
343  * specifies --read-times.
344  */
345 struct PerReadMetrics {
346 
PerReadMetricsPerReadMetrics347 	PerReadMetrics() { reset(); }
348 
resetPerReadMetrics349 	void reset() {
350 		nExIters =
351 		nExDps   = nExDpSuccs   = nExDpFails   =
352 		nMateDps = nMateDpSuccs = nMateDpFails =
353 		nExUgs   = nExUgSuccs   = nExUgFails   =
354 		nMateUgs = nMateUgSuccs = nMateUgFails =
355 		nExEes   = nExEeSuccs   = nExEeFails   =
356 		nRedundants =
357 		nEeFmops = nSdFmops = nExFmops =
358 		nDpFail = nDpFailStreak = nDpLastSucc =
359 		nUgFail = nUgFailStreak = nUgLastSucc =
360 		nEeFail = nEeFailStreak = nEeLastSucc =
361 		nFilt = 0;
362 		nFtabs = 0;
363 		nRedSkip = 0;
364 		nRedFail = 0;
365 		nRedIns = 0;
366 		doFmString = false;
367 		nSeedRanges = nSeedElts = 0;
368 		nSeedRangesFw = nSeedEltsFw = 0;
369 		nSeedRangesRc = nSeedEltsRc = 0;
370 		seedMedian = seedMean = 0;
371 		bestLtMinscMate1 =
372 		bestLtMinscMate2 = std::numeric_limits<TAlScore>::min();
373 		seedPctUnique = seedPctRep = seedsPerNuc = seedHitAvg = 0.0f;
374 		fmString.reset();
375 	}
376 
377 	struct timeval  tv_beg; // timer start to measure how long alignment takes
378 	struct timezone tz_beg; // timer start to measure how long alignment takes
379 
380 	uint64_t nExIters;      // iterations of seed hit extend loop
381 
382 	uint64_t nExDps;        // # extend DPs run on this read
383 	uint64_t nExDpSuccs;    // # extend DPs run on this read
384 	uint64_t nExDpFails;    // # extend DPs run on this read
385 
386 	uint64_t nExUgs;        // # extend ungapped alignments run on this read
387 	uint64_t nExUgSuccs;    // # extend ungapped alignments run on this read
388 	uint64_t nExUgFails;    // # extend ungapped alignments run on this read
389 
390 	uint64_t nExEes;        // # extend ungapped alignments run on this read
391 	uint64_t nExEeSuccs;    // # extend ungapped alignments run on this read
392 	uint64_t nExEeFails;    // # extend ungapped alignments run on this read
393 
394 	uint64_t nMateDps;      // # mate DPs run on this read
395 	uint64_t nMateDpSuccs;  // # mate DPs run on this read
396 	uint64_t nMateDpFails;  // # mate DPs run on this read
397 
398 	uint64_t nMateUgs;      // # mate ungapped alignments run on this read
399 	uint64_t nMateUgSuccs;  // # mate ungapped alignments run on this read
400 	uint64_t nMateUgFails;  // # mate ungapped alignments run on this read
401 
402 	uint64_t nRedundants;   // # redundant seed hits
403 
404 	uint64_t nSeedRanges;   // # BW ranges found for seeds
405 	uint64_t nSeedElts;     // # BW elements found for seeds
406 
407 	uint64_t nSeedRangesFw; // # BW ranges found for seeds from fw read
408 	uint64_t nSeedEltsFw;   // # BW elements found for seeds from fw read
409 
410 	uint64_t nSeedRangesRc; // # BW ranges found for seeds from fw read
411 	uint64_t nSeedEltsRc;   // # BW elements found for seeds from fw read
412 
413 	uint64_t seedMedian;    // median seed hit count
414 	uint64_t seedMean;      // rounded mean seed hit count
415 
416 	uint64_t nEeFmops;      // FM Index ops for end-to-end alignment
417 	uint64_t nSdFmops;      // FM Index ops used to align seeds
418 	uint64_t nExFmops;      // FM Index ops used to resolve offsets
419 
420 	uint64_t nFtabs;        // # ftab lookups
421 	uint64_t nRedSkip;      // # times redundant path was detected and aborted
422 	uint64_t nRedFail;      // # times a path was deemed non-redundant
423 	uint64_t nRedIns;       // # times a path was added to redundancy list
424 
425 	uint64_t nDpFail;       // number of dp failures in a row up until now
426 	uint64_t nDpFailStreak; // longest streak of dp failures
427 	uint64_t nDpLastSucc;   // index of last dp attempt that succeeded
428 
429 	uint64_t nUgFail;       // number of ungap failures in a row up until now
430 	uint64_t nUgFailStreak; // longest streak of ungap failures
431 	uint64_t nUgLastSucc;   // index of last ungap attempt that succeeded
432 
433 	uint64_t nEeFail;       // number of ungap failures in a row up until now
434 	uint64_t nEeFailStreak; // longest streak of ungap failures
435 	uint64_t nEeLastSucc;   // index of last ungap attempt that succeeded
436 
437 	uint64_t nFilt;         // # mates filtered
438 
439 	TAlScore bestLtMinscMate1; // best invalid score observed for mate 1
440 	TAlScore bestLtMinscMate2; // best invalid score observed for mate 2
441 
442 	float seedPctUnique;      // % of read covered by unique seed hits
443 	float seedPctUniqueMS[4]; // % of read covered by unique seed hits by mate and strand
444 	float seedPctRep;         // % of read covered by repetitive seed hits
445 	float seedPctRepMS[4];    // % of read covered by repetitive seed hits by mate and strand
446 	float seedHitAvg;         // avg # seed hits per hitting seed
447 	float seedHitAvgMS[4];    // avg # seed hits per hitting seed by mate and strand
448 	float seedsPerNuc;        // # seeds tried / # alignable nucleotides
449 	float seedsPerNucMS[4];   // # seeds tried / # alignable nucleotides by mate and strand
450 
451 	// For collecting information to go into an FM string
452 	bool doFmString;
453 	FmString fmString;
454 };
455 
456 #endif /*READ_H_*/
457