1 /* 2 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu> 3 * 4 * This file is part of Bowtie 2. 5 * 6 * Bowtie 2 is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * Bowtie 2 is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #ifndef READ_H_ 21 #define READ_H_ 22 23 #include <iostream> 24 #include <stdint.h> 25 #include <sys/time.h> 26 27 #include "ds.h" 28 #include "filebuf.h" 29 #include "hit_set.h" 30 #include "sstring.h" 31 #include "util.h" 32 33 typedef uint64_t TReadId; 34 typedef size_t TReadOff; 35 typedef int64_t TAlScore; 36 37 struct HitSet; 38 39 /** 40 * A buffer for keeping all relevant information about a single read. 41 */ 42 struct Read { 43 44 typedef SStringExpandable<char, 0, 2, 2048> TBuf; 45 ReadRead46 Read() { reset(); } 47 ReadRead48 Read(const char *nm, const char *seq, const char *ql) { init(nm, seq, ql); } 49 resetRead50 void reset() { 51 rdid = 0; 52 patid = 0; 53 trimmed5 = trimmed3 = 0; 54 readOrigBuf.clear(); 55 patFw.clear(); 56 patRc.clear(); 57 qual.clear(); 58 patFwRev.clear(); 59 patRcRev.clear(); 60 qualRev.clear(); 61 name.clear(); 62 filter = '?'; 63 seed = 0; 64 parsed = false; 65 ns_ = 0; 66 } 67 68 /** 69 * Finish initializing a new read. 70 */ finalizeRead71 void finalize() { 72 for(size_t i = 0; i < patFw.length(); i++) { 73 if((int)patFw[i] > 3) { 74 ns_++; 75 } 76 } 77 constructRevComps(); 78 constructReverses(); 79 } 80 81 /** 82 * Simple init function, used for testing. 83 */ initRead84 void init( 85 const char *nm, 86 const char *seq, 87 const char *ql) 88 { 89 reset(); 90 patFw.installChars(seq); 91 qual.install(ql); 92 for(size_t i = 0; i < patFw.length(); i++) { 93 if((int)patFw[i] > 3) { 94 ns_++; 95 } 96 } 97 constructRevComps(); 98 constructReverses(); 99 if(nm != NULL) name.install(nm); 100 } 101 102 /// Return true iff the read (pair) is empty emptyRead103 bool empty() const { 104 return patFw.empty(); 105 } 106 107 /// Return length of the read in the buffer lengthRead108 size_t length() const { 109 return patFw.length(); 110 } 111 112 /** 113 * Return the number of Ns in the read. 114 */ nsRead115 size_t ns() const { 116 return ns_; 117 } 118 119 /** 120 * Construct reverse complement of the pattern and the fuzzy 121 * alternative patters. 122 */ constructRevCompsRead123 void constructRevComps() { 124 patRc.installReverseComp(patFw); 125 } 126 127 /** 128 * Given patFw, patRc, and qual, construct the *Rev versions in 129 * place. Assumes constructRevComps() was called previously. 130 */ constructReversesRead131 void constructReverses() { 132 patFwRev.installReverse(patFw); 133 patRcRev.installReverse(patRc); 134 qualRev.installReverse(qual); 135 } 136 137 /** 138 * Append a "/1" or "/2" string onto the end of the name buf if 139 * it's not already there. 140 */ fixMateNameRead141 void fixMateName(int i) { 142 assert(i == 1 || i == 2); 143 size_t namelen = name.length(); 144 bool append = false; 145 if(namelen < 2) { 146 // Name is too short to possibly have /1 or /2 on the end 147 append = true; 148 } else { 149 if(i == 1) { 150 // append = true iff mate name does not already end in /1 151 append = 152 name[namelen-2] != '/' || 153 name[namelen-1] != '1'; 154 } else { 155 // append = true iff mate name does not already end in /2 156 append = 157 name[namelen-2] != '/' || 158 name[namelen-1] != '2'; 159 } 160 } 161 if(append) { 162 name.append('/'); 163 name.append("012"[i]); 164 } 165 } 166 167 /** 168 * Dump basic information about this read to the given ostream. 169 */ dumpRead170 void dump(std::ostream& os) const { 171 using namespace std; 172 os << name << ' '; 173 os << patFw; 174 os << ' '; 175 os << qual.toZBuf() << " "; 176 } 177 178 /** 179 * Check whether two reads are the same in the sense that they will 180 * lead to us finding the same set of alignments. 181 */ sameRead182 static bool same( 183 const BTDnaString& seq1, 184 const BTString& qual1, 185 const BTDnaString& seq2, 186 const BTString& qual2, 187 bool qualitiesMatter) 188 { 189 if(seq1.length() != seq2.length()) { 190 return false; 191 } 192 for(size_t i = 0; i < seq1.length(); i++) { 193 if(seq1[i] != seq2[i]) return false; 194 } 195 if(qualitiesMatter) { 196 if(qual1.length() != qual2.length()) { 197 return false; 198 } 199 for(size_t i = 0; i < qual1.length(); i++) { 200 if(qual1[i] != qual2[i]) return false; 201 } 202 } 203 return true; 204 } 205 206 /** 207 * Get the nucleotide and quality value at the given offset from 5' end. 208 * If 'fw' is false, get the reverse complement. 209 */ getRead210 std::pair<int, int> get(TReadOff off5p, bool fw) const { 211 assert_lt(off5p, length()); 212 int c = (int)patFw[off5p]; 213 int q = qual[off5p]; 214 assert_geq(q, 33); 215 return make_pair((!fw && c < 4) ? (c ^ 3) : c, q - 33); 216 } 217 218 /** 219 * Get the nucleotide at the given offset from 5' end. 220 * If 'fw' is false, get the reverse complement. 221 */ getcRead222 int getc(TReadOff off5p, bool fw) const { 223 assert_lt(off5p, length()); 224 int c = (int)patFw[off5p]; 225 return (!fw && c < 4) ? (c ^ 3) : c; 226 } 227 228 /** 229 * Get the quality value at the given offset from 5' end. 230 */ getqRead231 int getq(TReadOff off5p) const { 232 assert_lt(off5p, length()); 233 int q = qual[off5p]; 234 assert_geq(q, 33); 235 return q-33; 236 } 237 238 #ifndef NDEBUG 239 /** 240 * Check that read info is internally consistent. 241 */ repOkRead242 bool repOk() const { 243 if(patFw.empty()) return true; 244 assert_eq(qual.length(), patFw.length()); 245 return true; 246 } 247 #endif 248 249 BTDnaString patFw; // forward-strand sequence 250 BTDnaString patRc; // reverse-complement sequence 251 BTString qual; // quality values 252 253 BTDnaString patFwRev; 254 BTDnaString patRcRev; 255 BTString qualRev; 256 257 // For remembering the exact input text used to define a read 258 TBuf readOrigBuf; 259 260 BTString name; // read name 261 TReadId rdid; // 0-based id based on pair's offset in read file(s) 262 int mate; // 0 = single-end, 1 = mate1, 2 = mate2 263 uint32_t seed; // random seed 264 bool parsed; // true iff read has been fully parsed 265 size_t ns_; // # Ns 266 char filter; // if read format permits filter char, set it here 267 char primer; 268 char trimc; 269 uint32_t patid; 270 int trimmed5; // amount actually trimmed off 5' end 271 int trimmed3; // amount actually trimmed off 3' end 272 HitSet hitset; // holds previously-found hits; for chaining 273 }; 274 275 /** 276 * A string of FmStringOps represent a string of tasks performed by the 277 * best-first alignment search. We model the search as a series of FM ops 278 * interspersed with reported alignments. 279 */ 280 struct FmStringOp { 281 bool alignment; // true -> found an alignment 282 TAlScore pen; // penalty of the FM op or alignment 283 size_t n; // number of FM ops (only relevant for non-alignment) 284 }; 285 286 /** 287 * A string that summarizes the progress of an FM-index-assistet best-first 288 * search. Useful for trying to figure out what the aligner is spending its 289 * time doing for a given read. 290 */ 291 struct FmString { 292 293 /** 294 * Add one or more FM index ops to the op string 295 */ addFmString296 void add(bool alignment, TAlScore pen, size_t nops) { 297 if(ops.empty() || ops.back().pen != pen) { 298 ops.push_back(FmStringOp()); 299 ops.back().alignment = alignment; 300 ops.back().pen = pen; 301 ops.back().n = 0; 302 } 303 ops.back().n++; 304 } 305 306 /** 307 * Reset FmString to uninitialized state. 308 */ resetFmString309 void reset() { 310 pen = std::numeric_limits<TAlScore>::max(); 311 ops.clear(); 312 } 313 314 /** 315 * Print a :Z optional field where certain characters (whitespace, colon 316 * and percent) are escaped using % escapes. 317 */ printFmString318 void print(BTString& o, char *buf) const { 319 for(size_t i = 0; i < ops.size(); i++) { 320 if(i > 0) { 321 o.append(';'); 322 } 323 if(ops[i].alignment) { 324 o.append("A,"); 325 itoa10(ops[i].pen, buf); 326 o.append(buf); 327 } else { 328 o.append("F,"); 329 itoa10(ops[i].pen, buf); o.append(buf); 330 o.append(','); 331 itoa10(ops[i].n, buf); o.append(buf); 332 } 333 } 334 } 335 336 TAlScore pen; // current penalty 337 EList<FmStringOp> ops; // op string 338 }; 339 340 /** 341 * Key per-read metrics. These are used for thresholds, allowing us to bail 342 * for unproductive reads. They also the basis of what's printed when the user 343 * specifies --read-times. 344 */ 345 struct PerReadMetrics { 346 PerReadMetricsPerReadMetrics347 PerReadMetrics() { reset(); } 348 resetPerReadMetrics349 void reset() { 350 nExIters = 351 nExDps = nExDpSuccs = nExDpFails = 352 nMateDps = nMateDpSuccs = nMateDpFails = 353 nExUgs = nExUgSuccs = nExUgFails = 354 nMateUgs = nMateUgSuccs = nMateUgFails = 355 nExEes = nExEeSuccs = nExEeFails = 356 nRedundants = 357 nEeFmops = nSdFmops = nExFmops = 358 nDpFail = nDpFailStreak = nDpLastSucc = 359 nUgFail = nUgFailStreak = nUgLastSucc = 360 nEeFail = nEeFailStreak = nEeLastSucc = 361 nFilt = 0; 362 nFtabs = 0; 363 nRedSkip = 0; 364 nRedFail = 0; 365 nRedIns = 0; 366 doFmString = false; 367 nSeedRanges = nSeedElts = 0; 368 nSeedRangesFw = nSeedEltsFw = 0; 369 nSeedRangesRc = nSeedEltsRc = 0; 370 seedMedian = seedMean = 0; 371 bestLtMinscMate1 = 372 bestLtMinscMate2 = std::numeric_limits<TAlScore>::min(); 373 seedPctUnique = seedPctRep = seedsPerNuc = seedHitAvg = 0.0f; 374 fmString.reset(); 375 } 376 377 struct timeval tv_beg; // timer start to measure how long alignment takes 378 struct timezone tz_beg; // timer start to measure how long alignment takes 379 380 uint64_t nExIters; // iterations of seed hit extend loop 381 382 uint64_t nExDps; // # extend DPs run on this read 383 uint64_t nExDpSuccs; // # extend DPs run on this read 384 uint64_t nExDpFails; // # extend DPs run on this read 385 386 uint64_t nExUgs; // # extend ungapped alignments run on this read 387 uint64_t nExUgSuccs; // # extend ungapped alignments run on this read 388 uint64_t nExUgFails; // # extend ungapped alignments run on this read 389 390 uint64_t nExEes; // # extend ungapped alignments run on this read 391 uint64_t nExEeSuccs; // # extend ungapped alignments run on this read 392 uint64_t nExEeFails; // # extend ungapped alignments run on this read 393 394 uint64_t nMateDps; // # mate DPs run on this read 395 uint64_t nMateDpSuccs; // # mate DPs run on this read 396 uint64_t nMateDpFails; // # mate DPs run on this read 397 398 uint64_t nMateUgs; // # mate ungapped alignments run on this read 399 uint64_t nMateUgSuccs; // # mate ungapped alignments run on this read 400 uint64_t nMateUgFails; // # mate ungapped alignments run on this read 401 402 uint64_t nRedundants; // # redundant seed hits 403 404 uint64_t nSeedRanges; // # BW ranges found for seeds 405 uint64_t nSeedElts; // # BW elements found for seeds 406 407 uint64_t nSeedRangesFw; // # BW ranges found for seeds from fw read 408 uint64_t nSeedEltsFw; // # BW elements found for seeds from fw read 409 410 uint64_t nSeedRangesRc; // # BW ranges found for seeds from fw read 411 uint64_t nSeedEltsRc; // # BW elements found for seeds from fw read 412 413 uint64_t seedMedian; // median seed hit count 414 uint64_t seedMean; // rounded mean seed hit count 415 416 uint64_t nEeFmops; // FM Index ops for end-to-end alignment 417 uint64_t nSdFmops; // FM Index ops used to align seeds 418 uint64_t nExFmops; // FM Index ops used to resolve offsets 419 420 uint64_t nFtabs; // # ftab lookups 421 uint64_t nRedSkip; // # times redundant path was detected and aborted 422 uint64_t nRedFail; // # times a path was deemed non-redundant 423 uint64_t nRedIns; // # times a path was added to redundancy list 424 425 uint64_t nDpFail; // number of dp failures in a row up until now 426 uint64_t nDpFailStreak; // longest streak of dp failures 427 uint64_t nDpLastSucc; // index of last dp attempt that succeeded 428 429 uint64_t nUgFail; // number of ungap failures in a row up until now 430 uint64_t nUgFailStreak; // longest streak of ungap failures 431 uint64_t nUgLastSucc; // index of last ungap attempt that succeeded 432 433 uint64_t nEeFail; // number of ungap failures in a row up until now 434 uint64_t nEeFailStreak; // longest streak of ungap failures 435 uint64_t nEeLastSucc; // index of last ungap attempt that succeeded 436 437 uint64_t nFilt; // # mates filtered 438 439 TAlScore bestLtMinscMate1; // best invalid score observed for mate 1 440 TAlScore bestLtMinscMate2; // best invalid score observed for mate 2 441 442 float seedPctUnique; // % of read covered by unique seed hits 443 float seedPctUniqueMS[4]; // % of read covered by unique seed hits by mate and strand 444 float seedPctRep; // % of read covered by repetitive seed hits 445 float seedPctRepMS[4]; // % of read covered by repetitive seed hits by mate and strand 446 float seedHitAvg; // avg # seed hits per hitting seed 447 float seedHitAvgMS[4]; // avg # seed hits per hitting seed by mate and strand 448 float seedsPerNuc; // # seeds tried / # alignable nucleotides 449 float seedsPerNucMS[4]; // # seeds tried / # alignable nucleotides by mate and strand 450 451 // For collecting information to go into an FM string 452 bool doFmString; 453 FmString fmString; 454 }; 455 456 #endif /*READ_H_*/ 457