1 /*
2  * Copyright 2013, Ben Langmead <langmea@cs.jhu.edu>
3  *
4  * This file is part of Bowtie 2.
5  *
6  * Bowtie 2 is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Bowtie 2 is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <getopt.h>
21 #include "assert_helpers.h"
22 #include "ds.h"
23 #include "simple_func.h"
24 #include "aligner_seed_policy.h"
25 #include "scoring.h"
26 #include "opts.h"
27 #include "aligner_sw.h"
28 
29 using namespace std;
30 
31 int gVerbose;               // be talkative
32 int gQuiet;                 // print nothing but the alignments
33 static int sanityCheck;     // enable expensive sanity checks
34 static int seed;            // srandom() seed
35 static bool showVersion;    // just print version and quit?
36 static uint64_t qUpto;      // max # of queries to read
37 static int nthreads;        // number of pthreads operating concurrently
38 static uint32_t skipReads;  // # reads/read pairs to skip
39 int gGapBarrier;            // # diags on top/bot only to be entered diagonally
40 static int bonusMatchType;  // how to reward matches
41 static int bonusMatch;      // constant reward if bonusMatchType=constant
42 static int penMmcType;      // how to penalize mismatches
43 static int penMmcMax;       // max mm penalty
44 static int penMmcMin;       // min mm penalty
45 static int penNType;        // how to penalize Ns in the read
46 static int penN;            // constant if N pelanty is a constant
47 static bool penNCatPair;    // concatenate mates before N filtering?
48 static bool localAlign;     // do local alignment in DP steps
49 static int   penRdGapConst;   // constant cost of extending a gap in the read
50 static int   penRfGapConst;   // constant cost of extending a gap in the reference
51 static int   penRdGapLinear;  // coeff of linear term for cost of gap extension in read
52 static int   penRfGapLinear;  // coeff of linear term for cost of gap extension in ref
53 static SimpleFunc scoreMin;   // minimum valid score as function of read len
54 static SimpleFunc nCeil;      // max # Ns allowed as function of read len
55 static SimpleFunc msIval;     // interval between seeds as function of read len
56 static bool enable8;          // use 8-bit SSE where possible?
57 static size_t cminlen;        // longer reads use checkpointing
58 static size_t cpow2;          // checkpoint interval log2
59 static bool doTri;            // do triangular mini-fills?
60 static bool ignoreQuals;      // all mms incur same penalty, regardless of qual
61 static EList<string> queries; // list of query files
62 static string outfile;        // write output to this file
63 
resetOptions()64 static void resetOptions() {
65 	gVerbose                = 0;
66 	gQuiet					= false;
67 	sanityCheck				= 0;  // enable expensive sanity checks
68 	seed					= 0; // srandom() seed
69 	showVersion				= false; // just print version and quit?
70 	qUpto					= 0xffffffffffffffff; // max # of queries to read
71 	nthreads				= 1;     // number of pthreads operating concurrently
72 	skipReads				= 0;     // # reads/read pairs to skip
73 	gGapBarrier				= 4;     // disallow gaps within this many chars of either end of alignment
74 	bonusMatchType  = DEFAULT_MATCH_BONUS_TYPE;
75 	bonusMatch      = DEFAULT_MATCH_BONUS;
76 	penMmcType      = DEFAULT_MM_PENALTY_TYPE;
77 	penMmcMax       = DEFAULT_MM_PENALTY_MAX;
78 	penMmcMin       = DEFAULT_MM_PENALTY_MIN;
79 	penNType        = DEFAULT_N_PENALTY_TYPE;
80 	penN            = DEFAULT_N_PENALTY;
81 	penNCatPair     = DEFAULT_N_CAT_PAIR; // concatenate mates before N filtering?
82 	localAlign      = false;     // do local alignment in DP steps
83 	penRdGapConst   = DEFAULT_READ_GAP_CONST;
84 	penRfGapConst   = DEFAULT_REF_GAP_CONST;
85 	penRdGapLinear  = DEFAULT_READ_GAP_LINEAR;
86 	penRfGapLinear  = DEFAULT_REF_GAP_LINEAR;
87 	scoreMin.init  (SIMPLE_FUNC_LINEAR, DEFAULT_MIN_CONST,   DEFAULT_MIN_LINEAR);
88 	nCeil.init     (SIMPLE_FUNC_LINEAR, 0.0f, std::numeric_limits<double>::max(), 2.0f, 0.1f);
89 	msIval.init    (SIMPLE_FUNC_LINEAR, 1.0f, std::numeric_limits<double>::max(), DEFAULT_IVAL_B, DEFAULT_IVAL_A);
90 	enable8            = true;  // use 8-bit SSE where possible?
91 	cminlen            = 2000;  // longer reads use checkpointing
92 	cpow2              = 4;     // checkpoint interval log2
93 	doTri              = false; // do triangular mini-fills?
94 	ignoreQuals = false;     // all mms incur same penalty, regardless of qual
95 	queries.clear();         // list of query files
96 	outfile.clear();         // write output to this file
97 }
98 
99 static const char *short_options = "u:hp:P:S:";
100 
101 static struct option long_options[] = {
102 	{(char*)"verbose",          no_argument,       0, ARG_VERBOSE},
103 	{(char*)"quiet",            no_argument,       0, ARG_QUIET},
104 	{(char*)"sanity",           no_argument,       0, ARG_SANITY},
105 	{(char*)"qupto",            required_argument, 0, 'u'},
106 	{(char*)"upto",             required_argument, 0, 'u'},
107 	{(char*)"version",          no_argument,       0, ARG_VERSION},
108 	{(char*)"help",             no_argument,       0, 'h'},
109 	{(char*)"threads",          required_argument, 0, 'p'},
110 	{(char*)"usage",            no_argument,       0, ARG_USAGE},
111 	{(char*)"gbar",             required_argument, 0, ARG_GAP_BAR},
112 	{(char*)"policy",           required_argument, 0, ARG_ALIGN_POLICY},
113 	{(char*)"454",              no_argument,       0, ARG_NOISY_HPOLY},
114 	{(char*)"ion-torrent",      no_argument,       0, ARG_NOISY_HPOLY},
115 	{(char*)"local",            no_argument,       0, ARG_LOCAL},
116 	{(char*)"end-to-end",       no_argument,       0, ARG_END_TO_END},
117 	{(char*)"sse8",             no_argument,       0, ARG_SSE8},
118 	{(char*)"no-sse8",          no_argument,       0, ARG_SSE8_NO},
119 	{(char*)"ma",               required_argument, 0, ARG_SCORE_MA},
120 	{(char*)"mp",               required_argument, 0, ARG_SCORE_MMP},
121 	{(char*)"np",               required_argument, 0, ARG_SCORE_NP},
122 	{(char*)"rdg",              required_argument, 0, ARG_SCORE_RDG},
123 	{(char*)"rfg",              required_argument, 0, ARG_SCORE_RFG},
124 	{(char*)"score-min",        required_argument, 0, ARG_SCORE_MIN},
125 	{(char*)"min-score",        required_argument, 0, ARG_SCORE_MIN},
126 	{(char*)"n-ceil",           required_argument, 0, ARG_N_CEIL},
127 	{(char*)"ignore-quals",     no_argument,       0, ARG_IGNORE_QUALS},
128 	{(char*)"output",           required_argument, 0, 'S'},
129 	{(char*)"cp-min",           required_argument, 0, ARG_CP_MIN},
130 	{(char*)"cp-ival",          required_argument, 0, ARG_CP_IVAL},
131 	{(char*)"tri",              no_argument,       0, ARG_TRI},
132 	{(char*)0, 0, 0, 0} // terminator
133 };
134 
135 /**
136  * Print a summary usage message to the provided output stream.
137  */
printUsage(ostream & out)138 static void printUsage(ostream& out) {
139 	out << "Bowtie 2 dynamic programming engine, by Ben Langmead (langmea@cs.jhu.edu, www.cs.jhu.edu/~langmea)" << endl;
140 	string tool_name = "bowtie2-dp";
141 	out << "Usage: " << endl
142 	    << "  " << tool_name.c_str() << " [options]* <in> <out>" << endl
143 	    << endl
144 	    <<     "  <in>           File with DP input problems (default: stdin)" << endl
145 	    <<     "  <out>          File with DP output solutions (default: stdout)" << endl
146 		<< endl
147 	    << "Options (defaults in parentheses):" << endl
148 		<< endl
149 	    << " Input:" << endl
150 	    << "  -s/--skip <int>    skip the first <int> problems in the input (none)" << endl
151 	    << "  -u/--upto <int>    stop after first <int> problems (no limit)" << endl
152 		<< endl
153 	    << " Alignment:" << endl
154 		<< "  --n-ceil <func>    func for max # non-A/C/G/Ts permitted in aln (L,0,0.15)" << endl
155 		<< "  --gbar <int>       disallow gaps within <int> nucs of read extremes (4)" << endl
156 		<< "  --ignore-quals     treat all quality values as 30 on Phred scale (off)" << endl
157 		<< endl
158 		<< "  --end-to-end       entire read must align; no clipping (on)" << endl
159 		<< "   OR" << endl
160 		<< "  --local            local alignment; ends might be soft clipped (off)" << endl
161 		<< endl
162 	    << " Scoring:" << endl
163 		<< "  --ma <int>         match bonus (0 for --end-to-end, 2 for --local) " << endl
164 		<< "  --mp <int>         max penalty for mismatch; lower qual = lower penalty (6)" << endl
165 		<< "  --np <int>         penalty for non-A/C/G/Ts in read/ref (1)" << endl
166 		<< "  --rdg <int>,<int>  read gap open, extend penalties (5,3)" << endl
167 		<< "  --rfg <int>,<int>  reference gap open, extend penalties (5,3)" << endl
168 		<< "  --score-min <func> min acceptable alignment score w/r/t read length" << endl
169 		<< "                     (G,20,8 for local, L,-0.6,-0.6 for end-to-end)" << endl
170 	    << "  --quiet            print nothing to stderr except serious errors" << endl
171 		<< endl
172 	    << " Performance:" << endl
173 	    << "  -p/--threads <int> number of alignment threads to launch (1)" << endl
174 		<< endl
175 	    << " Other:" << endl
176 	    << "  --version          print version information and quit" << endl
177 	    << "  -h/--help          print this usage message" << endl
178 	    ;
179 }
180 
181 /**
182  * Parse an int out of optarg and enforce that it be at least 'lower';
183  * if it is less than 'lower', than output the given error message and
184  * exit with an error and a usage message.
185  */
parseInt(int lower,int upper,const char * errmsg,const char * arg)186 static int parseInt(int lower, int upper, const char *errmsg, const char *arg) {
187 	long l;
188 	char *endPtr= NULL;
189 	l = strtol(arg, &endPtr, 10);
190 	if (endPtr != NULL) {
191 		if (l < lower || l > upper) {
192 			cerr << errmsg << endl;
193 			printUsage(cerr);
194 			throw 1;
195 		}
196 		return (int32_t)l;
197 	}
198 	cerr << errmsg << endl;
199 	printUsage(cerr);
200 	throw 1;
201 	return -1;
202 }
203 
204 /**
205  * Upper is maximum int by default.
206  */
parseInt(int lower,const char * errmsg,const char * arg)207 static int parseInt(int lower, const char *errmsg, const char *arg) {
208 	return parseInt(lower, std::numeric_limits<int>::max(), errmsg, arg);
209 }
210 
211 /**
212  * Parse a T string 'str'.
213  */
214 template<typename T>
parse(const char * s)215 T parse(const char *s) {
216 	T tmp;
217 	stringstream ss(s);
218 	ss >> tmp;
219 	return tmp;
220 }
221 
parseFuncType(const std::string & otype)222 static int parseFuncType(const std::string& otype) {
223 	string type = otype;
224 	if(type == "C" || type == "Constant") {
225 		return SIMPLE_FUNC_CONST;
226 	} else if(type == "L" || type == "Linear") {
227 		return SIMPLE_FUNC_LINEAR;
228 	} else if(type == "S" || type == "Sqrt") {
229 		return SIMPLE_FUNC_SQRT;
230 	} else if(type == "G" || type == "Log") {
231 		return SIMPLE_FUNC_LOG;
232 	}
233 	std::cerr << "Error: Bad function type '" << otype.c_str()
234 	          << "'.  Should be C (constant), L (linear), "
235 	          << "S (square root) or G (natural log)." << std::endl;
236 	throw 1;
237 }
238 
239 #define PARSE_FUNC(fv) { \
240 	if(args.size() >= 1) { \
241 		fv.setType(parseFuncType(args[0])); \
242 	} \
243 	if(args.size() >= 2) { \
244 		double co; \
245 		istringstream tmpss(args[1]); \
246 		tmpss >> co; \
247 		fv.setConst(co); \
248 	} \
249 	if(args.size() >= 3) { \
250 		double ce; \
251 		istringstream tmpss(args[2]); \
252 		tmpss >> ce; \
253 		fv.setCoeff(ce); \
254 	} \
255 	if(args.size() >= 4) { \
256 		double mn; \
257 		istringstream tmpss(args[3]); \
258 		tmpss >> mn; \
259 		fv.setMin(mn); \
260 	} \
261 	if(args.size() >= 5) { \
262 		double mx; \
263 		istringstream tmpss(args[4]); \
264 		tmpss >> mx; \
265 		fv.setMin(mx); \
266 	} \
267 }
268 
269 /**
270  * TODO: Argument parsing is very, very flawed.  The biggest problem is that
271  * there are two separate worlds of arguments, the ones set via polstr, and
272  * the ones set directly in variables.  This makes for nasty interactions,
273  * e.g., with the -M option being resolved at an awkward time relative to
274  * the -k and -a options.
275  */
parseOption(int next_option,const char * arg)276 static void parseOption(int next_option, const char *arg) {
277 	switch (next_option) {
278 		case 's':
279 			skipReads = (uint32_t)parseInt(0, "-s arg must be positive", arg);
280 			break;
281 		case ARG_GAP_BAR:
282 			gGapBarrier = parseInt(1, "--gbar must be no less than 1", arg);
283 			break;
284 		case 'u':
285 			qUpto = (uint32_t)parseInt(1, "-u/--qupto arg must be at least 1", arg);
286 			break;
287 		case 'p':
288 			nthreads = parseInt(1, "-p/--threads arg must be at least 1", arg);
289 			break;
290 		case 'h': printUsage(cout); throw 0; break;
291 		case ARG_USAGE: printUsage(cout); throw 0; break;
292 		case ARG_VERBOSE: gVerbose = 1; break;
293 		case ARG_QUIET: gQuiet = true; break;
294 		case ARG_SANITY: sanityCheck = true; break;
295 		case ARG_CP_MIN:
296 			cminlen = parse<size_t>(arg);
297 			break;
298 		case ARG_CP_IVAL:
299 			cpow2 = parse<size_t>(arg);
300 			break;
301 		case ARG_TRI:
302 			doTri = true;
303 			break;
304 		case ARG_LOCAL: localAlign = true; break;
305 		case ARG_END_TO_END: localAlign = false; break;
306 		case ARG_SSE8: enable8 = true; break;
307 		case ARG_SSE8_NO: enable8 = false; break;
308 		case ARG_IGNORE_QUALS: ignoreQuals = true; break;
309 		case ARG_N_CEIL: {
310 			// Split argument by comma
311 			EList<string> args;
312 			tokenize(arg, ",", args);
313 			if(args.size() > 3) {
314 				cerr << "Error: expected 3 or fewer comma-separated "
315 					 << "arguments to --n-ceil option, got "
316 					 << args.size() << endl;
317 				throw 1;
318 			}
319 			if(args.size() == 0) {
320 				cerr << "Error: expected at least one argument to --n-ceil option" << endl;
321 				throw 1;
322 			}
323 			PARSE_FUNC(nCeil);
324 			break;
325 		}
326 		case ARG_SCORE_MA: {
327 			// Split argument by comma
328 			EList<string> args;
329 			tokenize(arg, ",", args);
330 			if(args.size() != 1) {
331 				cerr << "Error parsing --ma; RHS must have 1 token" << endl;
332 				assert(false); throw 1;
333 			}
334 			string tmp = args[0];
335 			istringstream tmpss(tmp);
336 			tmpss >> bonusMatch;
337 			break;
338 		}
339 		case ARG_SCORE_MMP: {
340 			// Split argument by comma
341 			EList<string> args;
342 			tokenize(arg, ",", args);
343 			if(args.size() > 3) {
344 				cerr << "Error parsing --mmp "
345 				     << "; RHS must have at most 3 tokens" << endl;
346 				assert(false); throw 1;
347 			}
348 			if(args[0][0] == 'C') {
349 				string tmp = args[0].substr(1);
350 				// Parse constant penalty
351 				istringstream tmpss(tmp);
352 				tmpss >> penMmcMax;
353 				penMmcMin = penMmcMax;
354 				// Parse constant penalty
355 				penMmcType = COST_MODEL_CONSTANT;
356 			} else if(args[0][0] == 'Q') {
357 				if(args.size() >= 2) {
358 					string tmp = args[1];
359 					istringstream tmpss(tmp);
360 					tmpss >> penMmcMax;
361 				} else {
362 					penMmcMax = DEFAULT_MM_PENALTY_MAX;
363 				}
364 				if(args.size() >= 3) {
365 					string tmp = args[2];
366 					istringstream tmpss(tmp);
367 					tmpss >> penMmcMin;
368 				} else {
369 					penMmcMin = DEFAULT_MM_PENALTY_MIN;
370 				}
371 				if(penMmcMin > penMmcMax) {
372 					cerr << "Error: Maximum mismatch penalty (" << penMmcMax
373 					     << ") is less than minimum penalty (" << penMmcMin
374 						 << endl;
375 					throw 1;
376 				}
377 				// Set type to =quality
378 				penMmcType = COST_MODEL_QUAL;
379 			} else if(args[0][0] == 'R') {
380 				// Set type to=Maq-quality
381 				penMmcType = COST_MODEL_ROUNDED_QUAL;
382 			} else {
383 				cerr << "Error parsing --mmp "
384 				     << "; RHS must start with C, Q or R" << endl;
385 				assert(false); throw 1;
386 			}
387 			break;
388 		}
389 		case ARG_SCORE_NP: {
390 			// Split argument by comma
391 			EList<string> args;
392 			tokenize(arg, ",", args);
393 			if(args.size() != 1) {
394 				cerr << "Error parsing --np "
395 				     << "; RHS must have 1 token" << endl;
396 				assert(false); throw 1;
397 			}
398 			if(args[0][0] == 'C') {
399 				string tmp = args[0].substr(1);
400 				// Parse constant penalty
401 				istringstream tmpss(tmp);
402 				tmpss >> penN;
403 				// Parse constant penalty
404 				penNType = COST_MODEL_CONSTANT;
405 			} else if(args[0][0] == 'Q') {
406 				// Set type to =quality
407 				penNType = COST_MODEL_QUAL;
408 			} else if(args[0][0] == 'R') {
409 				// Set type to=Maq-quality
410 				penNType = COST_MODEL_ROUNDED_QUAL;
411 			} else {
412 				cerr << "Error parsing --np "
413 				     << "; RHS must start with C, Q or R" << endl;
414 				assert(false); throw 1;
415 			}
416 		}
417 		case ARG_SCORE_RDG: {
418 			EList<string> args;
419 			tokenize(arg, ",", args);
420 			if(args.size() >= 1) {
421 				istringstream tmpss(args[0]);
422 				tmpss >> penRdGapConst;
423 			} else {
424 				penRdGapConst = DEFAULT_READ_GAP_CONST;
425 			}
426 			if(args.size() >= 2) {
427 				istringstream tmpss(args[1]);
428 				tmpss >> penRdGapLinear;
429 			} else {
430 				penRdGapLinear = DEFAULT_READ_GAP_LINEAR;
431 			}
432 		}
433 		case ARG_SCORE_RFG: {
434 			EList<string> args;
435 			tokenize(arg, ",", args);
436 			if(args.size() >= 1) {
437 				istringstream tmpss(args[0]);
438 				tmpss >> penRfGapConst;
439 			} else {
440 				penRfGapConst = DEFAULT_REF_GAP_CONST;
441 			}
442 			if(args.size() >= 2) {
443 				istringstream tmpss(args[1]);
444 				tmpss >> penRfGapLinear;
445 			} else {
446 				penRfGapLinear = DEFAULT_REF_GAP_LINEAR;
447 			}
448 		}
449 		case ARG_SCORE_MIN: {
450 			EList<string> args;
451 			tokenize(arg, ",", args);
452 			if(args.size() > 3 && args.size() == 0) {
453 				cerr << "Error: expected 3 or fewer comma-separated "
454 					 << "arguments to --n-ceil option, got "
455 					 << args.size() << endl;
456 				throw 1;
457 			}
458 			PARSE_FUNC(scoreMin);
459 			break;
460 		}
461 		case 'S': outfile = arg; break;
462 		case 'U': {
463 			EList<string> args;
464 			tokenize(arg, ",", args);
465 			for(size_t i = 0; i < args.size(); i++) {
466 				queries.push_back(args[i]);
467 			}
468 			break;
469 		}
470 		case ARG_VERSION: showVersion = 1; break;
471 		default:
472 			printUsage(cerr);
473 			throw 1;
474 	}
475 }
476 
477 /**
478  * Read command-line arguments
479  */
parseOptions(int argc,const char ** argv)480 static void parseOptions(int argc, const char **argv) {
481 	int option_index = 0;
482 	int next_option;
483 	while(true) {
484 		next_option = getopt_long(
485 			argc, const_cast<char**>(argv),
486 			short_options, long_options, &option_index);
487 		const char * arg = optarg;
488 		if(next_option == EOF) {
489 			break;
490 		}
491 		parseOption(next_option, arg);
492 	}
493 	// If both -s and -u are used, we need to adjust qUpto accordingly
494 	// since it uses rdid to know if we've reached the -u limit (and
495 	// rdids are all shifted up by skipReads characters)
496 	if(qUpto + skipReads > qUpto) {
497 		qUpto += skipReads;
498 	}
499 	if(gGapBarrier < 1) {
500 		cerr << "Warning: --gbar was set less than 1 (=" << gGapBarrier
501 		     << "); setting to 1 instead" << endl;
502 		gGapBarrier = 1;
503 	}
504 #ifndef NDEBUG
505 	if(!gQuiet) {
506 		cerr << "Warning: Running in debug mode.  Please use debug mode only "
507 			 << "for diagnosing errors, and not for typical use of Bowtie 2."
508 			 << endl;
509 	}
510 #endif
511 }
512 
513 struct DpProblem {
resetDpProblem514 	void reset() {
515 		ref.clear();
516 	}
517 
518 	TRefId   refidx;
519 	TRefOff  reflen;
520 	TAlScore minsc;
521 	BTString ref;
522 	bool     fw;
523 	DPRect   rect;
524 	bool     aligned;
525 	TAlScore score;
526 };
527 
528 class DpLogReader {
529 
530 public:
531 
DpLogReader()532 	DpLogReader() { }
533 
~DpLogReader()534 	~DpLogReader() { reset(); }
535 
init(const string & fn)536 	void init(const string& fn) {
537 		reset();
538 		fn_ = fn;
539 		ih_.open(fn_.c_str());
540 		ih_.sync_with_stdio(false);
541 	}
542 
reset()543 	void reset() {
544 		if(ih_.is_open()) {
545 			ih_.close();
546 		}
547 	}
548 
nextRead(BTDnaString & seq,BTString & qual,EList<DpProblem> & refs)549 	bool nextRead(
550 		BTDnaString& seq,
551 		BTString& qual,
552 		EList<DpProblem>& refs)
553 	{
554 		if(done()) {
555 			return false;
556 		}
557 		ln_.clear();
558 		getline(ih_, ln_);
559 		while(ln_.empty() && ih_.good()) {
560 			getline(ih_, ln_);
561 		}
562 		if(ln_.empty() && !ih_.good()) {
563 			return false;
564 		}
565 		EList<string> buf;
566 		tokenize(ln_, '\t', buf);
567 		assert_gt(buf.size(), 2);
568 		seq.install(buf[0].c_str(), true);
569 		qual = buf[1];
570 		for(size_t i = 2; i < buf.size(); i++) {
571 			refs.expand();
572 			istringstream is(buf[i]);
573 			char comma, tmp;
574 			// ref idx
575 			is >> refs.back().refidx;
576 			is >> comma; assert_eq(',', comma);
577 			// ref length
578 			is >> refs.back().reflen;
579 			is >> comma; assert_eq(',', comma);
580 			// minimum score
581 			is >> refs.back().minsc;
582 			is >> comma; assert_eq(',', comma);
583 			// read orientation
584 			is >> tmp;
585 			assert(tmp == '-' || tmp == '+');
586 			refs.back().fw = (tmp == '+');
587 			is >> comma; assert_eq(',', comma);
588 			// DP rectangle
589 			is >> refs.back().rect.refl;
590 			is >> comma; assert_eq(',', comma);
591 			is >> refs.back().rect.refr;
592 			is >> comma; assert_eq(',', comma);
593 			is >> refs.back().rect.refl_pretrim;
594 			is >> comma; assert_eq(',', comma);
595 			is >> refs.back().rect.refr_pretrim;
596 			is >> comma; assert_eq(',', comma);
597 			is >> refs.back().rect.triml;
598 			is >> comma; assert_eq(',', comma);
599 			is >> refs.back().rect.trimr;
600 			is >> comma; assert_eq(',', comma);
601 			is >> refs.back().rect.corel;
602 			is >> comma; assert_eq(',', comma);
603 			is >> refs.back().rect.corer;
604 			is >> comma; assert_eq(',', comma);
605 			is >> refs.back().rect.maxgap;
606 			is >> comma; assert_eq(',', comma);
607 			// reference string
608 			string ref;
609 			while(true) {
610 				char c;
611 				is >> c;
612 				if(c == ',') break;
613 				ref.push_back(c);
614 			}
615 			refs.back().ref.install(ref.c_str());
616 			for(size_t i = 0; i < ref.length(); i++) {
617 				int m = asc2dnamask[(int)refs.back().ref[i]];
618 				if(m == 15) {
619 					m = 16; // N
620 				}
621 				refs.back().ref.set(m, i);
622 			}
623 			// whether the DP alignment was successful
624 			int aligned;
625 			is >> aligned;
626 			refs.back().aligned = (aligned == 1);
627 			// alignment score
628 			is >> comma; assert_eq(',', comma);
629 			is >> refs.back().score;
630 		}
631 		return true;
632 	}
633 
done() const634 	bool done() const {
635 		return !ih_.good();
636 	}
637 
638 protected:
639 
640 	string   fn_; // file name
641 	ifstream ih_; // file handle
642 	string   ln_; // line buffer
643 };
644 
main(int argc,const char ** argv)645 int main(int argc, const char **argv) {
646 	try {
647 		// Reset all global state, including getopt state
648 		opterr = optind = 1;
649 		resetOptions();
650 		parseOptions(argc, argv);
651 		if(showVersion) {
652 			if(sizeof(void*) == 4) {
653 				cout << "32-bit" << endl;
654 			} else if(sizeof(void*) == 8) {
655 				cout << "64-bit" << endl;
656 			} else {
657 				cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
658 			}
659 			cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
660 				 << sizeof(int)
661 				 << ", " << sizeof(long) << ", " << sizeof(long long)
662 				 << ", " << sizeof(void *) << ", " << sizeof(size_t)
663 				 << ", " << sizeof(off_t) << "}" << endl;
664 			return 0;
665 		}
666 		while(optind < argc) {
667 			queries.push_back(argv[optind++]);
668 		}
669 		{
670 			// Optionally summarize
671 			if(gVerbose) {
672 				cout << "DP inputs:" << endl;
673 				for(size_t i = 0; i < queries.size(); i++) {
674 					cout << "  " << queries[i].c_str() << endl;
675 				}
676 				cout << "Output file: \"" << outfile.c_str() << "\"" << endl;
677 				cout << "Sanity checking: " << (sanityCheck? "enabled":"disabled") << endl;
678 			#ifdef NDEBUG
679 				cout << "Assertions: disabled" << endl;
680 			#else
681 				cout << "Assertions: enabled" << endl;
682 			#endif
683 			}
684 		}
685 		// Do stuff
686 		SwAligner sw(NULL);
687 		DpLogReader logrd;
688 		Scoring sc(
689 			bonusMatch,     // constant reward for match
690 			penMmcType,     // how to penalize mismatches
691 			penMmcMax,      // max mm pelanty
692 			penMmcMin,      // min mm pelanty
693 			scoreMin,       // min score as function of read len
694 			nCeil,          // max # Ns as function of read len
695 			penNType,       // how to penalize Ns in the read
696 			penN,           // constant if N pelanty is a constant
697 			penNCatPair,    // whether to concat mates before N filtering
698 			penRdGapConst,  // constant coeff for read gap cost
699 			penRfGapConst,  // constant coeff for ref gap cost
700 			penRdGapLinear, // linear coeff for read gap cost
701 			penRfGapLinear, // linear coeff for ref gap cost
702 			gGapBarrier);   // # rows at top/bot only entered diagonally
703 		RandomSource rnd(seed);
704 		{
705 			Timer tim(std::cerr, "Alignment ", true);
706 			BTDnaString seq, seqrc;
707 			BTString qual, qualrc;
708 			EList<DpProblem> probs;
709 			size_t qid = 0;
710 			size_t totnuc = 0, totcup = 0;
711 			for(size_t i = 0; i < queries.size(); i++) {
712 				logrd.init(queries[i]);
713 				while(logrd.nextRead(seq, qual, probs)) {
714 					totnuc += seq.length();
715 					seqrc = seq;
716 					seqrc.reverseComp();
717 					qualrc = qual;
718 					qualrc.reverse();
719 					//cerr << "Initing read with " << probs.size() << " problems" << endl;
720 					sw.initRead(seq, seqrc, qual, qualrc, 0, seq.length(), sc);
721 					// Calculate minimum score
722 					bool extend = true;
723 					for(size_t j = 0; j < probs.size(); j++) {
724 						sw.initRef(
725 							probs[j].fw,
726 							probs[j].refidx,
727 							probs[j].rect,
728 							const_cast<char *>(probs[j].ref.toZBuf()),
729 							0,
730 							probs[j].ref.length(),
731 							probs[j].reflen,
732 							sc,
733 							probs[j].minsc,
734 							enable8,
735 							cminlen,
736 							cpow2,
737 							doTri,
738 							extend);
739 						// Now fill the dynamic programming matrix and return true iff
740 						// there is at least one valid alignment
741 						TAlScore bestCell = std::numeric_limits<TAlScore>::min();
742 						ASSERT_ONLY(bool aligned =) sw.align(bestCell);
743 						assert(aligned == probs[j].aligned);
744 						assert(!aligned || bestCell == probs[j].score);
745 						totcup += (seq.length() * probs[j].ref.length());
746 					}
747 					seq.clear();  seqrc.clear();
748 					qual.clear(); qualrc.clear();
749 					probs.clear();
750 					qid++;
751 				}
752 			}
753 			size_t el = (size_t)tim.elapsed();
754 			double cups = 0.0;
755 			double totnucps = 0.0;
756 			double readps = 0.0;
757 			if(el > 0) {
758 				cups = totcup / (double)el;
759 				totnucps = totnuc / (double)el;
760 				readps = qid / (double)el;
761 			}
762 			cerr << qid << " reads" << endl;
763 			cerr << std::setprecision(4) << "  " << readps << " reads per second" << endl;
764 			cerr << totnuc << " nucleotides" << endl;
765 			cerr << std::setprecision(4) << "  " << totnucps << " nucleotides per second" << endl;
766 			cerr << totcup << " cell updates" << endl;
767 			cerr << std::setprecision(4) << "  " << cups << " cell updates per second (CUPS)" << endl;
768 		}
769 		return 0;
770 	} catch(std::exception& e) {
771 		cerr << "Error: Encountered exception: '" << e.what() << "'" << endl;
772 		cerr << "Command: ";
773 		for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
774 		cerr << endl;
775 		return 1;
776 	} catch(int e) {
777 		if(e != 0) {
778 			cerr << "Error: Encountered internal Bowtie 2 exception (#" << e << ")" << endl;
779 			cerr << "Command: ";
780 			for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
781 			cerr << endl;
782 		}
783 		return e;
784 	}
785 }
786 
787