1 /*
2  * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3  *
4  * This file is part of Bowtie 2.
5  *
6  * Bowtie 2 is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Bowtie 2 is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <string>
21 #include <iostream>
22 #include <sstream>
23 #include <limits>
24 #include "ds.h"
25 #include "aligner_seed_policy.h"
26 #include "mem_ids.h"
27 
28 using namespace std;
29 
parseFuncType(const std::string & otype)30 static int parseFuncType(const std::string& otype) {
31 	string type = otype;
32 	if(type == "C" || type == "Constant") {
33 		return SIMPLE_FUNC_CONST;
34 	} else if(type == "L" || type == "Linear") {
35 		return SIMPLE_FUNC_LINEAR;
36 	} else if(type == "S" || type == "Sqrt") {
37 		return SIMPLE_FUNC_SQRT;
38 	} else if(type == "G" || type == "Log") {
39 		return SIMPLE_FUNC_LOG;
40 	}
41 	std::cerr << "Error: Bad function type '" << otype.c_str()
42 	          << "'.  Should be C (constant), L (linear), "
43 	          << "S (square root) or G (natural log)." << std::endl;
44 	throw 1;
45 }
46 
47 #define PARSE_FUNC(fv) { \
48 	if(ctoks.size() >= 1) { \
49 		fv.setType(parseFuncType(ctoks[0])); \
50 	} \
51 	if(ctoks.size() >= 2) { \
52 		double co; \
53 		istringstream tmpss(ctoks[1]); \
54 		tmpss >> co; \
55 		fv.setConst(co); \
56 	} \
57 	if(ctoks.size() >= 3) { \
58 		double ce; \
59 		istringstream tmpss(ctoks[2]); \
60 		tmpss >> ce; \
61 		fv.setCoeff(ce); \
62 	} \
63 	if(ctoks.size() >= 4) { \
64 		double mn; \
65 		istringstream tmpss(ctoks[3]); \
66 		tmpss >> mn; \
67 		fv.setMin(mn); \
68 	} \
69 	if(ctoks.size() >= 5) { \
70 		double mx; \
71 		istringstream tmpss(ctoks[4]); \
72 		tmpss >> mx; \
73 		fv.setMin(mx); \
74 	} \
75 }
76 
77 /**
78  * Parse alignment policy when provided in this format:
79  * <lab>=<val>;<lab>=<val>;<lab>=<val>...
80  *
81  * And label=value possibilities are:
82  *
83  * Bonus for a match
84  * -----------------
85  *
86  * MA=xx (default: MA=0, or MA=2 if --local is set)
87  *
88  *    xx = Each position where equal read and reference characters match up
89  *         in the alignment contriubtes this amount to the total score.
90  *
91  * Penalty for a mismatch
92  * ----------------------
93  *
94  * MMP={Cxx|Q|RQ} (default: MMP=C6)
95  *
96  *   Cxx = Each mismatch costs xx.  If MMP=Cxx is specified, quality
97  *         values are ignored when assessing penalities for mismatches.
98  *   Q   = Each mismatch incurs a penalty equal to the mismatched base's
99  *         value.
100  *   R   = Each mismatch incurs a penalty equal to the mismatched base's
101  *         rounded quality value.  Qualities are rounded off to the
102  *         nearest 10, and qualities greater than 30 are rounded to 30.
103  *
104  * Penalty for position with N (in either read or reference)
105  * ---------------------------------------------------------
106  *
107  * NP={Cxx|Q|RQ} (default: NP=C1)
108  *
109  *   Cxx = Each alignment position with an N in either the read or the
110  *         reference costs xx.  If NP=Cxx is specified, quality values are
111  *         ignored when assessing penalities for Ns.
112  *   Q   = Each alignment position with an N in either the read or the
113  *         reference incurs a penalty equal to the read base's quality
114  *         value.
115  *   R   = Each alignment position with an N in either the read or the
116  *         reference incurs a penalty equal to the read base's rounded
117  *         quality value.  Qualities are rounded off to the nearest 10,
118  *         and qualities greater than 30 are rounded to 30.
119  *
120  * Penalty for a read gap
121  * ----------------------
122  *
123  * RDG=xx,yy (default: RDG=5,3)
124  *
125  *   xx    = Read gap open penalty.
126  *   yy    = Read gap extension penalty.
127  *
128  * Total cost incurred by a read gap = xx + (yy * gap length)
129  *
130  * Penalty for a reference gap
131  * ---------------------------
132  *
133  * RFG=xx,yy (default: RFG=5,3)
134  *
135  *   xx    = Reference gap open penalty.
136  *   yy    = Reference gap extension penalty.
137  *
138  * Total cost incurred by a reference gap = xx + (yy * gap length)
139  *
140  * Minimum score for valid alignment
141  * ---------------------------------
142  *
143  * MIN=xx,yy (defaults: MIN=-0.6,-0.6, or MIN=0.0,0.66 if --local is set)
144  *
145  *   xx,yy = For a read of length N, the total score must be at least
146  *           xx + (read length * yy) for the alignment to be valid.  The
147  *           total score is the sum of all negative penalties (from
148  *           mismatches and gaps) and all positive bonuses.  The minimum
149  *           can be negative (and is by default in global alignment mode).
150  *
151  * Score floor for local alignment
152  * -------------------------------
153  *
154  * FL=xx,yy (defaults: FL=-Infinity,0.0, or FL=0.0,0.0 if --local is set)
155  *
156  *   xx,yy = If a cell in the dynamic programming table has a score less
157  *           than xx + (read length * yy), then no valid alignment can go
158  *           through it.  Defaults are highly recommended.
159  *
160  * N ceiling
161  * ---------
162  *
163  * NCEIL=xx,yy (default: NCEIL=0.0,0.15)
164  *
165  *   xx,yy = For a read of length N, the number of alignment
166  *           positions with an N in either the read or the
167  *           reference cannot exceed
168  *           ceiling = xx + (read length * yy).  If the ceiling is
169  *           exceeded, the alignment is considered invalid.
170  *
171  * Seeds
172  * -----
173  *
174  * SEED=mm,len,ival (default: SEED=0,22)
175  *
176  *   mm   = Maximum number of mismatches allowed within a seed.
177  *          Must be >= 0 and <= 2.  Note that 2-mismatch mode is
178  *          not fully sensitive; i.e. some 2-mismatch seed
179  *          alignments may be missed.
180  *   len  = Length of seed.
181  *   ival = Interval between seeds.  If not specified, seed
182  *          interval is determined by IVAL.
183  *
184  * Seed interval
185  * -------------
186  *
187  * IVAL={L|S|C},xx,yy (default: IVAL=S,1.0,0.0)
188  *
189  *   L  = let interval between seeds be a linear function of the
190  *        read length.  xx and yy are the constant and linear
191  *        coefficients respectively.  In other words, the interval
192  *        equals a * len + b, where len is the read length.
193  *        Intervals less than 1 are rounded up to 1.
194  *   S  = let interval between seeds be a function of the sqaure
195  *        root of the  read length.  xx and yy are the
196  *        coefficients.  In other words, the interval equals
197  *        a * sqrt(len) + b, where len is the read length.
198  *        Intervals less than 1 are rounded up to 1.
199  *   C  = Like S but uses cube root of length instead of square
200  *        root.
201  *
202  * Example 1:
203  *
204  *  SEED=1,10,5 and read sequence is TGCTATCGTACGATCGTAC:
205  *
206  *  The following seeds are extracted from the forward
207  *  representation of the read and aligned to the reference
208  *  allowing up to 1 mismatch:
209  *
210  *  Read:    TGCTATCGTACGATCGTACA
211  *
212  *  Seed 1+: TGCTATCGTA
213  *  Seed 2+:      TCGTACGATC
214  *  Seed 3+:           CGATCGTACA
215  *
216  *  ...and the following are extracted from the reverse-complement
217  *  representation of the read and align to the reference allowing
218  *  up to 1 mismatch:
219  *
220  *  Seed 1-: TACGATAGCA
221  *  Seed 2-:      GATCGTACGA
222  *  Seed 3-:           TGTACGATCG
223  *
224  * Example 2:
225  *
226  *  SEED=1,20,20 and read sequence is TGCTATCGTACGATC.  The seed
227  *  length is 20 but the read is only 15 characters long.  In this
228  *  case, Bowtie2 automatically shrinks the seed length to be equal
229  *  to the read length.
230  *
231  *  Read:    TGCTATCGTACGATC
232  *
233  *  Seed 1+: TGCTATCGTACGATC
234  *  Seed 1-: GATCGTACGATAGCA
235  *
236  * Example 3:
237  *
238  *  SEED=1,10,10 and read sequence is TGCTATCGTACGATC.  Only one seed
239  *  fits on the read; a second seed would overhang the end of the read
240  *  by 5 positions.  In this case, Bowtie2 extracts one seed.
241  *
242  *  Read:    TGCTATCGTACGATC
243  *
244  *  Seed 1+: TGCTATCGTA
245  *  Seed 1-: TACGATAGCA
246  */
parseString(const std::string & s,bool local,bool noisyHpolymer,bool ignoreQuals,int & bonusMatchType,int & bonusMatch,int & penMmcType,int & penMmcMax,int & penMmcMin,int & penScMax,int & penScMin,int & penNType,int & penN,int & penRdExConst,int & penRfExConst,int & penRdExLinear,int & penRfExLinear,SimpleFunc & costMin,SimpleFunc & nCeil,bool & nCatPair,int & multiseedMms,int & multiseedLen,SimpleFunc & multiseedIval,size_t & failStreak,size_t & seedRounds,SimpleFunc * penCanIntronLen,SimpleFunc * penNoncanIntronLen)247 void SeedAlignmentPolicy::parseString(
248                                       const       std::string& s,
249                                       bool        local,
250                                       bool        noisyHpolymer,
251                                       bool        ignoreQuals,
252                                       int&        bonusMatchType,
253                                       int&        bonusMatch,
254                                       int&        penMmcType,
255                                       int&        penMmcMax,
256                                       int&        penMmcMin,
257                                       int&        penScMax,
258                                       int&        penScMin,
259                                       int&        penNType,
260                                       int&        penN,
261                                       int&        penRdExConst,
262                                       int&        penRfExConst,
263                                       int&        penRdExLinear,
264                                       int&        penRfExLinear,
265                                       SimpleFunc& costMin,
266                                       SimpleFunc& nCeil,
267                                       bool&       nCatPair,
268                                       int&        multiseedMms,
269                                       int&        multiseedLen,
270                                       SimpleFunc& multiseedIval,
271                                       size_t&     failStreak,
272                                       size_t&     seedRounds,
273                                       SimpleFunc* penCanIntronLen,
274                                       SimpleFunc* penNoncanIntronLen)
275 {
276 
277 	bonusMatchType    = local ? DEFAULT_MATCH_BONUS_TYPE_LOCAL : DEFAULT_MATCH_BONUS_TYPE;
278 	bonusMatch        = local ? DEFAULT_MATCH_BONUS_LOCAL : DEFAULT_MATCH_BONUS;
279 	penMmcType        = ignoreQuals ? DEFAULT_MM_PENALTY_TYPE_IGNORE_QUALS :
280 	                                  DEFAULT_MM_PENALTY_TYPE;
281 	penMmcMax         = DEFAULT_MM_PENALTY_MAX;
282 	penMmcMin         = DEFAULT_MM_PENALTY_MIN;
283 	penNType          = DEFAULT_N_PENALTY_TYPE;
284 	penN              = DEFAULT_N_PENALTY;
285 
286     penScMax          = DEFAULT_SC_PENALTY_MAX;
287     penScMin          = DEFAULT_SC_PENALTY_MIN;
288 
289 	const double DMAX = std::numeric_limits<double>::max();
290     costMin.init(
291 		local ? SIMPLE_FUNC_LOG : SIMPLE_FUNC_LINEAR,
292 		local ? DEFAULT_MIN_CONST_LOCAL  : 0.0f,
293 		local ? DEFAULT_MIN_LINEAR_LOCAL : -0.2f);
294 	nCeil.init(
295 		SIMPLE_FUNC_LINEAR, 0.0f, DMAX,
296 		DEFAULT_N_CEIL_CONST, DEFAULT_N_CEIL_LINEAR);
297 	multiseedIval.init(
298 		DEFAULT_IVAL, 1.0f, DMAX,
299 		DEFAULT_IVAL_B, DEFAULT_IVAL_A);
300 	nCatPair          = DEFAULT_N_CAT_PAIR;
301 
302 	if(!noisyHpolymer) {
303 		penRdExConst  = DEFAULT_READ_GAP_CONST;
304 		penRdExLinear = DEFAULT_READ_GAP_LINEAR;
305 		penRfExConst  = DEFAULT_REF_GAP_CONST;
306 		penRfExLinear = DEFAULT_REF_GAP_LINEAR;
307 	} else {
308 		penRdExConst  = DEFAULT_READ_GAP_CONST_BADHPOLY;
309 		penRdExLinear = DEFAULT_READ_GAP_LINEAR_BADHPOLY;
310 		penRfExConst  = DEFAULT_REF_GAP_CONST_BADHPOLY;
311 		penRfExLinear = DEFAULT_REF_GAP_LINEAR_BADHPOLY;
312 	}
313 
314 	multiseedMms      = DEFAULT_SEEDMMS;
315 	multiseedLen      = DEFAULT_SEEDLEN;
316 
317 	EList<string> toks(MISC_CAT);
318 	string tok;
319 	istringstream ss(s);
320 	int setting = 0;
321 	// Get each ;-separated token
322 	while(getline(ss, tok, ';')) {
323 		setting++;
324 		EList<string> etoks(MISC_CAT);
325 		string etok;
326 		// Divide into tokens on either side of =
327 		istringstream ess(tok);
328 		while(getline(ess, etok, '=')) {
329 			etoks.push_back(etok);
330 		}
331 		// Must be exactly 1 =
332 		if(etoks.size() != 2) {
333 			cerr << "Error parsing alignment policy setting " << setting
334 			     << "; must be bisected by = sign" << endl
335 				 << "Policy: " << s.c_str() << endl;
336 			assert(false); throw 1;
337 		}
338 		// LHS is tag, RHS value
339 		string tag = etoks[0], val = etoks[1];
340 		// Separate value into comma-separated tokens
341 		EList<string> ctoks(MISC_CAT);
342 		string ctok;
343 		istringstream css(val);
344 		while(getline(css, ctok, ',')) {
345 			ctoks.push_back(ctok);
346 		}
347 		if(ctoks.size() == 0) {
348 			cerr << "Error parsing alignment policy setting " << setting
349 			     << "; RHS must have at least 1 token" << endl
350 				 << "Policy: " << s.c_str() << endl;
351 			assert(false); throw 1;
352 		}
353 		for(size_t i = 0; i < ctoks.size(); i++) {
354 			if(ctoks[i].length() == 0) {
355 				cerr << "Error parsing alignment policy setting " << setting
356 				     << "; token " << i+1 << " on RHS had length=0" << endl
357 					 << "Policy: " << s.c_str() << endl;
358 				assert(false); throw 1;
359 			}
360 		}
361 		// Bonus for a match
362 		// MA=xx (default: MA=0, or MA=10 if --local is set)
363 		if(tag == "MA") {
364 			if(ctoks.size() != 1) {
365 				cerr << "Error parsing alignment policy setting " << setting
366 				     << "; RHS must have 1 token" << endl
367 					 << "Policy: " << s.c_str() << endl;
368 				assert(false); throw 1;
369 			}
370 			string tmp = ctoks[0];
371 			istringstream tmpss(tmp);
372 			tmpss >> bonusMatch;
373 		}
374 		// Scoring for mismatches
375 		// MMP={Cxx|Q|RQ}
376 		//        Cxx = constant, where constant is integer xx
377 		//        Qxx = equal to quality, scaled
378 		//        R   = equal to maq-rounded quality value (rounded to nearest
379 		//              10, can't be greater than 30)
380 		else if(tag == "MMP") {
381 			if(ctoks.size() > 3) {
382 				cerr << "Error parsing alignment policy setting "
383 				     << "'" << tag.c_str() << "'"
384 				     << "; RHS must have at most 3 tokens" << endl
385 					 << "Policy: '" << s.c_str() << "'" << endl;
386 				assert(false); throw 1;
387 			}
388 			if(ctoks[0][0] == 'C') {
389 				string tmp = ctoks[0].substr(1);
390 				// Parse constant penalty
391 				istringstream tmpss(tmp);
392 				tmpss >> penMmcMax;
393 				penMmcMin = penMmcMax;
394 				// Parse constant penalty
395 				penMmcType = COST_MODEL_CONSTANT;
396 			} else if(ctoks[0][0] == 'Q') {
397 				if(ctoks.size() >= 2) {
398 					string tmp = ctoks[1];
399 					istringstream tmpss(tmp);
400 					tmpss >> penMmcMax;
401 				} else {
402 					penMmcMax = DEFAULT_MM_PENALTY_MAX;
403 				}
404 				if(ctoks.size() >= 3) {
405 					string tmp = ctoks[2];
406 					istringstream tmpss(tmp);
407 					tmpss >> penMmcMin;
408 				} else {
409 					penMmcMin = DEFAULT_MM_PENALTY_MIN;
410 				}
411 				if(penMmcMin > penMmcMax) {
412 					cerr << "Error: Maximum mismatch penalty (" << penMmcMax
413 					     << ") is less than minimum penalty (" << penMmcMin
414 						 << endl;
415 					throw 1;
416 				}
417 				// Set type to =quality
418 				penMmcType = COST_MODEL_QUAL;
419 			} else if(ctoks[0][0] == 'R') {
420 				// Set type to=Maq-quality
421 				penMmcType = COST_MODEL_ROUNDED_QUAL;
422 			} else {
423 				cerr << "Error parsing alignment policy setting "
424 				     << "'" << tag.c_str() << "'"
425 				     << "; RHS must start with C, Q or R" << endl
426 					 << "Policy: '" << s.c_str() << "'" << endl;
427 				assert(false); throw 1;
428 			}
429 		}
430         else if(tag == "SCP") {
431             if(ctoks.size() > 3) {
432                 cerr << "Error parsing alignment policy setting "
433                 << "'" << tag.c_str() << "'"
434                 << "; SCP must have at most 3 tokens" << endl
435                 << "Policy: '" << s.c_str() << "'" << endl;
436                 assert(false); throw 1;
437             }
438             istringstream tmpMax(ctoks[1]);
439             tmpMax >> penScMax;
440             istringstream tmpMin(ctoks[1]);
441             tmpMin >> penScMin;
442             if(penScMin > penScMax) {
443                 cerr << "max (" << penScMax << ") should be >= min (" << penScMin << ")" << endl;
444                 assert(false); throw 1;
445             }
446             if(penScMin < 1) {
447                 cerr << "min (" << penScMin << ") should be greater than 0" << endl;
448                 assert(false); throw 1;
449             }
450         }
451 		// Scoring for mismatches where read char=N
452 		// NP={Cxx|Q|RQ}
453 		//        Cxx = constant, where constant is integer xx
454 		//        Q   = equal to quality
455 		//        R   = equal to maq-rounded quality value (rounded to nearest
456 		//              10, can't be greater than 30)
457 		else if(tag == "NP") {
458 			if(ctoks.size() != 1) {
459 				cerr << "Error parsing alignment policy setting "
460 				     << "'" << tag.c_str() << "'"
461 				     << "; RHS must have 1 token" << endl
462 					 << "Policy: '" << s.c_str() << "'" << endl;
463 				assert(false); throw 1;
464 			}
465 			if(ctoks[0][0] == 'C') {
466 				string tmp = ctoks[0].substr(1);
467 				// Parse constant penalty
468 				istringstream tmpss(tmp);
469 				tmpss >> penN;
470 				// Parse constant penalty
471 				penNType = COST_MODEL_CONSTANT;
472 			} else if(ctoks[0][0] == 'Q') {
473 				// Set type to =quality
474 				penNType = COST_MODEL_QUAL;
475 			} else if(ctoks[0][0] == 'R') {
476 				// Set type to=Maq-quality
477 				penNType = COST_MODEL_ROUNDED_QUAL;
478 			} else {
479 				cerr << "Error parsing alignment policy setting "
480 				     << "'" << tag.c_str() << "'"
481 				     << "; RHS must start with C, Q or R" << endl
482 					 << "Policy: '" << s.c_str() << "'" << endl;
483 				assert(false); throw 1;
484 			}
485 		}
486 		// Scoring for read gaps
487 		// RDG=xx,yy,zz
488 		//        xx = read gap open penalty
489 		//        yy = read gap extension penalty constant coefficient
490 		//             (defaults to open penalty)
491 		//        zz = read gap extension penalty linear coefficient
492 		//             (defaults to 0)
493 		else if(tag == "RDG") {
494 			if(ctoks.size() >= 1) {
495 				istringstream tmpss(ctoks[0]);
496 				tmpss >> penRdExConst;
497 			} else {
498 				penRdExConst = noisyHpolymer ?
499 					DEFAULT_READ_GAP_CONST_BADHPOLY :
500 					DEFAULT_READ_GAP_CONST;
501 			}
502 			if(ctoks.size() >= 2) {
503 				istringstream tmpss(ctoks[1]);
504 				tmpss >> penRdExLinear;
505 			} else {
506 				penRdExLinear = noisyHpolymer ?
507 					DEFAULT_READ_GAP_LINEAR_BADHPOLY :
508 					DEFAULT_READ_GAP_LINEAR;
509 			}
510 		}
511 		// Scoring for reference gaps
512 		// RFG=xx,yy,zz
513 		//        xx = ref gap open penalty
514 		//        yy = ref gap extension penalty constant coefficient
515 		//             (defaults to open penalty)
516 		//        zz = ref gap extension penalty linear coefficient
517 		//             (defaults to 0)
518 		else if(tag == "RFG") {
519 			if(ctoks.size() >= 1) {
520 				istringstream tmpss(ctoks[0]);
521 				tmpss >> penRfExConst;
522 			} else {
523 				penRfExConst = noisyHpolymer ?
524 					DEFAULT_REF_GAP_CONST_BADHPOLY :
525 					DEFAULT_REF_GAP_CONST;
526 			}
527 			if(ctoks.size() >= 2) {
528 				istringstream tmpss(ctoks[1]);
529 				tmpss >> penRfExLinear;
530 			} else {
531 				penRfExLinear = noisyHpolymer ?
532 					DEFAULT_REF_GAP_LINEAR_BADHPOLY :
533 					DEFAULT_REF_GAP_LINEAR;
534 			}
535 		}
536 		// Minimum score as a function of read length
537 		// MIN=xx,yy
538 		//        xx = constant coefficient
539 		//        yy = linear coefficient
540 		else if(tag == "MIN") {
541 			PARSE_FUNC(costMin);
542 		}
543 		// Per-read N ceiling as a function of read length
544 		// NCEIL=xx,yy
545 		//        xx = N ceiling constant coefficient
546 		//        yy = N ceiling linear coefficient (set to 0 if unspecified)
547 		else if(tag == "NCEIL") {
548 			PARSE_FUNC(nCeil);
549 		}
550 		/*
551 		 * Seeds
552 		 * -----
553 		 *
554 		 * SEED=mm,len,ival (default: SEED=0,22)
555 		 *
556 		 *   mm   = Maximum number of mismatches allowed within a seed.
557 		 *          Must be >= 0 and <= 2.  Note that 2-mismatch mode is
558 		 *          not fully sensitive; i.e. some 2-mismatch seed
559 		 *          alignments may be missed.
560 		 *   len  = Length of seed.
561 		 *   ival = Interval between seeds.  If not specified, seed
562 		 *          interval is determined by IVAL.
563 		 */
564 		else if(tag == "SEED") {
565 			if(ctoks.size() > 2) {
566 				cerr << "Error parsing alignment policy setting "
567 				     << "'" << tag.c_str() << "'; RHS must have 1 or 2 tokens, "
568 					 << "had " << ctoks.size() << ".  "
569 					 << "Policy: '" << s.c_str() << "'" << endl;
570 				assert(false); throw 1;
571 			}
572 			if(ctoks.size() >= 1) {
573 				istringstream tmpss(ctoks[0]);
574 				tmpss >> multiseedMms;
575 				if(multiseedMms > 1) {
576 					cerr << "Error: -N was set to " << multiseedMms << ", but cannot be set greater than 1" << endl;
577 					throw 1;
578 				}
579 				if(multiseedMms < 0) {
580 					cerr << "Error: -N was set to a number less than 0 (" << multiseedMms << ")" << endl;
581 					throw 1;
582 				}
583 			}
584 			if(ctoks.size() >= 2) {
585 				istringstream tmpss(ctoks[1]);
586 				tmpss >> multiseedLen;
587 			} else {
588 				multiseedLen = DEFAULT_SEEDLEN;
589 			}
590 		}
591 		else if(tag == "SEEDLEN") {
592 			if(ctoks.size() > 1) {
593 				cerr << "Error parsing alignment policy setting "
594 				     << "'" << tag.c_str() << "'; RHS must have 1 token, "
595 					 << "had " << ctoks.size() << ".  "
596 					 << "Policy: '" << s.c_str() << "'" << endl;
597 				assert(false); throw 1;
598 			}
599 			if(ctoks.size() >= 1) {
600 				istringstream tmpss(ctoks[0]);
601 				tmpss >> multiseedLen;
602 			}
603 		}
604 		else if(tag == "DPS") {
605 			if(ctoks.size() > 1) {
606 				cerr << "Error parsing alignment policy setting "
607 				     << "'" << tag.c_str() << "'; RHS must have 1 token, "
608 					 << "had " << ctoks.size() << ".  "
609 					 << "Policy: '" << s.c_str() << "'" << endl;
610 				assert(false); throw 1;
611 			}
612 			if(ctoks.size() >= 1) {
613 				istringstream tmpss(ctoks[0]);
614 				tmpss >> failStreak;
615 			}
616 		}
617 		else if(tag == "ROUNDS") {
618 			if(ctoks.size() > 1) {
619 				cerr << "Error parsing alignment policy setting "
620 				     << "'" << tag.c_str() << "'; RHS must have 1 token, "
621 					 << "had " << ctoks.size() << ".  "
622 					 << "Policy: '" << s.c_str() << "'" << endl;
623 				assert(false); throw 1;
624 			}
625 			if(ctoks.size() >= 1) {
626 				istringstream tmpss(ctoks[0]);
627 				tmpss >> seedRounds;
628 			}
629 		}
630 		/*
631 		 * Seed interval
632 		 * -------------
633 		 *
634 		 * IVAL={L|S|C},a,b (default: IVAL=S,1.0,0.0)
635 		 *
636 		 *   L  = let interval between seeds be a linear function of the
637 		 *        read length.  xx and yy are the constant and linear
638 		 *        coefficients respectively.  In other words, the interval
639 		 *        equals a * len + b, where len is the read length.
640 		 *        Intervals less than 1 are rounded up to 1.
641 		 *   S  = let interval between seeds be a function of the sqaure
642 		 *        root of the  read length.  xx and yy are the
643 		 *        coefficients.  In other words, the interval equals
644 		 *        a * sqrt(len) + b, where len is the read length.
645 		 *        Intervals less than 1 are rounded up to 1.
646 		 *   C  = Like S but uses cube root of length instead of square
647 		 *        root.
648 		 */
649 		else if(tag == "IVAL") {
650 			PARSE_FUNC(multiseedIval);
651 		}
652         else if(tag == "CANINTRONLEN") {
653             assert(penCanIntronLen != NULL);
654 			PARSE_FUNC((*penCanIntronLen));
655 		}
656         else if(tag == "NONCANINTRONLEN") {
657             assert(penNoncanIntronLen != NULL);
658             PARSE_FUNC((*penNoncanIntronLen));
659         }
660 		else {
661 			// Unknown tag
662 			cerr << "Unexpected alignment policy setting "
663 				 << "'" << tag.c_str() << "'" << endl
664 				 << "Policy: '" << s.c_str() << "'" << endl;
665 			assert(false); throw 1;
666 		}
667 	}
668 }
669 
670 #ifdef ALIGNER_SEED_POLICY_MAIN
main()671 int main() {
672 
673 	int bonusMatchType;
674 	int bonusMatch;
675 	int penMmcType;
676 	int penMmc;
677     int penScMax;
678     int penScMin;
679 	int penNType;
680 	int penN;
681 	int penRdExConst;
682 	int penRfExConst;
683 	int penRdExLinear;
684 	int penRfExLinear;
685 	SimpleFunc costMin;
686 	SimpleFunc costFloor;
687 	SimpleFunc nCeil;
688 	bool nCatPair;
689 	int multiseedMms;
690 	int multiseedLen;
691 	SimpleFunc msIval;
692 	SimpleFunc posfrac;
693 	SimpleFunc rowmult;
694 	uint32_t mhits;
695 
696 	{
697 		cout << "Case 1: Defaults 1 ... ";
698 		const char *pol = "";
699 		SeedAlignmentPolicy::parseString(
700 			string(pol),
701 			false,              // --local?
702 			false,              // noisy homopolymers a la 454?
703 			false,              // ignore qualities?
704 			bonusMatchType,
705 			bonusMatch,
706 			penMmcType,
707 			penMmc,
708             penScMax,
709             penScMin,
710 			penNType,
711 			penN,
712 			penRdExConst,
713 			penRfExConst,
714 			penRdExLinear,
715 			penRfExLinear,
716 			costMin,
717 			costFloor,
718 			nCeil,
719 			nCatPair,
720 			multiseedMms,
721 			multiseedLen,
722 			msIval,
723 			mhits);
724 
725 		assert_eq(DEFAULT_MATCH_BONUS_TYPE,   bonusMatchType);
726 		assert_eq(DEFAULT_MATCH_BONUS,        bonusMatch);
727 		assert_eq(DEFAULT_MM_PENALTY_TYPE,    penMmcType);
728 		assert_eq(DEFAULT_MM_PENALTY_MAX,     penMmcMax);
729 		assert_eq(DEFAULT_MM_PENALTY_MIN,     penMmcMin);
730 		assert_eq(DEFAULT_N_PENALTY_TYPE,     penNType);
731 		assert_eq(DEFAULT_N_PENALTY,          penN);
732 		assert_eq(DEFAULT_MIN_CONST,          costMin.getConst());
733 		assert_eq(DEFAULT_MIN_LINEAR,         costMin.getCoeff());
734 		assert_eq(DEFAULT_FLOOR_CONST,        costFloor.getConst());
735 		assert_eq(DEFAULT_FLOOR_LINEAR,       costFloor.getCoeff());
736 		assert_eq(DEFAULT_N_CEIL_CONST,       nCeil.getConst());
737 		assert_eq(DEFAULT_N_CAT_PAIR,         nCatPair);
738 
739 		assert_eq(DEFAULT_READ_GAP_CONST,     penRdExConst);
740 		assert_eq(DEFAULT_READ_GAP_LINEAR,    penRdExLinear);
741 		assert_eq(DEFAULT_REF_GAP_CONST,      penRfExConst);
742 		assert_eq(DEFAULT_REF_GAP_LINEAR,     penRfExLinear);
743 		assert_eq(DEFAULT_SEEDMMS,            multiseedMms);
744 		assert_eq(DEFAULT_SEEDLEN,            multiseedLen);
745 		assert_eq(DEFAULT_IVAL,               msIval.getType());
746 		assert_eq(DEFAULT_IVAL_A,             msIval.getCoeff());
747 		assert_eq(DEFAULT_IVAL_B,             msIval.getConst());
748 
749 		cout << "PASSED" << endl;
750 	}
751 
752 	{
753 		cout << "Case 2: Defaults 2 ... ";
754 		const char *pol = "";
755 		SeedAlignmentPolicy::parseString(
756 			string(pol),
757 			false,              // --local?
758 			true,               // noisy homopolymers a la 454?
759 			false,              // ignore qualities?
760 			bonusMatchType,
761 			bonusMatch,
762 			penMmcType,
763 			penMmc,
764 
765 			penNType,
766 			penN,
767 			penRdExConst,
768 			penRfExConst,
769 			penRdExLinear,
770 			penRfExLinear,
771 			costMin,
772 			costFloor,
773 			nCeil,
774 			nCatPair,
775 			multiseedMms,
776 			multiseedLen,
777 			msIval,
778 			mhits);
779 
780 		assert_eq(DEFAULT_MATCH_BONUS_TYPE,   bonusMatchType);
781 		assert_eq(DEFAULT_MATCH_BONUS,        bonusMatch);
782 		assert_eq(DEFAULT_MM_PENALTY_TYPE,    penMmcType);
783 		assert_eq(DEFAULT_MM_PENALTY_MAX,     penMmc);
784 		assert_eq(DEFAULT_MM_PENALTY_MIN,     penMmc);
785 		assert_eq(DEFAULT_N_PENALTY_TYPE,     penNType);
786 		assert_eq(DEFAULT_N_PENALTY,          penN);
787 		assert_eq(DEFAULT_MIN_CONST,          costMin.getConst());
788 		assert_eq(DEFAULT_MIN_LINEAR,         costMin.getCoeff());
789 		assert_eq(DEFAULT_FLOOR_CONST,        costFloor.getConst());
790 		assert_eq(DEFAULT_FLOOR_LINEAR,       costFloor.getCoeff());
791 		assert_eq(DEFAULT_N_CEIL_CONST,       nCeil.getConst());
792 		assert_eq(DEFAULT_N_CAT_PAIR,         nCatPair);
793 
794 		assert_eq(DEFAULT_READ_GAP_CONST_BADHPOLY,  penRdExConst);
795 		assert_eq(DEFAULT_READ_GAP_LINEAR_BADHPOLY, penRdExLinear);
796 		assert_eq(DEFAULT_REF_GAP_CONST_BADHPOLY,   penRfExConst);
797 		assert_eq(DEFAULT_REF_GAP_LINEAR_BADHPOLY,  penRfExLinear);
798 		assert_eq(DEFAULT_SEEDMMS,            multiseedMms);
799 		assert_eq(DEFAULT_SEEDLEN,            multiseedLen);
800 		assert_eq(DEFAULT_IVAL,               msIval.getType());
801 		assert_eq(DEFAULT_IVAL_A,             msIval.getCoeff());
802 		assert_eq(DEFAULT_IVAL_B,             msIval.getConst());
803 
804 		cout << "PASSED" << endl;
805 	}
806 
807 	{
808 		cout << "Case 3: Defaults 3 ... ";
809 		const char *pol = "";
810 		SeedAlignmentPolicy::parseString(
811 			string(pol),
812 			true,               // --local?
813 			false,              // noisy homopolymers a la 454?
814 			false,              // ignore qualities?
815 			bonusMatchType,
816 			bonusMatch,
817 			penMmcType,
818 			penMmc,
819 			penNType,
820 			penN,
821 			penRdExConst,
822 			penRfExConst,
823 			penRdExLinear,
824 			penRfExLinear,
825 			costMin,
826 			costFloor,
827 			nCeil,
828 			nCatPair,
829 			multiseedMms,
830 			multiseedLen,
831 			msIval,
832 			mhits);
833 
834 		assert_eq(DEFAULT_MATCH_BONUS_TYPE_LOCAL,   bonusMatchType);
835 		assert_eq(DEFAULT_MATCH_BONUS_LOCAL,        bonusMatch);
836 		assert_eq(DEFAULT_MM_PENALTY_TYPE,    penMmcType);
837 		assert_eq(DEFAULT_MM_PENALTY_MAX,     penMmcMax);
838 		assert_eq(DEFAULT_MM_PENALTY_MIN,     penMmcMin);
839 		assert_eq(DEFAULT_N_PENALTY_TYPE,     penNType);
840 		assert_eq(DEFAULT_N_PENALTY,          penN);
841 		assert_eq(DEFAULT_MIN_CONST_LOCAL,    costMin.getConst());
842 		assert_eq(DEFAULT_MIN_LINEAR_LOCAL,   costMin.getCoeff());
843 		assert_eq(DEFAULT_FLOOR_CONST_LOCAL,  costFloor.getConst());
844 		assert_eq(DEFAULT_FLOOR_LINEAR_LOCAL, costFloor.getCoeff());
845 		assert_eq(DEFAULT_N_CEIL_CONST,       nCeil.getConst());
846 		assert_eq(DEFAULT_N_CEIL_LINEAR,      nCeil.getCoeff());
847 		assert_eq(DEFAULT_N_CAT_PAIR,         nCatPair);
848 
849 		assert_eq(DEFAULT_READ_GAP_CONST,     penRdExConst);
850 		assert_eq(DEFAULT_READ_GAP_LINEAR,    penRdExLinear);
851 		assert_eq(DEFAULT_REF_GAP_CONST,      penRfExConst);
852 		assert_eq(DEFAULT_REF_GAP_LINEAR,     penRfExLinear);
853 		assert_eq(DEFAULT_SEEDMMS,            multiseedMms);
854 		assert_eq(DEFAULT_SEEDLEN,            multiseedLen);
855 		assert_eq(DEFAULT_IVAL,               msIval.getType());
856 		assert_eq(DEFAULT_IVAL_A,             msIval.getCoeff());
857 		assert_eq(DEFAULT_IVAL_B,             msIval.getConst());
858 
859 		cout << "PASSED" << endl;
860 	}
861 
862 	{
863 		cout << "Case 4: Simple string 1 ... ";
864 		const char *pol = "MMP=C44;MA=4;RFG=24,12;FL=C,8;RDG=2;NP=C4;MIN=C,7";
865 		SeedAlignmentPolicy::parseString(
866 			string(pol),
867 			true,               // --local?
868 			false,              // noisy homopolymers a la 454?
869 			false,              // ignore qualities?
870 			bonusMatchType,
871 			bonusMatch,
872 			penMmcType,
873 			penMmc,
874 			penNType,
875 			penN,
876 			penRdExConst,
877 			penRfExConst,
878 			penRdExLinear,
879 			penRfExLinear,
880 			costMin,
881 			costFloor,
882 			nCeil,
883 			nCatPair,
884 			multiseedMms,
885 			multiseedLen,
886 			msIval,
887 			mhits);
888 
889 		assert_eq(COST_MODEL_CONSTANT,        bonusMatchType);
890 		assert_eq(4,                          bonusMatch);
891 		assert_eq(COST_MODEL_CONSTANT,        penMmcType);
892 		assert_eq(44,                         penMmc);
893 		assert_eq(COST_MODEL_CONSTANT,        penNType);
894 		assert_eq(4.0f,                       penN);
895 		assert_eq(7,                          costMin.getConst());
896 		assert_eq(DEFAULT_MIN_LINEAR_LOCAL,   costMin.getCoeff());
897 		assert_eq(8,                          costFloor.getConst());
898 		assert_eq(DEFAULT_FLOOR_LINEAR_LOCAL, costFloor.getCoeff());
899 		assert_eq(DEFAULT_N_CEIL_CONST,       nCeil.getConst());
900 		assert_eq(DEFAULT_N_CEIL_LINEAR,      nCeil.getCoeff());
901 		assert_eq(DEFAULT_N_CAT_PAIR,         nCatPair);
902 
903 		assert_eq(2.0f,                       penRdExConst);
904 		assert_eq(DEFAULT_READ_GAP_LINEAR,    penRdExLinear);
905 		assert_eq(24.0f,                      penRfExConst);
906 		assert_eq(12.0f,                      penRfExLinear);
907 		assert_eq(DEFAULT_SEEDMMS,            multiseedMms);
908 		assert_eq(DEFAULT_SEEDLEN,            multiseedLen);
909 		assert_eq(DEFAULT_IVAL,               msIval.getType());
910 		assert_eq(DEFAULT_IVAL_A,             msIval.getCoeff());
911 		assert_eq(DEFAULT_IVAL_B,             msIval.getConst());
912 
913 		cout << "PASSED" << endl;
914 	}
915 }
916 #endif /*def ALIGNER_SEED_POLICY_MAIN*/
917