1 /*
2  * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3  *
4  * This file is part of Bowtie 2.
5  *
6  * Bowtie 2 is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Bowtie 2 is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 /*
21  * unique.h
22  *
23  * Encapsulates objects and routines for determining whether and to
24  * what extend the best alignment for a read is "unique."  In the
25  * simplest scenario, uniqueness is determined by whether we found only
26  * one alignment.  More complex scenarios might assign a uniqueness
27  * score based that is a function of (of a summarized version of): all
28  * the alignments found and their scores.
29  *
30  * Since mapping quality is related to uniqueness, objects and
31  * routings for calculating mapping quality are also included here.
32  */
33 
34 #ifndef UNIQUE_H_
35 #define UNIQUE_H_
36 
37 #include <algorithm>
38 #include <string>
39 #include "aligner_result.h"
40 #include "simple_func.h"
41 #include "util.h"
42 #include "scoring.h"
43 
44 typedef int64_t TMapq;
45 
46 /**
47  * Class that returns yes-or-no answers to the question of whether a
48  */
49 class Uniqueness {
50 public:
51 
52 	/**
53 	 * Given an AlnSetSumm, determine if the best alignment is "unique"
54 	 * according to some definition.
55 	 */
bestIsUnique(const AlnSetSumm & s,const AlnFlags & flags,bool mate1,size_t rdlen,size_t ordlen,char * inps)56 	static bool bestIsUnique(
57 		const AlnSetSumm& s,
58 		const AlnFlags& flags,
59 		bool mate1,
60 		size_t rdlen,
61 		size_t ordlen,
62 		char *inps)
63 	{
64 		assert(!s.empty());
65 		return !VALID_AL_SCORE(s.bestUnchosenScore(mate1));
66 	}
67 };
68 
69 /**
70  * Collection of routines for calculating mapping quality.
71  */
72 class Mapq {
73 
74 public:
75 
~Mapq()76 	virtual ~Mapq() { }
77 
78 	virtual TMapq mapq(
79 		const AlnSetSumm& s,
80 		const AlnFlags& flags,
81 		bool mate1,
82 		size_t rdlen,
83 		size_t ordlen,
84 		char *inps) const = 0;
85 };
86 
87 extern const TMapq unp_nosec_perf;
88 extern const TMapq unp_nosec[11];
89 extern const TMapq unp_sec_perf[11];
90 extern const TMapq unp_sec[11][11];
91 extern const TMapq pair_nosec_perf;
92 
93 /**
94  * V3 of the MAPQ calculator
95  */
96 class BowtieMapq3 : public Mapq {
97 
98 public:
99 
BowtieMapq3(const SimpleFunc & scoreMin,const Scoring & sc)100 	BowtieMapq3(
101 		const SimpleFunc& scoreMin,
102 		const Scoring& sc) :
103 		scoreMin_(scoreMin),
104 		sc_(sc)
105 	{ }
106 
~BowtieMapq3()107 	virtual ~BowtieMapq3() { }
108 
109 	/**
110 	 * Given an AlnSetSumm, return a mapping quality calculated.
111 	 */
mapq(const AlnSetSumm & s,const AlnFlags & flags,bool mate1,size_t rdlen,size_t ordlen,char * inps)112 	virtual TMapq mapq(
113 		const AlnSetSumm& s,
114 		const AlnFlags& flags,
115 		bool mate1,
116 		size_t rdlen,
117 		size_t ordlen,
118 		char *inps)     // put string representation of inputs here
119 		const
120 	{
121 		if(s.paired()) {
122 			return pair_nosec_perf;
123 		} else {
124 			bool hasSecbest = VALID_AL_SCORE(s.bestUnchosenScore(mate1));
125 			if(!flags.canMax() && !s.exhausted(mate1) && !hasSecbest) {
126 				return 255;
127 			}
128 			TAlScore scMax = (TAlScore)sc_.perfectScore(rdlen);
129 			TAlScore scMin = scoreMin_.f<TAlScore>((float)rdlen);
130 			assert_geq(scMax, scMin);
131 			TAlScore best  = scMax - s.bestScore(mate1).score(); // best score (lower=better)
132 			size_t best_bin = (size_t)((double)best * (10.0 / (double)(scMax - scMin)) + 0.5);
133 			assert_geq(best_bin, 0);
134 			assert_lt(best_bin, 11);
135 			if(hasSecbest) {
136 				assert_geq(s.bestScore(mate1).score(), s.bestUnchosenScore(mate1).score());
137 				size_t diff = s.bestScore(mate1).score() - s.bestUnchosenScore(mate1).score();
138 				size_t diff_bin = (size_t)((double)diff * (10.0 / (double)(scMax - scMin)) + 0.5);
139 				assert_geq(diff_bin, 0);
140 				assert_lt(diff_bin, 11);
141 				// A valid second-best alignment was found
142 				if(best == scMax) {
143 					// Best alignment has perfect score
144 					return unp_sec_perf[best_bin];
145 				} else {
146 					// Best alignment has less than perfect score
147 					return unp_sec[diff_bin][best_bin];
148 				}
149 			} else {
150 				// No valid second-best alignment was found
151 				if(best == scMax) {
152 					// Best alignment has perfect score
153 					return unp_nosec_perf;
154 				} else {
155 					// Best alignment has less than perfect score
156 					return unp_nosec[best_bin];
157 				}
158 			}
159 		}
160 	}
161 
162 protected:
163 
164 	SimpleFunc      scoreMin_;
165 	const Scoring&  sc_;
166 };
167 
168 /**
169  * V2 of the MAPQ calculator
170  */
171 class BowtieMapq2 : public Mapq {
172 
173 public:
174 
BowtieMapq2(const SimpleFunc & scoreMin,const Scoring & sc)175 	BowtieMapq2(
176 		const SimpleFunc& scoreMin,
177 		const Scoring& sc) :
178 		scoreMin_(scoreMin),
179 		sc_(sc)
180 	{ }
181 
~BowtieMapq2()182 	virtual ~BowtieMapq2() { }
183 
184 	/**
185 	 * Given an AlnSetSumm, return a mapping quality calculated.
186 	 */
mapq(const AlnSetSumm & s,const AlnFlags & flags,bool mate1,size_t rdlen,size_t ordlen,char * inps)187 	virtual TMapq mapq(
188 		const AlnSetSumm& s,
189 		const AlnFlags&   flags,
190 		bool              mate1,
191 		size_t            rdlen,
192 		size_t            ordlen,
193 		char             *inps)   // put string representation of inputs here
194 		const
195 	{
196 		// Did the read have a second-best alignment?
197 		bool hasSecbest = s.paired() ?
198 			VALID_AL_SCORE(s.bestUnchosenCScore()) :
199 			VALID_AL_SCORE(s.bestUnchosenScore(mate1));
200 		// This corresponds to a scenario where we found one and only one
201 		// alignment but didn't really look for a second one
202 		if(!flags.isPrimary() ||
203 		   (!flags.canMax() && !s.exhausted(mate1) && !hasSecbest)) {
204 			return 255;
205 		}
206 		// scPer = score of a perfect match
207 		TAlScore scPer = (TAlScore)sc_.perfectScore(rdlen);
208 		if(s.paired()) {
209 			scPer += (TAlScore)sc_.perfectScore(ordlen);
210 		}
211 		// scMin = score of a just barely valid match
212 		TAlScore scMin = scoreMin_.f<TAlScore>((float)rdlen);
213 		if(s.paired()) {
214 			scMin += scoreMin_.f<TAlScore>((float)ordlen);
215 		}
216 		TAlScore secbest = scMin-1;
217                 TAlScore diff = std::max<TAlScore>(1, scPer - scMin); // scores can vary by up to this much
218                 TMapq ret = 0;
219 		TAlScore best = s.paired() ?
220 			s.bestCScore().score() : s.bestScore(mate1).score();
221 		// best score but normalized so that 0 = worst valid score
222 		TAlScore bestOver = best - scMin;
223 		if(sc_.monotone) {
224 			// End-to-end alignment
225 			if(!hasSecbest) {
226 				if     (bestOver >= diff * (double)0.8f) ret = 42;
227 				else if(bestOver >= diff * (double)0.7f) ret = 40;
228 				else if(bestOver >= diff * (double)0.6f) ret = 24;
229 				else if(bestOver >= diff * (double)0.5f) ret = 23;
230 				else if(bestOver >= diff * (double)0.4f) ret = 8;
231 				else if(bestOver >= diff * (double)0.3f) ret = 3;
232 				else                                     ret = 0;
233 			} else {
234 				secbest = s.paired() ?
235 					s.bestUnchosenCScore().score() : s.bestUnchosenScore(mate1).score();
236 				TAlScore bestdiff = abs(abs(static_cast<long>(best))-abs(static_cast<long>(secbest)));
237 				if(bestdiff >= diff * (double)0.9f) {
238 					if(bestOver == diff) {
239 						ret = 39;
240 					} else {
241 						ret = 33;
242 					}
243 				} else if(bestdiff >= diff * (double)0.8f) {
244 					if(bestOver == diff) {
245 						ret = 38;
246 					} else {
247 						ret = 27;
248 					}
249 				} else if(bestdiff >= diff * (double)0.7f) {
250 					if(bestOver == diff) {
251 						ret = 37;
252 					} else {
253 						ret = 26;
254 					}
255 				} else if(bestdiff >= diff * (double)0.6f) {
256 					if(bestOver == diff) {
257 						ret = 36;
258 					} else {
259 						ret = 22;
260 					}
261 				} else if(bestdiff >= diff * (double)0.5f) {
262 					// Top third is still pretty good
263 					if       (bestOver == diff) {
264 						ret = 35;
265 					} else if(bestOver >= diff * (double)0.84f) {
266 						ret = 25;
267 					} else if(bestOver >= diff * (double)0.68f) {
268 						ret = 16;
269 					} else {
270 						ret = 5;
271 					}
272 				} else if(bestdiff >= diff * (double)0.4f) {
273 					// Top third is still pretty good
274 					if       (bestOver == diff) {
275 						ret = 34;
276 					} else if(bestOver >= diff * (double)0.84f) {
277 						ret = 21;
278 					} else if(bestOver >= diff * (double)0.68f) {
279 						ret = 14;
280 					} else {
281 						ret = 4;
282 					}
283 				} else if(bestdiff >= diff * (double)0.3f) {
284 					// Top third is still pretty good
285 					if       (bestOver == diff) {
286 						ret = 32;
287 					} else if(bestOver >= diff * (double)0.88f) {
288 						ret = 18;
289 					} else if(bestOver >= diff * (double)0.67f) {
290 						ret = 15;
291 					} else {
292 						ret = 3;
293 					}
294 				} else if(bestdiff >= diff * (double)0.2f) {
295 					// Top third is still pretty good
296 					if       (bestOver == diff) {
297 						ret = 31;
298 					} else if(bestOver >= diff * (double)0.88f) {
299 						ret = 17;
300 					} else if(bestOver >= diff * (double)0.67f) {
301 						ret = 11;
302 					} else {
303 						ret = 0;
304 					}
305 				} else if(bestdiff >= diff * (double)0.1f) {
306 					// Top third is still pretty good
307 					if       (bestOver == diff) {
308 						ret = 30;
309 					} else if(bestOver >= diff * (double)0.88f) {
310 						ret = 12;
311 					} else if(bestOver >= diff * (double)0.67f) {
312 						ret = 7;
313 					} else {
314 						ret = 0;
315 					}
316 				} else if(bestdiff > 0) {
317 					// Top third is still pretty good
318 					if(bestOver >= diff * (double)0.67f) {
319 						ret = 6;
320 					} else {
321 						ret = 2;
322 					}
323 				} else {
324 					assert_eq(bestdiff, 0);
325 					// Top third is still pretty good
326 					if(bestOver >= diff * (double)0.67f) {
327 						ret = 1;
328 					} else {
329 						ret = 0;
330 					}
331 				}
332 			}
333 		} else {
334 			// Local alignment
335 			if(!hasSecbest) {
336 				if     (bestOver >= diff * (double)0.8f) ret = 44;
337 				else if(bestOver >= diff * (double)0.7f) ret = 42;
338 				else if(bestOver >= diff * (double)0.6f) ret = 41;
339 				else if(bestOver >= diff * (double)0.5f) ret = 36;
340 				else if(bestOver >= diff * (double)0.4f) ret = 28;
341 				else if(bestOver >= diff * (double)0.3f) ret = 24;
342 				else                                     ret = 22;
343 			} else {
344 				secbest = s.paired() ?
345 					s.bestUnchosenCScore().score() : s.bestUnchosenScore(mate1).score();
346 				TAlScore bestdiff = abs(abs(static_cast<long>(best))-abs(static_cast<long>(secbest)));
347 				if     (bestdiff >= diff * (double)0.9f) ret = 40;
348 				else if(bestdiff >= diff * (double)0.8f) ret = 39;
349 				else if(bestdiff >= diff * (double)0.7f) ret = 38;
350 				else if(bestdiff >= diff * (double)0.6f) ret = 37;
351 				else if(bestdiff >= diff * (double)0.5f) {
352 					if     (bestOver == diff)                 ret = 35;
353 					else if(bestOver >= diff * (double)0.50f) ret = 25;
354 					else                                      ret = 20;
355 				} else if(bestdiff >= diff * (double)0.4f) {
356 					if     (bestOver == diff)                 ret = 34;
357 					else if(bestOver >= diff * (double)0.50f) ret = 21;
358 					else                                      ret = 19;
359 				} else if(bestdiff >= diff * (double)0.3f) {
360 					if     (bestOver == diff)                 ret = 33;
361 					else if(bestOver >= diff * (double)0.5f)  ret = 18;
362 					else                                      ret = 16;
363 				} else if(bestdiff >= diff * (double)0.2f) {
364 					if     (bestOver == diff)                 ret = 32;
365 					else if(bestOver >= diff * (double)0.5f)  ret = 17;
366 					else                                      ret = 12;
367 				} else if(bestdiff >= diff * (double)0.1f) {
368 					if     (bestOver == diff)                 ret = 31;
369 					else if(bestOver >= diff * (double)0.5f)  ret = 14;
370 					else                                      ret = 9;
371 				} else if(bestdiff > 0) {
372 					if(bestOver >= diff * (double)0.5f)       ret = 11;
373 					else                                      ret = 2;
374 				} else {
375 					assert_eq(bestdiff, 0);
376 					if(bestOver >= diff * (double)0.5f)       ret = 1;
377 					else                                      ret = 0;
378 				}
379 			}
380 		}
381 		// Note: modifications to inps must be synchronized
382 		//if(inps != NULL) {
383 		//	inps = itoa10<TAlScore>(best, inps);
384 		//	*inps++ = ',';
385 		//	inps = itoa10<TAlScore>(secbest, inps);
386 		//	*inps++ = ',';
387 		//	inps = itoa10<TMapq>(ret, inps);
388 		//}
389 		return ret;
390 	}
391 
392 protected:
393 
394 	SimpleFunc      scoreMin_;
395 	const Scoring&  sc_;
396 };
397 
398 /**
399  * TODO: Do BowtieMapq on a per-thread basis prior to the mutex'ed output
400  * function.
401  *
402  * topCoeff :: top_coeff
403  * botCoeff :: bot_coeff
404  * mx :: mapqMax
405  * horiz :: mapqHorizon (sort of)
406  *
407  * sc1 <- tab$sc1
408  * sc2 <- tab$sc2
409  * mapq <- rep(mx, length(sc1))
410  * diff_top <- ifelse(sc1 != best & sc2 != best, abs(best - abs(pmax(sc1, sc2))), 0)
411  * mapq <- mapq - diff_top * top_coeff
412  * diff_bot <- ifelse(sc2 != horiz, abs(abs(sc2) - abs(horiz)), 0)
413  * mapq <- mapq - diff_bot * bot_coeff
414  * mapq <- round(pmax(0, pmin(mx, mapq)))
415  * tab$mapq <- mapq
416  */
417 class BowtieMapq : public Mapq {
418 
419 public:
420 
BowtieMapq(const SimpleFunc & scoreMin,const Scoring & sc)421 	BowtieMapq(
422 		const SimpleFunc& scoreMin,
423 		const Scoring& sc) :
424 		scoreMin_(scoreMin),
425 		sc_(sc)
426 	{ }
427 
~BowtieMapq()428 	virtual ~BowtieMapq() { }
429 
430 	/**
431 	 * Given an AlnSetSumm, return a mapping quality calculated.
432 	 */
mapq(const AlnSetSumm & s,const AlnFlags & flags,bool mate1,size_t rdlen,size_t ordlen,char * inps)433 	virtual TMapq mapq(
434 		const AlnSetSumm& s,
435 		const AlnFlags& flags,
436 		bool mate1,
437 		size_t rdlen,
438 		size_t ordlen,
439 		char *inps)     // put string representation of inputs here
440 		const
441 	{
442 		bool hasSecbest = VALID_AL_SCORE(s.bestUnchosenScore(mate1));
443 		if(!flags.canMax() && !s.exhausted(mate1) && !hasSecbest) {
444 			return 255;
445 		}
446 		TAlScore scPer = (TAlScore)sc_.perfectScore(rdlen);
447 		TAlScore scMin = scoreMin_.f<TAlScore>((float)rdlen);
448 		TAlScore secbest = scMin-1;
449 		TAlScore diff = (scPer - scMin);
450 		float sixth_2 = (float)(scPer - diff * (double)0.1666f * 2);
451 		float sixth_3 = (float)(scPer - diff * (double)0.1666f * 3);
452 		TMapq ret = 0;
453 		TAlScore best = s.bestScore(mate1).score();
454 		if(!hasSecbest) {
455 			// Top third?
456 			if(best >= sixth_2) {
457 				ret = 37;
458 			}
459 			// Otherwise in top half?
460 			else if(best >= sixth_3) {
461 				ret = 25;
462 			}
463 			// Otherwise has no second-best?
464 			else {
465 				ret = 10;
466 			}
467 		} else {
468 			secbest = s.bestUnchosenScore(mate1).score();
469 			TAlScore bestdiff = abs(abs(static_cast<long>(best))-abs(static_cast<long>(secbest)));
470 			if(bestdiff >= diff * 0.1666 * 5) {
471 				ret = 6;
472 			} else if(bestdiff >= diff * 0.1666 * 4) {
473 				ret = 5;
474 			} else if(bestdiff >= diff * 0.1666 * 3) {
475 				ret = 4;
476 			} else if(bestdiff >= diff * 0.1666 * 2) {
477 				ret = 3;
478 			} else if(bestdiff >= diff * 0.1666 * 1) {
479 				ret = 2;
480 			} else {
481 				ret = 1;
482 			}
483 		}
484 		// Note: modifications to inps must be synchronized
485 		//if(inps != NULL) {
486 		//	inps = itoa10<TAlScore>(best, inps);
487 		//	*inps++ = ',';
488 		//	inps = itoa10<TAlScore>(secbest, inps);
489 		//	*inps++ = ',';
490 		//	inps = itoa10<TMapq>(ret, inps);
491 		//}
492 		return ret;
493 	}
494 
495 protected:
496 
497 	SimpleFunc      scoreMin_;
498 	const Scoring&  sc_;
499 };
500 
501 /**
502  * Create and return new MAPQ calculating object.
503  */
new_mapq(int version,const SimpleFunc & scoreMin,const Scoring & sc)504 static inline Mapq *new_mapq(
505 	int version,
506 	const SimpleFunc& scoreMin,
507 	const Scoring& sc)
508 {
509 	if(version == 3) {
510 		return new BowtieMapq3(scoreMin, sc);
511 	} else if(version == 2) {
512 		return new BowtieMapq2(scoreMin, sc);
513 	} else {
514 		return new BowtieMapq(scoreMin, sc);
515 	}
516 }
517 
518 #endif /*ndef UNIQUE_H_*/
519