1 /* $Id: best_placement.cpp 594857 2019-10-10 15:16:17Z astashya $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author : Alex Astashyn
27 */
28
29 #include <ncbi_pch.hpp>
30 #include <algo/align/util/best_placement.hpp>
31 //#include "best_placement.hpp"
32
33 #include <objects/seqalign/Seq_align.hpp>
34 #include <objects/seqalign/Seq_align_set.hpp>
35 #include <objects/seqalign/Dense_seg.hpp>
36 #include <objects/seqalign/Spliced_seg.hpp>
37 #include <objects/seqalign/Spliced_exon.hpp>
38 #include <objects/seqalign/Spliced_exon_chunk.hpp>
39 #include <objects/seqalign/Splice_site.hpp>
40 #include <objects/seqalign/Product_pos.hpp>
41 #include <objects/seqloc/Seq_id.hpp>
42
43 #include <cmath>
44
45 USING_SCOPE(ncbi);
46 USING_SCOPE(objects);
47
48 /*
49 For high-identity alignments mismatches carry more signal, i.e.
50 short high-identity partial is better than a long full-coverage
51 placement that is likely a paralog.
52
53 However, very short 100% identity is worse than high-identity
54 full-coverage alignment.
55
56 For low-identity alignments (e.g. cross-species) mismatches matter less,
57 and other attributes, e.g. coverage, matter more.
58
59 For high-identity alignments indels matter more than mismatches,
60 but in gappy PacBio alignments indels don't carry a lot of signal.
61
62 Spread-out defects matter more than clustered, since the latter is
63 more likely due to low-qual sequencing or alignment artifact
64 rather than true SNPs.
65
66 -----------------------------------------------------------------------
67
68 Odds are multiplicative; log-odds are additive.
69
70 gaps and mismatches are in separate categories such that
71 if an alignment is a gappy crud from pacbio gaps don't
72 overshadow everything else.
73
74 score = log(matches+polya) - log(mismatches)
75 + log(ug_aln_len) - log(gaps + 4*gapopens + 8*frameshifting_gapopens)
76 + log(ug_aln_len) - log(unaligned_len)
77 + log(cons_spl) - log(ncons_spl)
78 + log(good_exons) - log(partial_or_short_or_downstreamof_long_intron)
79 + log(L) - log(1) // L is longest matchrun capped at ungapped_len/3
80 */
81
82 ///////////////////////////////////////////////////////////////////////////
83
84 struct SAlignmentScoringModel
85 {
operator ()SAlignmentScoringModel86 int operator() (const CSeq_align& aln) const
87 {
88 return (int)round(100 * GetScore(aln));
89 }
90
91 ///////////////////////////////////////////////////////////////////////
92
93 private:
94
95 // pair of sum of positives and sum of negatives for
96 // some score-category (e.g matches vs. mismatches)
97 struct odds
98 {
99 double pos, neg; // sum of absolute-values
100
oddsSAlignmentScoringModel::odds101 odds(double pos_ = 0.0, double neg_ = 0.0) : pos(pos_), neg(neg_) {}
102
operator +=SAlignmentScoringModel::odds103 void operator += (double x) { (x > 0 ? pos : neg) += fabs(x); }
operator +=SAlignmentScoringModel::odds104 void operator += (const odds o) { pos += o.pos; neg += o.neg; }
105
logoddsSAlignmentScoringModel::odds106 double logodds(int a = 2) const
107 {
108 if(pos < 0 || neg < 0) {
109 NCBI_USER_THROW("Invalid state of odds: " + this->AsString());
110 }
111 return log( (a+pos)/(a+neg) );
112 }
113
AsStringSAlignmentScoringModel::odds114 string AsString() const
115 {
116 return "(+" + std::to_string(pos)
117 + ", -" + std::to_string(neg) + ")";
118 }
119 };
120
as_logoddsSAlignmentScoringModel121 static double as_logodds(double ratio)
122 {
123 _ASSERT(0 < ratio && ratio < 1);
124 return log( ratio / (1 - ratio) );
125 }
126
127 ///////////////////////////////////////////////////////////////////////
128
clampSAlignmentScoringModel129 static double clamp(double x, double lo, double hi)
130 {
131 return x < lo ? lo
132 : x > hi ? hi
133 : x;
134 }
135
136
GetScoreSAlignmentScoringModel137 double GetScore(const CSeq_align& aln) const
138 {
139 const bool is_prot = isProtSS(aln);
140
141 const auto gapped_len = aln.GetAlignLength(true);
142 const auto ungapped_len = aln.GetAlignLength(false);
143 const auto num_gaps = gapped_len - ungapped_len;
144 const auto num_gapopens = aln.GetNumGapOpenings();
145 const auto num_frameshifts = !is_prot ? 0 : aln.GetNumFrameshifts();
146 const auto polya_len = GetPolyA_length(aln);
147
148 const auto num_gapopens_between_exons =
149 GetNumGapopensBetweenExons(aln);
150
151 const auto weighted_gaps =
152 num_gaps
153 + 2 * num_gapopens
154 + 4 * num_gapopens_between_exons
155 + 4 * num_frameshifts;
156
157 const odds gaps_odds(ungapped_len, weighted_gaps);
158
159 auto coverage = GetCoverage(aln, ungapped_len);
160 auto splices_odds = GetSplicesOdds(aln);
161 auto exons_odds = GetExonsOdds(aln);
162 auto matches_odds = GetIdentOdds(aln);
163 matches_odds += polya_len;
164
165 if(polya_len > 5) {
166 // If we have alignments in both orientations,
167 // we may have a polyA in one orinetation and
168 // a corresponding false-positive exon in opposite
169 // orientation. We want the former, so will weigh
170 // existence of polyA_len as slightly more than
171 // a consensus splice.
172 splices_odds.pos += 1.1;
173 }
174
175 const bool defect_free = matches_odds.neg == 0
176 && gaps_odds.neg == 0
177 && splices_odds.neg == 0;
178
179 // Given two alignments with equivalent identity, we prefer the
180 // one where defects (mismatches and indels) are clustered.
181 //
182 // To ascertain that, we'll define identity-like score based
183 // on a longest defect-free matchrun (or diag-run in case of proteins),
184 // L/(L+1) with corresponding odds L/1.
185 //
186 // We clamp L from below at 1, such that
187 // if it wasn't computed (not a spliced-seg),
188 // the odds will be 1/1 (contributing logodds = 0)
189 //
190 // We clamp L from above to a fraction of an alignment length because
191 // we don't want this to have any effect for high-identity alignments.
192 // (e.g. if we have a single-mismatch alignment, then
193 // L will vary between [aln_len/2, aln_len), but we don't want
194 // the score to be affected.
195 const size_t longest_matchrun = GetLongestMatchrunLen(aln);
196 const odds matchrun_odds(
197 clamp(longest_matchrun, 1, ungapped_len/4.0),
198 1);
199
200
201 auto ret =
202 0.5 * matches_odds.logodds() // taking average of these two,
203 + 0.5 * matchrun_odds.logodds() // since they are highly correlated
204 + gaps_odds.logodds()
205 + splices_odds.logodds()
206 + exons_odds.logodds()
207 + defect_free * 0.1
208 + as_logodds( coverage == 0 ? 0.5 // logodds==0
209 : clamp(coverage, 0.05, 0.95))
210 ;
211
212 #if 0
213 cerr << "ret:" << ret
214 << " ma" << matches_odds.AsString()
215 << " mr" << matchrun_odds.AsString()
216 << " g" << gaps_odds.AsString()
217 << " s" << splices_odds.AsString()
218 << " e" << exons_odds.AsString()
219 << " " << (defect_free ? "perfect" : "not-perfect")
220 << (int)round(100 * ret)
221 << "\n";
222 #endif
223
224 return ret;
225 }
226
227 ///////////////////////////////////////////////////////////////////////
228
GetSpliceConsensusScoreSAlignmentScoringModel229 double GetSpliceConsensusScore(
230 const string& donor, // dinucleotides
231 const string& acptr) const
232 {
233 const bool have_donor = !(donor == "" || donor == "NN");
234 const bool have_acptr = !(acptr == "" || acptr == "NN");
235
236 // note: these values are approximate log-odds
237 // and are based on distribution of splice-sites
238 // across many taxa.
239 //
240 // GT-AG occurs in about 97.5% of the time, so
241 // it is the only one with the positive log-odds.
242
243 const double x =
244 (have_donor && have_acptr) ?
245 ( donor == "GT" && acptr == "AG" ? 3.75
246 : donor == "AC" && acptr == "AG" ? -4.5
247 : donor == "AT" && acptr == "AC" ? -6.5
248 : donor == "GT" || acptr == "AG" ? -7.0
249 : -10.0)
250
251 : ( !have_donor && !have_acptr ? 0.0
252 : !have_donor ? (acptr == "AG" ? 2.0 : -7.0)
253 : !have_acptr ? (donor == "GT" ? 2.0 : -7.0)
254 : 0.0);
255
256 return x / 3.75;
257 };
258
259 ///////////////////////////////////////////////////////////////////////
260 // return 0 if can't compute
GetLongestMatchrunLenSAlignmentScoringModel261 static size_t GetLongestMatchrunLen(const CSeq_align& aln)
262 {
263 if(!aln.GetSegs().IsSpliced()) {
264 return 0;
265 }
266
267 size_t max_len = 0;
268 size_t curr_len = 0;
269 for(const auto& exon : aln.GetSegs().GetSpliced().GetExons()) {
270 for(const auto& chunk : exon->GetParts()) {
271
272 if(chunk->IsMatch()) {
273 curr_len += chunk->GetMatch();
274 } else if(chunk->IsDiag()) {
275 curr_len += chunk->GetDiag();
276 } else {
277 max_len = max(max_len, curr_len);
278 curr_len = 0;
279 }
280 }
281 }
282
283 return max(curr_len, max_len);
284 }
285
286 ///////////////////////////////////////////////////////////////////////
287
GetGapLengthOnRowSAlignmentScoringModel288 static Int8 GetGapLengthOnRow(
289 const CSpliced_exon& exon1,
290 const CSpliced_exon& exon2,
291 CSeq_align::TDim row)
292 {
293 const auto r1 = exon1.GetRowSeq_range(row, true);
294 const auto r2 = exon2.GetRowSeq_range(row, true);
295
296 return r1.CombinationWith(r2).GetLength()
297 - r1.GetLength()
298 - r2.GetLength();
299 }
300
301 ///////////////////////////////////////////////////////////////////////
302
303 #pragma push_macro("GET_SCORE")
304 #define GET_SCORE(score_name) \
305 double score_name = 0.0; \
306 bool have_##score_name = aln.GetNamedScore(#score_name, score_name); \
307 (void)score_name; (void)have_##score_name
308 // (void)variable is to quiet unused-variable warnings
309
310 ///////////////////////////////////////////////////////////////////////
311
isProtSSSAlignmentScoringModel312 static bool isProtSS(const CSeq_align& aln)
313 {
314 return aln.GetSegs().IsSpliced()
315 && aln.GetSegs().GetSpliced().GetProduct_type() ==
316 CSpliced_seg::eProduct_type_protein;
317 }
318
319 ///////////////////////////////////////////////////////////////////////
320
s_GetIdentOdds_protSSSAlignmentScoringModel321 static odds s_GetIdentOdds_protSS(const CSeq_align& aln)
322 {
323 _ASSERT(isProtSS(aln));
324
325 GET_SCORE(num_ident);
326 GET_SCORE(num_mismatch);
327
328 GET_SCORE(num_positives);
329 GET_SCORE(num_negatives);
330
331 // Note: for ProSplign spliced-segs these are in terms of NAs, not AAs,
332 // JIRA:GP-27664
333
334 const size_t ungapped_len = aln.GetAlignLength(false); // in NAs, not AAs.
335
336 if(!have_num_ident && have_num_mismatch) {
337 have_num_ident = true;
338 num_ident = ungapped_len - num_mismatch;
339 }
340
341 if(!have_num_positives && have_num_negatives) {
342 have_num_positives = true;
343 num_positives = ungapped_len - num_negatives;
344 }
345
346 // GP-9599 - disregarding num_positives if have num_ident.
347 // (used to take weighted sum thereof when both are present).
348
349 double identity = have_num_ident ? num_ident / ungapped_len
350 : have_num_positives ? num_positives / ungapped_len
351 : 0.5; // log-odds=0.
352
353 if(identity < 0 || identity > 1.0) {
354 std::cerr << MSerial_AsnText << aln;
355 NCBI_USER_THROW("Identity is outside of [0..1] range - problem with alignment scores.");
356 }
357
358 return odds(ungapped_len * identity,
359 ungapped_len * (1 - identity));
360 }
361
362 ///////////////////////////////////////////////////////////////////////
363
s_GetIdentOdds_nucSSSAlignmentScoringModel364 static odds s_GetIdentOdds_nucSS(const CSeq_align& aln)
365 {
366 _ASSERT(aln.GetSegs().IsSpliced() && !isProtSS(aln));
367
368 odds res;
369 for(const auto& exon : aln.GetSegs().GetSpliced().GetExons()) {
370 for(const auto& chunk : exon->GetParts()) {
371 if(chunk->IsMatch()) {
372 res.pos += chunk->GetMatch();
373 } else if(chunk->IsMismatch()) {
374 res.neg += chunk->GetMismatch();
375 }
376 }
377 }
378 return res;
379 }
380
381 ///////////////////////////////////////////////////////////////////////
382
s_GetIdentOdds_discSAlignmentScoringModel383 static odds s_GetIdentOdds_disc(const CSeq_align& aln)
384 {
385 odds res;
386
387 for(const auto subaln : aln.GetSegs().GetDisc().Get()) {
388 res += GetIdentOdds(*subaln);
389 }
390 return res;
391 }
392
393 ///////////////////////////////////////////////////////////////////////
394
GetIdentOddsSAlignmentScoringModel395 static odds GetIdentOdds(const CSeq_align& aln)
396 {
397 if(isProtSS(aln)) {
398 return s_GetIdentOdds_protSS(aln);
399 }
400
401 GET_SCORE(num_ident);
402 GET_SCORE(num_mismatch);
403
404 // Note: if alignment is from tblastn,
405 // num_ident/num_mismatch/num_positives/num_negatives are in AAs,
406 // and CSeq_align::GetAlignLength is in AAs.
407 //
408 // if the alignment is from ProSplign, those are in NAs.
409
410 if(have_num_ident && have_num_mismatch) {
411 return odds(num_ident,
412 num_mismatch);
413 }
414
415 const auto ungapped_len = aln.GetAlignLength(false);
416
417 if(have_num_ident) {
418 return odds(num_ident,
419 ungapped_len - num_ident);
420 }
421
422 if(have_num_mismatch) {
423 return odds(ungapped_len - num_mismatch,
424 num_mismatch);
425 }
426
427 GET_SCORE(pct_identity_ungap);
428
429 if(have_pct_identity_ungap) {
430 num_ident = pct_identity_ungap * 0.01 * ungapped_len;
431 return odds(num_ident,
432 ungapped_len - num_ident);
433 }
434
435 GET_SCORE(pct_identity_gap);
436
437 if(have_pct_identity_gap) {
438 const auto gapped_len = aln.GetAlignLength(true);
439 num_ident = pct_identity_gap * 0.01 * gapped_len;
440 return odds(num_ident,
441 ungapped_len - num_ident);
442 }
443
444 if(aln.GetSegs().IsSpliced() && !isProtSS(aln)) {
445 return s_GetIdentOdds_nucSS(aln);
446 } else if(aln.GetSegs().IsDisc()) {
447 return s_GetIdentOdds_disc(aln);
448 }
449
450 cerr << MSerial_AsnText << aln;
451 NCBI_USER_THROW("Can't get ident/mismatch count for aln");
452 }
453
454 ///////////////////////////////////////////////////////////////////////
455
GetSplicesOddsSAlignmentScoringModel456 odds GetSplicesOdds(const CSeq_align& aln) const
457 {
458 odds res;
459 if(!aln.GetSegs().IsSpliced()) {
460 return res;
461 }
462
463 const CSpliced_seg& ss = aln.GetSegs().GetSpliced();
464
465 CConstRef<CSpliced_exon> prev;
466 for(const auto& exon : ss.GetExons()) {
467 if(!prev) {
468 prev = exon;
469 continue;
470 }
471
472 // not checking for abutting, since
473 // the scoring function can score a
474 // single splice
475
476 res += GetSpliceConsensusScore(
477 prev->IsSetDonor_after_exon()
478 ? prev->GetDonor_after_exon().GetBases() : "",
479
480 exon->IsSetAcceptor_before_exon()
481 ? exon->GetAcceptor_before_exon().GetBases() : "");
482
483 prev = exon;
484 }
485
486 return res;
487 }
488
489 ///////////////////////////////////////////////////////////////////////
490 /// pos + neg = count of exons.
491 /// neg = count of "bad" exons
492 /// (partial or short or downstream of a long intron)
GetExonsOddsSAlignmentScoringModel493 odds GetExonsOdds(const CSeq_align& aln) const
494 {
495 odds res;
496 if(!aln.GetSegs().IsSpliced()) {
497 return res;
498 }
499
500 const CSpliced_seg& ss = aln.GetSegs().GetSpliced();
501
502 CConstRef<CSpliced_exon> prev;
503 for(const auto& exon : ss.GetExons()) {
504
505 const bool is_terminal = exon == ss.GetExons().front()
506 || exon == ss.GetExons().back();
507
508 const size_t len = exon->GetRowSeq_range(1, true).GetLength();
509
510 const bool is_poor =
511 (exon->IsSetPartial() && exon->GetPartial())
512 || (len <= (is_terminal ? 15 : 5))
513 || (prev && GetGapLengthOnRow(*prev, *exon, 1) > 65000);
514
515 res += is_poor ? -1 : 1;
516
517 prev = exon;
518 }
519
520 return res;
521 }
522
523 ///////////////////////////////////////////////////////////////////////
524
GetNumGapopensBetweenExonsSAlignmentScoringModel525 static size_t GetNumGapopensBetweenExons(const CSeq_align& aln)
526 {
527 if(!aln.GetSegs().IsSpliced()) {
528 return 0;
529 }
530
531 CConstRef<CSpliced_exon> prev;
532 size_t n = 0;
533 for(const auto& exon : aln.GetSegs().GetSpliced().GetExons()) {
534 n += prev && GetGapLengthOnRow(*prev, *exon, 0) != 0;
535 prev = exon;
536 }
537 return n;
538 }
539
540 ///////////////////////////////////////////////////////////////////////
541
GetPolyA_lengthSAlignmentScoringModel542 static size_t GetPolyA_length(const CSeq_align& aln)
543 {
544 if(!aln.GetSegs().IsSpliced()) {
545 return 0;
546 }
547
548 const CSpliced_seg& ss = aln.GetSegs().GetSpliced();
549
550 return !ss.IsSetPoly_a() ? 0
551 : !ss.IsSetProduct_length() ? 0
552
553 : aln.GetSeqStrand(0) == eNa_strand_minus
554 ? ss.GetPoly_a()
555
556 : aln.GetSeqStrand(0) == eNa_strand_plus
557 ? ss.GetProduct_length() - ss.GetPoly_a()
558
559 : 0;
560 }
561
562 ///////////////////////////////////////////////////////////////////////
563
564 // return 0 if can't compute
GetCoverageSAlignmentScoringModel565 static double GetCoverage(const CSeq_align& aln, size_t ung_aln_len)
566 {
567 GET_SCORE(pct_coverage_hiqual);
568 if(have_pct_coverage_hiqual) {
569 return pct_coverage_hiqual / 100;
570 }
571
572 GET_SCORE(pct_coverage);
573 if(have_pct_coverage) {
574 return pct_coverage / 100;
575 }
576
577 const bool is_prot = isProtSS(aln);
578
579 if( aln.GetSegs().IsSpliced()
580 && aln.GetSegs().GetSpliced().IsSetProduct_length())
581 {
582 auto len = aln.GetSegs().GetSpliced().GetProduct_length()
583 * (is_prot ? 3 : 1);
584
585 return ung_aln_len * 1.0 / len;
586 }
587
588 return 0.0;
589 }
590
591 ///////////////////////////////////////////////////////////////////////
592 //
593
594
595 #pragma pop_macro("GET_SCORE")
596 };
597
598 /////////////////////////////////////////////////////////////////////////////
599
GetScore(const CSeq_align & aln)600 int NBestPlacement::GetScore(const CSeq_align& aln)
601 {
602 static SAlignmentScoringModel get_score;
603 return get_score(aln);
604 }
605
606 /////////////////////////////////////////////////////////////////////////////
607
Rank(CSeq_align_set & sas,score_fn_t score_fn)608 void NBestPlacement::Rank(
609 CSeq_align_set& sas,
610 score_fn_t score_fn)
611 {
612 if(sas.Get().empty()) {
613 return;
614 }
615
616 using pair_t = pair<int, CRef<CSeq_align> >;
617 vector<pair_t> v;
618
619 for(const auto aln : sas.Set()) {
620
621 _ASSERT( aln->GetSeq_id(0).Equals(
622 sas.Get().front()->GetSeq_id(0)));
623
624 v.emplace_back(score_fn(*aln), aln);
625 }
626
627 stable_sort(
628 v.begin(), v.end(),
629 [](const pair_t a, const pair_t b) {
630 return b.first < a.first; // by score, descending
631 });
632
633 const auto rank1_count = std::count_if(
634 v.begin(), v.end(),
635 [&v](pair_t p) { return p.first == v.front().first; });
636
637 sas.Set().clear();
638
639 int rank = 1;
640 int rank1_index = 1;
641 for(const auto p : v) {
642 auto aln = p.second;
643
644 if(rank1_count > 1 && p.first == v.front().first) {
645 aln->SetNamedScore("rank", 1);
646 aln->SetNamedScore("rank1_index", rank1_index++);
647 aln->SetNamedScore("rank1_count", (int)rank1_count);
648 } else {
649 aln->SetNamedScore("rank", rank);
650 }
651
652 rank++;
653
654 aln->SetNamedScore("best_placement_score", p.first);
655
656 sas.Set().emplace_back(aln);
657 }
658 }
659