1 /*  $Id: best_placement.cpp 594857 2019-10-10 15:16:17Z astashya $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  *  Author : Alex Astashyn
27  */
28 
29 #include <ncbi_pch.hpp>
30 #include <algo/align/util/best_placement.hpp>
31 //#include "best_placement.hpp"
32 
33 #include <objects/seqalign/Seq_align.hpp>
34 #include <objects/seqalign/Seq_align_set.hpp>
35 #include <objects/seqalign/Dense_seg.hpp>
36 #include <objects/seqalign/Spliced_seg.hpp>
37 #include <objects/seqalign/Spliced_exon.hpp>
38 #include <objects/seqalign/Spliced_exon_chunk.hpp>
39 #include <objects/seqalign/Splice_site.hpp>
40 #include <objects/seqalign/Product_pos.hpp>
41 #include <objects/seqloc/Seq_id.hpp>
42 
43 #include <cmath>
44 
45 USING_SCOPE(ncbi);
46 USING_SCOPE(objects);
47 
48 /*
49     For high-identity alignments mismatches carry more signal, i.e.
50     short high-identity partial is better than a long full-coverage
51     placement that is likely a paralog.
52 
53     However, very short 100% identity is worse than high-identity
54     full-coverage alignment.
55 
56     For low-identity alignments (e.g. cross-species) mismatches matter less,
57     and other attributes, e.g. coverage, matter more.
58 
59     For high-identity alignments indels matter more than mismatches,
60     but in gappy PacBio alignments indels don't carry a lot of signal.
61 
62     Spread-out defects matter more than clustered, since the latter is
63     more likely due to low-qual sequencing or alignment artifact
64     rather than true SNPs.
65 
66     -----------------------------------------------------------------------
67 
68     Odds are multiplicative; log-odds are additive.
69 
70     gaps and mismatches are in separate categories such that
71     if an alignment is a gappy crud from pacbio gaps don't
72     overshadow everything else.
73 
74     score = log(matches+polya)   - log(mismatches)
75           + log(ug_aln_len)      - log(gaps + 4*gapopens + 8*frameshifting_gapopens)
76           + log(ug_aln_len)      - log(unaligned_len)
77           + log(cons_spl)        - log(ncons_spl)
78           + log(good_exons)      - log(partial_or_short_or_downstreamof_long_intron)
79           + log(L)               - log(1) // L is longest matchrun capped at ungapped_len/3
80 */
81 
82 ///////////////////////////////////////////////////////////////////////////
83 
84 struct SAlignmentScoringModel
85 {
operator ()SAlignmentScoringModel86     int operator() (const CSeq_align& aln) const
87     {
88         return (int)round(100 * GetScore(aln));
89     }
90 
91     ///////////////////////////////////////////////////////////////////////
92 
93 private:
94 
95     // pair of sum of positives and sum of negatives for
96     // some score-category (e.g matches vs. mismatches)
97     struct odds
98     {
99         double pos, neg; // sum of absolute-values
100 
oddsSAlignmentScoringModel::odds101         odds(double pos_ = 0.0, double neg_ = 0.0) : pos(pos_), neg(neg_) {}
102 
operator +=SAlignmentScoringModel::odds103         void operator += (double x)     { (x > 0 ? pos : neg) += fabs(x);  }
operator +=SAlignmentScoringModel::odds104         void operator += (const odds o) { pos += o.pos;  neg  += o.neg;    }
105 
logoddsSAlignmentScoringModel::odds106         double logodds(int a = 2) const
107         {
108             if(pos < 0 || neg < 0) {
109                 NCBI_USER_THROW("Invalid state of odds: " + this->AsString());
110             }
111             return log( (a+pos)/(a+neg) );
112         }
113 
AsStringSAlignmentScoringModel::odds114         string AsString() const
115         {
116             return "(+" + std::to_string(pos)
117                 + ", -" + std::to_string(neg) + ")";
118         }
119     };
120 
as_logoddsSAlignmentScoringModel121     static double as_logodds(double ratio)
122     {
123         _ASSERT(0 < ratio && ratio < 1);
124         return log( ratio / (1 - ratio) );
125     }
126 
127     ///////////////////////////////////////////////////////////////////////
128 
clampSAlignmentScoringModel129     static double clamp(double x, double lo, double hi)
130     {
131         return x < lo ? lo
132              : x > hi ? hi
133              :          x;
134     }
135 
136 
GetScoreSAlignmentScoringModel137     double GetScore(const CSeq_align& aln) const
138     {
139         const bool is_prot = isProtSS(aln);
140 
141         const auto      gapped_len = aln.GetAlignLength(true);
142         const auto    ungapped_len = aln.GetAlignLength(false);
143         const auto        num_gaps = gapped_len - ungapped_len;
144         const auto    num_gapopens = aln.GetNumGapOpenings();
145         const auto num_frameshifts = !is_prot ? 0 : aln.GetNumFrameshifts();
146         const auto       polya_len = GetPolyA_length(aln);
147 
148         const auto num_gapopens_between_exons =
149             GetNumGapopensBetweenExons(aln);
150 
151         const auto weighted_gaps =
152                    num_gaps
153              + 2 * num_gapopens
154              + 4 * num_gapopens_between_exons
155              + 4 * num_frameshifts;
156 
157         const odds gaps_odds(ungapped_len, weighted_gaps);
158 
159         auto     coverage = GetCoverage(aln, ungapped_len);
160         auto splices_odds = GetSplicesOdds(aln);
161         auto   exons_odds = GetExonsOdds(aln);
162         auto matches_odds = GetIdentOdds(aln);
163              matches_odds += polya_len;
164 
165         if(polya_len > 5) {
166             // If we have alignments in both orientations,
167             // we may have a polyA in one orinetation and
168             // a corresponding false-positive exon in opposite
169             // orientation. We want the former, so will weigh
170             // existence of polyA_len as slightly more than
171             // a consensus splice.
172             splices_odds.pos += 1.1;
173         }
174 
175         const bool defect_free = matches_odds.neg == 0
176                               &&    gaps_odds.neg == 0
177                               && splices_odds.neg == 0;
178 
179         // Given two alignments with equivalent identity, we prefer the
180         // one where defects (mismatches and indels) are clustered.
181         //
182         // To ascertain that, we'll define identity-like score based
183         // on a longest defect-free matchrun (or diag-run in case of proteins),
184         // L/(L+1) with corresponding odds L/1.
185         //
186         // We clamp L from below at 1, such that
187         // if it wasn't computed (not a spliced-seg),
188         // the odds will be 1/1 (contributing logodds = 0)
189         //
190         // We clamp L from above to a fraction of an alignment length because
191         // we don't want this to have any effect for high-identity alignments.
192         // (e.g. if we have a single-mismatch alignment, then
193         // L will vary between [aln_len/2, aln_len), but we don't want
194         // the score to be affected.
195         const size_t longest_matchrun = GetLongestMatchrunLen(aln);
196         const odds matchrun_odds(
197                 clamp(longest_matchrun, 1, ungapped_len/4.0),
198                 1);
199 
200 
201         auto ret =
202                0.5 * matches_odds.logodds()  // taking average of these two,
203             +  0.5 * matchrun_odds.logodds() // since they are highly correlated
204             +  gaps_odds.logodds()
205             +  splices_odds.logodds()
206             +    exons_odds.logodds()
207             +  defect_free * 0.1
208             +  as_logodds( coverage == 0 ? 0.5 // logodds==0
209                          : clamp(coverage, 0.05, 0.95))
210             ;
211 
212 #if 0
213         cerr << "ret:" << ret
214              << " ma" << matches_odds.AsString()
215              << " mr" << matchrun_odds.AsString()
216              << " g" << gaps_odds.AsString()
217              << " s" << splices_odds.AsString()
218              << " e" << exons_odds.AsString()
219              << " " << (defect_free ? "perfect" : "not-perfect")
220              << (int)round(100 * ret)
221              << "\n";
222 #endif
223 
224         return ret;
225     }
226 
227     ///////////////////////////////////////////////////////////////////////
228 
GetSpliceConsensusScoreSAlignmentScoringModel229     double GetSpliceConsensusScore(
230             const string& donor, // dinucleotides
231             const string& acptr)  const
232     {
233         const bool have_donor = !(donor == "" || donor == "NN");
234         const bool have_acptr = !(acptr == "" || acptr == "NN");
235 
236         // note: these values are approximate log-odds
237         // and are based on distribution of splice-sites
238         // across many taxa.
239         //
240         // GT-AG occurs in about 97.5% of the time, so
241         // it is the only one with the positive log-odds.
242 
243         const double x =
244             (have_donor && have_acptr) ?
245                  ( donor == "GT" && acptr == "AG" ? 3.75
246                  : donor == "AC" && acptr == "AG" ? -4.5
247                  : donor == "AT" && acptr == "AC" ? -6.5
248                  : donor == "GT" || acptr == "AG" ? -7.0
249                  :                                  -10.0)
250 
251             :    ( !have_donor && !have_acptr     ? 0.0
252                  : !have_donor  ? (acptr == "AG"  ? 2.0 : -7.0)
253                  : !have_acptr  ? (donor == "GT"  ? 2.0 : -7.0)
254                  :                                  0.0);
255 
256         return x / 3.75;
257     };
258 
259     ///////////////////////////////////////////////////////////////////////
260     // return 0 if can't compute
GetLongestMatchrunLenSAlignmentScoringModel261     static size_t GetLongestMatchrunLen(const CSeq_align& aln)
262     {
263         if(!aln.GetSegs().IsSpliced()) {
264             return 0;
265         }
266 
267         size_t max_len = 0;
268         size_t curr_len = 0;
269         for(const auto& exon : aln.GetSegs().GetSpliced().GetExons()) {
270             for(const auto& chunk : exon->GetParts()) {
271 
272                 if(chunk->IsMatch()) {
273                     curr_len += chunk->GetMatch();
274                 } else if(chunk->IsDiag()) {
275                     curr_len += chunk->GetDiag();
276                 } else {
277                     max_len = max(max_len, curr_len);
278                     curr_len = 0;
279                 }
280             }
281         }
282 
283         return max(curr_len, max_len);
284     }
285 
286     ///////////////////////////////////////////////////////////////////////
287 
GetGapLengthOnRowSAlignmentScoringModel288     static Int8 GetGapLengthOnRow(
289             const CSpliced_exon& exon1,
290             const CSpliced_exon& exon2,
291             CSeq_align::TDim row)
292     {
293         const auto r1 = exon1.GetRowSeq_range(row, true);
294         const auto r2 = exon2.GetRowSeq_range(row, true);
295 
296         return r1.CombinationWith(r2).GetLength()
297               - r1.GetLength()
298               - r2.GetLength();
299     }
300 
301     ///////////////////////////////////////////////////////////////////////
302 
303 #pragma push_macro("GET_SCORE")
304 #define GET_SCORE(score_name) \
305         double score_name = 0.0; \
306         bool have_##score_name = aln.GetNamedScore(#score_name, score_name); \
307         (void)score_name; (void)have_##score_name
308         // (void)variable is to quiet unused-variable warnings
309 
310     ///////////////////////////////////////////////////////////////////////
311 
isProtSSSAlignmentScoringModel312     static bool isProtSS(const CSeq_align& aln)
313     {
314         return aln.GetSegs().IsSpliced()
315             && aln.GetSegs().GetSpliced().GetProduct_type() ==
316                CSpliced_seg::eProduct_type_protein;
317     }
318 
319     ///////////////////////////////////////////////////////////////////////
320 
s_GetIdentOdds_protSSSAlignmentScoringModel321     static odds s_GetIdentOdds_protSS(const CSeq_align& aln)
322     {
323         _ASSERT(isProtSS(aln));
324 
325         GET_SCORE(num_ident);
326         GET_SCORE(num_mismatch);
327 
328         GET_SCORE(num_positives);
329         GET_SCORE(num_negatives);
330 
331         // Note: for ProSplign spliced-segs these are in terms of NAs, not AAs,
332         // JIRA:GP-27664
333 
334         const size_t ungapped_len = aln.GetAlignLength(false); // in NAs, not AAs.
335 
336         if(!have_num_ident && have_num_mismatch) {
337             have_num_ident = true;
338             num_ident = ungapped_len - num_mismatch;
339         }
340 
341         if(!have_num_positives && have_num_negatives) {
342             have_num_positives = true;
343             num_positives = ungapped_len - num_negatives;
344         }
345 
346         // GP-9599 - disregarding num_positives if have num_ident.
347         // (used to take weighted sum thereof when both are present).
348 
349         double identity = have_num_ident     ?     num_ident / ungapped_len
350                         : have_num_positives ? num_positives / ungapped_len
351                         : 0.5; // log-odds=0.
352 
353         if(identity < 0 || identity > 1.0) {
354             std::cerr << MSerial_AsnText << aln;
355             NCBI_USER_THROW("Identity is outside of [0..1] range - problem with alignment scores.");
356         }
357 
358         return odds(ungapped_len * identity,
359                     ungapped_len * (1 - identity));
360     }
361 
362     ///////////////////////////////////////////////////////////////////////
363 
s_GetIdentOdds_nucSSSAlignmentScoringModel364     static odds s_GetIdentOdds_nucSS(const CSeq_align& aln)
365     {
366         _ASSERT(aln.GetSegs().IsSpliced() && !isProtSS(aln));
367 
368         odds res;
369         for(const auto& exon : aln.GetSegs().GetSpliced().GetExons()) {
370             for(const auto& chunk : exon->GetParts()) {
371                 if(chunk->IsMatch()) {
372                     res.pos += chunk->GetMatch();
373                 } else if(chunk->IsMismatch()) {
374                     res.neg += chunk->GetMismatch();
375                 }
376             }
377         }
378         return res;
379     }
380 
381     ///////////////////////////////////////////////////////////////////////
382 
s_GetIdentOdds_discSAlignmentScoringModel383     static odds s_GetIdentOdds_disc(const CSeq_align& aln)
384     {
385         odds res;
386 
387         for(const auto subaln : aln.GetSegs().GetDisc().Get()) {
388             res += GetIdentOdds(*subaln);
389         }
390         return res;
391     }
392 
393     ///////////////////////////////////////////////////////////////////////
394 
GetIdentOddsSAlignmentScoringModel395     static odds GetIdentOdds(const CSeq_align& aln)
396     {
397         if(isProtSS(aln)) {
398             return s_GetIdentOdds_protSS(aln);
399         }
400 
401         GET_SCORE(num_ident);
402         GET_SCORE(num_mismatch);
403 
404         // Note: if alignment is from tblastn,
405         // num_ident/num_mismatch/num_positives/num_negatives are in AAs,
406         // and CSeq_align::GetAlignLength is in AAs.
407         //
408         // if the alignment is from ProSplign, those are in NAs.
409 
410         if(have_num_ident && have_num_mismatch) {
411             return odds(num_ident,
412                         num_mismatch);
413         }
414 
415         const auto ungapped_len = aln.GetAlignLength(false);
416 
417         if(have_num_ident) {
418             return odds(num_ident,
419                         ungapped_len - num_ident);
420         }
421 
422         if(have_num_mismatch) {
423             return odds(ungapped_len - num_mismatch,
424                         num_mismatch);
425         }
426 
427         GET_SCORE(pct_identity_ungap);
428 
429         if(have_pct_identity_ungap) {
430             num_ident = pct_identity_ungap * 0.01 * ungapped_len;
431             return odds(num_ident,
432                         ungapped_len - num_ident);
433         }
434 
435         GET_SCORE(pct_identity_gap);
436 
437         if(have_pct_identity_gap) {
438             const auto gapped_len = aln.GetAlignLength(true);
439             num_ident = pct_identity_gap * 0.01 * gapped_len;
440             return odds(num_ident,
441                         ungapped_len - num_ident);
442         }
443 
444         if(aln.GetSegs().IsSpliced() && !isProtSS(aln)) {
445             return s_GetIdentOdds_nucSS(aln);
446         } else if(aln.GetSegs().IsDisc()) {
447             return s_GetIdentOdds_disc(aln);
448         }
449 
450         cerr << MSerial_AsnText << aln;
451         NCBI_USER_THROW("Can't get ident/mismatch count for aln");
452     }
453 
454     ///////////////////////////////////////////////////////////////////////
455 
GetSplicesOddsSAlignmentScoringModel456     odds GetSplicesOdds(const CSeq_align& aln) const
457     {
458         odds res;
459         if(!aln.GetSegs().IsSpliced()) {
460             return res;
461         }
462 
463         const CSpliced_seg& ss = aln.GetSegs().GetSpliced();
464 
465         CConstRef<CSpliced_exon> prev;
466         for(const auto& exon : ss.GetExons()) {
467             if(!prev) {
468                 prev = exon;
469                 continue;
470             }
471 
472             // not checking for abutting, since
473             // the scoring function can score a
474             // single splice
475 
476             res += GetSpliceConsensusScore(
477                 prev->IsSetDonor_after_exon()
478               ? prev->GetDonor_after_exon().GetBases() : "",
479 
480                 exon->IsSetAcceptor_before_exon()
481               ? exon->GetAcceptor_before_exon().GetBases() : "");
482 
483             prev = exon;
484         }
485 
486         return res;
487     }
488 
489     ///////////////////////////////////////////////////////////////////////
490     /// pos + neg = count of exons.
491     /// neg = count of "bad" exons
492     ///          (partial or short or downstream of a long intron)
GetExonsOddsSAlignmentScoringModel493     odds GetExonsOdds(const CSeq_align& aln) const
494     {
495         odds res;
496         if(!aln.GetSegs().IsSpliced()) {
497             return res;
498         }
499 
500         const CSpliced_seg& ss = aln.GetSegs().GetSpliced();
501 
502         CConstRef<CSpliced_exon> prev;
503         for(const auto& exon : ss.GetExons()) {
504 
505             const bool is_terminal = exon == ss.GetExons().front()
506                                   || exon == ss.GetExons().back();
507 
508             const size_t len = exon->GetRowSeq_range(1, true).GetLength();
509 
510             const bool is_poor =
511                 (exon->IsSetPartial() && exon->GetPartial())
512              || (len <= (is_terminal ? 15 : 5))
513              || (prev && GetGapLengthOnRow(*prev, *exon, 1) > 65000);
514 
515             res += is_poor ? -1 : 1;
516 
517             prev = exon;
518         }
519 
520         return res;
521     }
522 
523     ///////////////////////////////////////////////////////////////////////
524 
GetNumGapopensBetweenExonsSAlignmentScoringModel525     static size_t GetNumGapopensBetweenExons(const CSeq_align& aln)
526     {
527         if(!aln.GetSegs().IsSpliced()) {
528             return 0;
529         }
530 
531         CConstRef<CSpliced_exon> prev;
532         size_t n = 0;
533         for(const auto& exon : aln.GetSegs().GetSpliced().GetExons()) {
534             n += prev && GetGapLengthOnRow(*prev, *exon, 0) != 0;
535             prev = exon;
536         }
537         return n;
538     }
539 
540     ///////////////////////////////////////////////////////////////////////
541 
GetPolyA_lengthSAlignmentScoringModel542     static size_t GetPolyA_length(const CSeq_align& aln)
543     {
544         if(!aln.GetSegs().IsSpliced()) {
545             return 0;
546         }
547 
548         const CSpliced_seg& ss = aln.GetSegs().GetSpliced();
549 
550         return !ss.IsSetPoly_a()         ? 0
551              : !ss.IsSetProduct_length() ? 0
552 
553              : aln.GetSeqStrand(0) == eNa_strand_minus
554                     ? ss.GetPoly_a()
555 
556              : aln.GetSeqStrand(0) == eNa_strand_plus
557                     ? ss.GetProduct_length() - ss.GetPoly_a()
558 
559              : 0;
560     }
561 
562     ///////////////////////////////////////////////////////////////////////
563 
564     // return 0 if can't compute
GetCoverageSAlignmentScoringModel565     static double GetCoverage(const CSeq_align& aln, size_t ung_aln_len)
566     {
567         GET_SCORE(pct_coverage_hiqual);
568         if(have_pct_coverage_hiqual) {
569             return pct_coverage_hiqual / 100;
570         }
571 
572         GET_SCORE(pct_coverage);
573         if(have_pct_coverage) {
574             return pct_coverage / 100;
575         }
576 
577         const bool is_prot = isProtSS(aln);
578 
579         if(   aln.GetSegs().IsSpliced()
580            && aln.GetSegs().GetSpliced().IsSetProduct_length())
581         {
582             auto len = aln.GetSegs().GetSpliced().GetProduct_length()
583                      * (is_prot ? 3 : 1);
584 
585             return ung_aln_len * 1.0 / len;
586         }
587 
588         return 0.0;
589     }
590 
591     ///////////////////////////////////////////////////////////////////////
592     //
593 
594 
595 #pragma pop_macro("GET_SCORE")
596 };
597 
598 /////////////////////////////////////////////////////////////////////////////
599 
GetScore(const CSeq_align & aln)600 int NBestPlacement::GetScore(const CSeq_align& aln)
601 {
602     static SAlignmentScoringModel get_score;
603     return get_score(aln);
604 }
605 
606 /////////////////////////////////////////////////////////////////////////////
607 
Rank(CSeq_align_set & sas,score_fn_t score_fn)608 void NBestPlacement::Rank(
609            CSeq_align_set& sas,
610                 score_fn_t score_fn)
611 {
612     if(sas.Get().empty()) {
613         return;
614     }
615 
616     using pair_t = pair<int, CRef<CSeq_align> >;
617     vector<pair_t> v;
618 
619     for(const auto aln : sas.Set()) {
620 
621         _ASSERT(          aln->GetSeq_id(0).Equals(
622             sas.Get().front()->GetSeq_id(0)));
623 
624         v.emplace_back(score_fn(*aln), aln);
625     }
626 
627     stable_sort(
628            v.begin(), v.end(),
629            [](const pair_t a, const pair_t b) {
630                 return b.first < a.first; // by score, descending
631            });
632 
633     const auto rank1_count = std::count_if(
634             v.begin(), v.end(),
635             [&v](pair_t p) { return p.first == v.front().first; });
636 
637     sas.Set().clear();
638 
639     int rank = 1;
640     int rank1_index = 1;
641     for(const auto p : v) {
642         auto aln = p.second;
643 
644         if(rank1_count > 1 && p.first == v.front().first) {
645             aln->SetNamedScore("rank", 1);
646             aln->SetNamedScore("rank1_index", rank1_index++);
647             aln->SetNamedScore("rank1_count", (int)rank1_count);
648         } else {
649             aln->SetNamedScore("rank", rank);
650         }
651 
652         rank++;
653 
654         aln->SetNamedScore("best_placement_score", p.first);
655 
656         sas.Set().emplace_back(aln);
657     }
658 }
659