1 /*  $Id: alnmatch.cpp 355293 2012-03-05 15:17:16Z vasilche $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Kamen Todorov, NCBI
27 *
28 * File Description:
29 *   Alignment matches
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 #include <objtools/alnmgr/alnmatch.hpp>
36 #include <objtools/alnmgr/alnexception.hpp>
37 #include <objtools/alnmgr/alnmap.hpp>
38 
39 #include <objects/seq/Bioseq.hpp>
40 #include <objects/seqloc/Seq_id.hpp>
41 
42 #include <algorithm>
43 
44 
45 BEGIN_NCBI_SCOPE
46 BEGIN_objects_SCOPE // namespace ncbi::objects::
47 
48 
CAlnMixMatches(CRef<CAlnMixSequences> & sequences,TCalcScoreMethod calc_score)49 CAlnMixMatches::CAlnMixMatches(CRef<CAlnMixSequences>& sequences,
50                                TCalcScoreMethod calc_score)
51     : m_DsCnt(0),
52       m_AlnMixSequences(sequences),
53       m_Seqs(m_AlnMixSequences->m_Seqs),
54       x_CalculateScore(calc_score),
55       m_ContainsAA(m_AlnMixSequences->m_ContainsAA),
56       m_ContainsNA(m_AlnMixSequences->m_ContainsNA)
57 {
58 }
59 
60 
61 inline
62 bool
x_CompareScores(const CRef<CAlnMixMatch> & match1,const CRef<CAlnMixMatch> & match2)63 CAlnMixMatches::x_CompareScores(const CRef<CAlnMixMatch>& match1,
64                                 const CRef<CAlnMixMatch>& match2)
65 {
66     return match1->m_Score > match2->m_Score;
67 }
68 
69 
70 inline
71 bool
x_CompareChainScores(const CRef<CAlnMixMatch> & match1,const CRef<CAlnMixMatch> & match2)72 CAlnMixMatches::x_CompareChainScores(const CRef<CAlnMixMatch>& match1,
73                                      const CRef<CAlnMixMatch>& match2)
74 {
75     return
76         match1->m_ChainScore == match2->m_ChainScore  &&
77         match1->m_Score > match2->m_Score  ||
78         match1->m_ChainScore > match2->m_ChainScore;
79 }
80 
81 
82 void
SortByScore()83 CAlnMixMatches::SortByScore()
84 {
85     stable_sort(m_Matches.begin(), m_Matches.end(), x_CompareScores);
86 }
87 
88 
89 void
SortByChainScore()90 CAlnMixMatches::SortByChainScore()
91 {
92     stable_sort(m_Matches.begin(), m_Matches.end(), x_CompareChainScores);
93 }
94 
95 
96 void
Add(const CDense_seg & ds,TAddFlags flags)97 CAlnMixMatches::Add(const CDense_seg& ds, TAddFlags flags)
98 {
99     m_DsCnt++;
100 
101     m_AddFlags = flags;
102 
103     int              seg_off = 0;
104     TSignedSeqPos    start1, start2;
105     TSeqPos          len;
106     bool             single_chunk;
107     CAlnMap::TNumrow first_non_gapped_row_found;
108     bool             strands_exist =
109         ds.GetStrands().size() == (size_t)ds.GetNumseg() * ds.GetDim();
110     int              total_aln_score = 0;
111 
112     vector<CRef<CAlnMixSeq> >& ds_seq = m_AlnMixSequences->m_DsSeq[&ds];
113 
114     size_t prev_matches_size = m_Matches.size();
115 
116     for (CAlnMap::TNumseg seg =0;  seg < ds.GetNumseg();  seg++) {
117         len = ds.GetLens()[seg];
118         single_chunk = true;
119 
120         for (CAlnMap::TNumrow row1 = 0;  row1 < ds.GetDim();  row1++) {
121             if ((start1 = ds.GetStarts()[seg_off + row1]) >= 0) {
122                 //search for a match for the piece of seq on row1
123 
124                 CAlnMixSeq* aln_seq1 = ds_seq[row1].GetNonNullPointer();
125 
126                 for (CAlnMap::TNumrow row2 = row1+1;
127                      row2 < ds.GetDim();  row2++) {
128                     if ((start2 = ds.GetStarts()[seg_off + row2]) >= 0) {
129                         //match found
130                         if (single_chunk) {
131                             single_chunk = false;
132                             first_non_gapped_row_found = row1;
133                         }
134 
135 
136                         //add only pairs with the first_non_gapped_row_found
137                         //still, calc the score to be added to the seqs' scores
138 
139                         int score = 0;
140 
141                         CAlnMixSeq* aln_seq2 = ds_seq[row2].GetNonNullPointer();
142 
143 
144 
145                         // determine the strand
146                         ENa_strand strand1 = eNa_strand_plus;
147                         ENa_strand strand2 = eNa_strand_plus;
148                         if (strands_exist) {
149                             if (ds.GetStrands()[seg_off + row1]
150                                 == eNa_strand_minus) {
151                                 strand1 = eNa_strand_minus;
152                             }
153                             if (ds.GetStrands()[seg_off + row2]
154                                 == eNa_strand_minus) {
155                                 strand2 = eNa_strand_minus;
156                             }
157                         }
158 
159 
160                         //Determine the score
161                         if (flags & fCalcScore  &&  x_CalculateScore) {
162                             // calc the score by seq comp
163                             string s1, s2;
164                             aln_seq1->GetSeqString(s1,
165                                                    start1,
166                                                    len * aln_seq1->m_Width,
167                                                    strand1 != eNa_strand_minus);
168                             aln_seq2->GetSeqString(s2,
169                                                    start2,
170                                                    len * aln_seq2->m_Width,
171                                                    strand2 != eNa_strand_minus);
172 
173                             score = x_CalculateScore(s1,
174                                                               s2,
175                                                               aln_seq1->m_IsAA,
176                                                               aln_seq2->m_IsAA,
177                                                               1,
178                                                               1);
179                         } else {
180                             score = len;
181                         }
182                         total_aln_score += score;
183 
184                         // add to the sequences' scores
185                         aln_seq1->m_Score += score;
186                         aln_seq2->m_Score += score;
187 
188                         // in case of fForceTranslation,
189                         // check if strands are not mixed by
190                         // comparing current strand to the prevailing one
191                         if (flags & fForceTranslation  &&
192                             (aln_seq1->m_StrandScore > 0  &&
193                              strand1 == eNa_strand_minus ||
194                              aln_seq1->m_StrandScore < 0  &&
195                              strand1 != eNa_strand_minus ||
196                              aln_seq2->m_StrandScore > 0  &&
197                              strand2 == eNa_strand_minus ||
198                              aln_seq2->m_StrandScore < 0  &&
199                              strand2 != eNa_strand_minus)) {
200                             NCBI_THROW(CAlnException, eMergeFailure,
201                                        "CAlnMixMatches::Add(): "
202                                        "Unable to mix strands when "
203                                        "forcing translation!");
204                         }
205 
206                         // add to the prevailing strand
207                         aln_seq1->m_StrandScore += (strand1 == eNa_strand_minus ?
208                                                     - score : score);
209                         aln_seq2->m_StrandScore += (strand2 == eNa_strand_minus ?
210                                                     - score : score);
211 
212                         if (row1 == first_non_gapped_row_found) {
213                             CRef<CAlnMixMatch> match(new CAlnMixMatch);
214                             match->m_AlnSeq1 = ds_seq[row1];
215                             match->m_MatchIter1 = match->m_AlnSeq1->m_MatchList.end();
216                             match->m_Start1 = start1;
217                             match->m_AlnSeq2 = ds_seq[row2];
218                             match->m_MatchIter2 = match->m_AlnSeq2->m_MatchList.end();
219                             match->m_Start2 = start2;
220                             match->m_Len = len;
221                             match->m_DsIdx = m_DsCnt;
222                             match->m_StrandsDiffer = false;
223                             if (strands_exist) {
224                                 if ((strand1 == eNa_strand_minus  &&
225                                      strand2 != eNa_strand_minus)  ||
226                                     (strand1 != eNa_strand_minus  &&
227                                      strand2 == eNa_strand_minus)) {
228 
229                                     match->m_StrandsDiffer = true;
230                                 }
231                             }
232                             match->m_Score = score;
233                             _ASSERT(match->IsGood());
234                             m_Matches.push_back(match);
235                             _ASSERT(match->IsGood());
236                         }
237                     }
238                 }
239                 if (single_chunk) {
240                     //record it
241                     CRef<CAlnMixMatch> match(new CAlnMixMatch);
242                     match->m_Score = 0;
243                     match->m_AlnSeq1 = ds_seq[row1];
244                     match->m_MatchIter1 = match->m_AlnSeq1->m_MatchList.end();
245                     match->m_Start1 = start1;
246                     match->m_AlnSeq2 = 0;
247                     match->m_Start2 = 0;
248                     match->m_Len = len;
249                     match->m_StrandsDiffer = false;
250                     match->m_DsIdx = m_DsCnt;
251                     _ASSERT(match->IsGood());
252                     m_Matches.push_back(match);
253                 }
254             }
255         }
256         seg_off += ds.GetDim();
257     }
258 
259 
260     // Update chain scores
261     {{
262         // iterate through the newly added matches to set the m_ChainScore
263         size_t new_maches_size = m_Matches.size() - prev_matches_size;
264         NON_CONST_REVERSE_ITERATE(TMatches, match_i, m_Matches) {
265             _ASSERT((*match_i)->IsGood());
266             (*match_i)-> m_ChainScore = total_aln_score;
267             if ( !(--new_maches_size) ) {
268                 break;
269             }
270         }
271 
272         // update m_ChainScore in the participating sequences
273         for (size_t row = 0;  row < ds_seq.size();  row++) {
274             ds_seq[row]->m_ChainScore += total_aln_score;
275         }
276     }}
277 }
278 
279 
280 END_objects_SCOPE // namespace ncbi::objects::
281 END_NCBI_SCOPE
282