1 /* $Id: alnmatch.cpp 355293 2012-03-05 15:17:16Z vasilche $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kamen Todorov, NCBI
27 *
28 * File Description:
29 * Alignment matches
30 *
31 * ===========================================================================
32 */
33
34 #include <ncbi_pch.hpp>
35 #include <objtools/alnmgr/alnmatch.hpp>
36 #include <objtools/alnmgr/alnexception.hpp>
37 #include <objtools/alnmgr/alnmap.hpp>
38
39 #include <objects/seq/Bioseq.hpp>
40 #include <objects/seqloc/Seq_id.hpp>
41
42 #include <algorithm>
43
44
45 BEGIN_NCBI_SCOPE
46 BEGIN_objects_SCOPE // namespace ncbi::objects::
47
48
CAlnMixMatches(CRef<CAlnMixSequences> & sequences,TCalcScoreMethod calc_score)49 CAlnMixMatches::CAlnMixMatches(CRef<CAlnMixSequences>& sequences,
50 TCalcScoreMethod calc_score)
51 : m_DsCnt(0),
52 m_AlnMixSequences(sequences),
53 m_Seqs(m_AlnMixSequences->m_Seqs),
54 x_CalculateScore(calc_score),
55 m_ContainsAA(m_AlnMixSequences->m_ContainsAA),
56 m_ContainsNA(m_AlnMixSequences->m_ContainsNA)
57 {
58 }
59
60
61 inline
62 bool
x_CompareScores(const CRef<CAlnMixMatch> & match1,const CRef<CAlnMixMatch> & match2)63 CAlnMixMatches::x_CompareScores(const CRef<CAlnMixMatch>& match1,
64 const CRef<CAlnMixMatch>& match2)
65 {
66 return match1->m_Score > match2->m_Score;
67 }
68
69
70 inline
71 bool
x_CompareChainScores(const CRef<CAlnMixMatch> & match1,const CRef<CAlnMixMatch> & match2)72 CAlnMixMatches::x_CompareChainScores(const CRef<CAlnMixMatch>& match1,
73 const CRef<CAlnMixMatch>& match2)
74 {
75 return
76 match1->m_ChainScore == match2->m_ChainScore &&
77 match1->m_Score > match2->m_Score ||
78 match1->m_ChainScore > match2->m_ChainScore;
79 }
80
81
82 void
SortByScore()83 CAlnMixMatches::SortByScore()
84 {
85 stable_sort(m_Matches.begin(), m_Matches.end(), x_CompareScores);
86 }
87
88
89 void
SortByChainScore()90 CAlnMixMatches::SortByChainScore()
91 {
92 stable_sort(m_Matches.begin(), m_Matches.end(), x_CompareChainScores);
93 }
94
95
96 void
Add(const CDense_seg & ds,TAddFlags flags)97 CAlnMixMatches::Add(const CDense_seg& ds, TAddFlags flags)
98 {
99 m_DsCnt++;
100
101 m_AddFlags = flags;
102
103 int seg_off = 0;
104 TSignedSeqPos start1, start2;
105 TSeqPos len;
106 bool single_chunk;
107 CAlnMap::TNumrow first_non_gapped_row_found;
108 bool strands_exist =
109 ds.GetStrands().size() == (size_t)ds.GetNumseg() * ds.GetDim();
110 int total_aln_score = 0;
111
112 vector<CRef<CAlnMixSeq> >& ds_seq = m_AlnMixSequences->m_DsSeq[&ds];
113
114 size_t prev_matches_size = m_Matches.size();
115
116 for (CAlnMap::TNumseg seg =0; seg < ds.GetNumseg(); seg++) {
117 len = ds.GetLens()[seg];
118 single_chunk = true;
119
120 for (CAlnMap::TNumrow row1 = 0; row1 < ds.GetDim(); row1++) {
121 if ((start1 = ds.GetStarts()[seg_off + row1]) >= 0) {
122 //search for a match for the piece of seq on row1
123
124 CAlnMixSeq* aln_seq1 = ds_seq[row1].GetNonNullPointer();
125
126 for (CAlnMap::TNumrow row2 = row1+1;
127 row2 < ds.GetDim(); row2++) {
128 if ((start2 = ds.GetStarts()[seg_off + row2]) >= 0) {
129 //match found
130 if (single_chunk) {
131 single_chunk = false;
132 first_non_gapped_row_found = row1;
133 }
134
135
136 //add only pairs with the first_non_gapped_row_found
137 //still, calc the score to be added to the seqs' scores
138
139 int score = 0;
140
141 CAlnMixSeq* aln_seq2 = ds_seq[row2].GetNonNullPointer();
142
143
144
145 // determine the strand
146 ENa_strand strand1 = eNa_strand_plus;
147 ENa_strand strand2 = eNa_strand_plus;
148 if (strands_exist) {
149 if (ds.GetStrands()[seg_off + row1]
150 == eNa_strand_minus) {
151 strand1 = eNa_strand_minus;
152 }
153 if (ds.GetStrands()[seg_off + row2]
154 == eNa_strand_minus) {
155 strand2 = eNa_strand_minus;
156 }
157 }
158
159
160 //Determine the score
161 if (flags & fCalcScore && x_CalculateScore) {
162 // calc the score by seq comp
163 string s1, s2;
164 aln_seq1->GetSeqString(s1,
165 start1,
166 len * aln_seq1->m_Width,
167 strand1 != eNa_strand_minus);
168 aln_seq2->GetSeqString(s2,
169 start2,
170 len * aln_seq2->m_Width,
171 strand2 != eNa_strand_minus);
172
173 score = x_CalculateScore(s1,
174 s2,
175 aln_seq1->m_IsAA,
176 aln_seq2->m_IsAA,
177 1,
178 1);
179 } else {
180 score = len;
181 }
182 total_aln_score += score;
183
184 // add to the sequences' scores
185 aln_seq1->m_Score += score;
186 aln_seq2->m_Score += score;
187
188 // in case of fForceTranslation,
189 // check if strands are not mixed by
190 // comparing current strand to the prevailing one
191 if (flags & fForceTranslation &&
192 (aln_seq1->m_StrandScore > 0 &&
193 strand1 == eNa_strand_minus ||
194 aln_seq1->m_StrandScore < 0 &&
195 strand1 != eNa_strand_minus ||
196 aln_seq2->m_StrandScore > 0 &&
197 strand2 == eNa_strand_minus ||
198 aln_seq2->m_StrandScore < 0 &&
199 strand2 != eNa_strand_minus)) {
200 NCBI_THROW(CAlnException, eMergeFailure,
201 "CAlnMixMatches::Add(): "
202 "Unable to mix strands when "
203 "forcing translation!");
204 }
205
206 // add to the prevailing strand
207 aln_seq1->m_StrandScore += (strand1 == eNa_strand_minus ?
208 - score : score);
209 aln_seq2->m_StrandScore += (strand2 == eNa_strand_minus ?
210 - score : score);
211
212 if (row1 == first_non_gapped_row_found) {
213 CRef<CAlnMixMatch> match(new CAlnMixMatch);
214 match->m_AlnSeq1 = ds_seq[row1];
215 match->m_MatchIter1 = match->m_AlnSeq1->m_MatchList.end();
216 match->m_Start1 = start1;
217 match->m_AlnSeq2 = ds_seq[row2];
218 match->m_MatchIter2 = match->m_AlnSeq2->m_MatchList.end();
219 match->m_Start2 = start2;
220 match->m_Len = len;
221 match->m_DsIdx = m_DsCnt;
222 match->m_StrandsDiffer = false;
223 if (strands_exist) {
224 if ((strand1 == eNa_strand_minus &&
225 strand2 != eNa_strand_minus) ||
226 (strand1 != eNa_strand_minus &&
227 strand2 == eNa_strand_minus)) {
228
229 match->m_StrandsDiffer = true;
230 }
231 }
232 match->m_Score = score;
233 _ASSERT(match->IsGood());
234 m_Matches.push_back(match);
235 _ASSERT(match->IsGood());
236 }
237 }
238 }
239 if (single_chunk) {
240 //record it
241 CRef<CAlnMixMatch> match(new CAlnMixMatch);
242 match->m_Score = 0;
243 match->m_AlnSeq1 = ds_seq[row1];
244 match->m_MatchIter1 = match->m_AlnSeq1->m_MatchList.end();
245 match->m_Start1 = start1;
246 match->m_AlnSeq2 = 0;
247 match->m_Start2 = 0;
248 match->m_Len = len;
249 match->m_StrandsDiffer = false;
250 match->m_DsIdx = m_DsCnt;
251 _ASSERT(match->IsGood());
252 m_Matches.push_back(match);
253 }
254 }
255 }
256 seg_off += ds.GetDim();
257 }
258
259
260 // Update chain scores
261 {{
262 // iterate through the newly added matches to set the m_ChainScore
263 size_t new_maches_size = m_Matches.size() - prev_matches_size;
264 NON_CONST_REVERSE_ITERATE(TMatches, match_i, m_Matches) {
265 _ASSERT((*match_i)->IsGood());
266 (*match_i)-> m_ChainScore = total_aln_score;
267 if ( !(--new_maches_size) ) {
268 break;
269 }
270 }
271
272 // update m_ChainScore in the participating sequences
273 for (size_t row = 0; row < ds_seq.size(); row++) {
274 ds_seq[row]->m_ChainScore += total_aln_score;
275 }
276 }}
277 }
278
279
280 END_objects_SCOPE // namespace ncbi::objects::
281 END_NCBI_SCOPE
282