1 /* $Id: alnseq.cpp 601341 2020-02-05 20:27:33Z vasilche $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kamen Todorov, NCBI
27 *
28 * File Description:
29 * Alignment sequences
30 *
31 * ===========================================================================
32 */
33
34 #include <ncbi_pch.hpp>
35 #include <objtools/alnmgr/alnmap.hpp>
36 #include <objtools/alnmgr/alnseq.hpp>
37 #include <objtools/alnmgr/alnsegments.hpp>
38
39 #include <objects/seq/Bioseq.hpp>
40 #include <objects/seqloc/Seq_id.hpp>
41
42 #include <objmgr/scope.hpp>
43 #include <objmgr/bioseq_handle.hpp>
44
45 #include <algorithm>
46
47
48 BEGIN_NCBI_SCOPE
49 BEGIN_objects_SCOPE // namespace ncbi::objects::
50
51
CAlnMixSequences(void)52 CAlnMixSequences::CAlnMixSequences(void)
53 : m_DsCnt(0),
54 m_ContainsAA(false),
55 m_ContainsNA(false)
56 {
57 }
58
59
CAlnMixSequences(CScope & scope)60 CAlnMixSequences::CAlnMixSequences(CScope& scope)
61 : m_DsCnt(0),
62 m_Scope(&scope),
63 m_ContainsAA(false),
64 m_ContainsNA(false)
65 {
66 }
67
68
69 inline
70 bool
x_CompareScores(const CRef<CAlnMixSeq> & seq1,const CRef<CAlnMixSeq> & seq2)71 CAlnMixSequences::x_CompareScores(const CRef<CAlnMixSeq>& seq1,
72 const CRef<CAlnMixSeq>& seq2)
73 {
74 return seq1->m_Score >seq2->m_Score;
75 }
76
77
78 inline
79 bool
x_CompareChainScores(const CRef<CAlnMixSeq> & seq1,const CRef<CAlnMixSeq> & seq2)80 CAlnMixSequences::x_CompareChainScores(const CRef<CAlnMixSeq>& seq1,
81 const CRef<CAlnMixSeq>& seq2)
82 {
83 return
84 (seq1->m_ChainScore == seq2->m_ChainScore &&
85 seq1->m_Score > seq2->m_Score) ||
86 seq1->m_ChainScore > seq2->m_ChainScore;
87 }
88
89
90 void
SortByScore()91 CAlnMixSequences::SortByScore()
92 {
93 stable_sort(m_Seqs.begin(), m_Seqs.end(), x_CompareScores);
94 }
95
96
97 void
SortByChainScore()98 CAlnMixSequences::SortByChainScore()
99 {
100 stable_sort(m_Seqs.begin(), m_Seqs.end(), x_CompareChainScores);
101 }
102
103
104 void
Add(const CDense_seg & ds,TAddFlags flags)105 CAlnMixSequences::Add(const CDense_seg& ds, TAddFlags flags)
106 {
107 m_DsCnt++;
108
109 vector<CRef<CAlnMixSeq> >& ds_seq = m_DsSeq[&ds];
110
111 for (CAlnMap::TNumrow row = 0; row < ds.GetDim(); row++) {
112
113 CRef<CAlnMixSeq> aln_seq;
114
115 if ( !m_Scope ) {
116 // identify sequences by their seq ids as provided by
117 // the dense seg (not as reliable as with OM, but faster)
118 CRef<CSeq_id> seq_id(new CSeq_id);
119 seq_id->Assign(*ds.GetIds()[row]);
120
121 TSeqIdMap::iterator it = m_SeqIds.find(seq_id);
122 if (it == m_SeqIds.end()) {
123 // add this seq id
124 aln_seq = new CAlnMixSeq();
125 m_SeqIds[seq_id] = aln_seq;
126 aln_seq->m_SeqId = seq_id;
127 aln_seq->m_DsCnt = 0;
128
129 // add this sequence
130 m_Seqs.push_back(aln_seq);
131
132 // AA or NA?
133 if (ds.IsSetWidths()) {
134 if (ds.GetWidths()[row] == 1) {
135 aln_seq->m_IsAA = true;
136 m_ContainsAA = true;
137 } else {
138 aln_seq->m_IsAA = false;
139 m_ContainsNA = true;
140 }
141 }
142
143 } else {
144 aln_seq = it->second;
145 }
146
147 } else {
148 // uniquely identify the bioseq
149 x_IdentifyAlnMixSeq(aln_seq, *(ds.GetIds())[row]);
150 #if _DEBUG
151 // Verify the widths (if exist)
152 if (ds.IsSetWidths()) {
153 const int& width = ds.GetWidths()[row];
154 if (width == 1 && aln_seq->m_IsAA != true ||
155 width == 3 && aln_seq->m_IsAA != false) {
156 string errstr = string("CAlnMix::Add(): ")
157 + "Incorrect width("
158 + NStr::IntToString(width) +
159 ") or molecule type(" +
160 (aln_seq->m_IsAA ? "AA" : "NA") +
161 ").";
162 NCBI_THROW(CAlnException, eInvalidSegment,
163 errstr);
164 }
165 }
166 #endif
167 }
168 // Set the width
169 if (ds.IsSetWidths()) {
170 aln_seq->m_Width = ds.GetWidths()[row];
171 }
172
173
174 // Preserve the row of the the original sequences if requested.
175 // This is mostly used to allow a sequence to itself.
176 // Create an additional sequence, pointed by m_AntoherRow,
177 // if the row index differs.
178 if (flags & fPreserveRows) {
179 int row_index = aln_seq->m_RowIdx;
180 if (row_index == -1) {
181 // initialization
182 aln_seq->m_RowIdx = row;
183 } else while (row_index != row) {
184 if (aln_seq->m_AnotherRow) {
185 aln_seq = aln_seq->m_AnotherRow;
186 row_index = aln_seq->m_RowIdx;
187 } else {
188 CRef<CAlnMixSeq> another_row (new CAlnMixSeq);
189
190 another_row->m_BioseqHandle = aln_seq->m_BioseqHandle;
191 another_row->m_SeqId = aln_seq->m_SeqId;
192 another_row->m_Width = aln_seq->m_Width;
193 another_row->m_SeqIdx = aln_seq->m_SeqIdx;
194 another_row->m_ChildIdx = aln_seq->m_ChildIdx + 1;
195 another_row->m_DsIdx = m_DsCnt;
196 another_row->m_RowIdx = row;
197
198 m_Seqs.push_back(another_row);
199
200 aln_seq = aln_seq->m_AnotherRow = another_row;
201
202 break;
203 }
204 }
205 }
206
207 aln_seq->m_DsCnt++;
208 ds_seq.push_back(aln_seq);
209 }
210 }
211
212
213 void
x_IdentifyAlnMixSeq(CRef<CAlnMixSeq> & aln_seq,const CSeq_id & seq_id)214 CAlnMixSequences::x_IdentifyAlnMixSeq(CRef<CAlnMixSeq>& aln_seq, const CSeq_id& seq_id)
215 {
216 if ( !m_Scope ) {
217 string errstr = string("CAlnMix::x_IdentifyAlnMixSeq(): ")
218 + "In order to use this functionality "
219 "scope should be provided in CAlnMix constructor.";
220 NCBI_THROW(CAlnException, eInvalidRequest, errstr);
221 }
222
223 CBioseq_Handle bioseq_handle =
224 m_Scope->GetBioseqHandle(seq_id);
225
226 if ( !bioseq_handle ) {
227 string errstr = string("CAlnMix::x_IdentifyAlnMixSeq(): ")
228 + "Seq-id cannot be resolved: "
229 + (seq_id.AsFastaString());
230
231 NCBI_THROW(CAlnException, eInvalidSeqId, errstr);
232 }
233
234
235 TBioseqHandleMap::iterator it = m_BioseqHandles.find(bioseq_handle);
236 if (it == m_BioseqHandles.end()) {
237 // add this bioseq handle
238 aln_seq = new CAlnMixSeq();
239 m_BioseqHandles[bioseq_handle] = aln_seq;
240 aln_seq->m_BioseqHandle =
241 &m_BioseqHandles.find(bioseq_handle)->first;
242
243 CRef<CSeq_id> seq_id(new CSeq_id);
244 seq_id->Assign(*aln_seq->m_BioseqHandle->GetSeqId());
245 aln_seq->m_SeqId = seq_id;
246 aln_seq->m_DsCnt = 0;
247
248 // add this sequence
249 m_Seqs.push_back(aln_seq);
250
251 // AA or NA?
252 if (aln_seq->m_BioseqHandle->IsProtein()) {
253 aln_seq->m_IsAA = true;
254 m_ContainsAA = true;
255 } else {
256 aln_seq->m_IsAA = false;
257 m_ContainsNA = true;
258 }
259 } else {
260 aln_seq = it->second;
261 }
262 }
263
264
265 void
BuildRows()266 CAlnMixSequences::BuildRows()
267 {
268 m_Rows.clear();
269
270 int count = 0;
271 NON_CONST_ITERATE (TSeqs, i, m_Seqs) {
272 CRef<CAlnMixSeq>& seq = *i;
273
274 if ( !seq->GetStarts().empty() ) {
275 m_Rows.push_back(seq);
276 seq->m_RowIdx = count++;
277 while ((seq = seq->m_ExtraRow) != NULL ) {
278 seq->m_RowIdx = count++;
279 m_Rows.push_back(seq);
280 }
281 }
282 }
283 }
284
285
286 void
InitRowsStartIts()287 CAlnMixSequences::InitRowsStartIts()
288 {
289 NON_CONST_ITERATE (TSeqs, row_i, m_Rows) {
290 CAlnMixSeq * row = *row_i;
291 if ( !row->GetStarts().empty() ) {
292 if (row->m_PositiveStrand) {
293 row->SetStarts().current = row->GetStarts().begin();
294 } else {
295 row->SetStarts().current = row->GetStarts().end();
296 row->SetStarts().current--;
297 }
298 } else {
299 row->SetStarts().current = row->GetStarts().end();
300 #if _DEBUG
301 string errstr =
302 string("CAlnMixSequences::InitRowsStartIts():") +
303 " Internal error: no starts for row " +
304 NStr::IntToString(row->m_RowIdx) +
305 " (seq " +
306 NStr::IntToString(row->m_SeqIdx) +
307 " child " +
308 NStr::IntToString(row->m_ChildIdx) +
309 ").";
310 NCBI_THROW(CAlnException, eMergeFailure, errstr);
311 #endif
312 }
313 }
314 }
315
316
317 void
InitExtraRowsStartIts()318 CAlnMixSequences::InitExtraRowsStartIts()
319 {
320 NON_CONST_ITERATE (list<CRef<CAlnMixSeq> >, row_i, m_ExtraRows) {
321 CAlnMixSeq * row = *row_i;
322 if ( !row->GetStarts().empty() ) {
323 if (row->m_PositiveStrand) {
324 row->SetStarts().current = row->GetStarts().begin();
325 } else {
326 row->SetStarts().current = row->GetStarts().end();
327 row->SetStarts().current--;
328 }
329 } else {
330 row->SetStarts().current = row->GetStarts().end();
331 #if _DEBUG
332 string errstr =
333 string("CAlnMixSequences::InitExtraRowStartIts():") +
334 " Internal error: no starts for row " +
335 NStr::IntToString(row->m_RowIdx) + ".";
336 NCBI_THROW(CAlnException, eMergeFailure, errstr);
337 #endif
338 }
339 }
340 }
341
342
343 void
RowsStartItsContsistencyCheck(size_t match_idx)344 CAlnMixSequences::RowsStartItsContsistencyCheck(size_t match_idx)
345 {
346 NON_CONST_ITERATE (TSeqs, row_i, m_Rows) {
347 ITERATE (CAlnMixStarts, st_i, (*row_i)->GetStarts()) {
348 st_i->second->
349 StartItsConsistencyCheck(**row_i,
350 st_i->first,
351 match_idx);
352 }
353 }
354 }
355
356
CAlnMixSeq(void)357 CAlnMixSeq::CAlnMixSeq(void)
358 : m_DsCnt(0),
359 m_BioseqHandle(0),
360 m_Score(0),
361 m_ChainScore(0),
362 m_StrandScore(0),
363 m_IsAA(false),
364 m_Width(1),
365 m_Frame(-1),
366 m_PositiveStrand(true),
367 m_RefBy(0),
368 m_ExtraRow(0),
369 m_ExtraRowIdx(0),
370 m_AnotherRow(0),
371 m_DsIdx(0),
372 m_SeqIdx(-1),
373 m_ChildIdx(0),
374 m_RowIdx(-1),
375 m_Starts(new CAlnMixStarts())
376 {
377 }
378
379
~CAlnMixSeq()380 CAlnMixSeq::~CAlnMixSeq()
381 {
382 }
383
384 END_objects_SCOPE // namespace ncbi::objects::
385 END_NCBI_SCOPE
386