1 /*  $Id: alnseq.cpp 601341 2020-02-05 20:27:33Z vasilche $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Kamen Todorov, NCBI
27 *
28 * File Description:
29 *   Alignment sequences
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 #include <objtools/alnmgr/alnmap.hpp>
36 #include <objtools/alnmgr/alnseq.hpp>
37 #include <objtools/alnmgr/alnsegments.hpp>
38 
39 #include <objects/seq/Bioseq.hpp>
40 #include <objects/seqloc/Seq_id.hpp>
41 
42 #include <objmgr/scope.hpp>
43 #include <objmgr/bioseq_handle.hpp>
44 
45 #include <algorithm>
46 
47 
48 BEGIN_NCBI_SCOPE
49 BEGIN_objects_SCOPE // namespace ncbi::objects::
50 
51 
CAlnMixSequences(void)52 CAlnMixSequences::CAlnMixSequences(void)
53     : m_DsCnt(0),
54       m_ContainsAA(false),
55       m_ContainsNA(false)
56 {
57 }
58 
59 
CAlnMixSequences(CScope & scope)60 CAlnMixSequences::CAlnMixSequences(CScope& scope)
61     : m_DsCnt(0),
62       m_Scope(&scope),
63       m_ContainsAA(false),
64       m_ContainsNA(false)
65 {
66 }
67 
68 
69 inline
70 bool
x_CompareScores(const CRef<CAlnMixSeq> & seq1,const CRef<CAlnMixSeq> & seq2)71 CAlnMixSequences::x_CompareScores(const CRef<CAlnMixSeq>& seq1,
72                                   const CRef<CAlnMixSeq>& seq2)
73 {
74     return seq1->m_Score >seq2->m_Score;
75 }
76 
77 
78 inline
79 bool
x_CompareChainScores(const CRef<CAlnMixSeq> & seq1,const CRef<CAlnMixSeq> & seq2)80 CAlnMixSequences::x_CompareChainScores(const CRef<CAlnMixSeq>& seq1,
81                                        const CRef<CAlnMixSeq>& seq2)
82 {
83     return
84         (seq1->m_ChainScore == seq2->m_ChainScore  &&
85          seq1->m_Score > seq2->m_Score)  ||
86         seq1->m_ChainScore > seq2->m_ChainScore;
87 }
88 
89 
90 void
SortByScore()91 CAlnMixSequences::SortByScore()
92 {
93     stable_sort(m_Seqs.begin(), m_Seqs.end(), x_CompareScores);
94 }
95 
96 
97 void
SortByChainScore()98 CAlnMixSequences::SortByChainScore()
99 {
100     stable_sort(m_Seqs.begin(), m_Seqs.end(), x_CompareChainScores);
101 }
102 
103 
104 void
Add(const CDense_seg & ds,TAddFlags flags)105 CAlnMixSequences::Add(const CDense_seg& ds, TAddFlags flags)
106 {
107     m_DsCnt++;
108 
109     vector<CRef<CAlnMixSeq> >& ds_seq = m_DsSeq[&ds];
110 
111     for (CAlnMap::TNumrow row = 0;  row < ds.GetDim();  row++) {
112 
113         CRef<CAlnMixSeq> aln_seq;
114 
115         if ( !m_Scope ) {
116             // identify sequences by their seq ids as provided by
117             // the dense seg (not as reliable as with OM, but faster)
118             CRef<CSeq_id> seq_id(new CSeq_id);
119             seq_id->Assign(*ds.GetIds()[row]);
120 
121             TSeqIdMap::iterator it = m_SeqIds.find(seq_id);
122             if (it == m_SeqIds.end()) {
123                 // add this seq id
124                 aln_seq = new CAlnMixSeq();
125                 m_SeqIds[seq_id] = aln_seq;
126                 aln_seq->m_SeqId = seq_id;
127                 aln_seq->m_DsCnt = 0;
128 
129                 // add this sequence
130                 m_Seqs.push_back(aln_seq);
131 
132                 // AA or NA?
133                 if (ds.IsSetWidths()) {
134                     if (ds.GetWidths()[row] == 1) {
135                         aln_seq->m_IsAA = true;
136                         m_ContainsAA = true;
137                     } else {
138                         aln_seq->m_IsAA = false;
139                         m_ContainsNA = true;
140                     }
141                 }
142 
143             } else {
144                 aln_seq = it->second;
145             }
146 
147         } else {
148             // uniquely identify the bioseq
149             x_IdentifyAlnMixSeq(aln_seq, *(ds.GetIds())[row]);
150 #if _DEBUG
151             // Verify the widths (if exist)
152             if (ds.IsSetWidths()) {
153                 const int& width = ds.GetWidths()[row];
154                 if (width == 1  &&  aln_seq->m_IsAA != true  ||
155                     width == 3  &&  aln_seq->m_IsAA != false) {
156                     string errstr = string("CAlnMix::Add(): ")
157                         + "Incorrect width("
158                         + NStr::IntToString(width) +
159                         ") or molecule type(" +
160                         (aln_seq->m_IsAA ? "AA" : "NA") +
161                         ").";
162                     NCBI_THROW(CAlnException, eInvalidSegment,
163                                errstr);
164                 }
165             }
166 #endif
167         }
168         // Set the width
169         if (ds.IsSetWidths()) {
170             aln_seq->m_Width = ds.GetWidths()[row];
171         }
172 
173 
174         // Preserve the row of the the original sequences if requested.
175         // This is mostly used to allow a sequence to itself.
176         // Create an additional sequence, pointed by m_AntoherRow,
177         // if the row index differs.
178         if (flags & fPreserveRows) {
179             int row_index = aln_seq->m_RowIdx;
180             if (row_index == -1) {
181                 // initialization
182                 aln_seq->m_RowIdx = row;
183             } else while (row_index != row) {
184                 if (aln_seq->m_AnotherRow) {
185                     aln_seq   = aln_seq->m_AnotherRow;
186                     row_index = aln_seq->m_RowIdx;
187                 } else {
188                     CRef<CAlnMixSeq> another_row (new CAlnMixSeq);
189 
190                     another_row->m_BioseqHandle = aln_seq->m_BioseqHandle;
191                     another_row->m_SeqId        = aln_seq->m_SeqId;
192                     another_row->m_Width        = aln_seq->m_Width;
193                     another_row->m_SeqIdx       = aln_seq->m_SeqIdx;
194                     another_row->m_ChildIdx     = aln_seq->m_ChildIdx + 1;
195                     another_row->m_DsIdx        = m_DsCnt;
196                     another_row->m_RowIdx       = row;
197 
198                     m_Seqs.push_back(another_row);
199 
200                     aln_seq = aln_seq->m_AnotherRow = another_row;
201 
202                     break;
203                 }
204             }
205         }
206 
207         aln_seq->m_DsCnt++;
208         ds_seq.push_back(aln_seq);
209     }
210 }
211 
212 
213 void
x_IdentifyAlnMixSeq(CRef<CAlnMixSeq> & aln_seq,const CSeq_id & seq_id)214 CAlnMixSequences::x_IdentifyAlnMixSeq(CRef<CAlnMixSeq>& aln_seq, const CSeq_id& seq_id)
215 {
216     if ( !m_Scope ) {
217         string errstr = string("CAlnMix::x_IdentifyAlnMixSeq(): ")
218             + "In order to use this functionality "
219             "scope should be provided in CAlnMix constructor.";
220         NCBI_THROW(CAlnException, eInvalidRequest, errstr);
221     }
222 
223     CBioseq_Handle bioseq_handle =
224         m_Scope->GetBioseqHandle(seq_id);
225 
226     if ( !bioseq_handle ) {
227         string errstr = string("CAlnMix::x_IdentifyAlnMixSeq(): ")
228             + "Seq-id cannot be resolved: "
229             + (seq_id.AsFastaString());
230 
231         NCBI_THROW(CAlnException, eInvalidSeqId, errstr);
232     }
233 
234 
235     TBioseqHandleMap::iterator it = m_BioseqHandles.find(bioseq_handle);
236     if (it == m_BioseqHandles.end()) {
237         // add this bioseq handle
238         aln_seq = new CAlnMixSeq();
239         m_BioseqHandles[bioseq_handle] = aln_seq;
240         aln_seq->m_BioseqHandle =
241             &m_BioseqHandles.find(bioseq_handle)->first;
242 
243         CRef<CSeq_id> seq_id(new CSeq_id);
244         seq_id->Assign(*aln_seq->m_BioseqHandle->GetSeqId());
245         aln_seq->m_SeqId = seq_id;
246         aln_seq->m_DsCnt = 0;
247 
248         // add this sequence
249         m_Seqs.push_back(aln_seq);
250 
251         // AA or NA?
252         if (aln_seq->m_BioseqHandle->IsProtein()) {
253             aln_seq->m_IsAA = true;
254             m_ContainsAA = true;
255         } else {
256             aln_seq->m_IsAA = false;
257             m_ContainsNA = true;
258         }
259     } else {
260         aln_seq = it->second;
261     }
262 }
263 
264 
265 void
BuildRows()266 CAlnMixSequences::BuildRows()
267 {
268     m_Rows.clear();
269 
270     int count = 0;
271     NON_CONST_ITERATE (TSeqs, i, m_Seqs) {
272         CRef<CAlnMixSeq>& seq = *i;
273 
274         if ( !seq->GetStarts().empty() ) {
275             m_Rows.push_back(seq);
276             seq->m_RowIdx = count++;
277             while ((seq = seq->m_ExtraRow) != NULL ) {
278                 seq->m_RowIdx = count++;
279                 m_Rows.push_back(seq);
280             }
281         }
282     }
283 }
284 
285 
286 void
InitRowsStartIts()287 CAlnMixSequences::InitRowsStartIts()
288 {
289     NON_CONST_ITERATE (TSeqs, row_i, m_Rows) {
290         CAlnMixSeq * row = *row_i;
291         if ( !row->GetStarts().empty() ) {
292             if (row->m_PositiveStrand) {
293                 row->SetStarts().current = row->GetStarts().begin();
294             } else {
295                 row->SetStarts().current = row->GetStarts().end();
296                 row->SetStarts().current--;
297             }
298         } else {
299             row->SetStarts().current = row->GetStarts().end();
300 #if _DEBUG
301             string errstr =
302                 string("CAlnMixSequences::InitRowsStartIts():") +
303                 " Internal error: no starts for row " +
304                 NStr::IntToString(row->m_RowIdx) +
305                 " (seq " +
306                 NStr::IntToString(row->m_SeqIdx) +
307                 " child " +
308                 NStr::IntToString(row->m_ChildIdx) +
309                 ").";
310             NCBI_THROW(CAlnException, eMergeFailure, errstr);
311 #endif
312         }
313     }
314 }
315 
316 
317 void
InitExtraRowsStartIts()318 CAlnMixSequences::InitExtraRowsStartIts()
319 {
320     NON_CONST_ITERATE (list<CRef<CAlnMixSeq> >, row_i, m_ExtraRows) {
321         CAlnMixSeq * row = *row_i;
322         if ( !row->GetStarts().empty() ) {
323             if (row->m_PositiveStrand) {
324                 row->SetStarts().current = row->GetStarts().begin();
325             } else {
326                 row->SetStarts().current = row->GetStarts().end();
327                 row->SetStarts().current--;
328             }
329         } else {
330             row->SetStarts().current = row->GetStarts().end();
331 #if _DEBUG
332             string errstr =
333                 string("CAlnMixSequences::InitExtraRowStartIts():") +
334                 " Internal error: no starts for row " +
335                 NStr::IntToString(row->m_RowIdx) + ".";
336             NCBI_THROW(CAlnException, eMergeFailure, errstr);
337 #endif
338         }
339     }
340 }
341 
342 
343 void
RowsStartItsContsistencyCheck(size_t match_idx)344 CAlnMixSequences::RowsStartItsContsistencyCheck(size_t match_idx)
345 {
346     NON_CONST_ITERATE (TSeqs, row_i, m_Rows) {
347         ITERATE (CAlnMixStarts, st_i, (*row_i)->GetStarts()) {
348             st_i->second->
349                 StartItsConsistencyCheck(**row_i,
350                                          st_i->first,
351                                          match_idx);
352         }
353     }
354 }
355 
356 
CAlnMixSeq(void)357 CAlnMixSeq::CAlnMixSeq(void)
358     : m_DsCnt(0),
359       m_BioseqHandle(0),
360       m_Score(0),
361       m_ChainScore(0),
362       m_StrandScore(0),
363       m_IsAA(false),
364       m_Width(1),
365       m_Frame(-1),
366       m_PositiveStrand(true),
367       m_RefBy(0),
368       m_ExtraRow(0),
369       m_ExtraRowIdx(0),
370       m_AnotherRow(0),
371       m_DsIdx(0),
372       m_SeqIdx(-1),
373       m_ChildIdx(0),
374       m_RowIdx(-1),
375       m_Starts(new CAlnMixStarts())
376 {
377 }
378 
379 
~CAlnMixSeq()380 CAlnMixSeq::~CAlnMixSeq()
381 {
382 }
383 
384 END_objects_SCOPE // namespace ncbi::objects::
385 END_NCBI_SCOPE
386