1 /*  $Id: sparse_aln.cpp 601341 2020-02-05 20:27:33Z vasilche $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Andrey Yazhuk
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
34 #include <objtools/alnmgr/sparse_aln.hpp>
35 #include <objtools/alnmgr/sparse_ci.hpp>
36 #include <objtools/alnmgr/alnexception.hpp>
37 #include <objtools/error_codes.hpp>
38 #include <objmgr/util/sequence.hpp>
39 
40 #include <objects/seqalign/Sparse_align.hpp>
41 #include <objects/seqfeat/Genetic_code_table.hpp>
42 #include <objects/seqfeat/Org_ref.hpp>
43 
44 
45 #define NCBI_USE_ERRCODE_X   Objtools_Aln_Sparse
46 
47 
48 BEGIN_NCBI_SCOPE
49 USING_SCOPE(ncbi::objects);
50 
51 
CSparseAln(const CAnchoredAln & anchored_aln,objects::CScope & scope)52 CSparseAln::CSparseAln(const CAnchoredAln& anchored_aln,
53                        objects::CScope& scope)
54     : m_Scope(&scope),
55       m_GapChar('-'),
56       m_NaCoding(CSeq_data::e_not_set),
57       m_AaCoding(CSeq_data::e_not_set),
58       m_AnchorDirect(true)
59 {
60     x_Build(anchored_aln);
61 }
62 
63 
~CSparseAln()64 CSparseAln::~CSparseAln()
65 {
66 }
67 
68 
GetDim() const69 CSparseAln::TDim CSparseAln::GetDim() const {
70     return m_Aln->GetDim();
71 }
72 
73 
74 struct SGapRange
75 {
76     TSignedSeqPos from; // original position of the gap on the anchor
77     TSignedSeqPos second_from; // position on the sequence
78     TSignedSeqPos len;  // length of the gap
79     bool          direct;
80     int           row;  // Row, containing the gap.
81     //size_t        idx;  // Index of the gap in the original vector.
82     // This gap's 'from' must be shifted by 'shift'.
83     // Segments at or after 'from' position must be offset by 'shift + len'.
84     TSignedSeqPos shift;
85 
operator <SGapRange86     bool operator<(const SGapRange& rg) const
87     {
88         if (from != rg.from) return from < rg.from; // sort by pos
89         return row < rg.row; // lower rows go first.
90         // Don't check the length. Stable sort will preserve the order
91         // of consecutive gaps on the same row.
92     }
93 };
94 
95 
96 typedef vector<SGapRange> TGapRanges;
97 
98 
x_Build(const CAnchoredAln & src_align)99 void CSparseAln::x_Build(const CAnchoredAln& src_align)
100 {
101     const TDim& dim = src_align.GetDim();
102 
103     m_BioseqHandles.clear();
104     m_BioseqHandles.resize(dim);
105 
106     m_SeqVectors.clear();
107     m_SeqVectors.resize(dim);
108 
109     // Collect all gaps on all rows. They need to be inserted into the
110     // new anchor as normal segments.
111     TGapRanges gaps;
112     for (TDim row = 0; row < dim; ++row) {
113         const CPairwiseAln& pw = *src_align.GetPairwiseAlns()[row];
114         const CPairwiseAln::TInsertions& ins_vec = pw.GetInsertions();
115         gaps.reserve(gaps.size() + ins_vec.size());
116         for (auto i = ins_vec.begin(); i != ins_vec.end(); ++i) {
117             SGapRange gap;
118             gap.from = i->GetFirstFrom();
119             gap.second_from = i->GetSecondFrom();
120             gap.len = i->GetLength();
121             gap.direct = i->IsDirect();
122             gap.row = row;
123             gap.shift = 0;
124             //gap.idx = i;
125             gaps.push_back(gap);
126         }
127     }
128 
129     // We need to preserve the order of consecutive gaps at the same
130     // position on the same row. Use stable_sort.
131     stable_sort(gaps.begin(), gaps.end());
132     // Set shift for all gaps.
133     TSignedSeqPos shift = 0;
134     NON_CONST_ITERATE(TGapRanges, gap_it, gaps) {
135         gap_it->shift = shift;
136         shift += gap_it->len;
137     }
138 
139     m_Aln.Reset(new CAnchoredAln());
140     m_Aln->SetDim(dim);
141     m_Aln->SetAnchorRow(src_align.GetAnchorRow());
142     m_Aln->SetScore(src_align.GetScore());
143 
144     // Now for each row convert insertions to normal segments and change
145     // the old anchor coordinates to alignment coordinates.
146     for (TDim row = 0; row < dim; ++row) {
147         const CPairwiseAln& pw = *src_align.GetPairwiseAlns()[row];
148         CPairwiseAln::const_iterator seg_it = pw.begin();
149         TGapRanges::const_iterator gap = gaps.begin();
150         TSignedSeqPos shift = 0;
151         CRef<CPairwiseAln> dst(new CPairwiseAln(
152             pw.GetFirstId(), pw.GetSecondId(), pw.GetFlags()));
153         TSeqPos last_to = 0;
154         bool first_direct = true;
155         bool second_direct = true;
156         while (seg_it != pw.end()) {
157             CPairwiseAln::TAlignRange rg = *seg_it;
158             first_direct = rg.IsFirstDirect();
159             second_direct = rg.IsDirect();
160             ++seg_it;
161             // Check if there are gaps before the new segment's end.
162             while (gap != gaps.end()  &&
163                 gap->from < rg.GetFirstToOpen()) {
164                 if (gap->row == row) {
165                     // Insertion in this row - align to the anchor.
166                     CPairwiseAln::TAlignRange ins(gap->from + shift,
167                         gap->second_from, gap->len, gap->direct);
168                     // Reset flags to match row orientation.
169                     ins.SetFirstDirect(first_direct);
170                     ins.SetDirect(second_direct);
171                     dst->push_back(ins);
172                 }
173                 else if (gap->from > rg.GetFirstFrom()) {
174                     // Split the range if there are insertions in other rows.
175                     CPairwiseAln::TAlignRange sub = rg;
176                     sub.SetLength(gap->from - rg.GetFirstFrom());
177                     sub.SetFirstFrom(sub.GetFirstFrom() + shift);
178                     if (rg.IsDirect()) {
179                         rg.SetSecondFrom(rg.GetSecondFrom() + sub.GetLength());
180                     }
181                     else {
182                         sub.SetSecondFrom(rg.GetSecondToOpen() - sub.GetLength());
183                     }
184                     rg.SetFirstFrom(rg.GetFirstFrom() + sub.GetLength());
185                     rg.SetLength(rg.GetLength() - sub.GetLength());
186                     dst->push_back(sub);
187                 }
188                 shift = gap->shift + gap->len;
189                 ++gap;
190             }
191             rg.SetFirstFrom(rg.GetFirstFrom() + shift);
192             dst->push_back(rg);
193             last_to = rg.GetSecondToOpen();
194         }
195         // Still have gaps in some rows?
196         while (gap != gaps.end()) {
197             if (gap->row == row) {
198                 CPairwiseAln::TAlignRange ins(gap->from + shift,
199                     last_to, gap->len, gap->direct);
200                 ins.SetFirstDirect(first_direct);
201                 ins.SetDirect(second_direct);
202                 dst->push_back(ins);
203             }
204             shift = gap->shift + gap->len;
205             ++gap;
206         }
207         m_Aln->SetPairwiseAlns()[row] = dst;
208     }
209     // Now the new anchored alignment should contain all segments and gaps
210     // aligned to the new anchor coordinates.
211 
212     m_SecondRanges.resize(dim);
213     for (TDim row = 0;  row < dim;  ++row) {
214 
215         /// Check collections flags
216         _ASSERT( !m_Aln->GetPairwiseAlns()[row]->IsSet(TAlnRngColl::fInvalid) );
217         _ASSERT( !m_Aln->GetPairwiseAlns()[row]->IsSet(TAlnRngColl::fUnsorted) );
218         _ASSERT( !m_Aln->GetPairwiseAlns()[row]->IsSet(TAlnRngColl::fOverlap) );
219 
220         /// Determine m_FirstRange
221         if (row == 0) {
222             m_FirstRange = m_Aln->GetPairwiseAlns()[row]->GetFirstRange();
223         } else {
224             m_FirstRange.CombineWith(m_Aln->GetPairwiseAlns()[row]->GetFirstRange());
225         }
226 
227         /// Determine m_SecondRanges
228         CAlignRangeCollExtender<TAlnRngColl> ext(*m_Aln->GetPairwiseAlns()[row]);
229         ext.UpdateIndex();
230         m_SecondRanges[row] = ext.GetSecondRange();
231     }
232 
233     const CPairwiseAln& anch_pw = *m_Aln->GetPairwiseAlns()[m_Aln->GetAnchorRow()];
234     m_AnchorDirect = anch_pw.empty() || anch_pw.begin()->IsFirstDirect();
235 }
236 
237 
GetAlnRange() const238 CSparseAln::TRng CSparseAln::GetAlnRange() const
239 {
240     return m_FirstRange;
241 }
242 
243 
SetGapChar(TResidue gap_char)244 void CSparseAln::SetGapChar(TResidue gap_char)
245 {
246     m_GapChar = gap_char;
247 }
248 
249 
GetScope() const250 CRef<CScope> CSparseAln::GetScope() const
251 {
252     return m_Scope;
253 }
254 
255 
256 const CSparseAln::TAlnRngColl&
GetAlignCollection(TNumrow row)257 CSparseAln::GetAlignCollection(TNumrow row)
258 {
259     _ASSERT(row >= 0  &&  row < GetDim());
260     return *m_Aln->GetPairwiseAlns()[row];
261 }
262 
263 
GetSeqId(TNumrow row) const264 const CSeq_id& CSparseAln::GetSeqId(TNumrow row) const
265 {
266     _ASSERT(row >= 0  &&  row < GetDim());
267     return m_Aln->GetPairwiseAlns()[row]->GetSecondId()->GetSeqId();
268 }
269 
270 
GetSeqAlnStart(TNumrow row) const271 TSignedSeqPos   CSparseAln::GetSeqAlnStart(TNumrow row) const
272 {
273     _ASSERT(row >= 0  &&  row < GetDim());
274     return m_Aln->GetPairwiseAlns()[row]->GetFirstFrom();
275 }
276 
277 
GetSeqAlnStop(TNumrow row) const278 TSignedSeqPos CSparseAln::GetSeqAlnStop(TNumrow row) const
279 {
280     _ASSERT(row >= 0  &&  row < GetDim());
281     return m_Aln->GetPairwiseAlns()[row]->GetFirstTo();
282 }
283 
284 
GetSeqAlnRange(TNumrow row) const285 CSparseAln::TSignedRange CSparseAln::GetSeqAlnRange(TNumrow row) const
286 {
287     _ASSERT(row >= 0  &&  row < GetDim());
288     return TSignedRange(GetSeqAlnStart(row), GetSeqAlnStop(row));
289 }
290 
291 
GetSeqStart(TNumrow row) const292 TSeqPos CSparseAln::GetSeqStart(TNumrow row) const
293 {
294     _ASSERT(row >= 0  &&  row < GetDim());
295     return m_SecondRanges[row].GetFrom();
296 }
297 
298 
GetSeqStop(TNumrow row) const299 TSeqPos CSparseAln::GetSeqStop(TNumrow row) const
300 {
301     _ASSERT(row >= 0  &&  row < GetDim());
302     return m_SecondRanges[row].GetTo();
303 }
304 
305 
GetSeqRange(TNumrow row) const306 CSparseAln::TRange CSparseAln::GetSeqRange(TNumrow row) const
307 {
308     _ASSERT(row >= 0  &&  row < GetDim());
309     return TRange(GetSeqStart(row), GetSeqStop(row));
310 }
311 
312 
IsPositiveStrand(TNumrow row) const313 bool CSparseAln::IsPositiveStrand(TNumrow row) const
314 {
315     _ASSERT(row >= 0  &&  row < GetDim());
316     _ASSERT( !m_Aln->GetPairwiseAlns()[row]->IsSet(CPairwiseAln::fMixedDir) );
317     return m_Aln->GetPairwiseAlns()[row]->IsSet(CPairwiseAln::fDirect) == m_AnchorDirect;
318 }
319 
320 
IsNegativeStrand(TNumrow row) const321 bool CSparseAln::IsNegativeStrand(TNumrow row) const
322 {
323     _ASSERT(row >= 0  &&  row < GetDim());
324     _ASSERT( !m_Aln->GetPairwiseAlns()[row]->IsSet(CPairwiseAln::fMixedDir) );
325     return m_Aln->GetPairwiseAlns()[row]->IsSet(CPairwiseAln::fReversed) == m_AnchorDirect;
326 }
327 
328 
IsTranslated() const329 bool CSparseAln::IsTranslated() const {
330     /// TODO Does BaseWidth of 1 always mean nucleotide?  Should we
331     /// have an enum (with an invalid (unasigned) value?
332     const int k_unasigned_base_width = 0;
333     int base_width = k_unasigned_base_width;
334     for (TDim row = 0;  row < GetDim();  ++row) {
335         if (base_width == k_unasigned_base_width) {
336             base_width = m_Aln->GetPairwiseAlns()[row]->GetFirstBaseWidth();
337         }
338         if (base_width != m_Aln->GetPairwiseAlns()[row]->GetFirstBaseWidth()  ||
339             base_width != m_Aln->GetPairwiseAlns()[row]->GetSecondBaseWidth()) {
340             return true; //< there *at least one* base diff base width
341         }
342         /// TODO or should this check be stronger:
343         if (base_width != 1) {
344             return true;
345         }
346     }
347     return false;
348 }
349 
350 
351 inline CSparseAln::TAlnRngColl::ESearchDirection
GetCollectionSearchDirection(CSparseAln::ESearchDirection dir)352 GetCollectionSearchDirection(CSparseAln::ESearchDirection dir)
353 {
354     typedef CSparseAln::TAlnRngColl   T;
355     switch(dir) {
356     case CSparseAln::eNone:
357         return T::eNone;
358     case CSparseAln::eLeft:
359         return T::eLeft;
360     case CSparseAln::eRight:
361         return T::eRight;
362     case CSparseAln::eForward:
363         return T::eForward;
364     case CSparseAln::eBackwards:
365         return T::eBackwards;
366     }
367     _ASSERT(false); // invalid
368     return T::eNone;
369 }
370 
371 
372 TSignedSeqPos
GetAlnPosFromSeqPos(TNumrow row,TSeqPos seq_pos,ESearchDirection dir,bool try_reverse_dir) const373 CSparseAln::GetAlnPosFromSeqPos(TNumrow row,
374                                 TSeqPos seq_pos,
375                                 ESearchDirection dir,
376                                 bool try_reverse_dir) const
377 {
378     _ASSERT(row >= 0  &&  row < GetDim());
379 
380     TAlnRngColl::ESearchDirection c_dir = GetCollectionSearchDirection(dir);
381     return m_Aln->GetPairwiseAlns()[row]->GetFirstPosBySecondPos(seq_pos, c_dir);
382 }
383 
384 
GetSeqPosFromAlnPos(TNumrow row,TSeqPos aln_pos,ESearchDirection dir,bool try_reverse_dir) const385 TSignedSeqPos CSparseAln::GetSeqPosFromAlnPos(TNumrow row, TSeqPos aln_pos,
386                                               ESearchDirection dir,
387                                               bool try_reverse_dir) const
388 {
389     _ASSERT(row >= 0  &&  row < GetDim());
390 
391     TAlnRngColl::ESearchDirection c_dir = GetCollectionSearchDirection(dir);
392     return m_Aln->GetPairwiseAlns()[row]->GetSecondPosByFirstPos(aln_pos, c_dir);
393 }
394 
395 
GetBioseqHandle(TNumrow row) const396 const CBioseq_Handle&  CSparseAln::GetBioseqHandle(TNumrow row) const
397 {
398     _ASSERT(row >= 0  &&  row < GetDim());
399 
400     if ( !m_BioseqHandles[row] ) {
401         if ( !(m_BioseqHandles[row] = m_Scope->GetBioseqHandle(GetSeqId(row))) ) {
402             string errstr = "Invalid bioseq handle.  Seq id \"" +
403                 GetSeqId(row).AsFastaString() + "\" not in scope?";
404             NCBI_THROW(CAlnException, eInvalidRequest, errstr);
405         }
406     }
407     return m_BioseqHandles[row];
408 }
409 
410 
x_GetSeqVector(TNumrow row) const411 CSeqVector& CSparseAln::x_GetSeqVector(TNumrow row) const
412 {
413     _ASSERT(row >= 0  &&  row < GetDim());
414 
415     if ( !m_SeqVectors[row] ) {
416         CSeqVector vec = GetBioseqHandle(row).GetSeqVector
417             (CBioseq_Handle::eCoding_Iupac,
418              IsPositiveStrand(row) ?
419              CBioseq_Handle::eStrand_Plus :
420              CBioseq_Handle::eStrand_Minus);
421         m_SeqVectors[row].Reset(new CSeqVector(vec));
422     }
423 
424     CSeqVector& seq_vec = *m_SeqVectors[row];
425     if ( seq_vec.IsNucleotide() ) {
426         if (m_NaCoding != CSeq_data::e_not_set) {
427             seq_vec.SetCoding(m_NaCoding);
428         }
429         else {
430             seq_vec.SetIupacCoding();
431         }
432     }
433     else if ( seq_vec.IsProtein() ) {
434         if (m_AaCoding != CSeq_data::e_not_set) {
435             seq_vec.SetCoding(m_AaCoding);
436         }
437         else {
438             seq_vec.SetIupacCoding();
439         }
440     }
441 
442     return seq_vec;
443 }
444 
445 
x_GetGenCode(TNumrow row) const446 int CSparseAln::x_GetGenCode(TNumrow row) const
447 {
448     CBioseq_Handle h = GetBioseqHandle(row);
449     if ( !h ) return kDefaultGenCode;
450 
451     // Try to use bio-source first.
452     CConstRef<CBioSource> src(sequence::GetBioSource(h));
453     if ( src ) return src->GetGenCode();
454 
455     // Fallback to org-ref.
456     CConstRef<COrg_ref> ref(sequence::GetOrg_refOrNull(h));
457     if ( ref ) return ref->GetGcode();
458 
459     return kDefaultGenCode;
460 }
461 
462 
TranslateNAToAA(const string & na,string & aa,int gencode)463 void CSparseAln::TranslateNAToAA(const string& na,
464                                  string& aa,
465                                  int gencode)
466 {
467     const CTrans_table& tbl = CGen_code_table::GetTransTable(gencode);
468 
469     size_t na_remainder = na.size() % 3;
470     size_t na_size = na.size() - na_remainder;
471 
472     if (&aa != &na) {
473         aa.resize(na_size / 3 + (na_remainder ? 1 : 0));
474     }
475 
476     if ( na.empty() ) return;
477 
478     size_t aa_i = 0;
479     int state = 0;
480     for (size_t na_i = 0;  na_i < na_size; ) {
481         for (size_t i = 0;  i < 3;  ++i, ++na_i) {
482             state = tbl.NextCodonState(state, na[na_i]);
483         }
484         aa[aa_i++] = tbl.GetCodonResidue(state);
485     }
486     if (na_remainder) {
487         aa[aa_i++] = '\\';
488     }
489 
490     if (&aa == &na) {
491         aa.resize(aa_i);
492     }
493 }
494 
495 
GetSeqString(TNumrow row,string & buffer,TSeqPos seq_from,TSeqPos seq_to,bool force_translation) const496 string& CSparseAln::GetSeqString(TNumrow row,
497                                  string &buffer,
498                                  TSeqPos seq_from, TSeqPos seq_to,
499                                  bool force_translation) const
500 {
501     return GetSeqString(row, buffer, TRange(seq_from, seq_to), force_translation);
502 }
503 
504 
GetSeqString(TNumrow row,string & buffer,const TRange & rq_seq_range,bool force_translation) const505 string& CSparseAln::GetSeqString(TNumrow row,
506                                  string &buffer,
507                                  const TRange &rq_seq_range,
508                                  bool force_translation) const
509 {
510     _ASSERT(row >= 0  &&  row < GetDim());
511 
512     TRange seq_range = rq_seq_range;
513     if ( seq_range.IsWhole() ) {
514         seq_range = GetSeqRange(row);
515     }
516 
517     buffer.erase();
518     int width = m_Aln->GetPairwiseAlns()[row]->GetSecondBaseWidth();
519     TSeqPos tr_from = seq_range.GetFrom();
520     TSeqPos tr_to = seq_range.GetToOpen();
521     if (width > 1) {
522         // It's a protein sequence. Protein coordinates were multiplied by 3,
523         // any fractions (incomplete AAs) must be discarded.
524         tr_from /= 3;
525         if (seq_range.GetFrom() % 3 > 0) {
526             tr_from++; // skip incomplete AA
527         }
528         tr_to /= 3;
529         force_translation = false; // do not translate AAs.
530     }
531     if (tr_to > tr_from) {
532         CSeqVector& seq_vector = x_GetSeqVector(row);
533 
534         TSeqPos size = tr_to - tr_from;
535         buffer.resize(size, m_GapChar);
536 
537         if (IsPositiveStrand(row)) {
538             seq_vector.GetSeqData(tr_from, tr_to, buffer);
539         } else {
540             TSeqPos vec_size = seq_vector.size();
541             seq_vector.GetSeqData(vec_size - tr_to, vec_size - tr_from, buffer);
542         }
543 
544         if ( force_translation ) {
545             TranslateNAToAA(buffer, buffer, x_GetGenCode(row));
546         }
547     }
548     return buffer;
549 }
550 
551 
GetAlnSeqString(TNumrow row,string & buffer,const TSignedRange & rq_aln_range,bool force_translation) const552 string& CSparseAln::GetAlnSeqString(TNumrow row,
553                                     string &buffer,
554                                     const TSignedRange &rq_aln_range,
555                                     bool force_translation) const
556 {
557     _ASSERT(row >= 0  &&  row < GetDim());
558 
559     TSignedRange aln_range(rq_aln_range);
560     if ( aln_range.IsWhole() ) {
561         aln_range = GetSeqAlnRange(row);
562     }
563 
564     buffer.erase();
565     if (aln_range.GetLength() <= 0) {
566         return buffer;
567     }
568 
569     const CPairwiseAln& pairwise_aln = *m_Aln->GetPairwiseAlns()[row];
570     if (pairwise_aln.empty()) {
571         string errstr = "Invalid (empty) row (" + NStr::IntToString(row) + ").  Seq id \"" +
572             GetSeqId(row).AsFastaString() + "\".";
573         NCBI_THROW(CAlnException, eInvalidRequest, errstr);
574     }
575 
576     CSeqVector& seq_vector = x_GetSeqVector(row);
577     TSeqPos vec_size = seq_vector.size();
578 
579     const int base_width = pairwise_aln.GetSecondBaseWidth();
580     int gencode = 0;
581     bool translate = force_translation  ||  pairwise_aln.GetSecondId()->IsProtein();
582 
583     // buffer holds sequence for "aln_range", 0 index corresonds to aln_range.GetFrom()
584     size_t buf_size = aln_range.GetLength();
585     if (translate) {
586         gencode = x_GetGenCode(row);
587         buf_size /= 3;
588     }
589     buffer.resize(buf_size, m_GapChar);
590 
591     string s; // current segment sequence
592     CSparse_CI it(*this, row, IAlnSegmentIterator::eSkipInserts, aln_range);
593 
594     // Store last incomplete codon's position and length. If the next
595     // segment includes the rest of the same codon, it should be included
596     // in the results. Otherwise incomplete codons are removed from protein
597     // sequences.
598     TSeqPos split_codon_pos = kInvalidSeqPos;
599     // When trimming one or more codons, future offsets must be adjusted.
600     bool direct = IsPositiveStrand(row);
601     bool is_first_seg = true;
602     size_t trim_from = 0;
603     size_t trim_to = 0;
604     while ( it )   {
605         trim_to = 0;
606         const IAlnSegment::TSignedRange& aln_r = it->GetAlnRange(); // in alignment
607         const IAlnSegment::TSignedRange& row_r = it->GetRange(); // on sequence
608         if ( row_r.Empty() ) {
609             ++it;
610             is_first_seg = false;
611             continue;
612         }
613 
614         size_t off = aln_r.GetFrom() - aln_range.GetFrom();
615         if (base_width == 1) {
616             // TODO performance issue - waiting for better API
617             if ( direct ) {
618                 seq_vector.GetSeqData(row_r.GetFrom(), row_r.GetToOpen(), s);
619             } else {
620                 seq_vector.GetSeqData(vec_size - row_r.GetToOpen(),
621                     vec_size - row_r.GetFrom(), s);
622             }
623             if ( translate ) {
624                 TranslateNAToAA(s, s, gencode);
625                 off /= 3;
626             }
627         }
628         else {
629             _ASSERT(base_width == 3);
630             TSeqPos tr_from = row_r.GetFrom();
631             TSeqPos tr_to = row_r.GetToOpen();
632             if ( direct ) {
633                 if (tr_from % 3 > 0) {
634                     if (tr_from == split_codon_pos) {
635                         // Include incomplete codon from the last range.
636                         _ASSERT(off > 0);
637                         if ( is_first_seg ) trim_from = tr_from % 3;
638                         off -= tr_from % 3;
639                         tr_from -= tr_from % 3;
640                     }
641                     else {
642                         // Trim incomplete codon at range start.
643                         off += 3 - (tr_from % 3);
644                         tr_from += 3 - (tr_from % 3);
645                     }
646                 }
647                 if (tr_to % 3 > 0) {
648                     split_codon_pos = tr_to;
649                     trim_to = tr_to % 3;
650                     tr_to -= tr_to % 3;
651                 }
652             }
653             else /* reverse */ {
654                 if (tr_to % 3 > 0) {
655                     if (tr_to == split_codon_pos) {
656                         // Include incomplete codon from the last range.
657                         _ASSERT(off > 0);
658                         if ( is_first_seg ) trim_from = 3 - (tr_to % 3);
659                         off -= 3 - (tr_to % 3);
660                         tr_to += 3 - (tr_to % 3);
661                     }
662                     else {
663                         off += tr_to % 3;
664                         tr_to -= tr_to % 3;
665                     }
666                 }
667                 if (tr_from % 3 > 0) {
668                     split_codon_pos = tr_from;
669                     trim_to = 3 - (tr_from % 3);
670                     tr_from += 3 - (tr_from % 3);
671                 }
672             }
673             off /= 3;
674             _ASSERT(tr_from % 3 == 0);
675             _ASSERT(tr_to % 3 == 0);
676             IAlnSegment::TSignedRange prot_r;
677             prot_r.SetOpen(tr_from / 3, tr_to / 3);
678             if ( direct ) {
679                 seq_vector.GetSeqData(prot_r.GetFrom(),
680                                         prot_r.GetToOpen(), s);
681             }
682             else {
683                 seq_vector.GetSeqData(vec_size - prot_r.GetToOpen(),
684                                         vec_size - prot_r.GetFrom(), s);
685             }
686         }
687 
688         size_t len = min(buf_size - off, s.size());
689         _ASSERT(off + len <= buf_size);
690 
691         if (len > 0) {
692             if ( m_AnchorDirect ) {
693                 buffer.replace(off, len, s, 0, len);
694             }
695             else {
696                 buffer.replace(buf_size - off - len, len, s, 0, len);
697             }
698         }
699         ++it;
700         is_first_seg = false;
701     }
702     if (translate  &&  aln_range.GetLength() >= trim_from + trim_to) {
703         buffer.resize((aln_range.GetLength() - trim_from - trim_to) / 3);
704     }
705     return buffer;
706 }
707 
708 
709 IAlnSegmentIterator*
CreateSegmentIterator(TNumrow row,const TSignedRange & range,IAlnSegmentIterator::EFlags flag) const710 CSparseAln::CreateSegmentIterator(TNumrow row,
711                                   const TSignedRange& range,
712                                   IAlnSegmentIterator::EFlags flag) const
713 {
714     _ASSERT(row >= 0  &&  row < GetDim());
715     _ASSERT( !m_Aln->GetPairwiseAlns()[row]->empty() );
716     if (m_Aln->GetPairwiseAlns()[row]->empty()) {
717         string errstr = "Invalid (empty) row (" + NStr::IntToString(row) + ").  Seq id \"" +
718             GetSeqId(row).AsFastaString() + "\".";
719         NCBI_THROW(CAlnException, eInvalidRequest, errstr);
720     }
721     return new CSparse_CI(*this, row, flag, range);
722 }
723 
724 
725 END_NCBI_SCOPE
726