1 /* $Id: sparse_aln.cpp 601341 2020-02-05 20:27:33Z vasilche $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Andrey Yazhuk
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33
34 #include <objtools/alnmgr/sparse_aln.hpp>
35 #include <objtools/alnmgr/sparse_ci.hpp>
36 #include <objtools/alnmgr/alnexception.hpp>
37 #include <objtools/error_codes.hpp>
38 #include <objmgr/util/sequence.hpp>
39
40 #include <objects/seqalign/Sparse_align.hpp>
41 #include <objects/seqfeat/Genetic_code_table.hpp>
42 #include <objects/seqfeat/Org_ref.hpp>
43
44
45 #define NCBI_USE_ERRCODE_X Objtools_Aln_Sparse
46
47
48 BEGIN_NCBI_SCOPE
49 USING_SCOPE(ncbi::objects);
50
51
CSparseAln(const CAnchoredAln & anchored_aln,objects::CScope & scope)52 CSparseAln::CSparseAln(const CAnchoredAln& anchored_aln,
53 objects::CScope& scope)
54 : m_Scope(&scope),
55 m_GapChar('-'),
56 m_NaCoding(CSeq_data::e_not_set),
57 m_AaCoding(CSeq_data::e_not_set),
58 m_AnchorDirect(true)
59 {
60 x_Build(anchored_aln);
61 }
62
63
~CSparseAln()64 CSparseAln::~CSparseAln()
65 {
66 }
67
68
GetDim() const69 CSparseAln::TDim CSparseAln::GetDim() const {
70 return m_Aln->GetDim();
71 }
72
73
74 struct SGapRange
75 {
76 TSignedSeqPos from; // original position of the gap on the anchor
77 TSignedSeqPos second_from; // position on the sequence
78 TSignedSeqPos len; // length of the gap
79 bool direct;
80 int row; // Row, containing the gap.
81 //size_t idx; // Index of the gap in the original vector.
82 // This gap's 'from' must be shifted by 'shift'.
83 // Segments at or after 'from' position must be offset by 'shift + len'.
84 TSignedSeqPos shift;
85
operator <SGapRange86 bool operator<(const SGapRange& rg) const
87 {
88 if (from != rg.from) return from < rg.from; // sort by pos
89 return row < rg.row; // lower rows go first.
90 // Don't check the length. Stable sort will preserve the order
91 // of consecutive gaps on the same row.
92 }
93 };
94
95
96 typedef vector<SGapRange> TGapRanges;
97
98
x_Build(const CAnchoredAln & src_align)99 void CSparseAln::x_Build(const CAnchoredAln& src_align)
100 {
101 const TDim& dim = src_align.GetDim();
102
103 m_BioseqHandles.clear();
104 m_BioseqHandles.resize(dim);
105
106 m_SeqVectors.clear();
107 m_SeqVectors.resize(dim);
108
109 // Collect all gaps on all rows. They need to be inserted into the
110 // new anchor as normal segments.
111 TGapRanges gaps;
112 for (TDim row = 0; row < dim; ++row) {
113 const CPairwiseAln& pw = *src_align.GetPairwiseAlns()[row];
114 const CPairwiseAln::TInsertions& ins_vec = pw.GetInsertions();
115 gaps.reserve(gaps.size() + ins_vec.size());
116 for (auto i = ins_vec.begin(); i != ins_vec.end(); ++i) {
117 SGapRange gap;
118 gap.from = i->GetFirstFrom();
119 gap.second_from = i->GetSecondFrom();
120 gap.len = i->GetLength();
121 gap.direct = i->IsDirect();
122 gap.row = row;
123 gap.shift = 0;
124 //gap.idx = i;
125 gaps.push_back(gap);
126 }
127 }
128
129 // We need to preserve the order of consecutive gaps at the same
130 // position on the same row. Use stable_sort.
131 stable_sort(gaps.begin(), gaps.end());
132 // Set shift for all gaps.
133 TSignedSeqPos shift = 0;
134 NON_CONST_ITERATE(TGapRanges, gap_it, gaps) {
135 gap_it->shift = shift;
136 shift += gap_it->len;
137 }
138
139 m_Aln.Reset(new CAnchoredAln());
140 m_Aln->SetDim(dim);
141 m_Aln->SetAnchorRow(src_align.GetAnchorRow());
142 m_Aln->SetScore(src_align.GetScore());
143
144 // Now for each row convert insertions to normal segments and change
145 // the old anchor coordinates to alignment coordinates.
146 for (TDim row = 0; row < dim; ++row) {
147 const CPairwiseAln& pw = *src_align.GetPairwiseAlns()[row];
148 CPairwiseAln::const_iterator seg_it = pw.begin();
149 TGapRanges::const_iterator gap = gaps.begin();
150 TSignedSeqPos shift = 0;
151 CRef<CPairwiseAln> dst(new CPairwiseAln(
152 pw.GetFirstId(), pw.GetSecondId(), pw.GetFlags()));
153 TSeqPos last_to = 0;
154 bool first_direct = true;
155 bool second_direct = true;
156 while (seg_it != pw.end()) {
157 CPairwiseAln::TAlignRange rg = *seg_it;
158 first_direct = rg.IsFirstDirect();
159 second_direct = rg.IsDirect();
160 ++seg_it;
161 // Check if there are gaps before the new segment's end.
162 while (gap != gaps.end() &&
163 gap->from < rg.GetFirstToOpen()) {
164 if (gap->row == row) {
165 // Insertion in this row - align to the anchor.
166 CPairwiseAln::TAlignRange ins(gap->from + shift,
167 gap->second_from, gap->len, gap->direct);
168 // Reset flags to match row orientation.
169 ins.SetFirstDirect(first_direct);
170 ins.SetDirect(second_direct);
171 dst->push_back(ins);
172 }
173 else if (gap->from > rg.GetFirstFrom()) {
174 // Split the range if there are insertions in other rows.
175 CPairwiseAln::TAlignRange sub = rg;
176 sub.SetLength(gap->from - rg.GetFirstFrom());
177 sub.SetFirstFrom(sub.GetFirstFrom() + shift);
178 if (rg.IsDirect()) {
179 rg.SetSecondFrom(rg.GetSecondFrom() + sub.GetLength());
180 }
181 else {
182 sub.SetSecondFrom(rg.GetSecondToOpen() - sub.GetLength());
183 }
184 rg.SetFirstFrom(rg.GetFirstFrom() + sub.GetLength());
185 rg.SetLength(rg.GetLength() - sub.GetLength());
186 dst->push_back(sub);
187 }
188 shift = gap->shift + gap->len;
189 ++gap;
190 }
191 rg.SetFirstFrom(rg.GetFirstFrom() + shift);
192 dst->push_back(rg);
193 last_to = rg.GetSecondToOpen();
194 }
195 // Still have gaps in some rows?
196 while (gap != gaps.end()) {
197 if (gap->row == row) {
198 CPairwiseAln::TAlignRange ins(gap->from + shift,
199 last_to, gap->len, gap->direct);
200 ins.SetFirstDirect(first_direct);
201 ins.SetDirect(second_direct);
202 dst->push_back(ins);
203 }
204 shift = gap->shift + gap->len;
205 ++gap;
206 }
207 m_Aln->SetPairwiseAlns()[row] = dst;
208 }
209 // Now the new anchored alignment should contain all segments and gaps
210 // aligned to the new anchor coordinates.
211
212 m_SecondRanges.resize(dim);
213 for (TDim row = 0; row < dim; ++row) {
214
215 /// Check collections flags
216 _ASSERT( !m_Aln->GetPairwiseAlns()[row]->IsSet(TAlnRngColl::fInvalid) );
217 _ASSERT( !m_Aln->GetPairwiseAlns()[row]->IsSet(TAlnRngColl::fUnsorted) );
218 _ASSERT( !m_Aln->GetPairwiseAlns()[row]->IsSet(TAlnRngColl::fOverlap) );
219
220 /// Determine m_FirstRange
221 if (row == 0) {
222 m_FirstRange = m_Aln->GetPairwiseAlns()[row]->GetFirstRange();
223 } else {
224 m_FirstRange.CombineWith(m_Aln->GetPairwiseAlns()[row]->GetFirstRange());
225 }
226
227 /// Determine m_SecondRanges
228 CAlignRangeCollExtender<TAlnRngColl> ext(*m_Aln->GetPairwiseAlns()[row]);
229 ext.UpdateIndex();
230 m_SecondRanges[row] = ext.GetSecondRange();
231 }
232
233 const CPairwiseAln& anch_pw = *m_Aln->GetPairwiseAlns()[m_Aln->GetAnchorRow()];
234 m_AnchorDirect = anch_pw.empty() || anch_pw.begin()->IsFirstDirect();
235 }
236
237
GetAlnRange() const238 CSparseAln::TRng CSparseAln::GetAlnRange() const
239 {
240 return m_FirstRange;
241 }
242
243
SetGapChar(TResidue gap_char)244 void CSparseAln::SetGapChar(TResidue gap_char)
245 {
246 m_GapChar = gap_char;
247 }
248
249
GetScope() const250 CRef<CScope> CSparseAln::GetScope() const
251 {
252 return m_Scope;
253 }
254
255
256 const CSparseAln::TAlnRngColl&
GetAlignCollection(TNumrow row)257 CSparseAln::GetAlignCollection(TNumrow row)
258 {
259 _ASSERT(row >= 0 && row < GetDim());
260 return *m_Aln->GetPairwiseAlns()[row];
261 }
262
263
GetSeqId(TNumrow row) const264 const CSeq_id& CSparseAln::GetSeqId(TNumrow row) const
265 {
266 _ASSERT(row >= 0 && row < GetDim());
267 return m_Aln->GetPairwiseAlns()[row]->GetSecondId()->GetSeqId();
268 }
269
270
GetSeqAlnStart(TNumrow row) const271 TSignedSeqPos CSparseAln::GetSeqAlnStart(TNumrow row) const
272 {
273 _ASSERT(row >= 0 && row < GetDim());
274 return m_Aln->GetPairwiseAlns()[row]->GetFirstFrom();
275 }
276
277
GetSeqAlnStop(TNumrow row) const278 TSignedSeqPos CSparseAln::GetSeqAlnStop(TNumrow row) const
279 {
280 _ASSERT(row >= 0 && row < GetDim());
281 return m_Aln->GetPairwiseAlns()[row]->GetFirstTo();
282 }
283
284
GetSeqAlnRange(TNumrow row) const285 CSparseAln::TSignedRange CSparseAln::GetSeqAlnRange(TNumrow row) const
286 {
287 _ASSERT(row >= 0 && row < GetDim());
288 return TSignedRange(GetSeqAlnStart(row), GetSeqAlnStop(row));
289 }
290
291
GetSeqStart(TNumrow row) const292 TSeqPos CSparseAln::GetSeqStart(TNumrow row) const
293 {
294 _ASSERT(row >= 0 && row < GetDim());
295 return m_SecondRanges[row].GetFrom();
296 }
297
298
GetSeqStop(TNumrow row) const299 TSeqPos CSparseAln::GetSeqStop(TNumrow row) const
300 {
301 _ASSERT(row >= 0 && row < GetDim());
302 return m_SecondRanges[row].GetTo();
303 }
304
305
GetSeqRange(TNumrow row) const306 CSparseAln::TRange CSparseAln::GetSeqRange(TNumrow row) const
307 {
308 _ASSERT(row >= 0 && row < GetDim());
309 return TRange(GetSeqStart(row), GetSeqStop(row));
310 }
311
312
IsPositiveStrand(TNumrow row) const313 bool CSparseAln::IsPositiveStrand(TNumrow row) const
314 {
315 _ASSERT(row >= 0 && row < GetDim());
316 _ASSERT( !m_Aln->GetPairwiseAlns()[row]->IsSet(CPairwiseAln::fMixedDir) );
317 return m_Aln->GetPairwiseAlns()[row]->IsSet(CPairwiseAln::fDirect) == m_AnchorDirect;
318 }
319
320
IsNegativeStrand(TNumrow row) const321 bool CSparseAln::IsNegativeStrand(TNumrow row) const
322 {
323 _ASSERT(row >= 0 && row < GetDim());
324 _ASSERT( !m_Aln->GetPairwiseAlns()[row]->IsSet(CPairwiseAln::fMixedDir) );
325 return m_Aln->GetPairwiseAlns()[row]->IsSet(CPairwiseAln::fReversed) == m_AnchorDirect;
326 }
327
328
IsTranslated() const329 bool CSparseAln::IsTranslated() const {
330 /// TODO Does BaseWidth of 1 always mean nucleotide? Should we
331 /// have an enum (with an invalid (unasigned) value?
332 const int k_unasigned_base_width = 0;
333 int base_width = k_unasigned_base_width;
334 for (TDim row = 0; row < GetDim(); ++row) {
335 if (base_width == k_unasigned_base_width) {
336 base_width = m_Aln->GetPairwiseAlns()[row]->GetFirstBaseWidth();
337 }
338 if (base_width != m_Aln->GetPairwiseAlns()[row]->GetFirstBaseWidth() ||
339 base_width != m_Aln->GetPairwiseAlns()[row]->GetSecondBaseWidth()) {
340 return true; //< there *at least one* base diff base width
341 }
342 /// TODO or should this check be stronger:
343 if (base_width != 1) {
344 return true;
345 }
346 }
347 return false;
348 }
349
350
351 inline CSparseAln::TAlnRngColl::ESearchDirection
GetCollectionSearchDirection(CSparseAln::ESearchDirection dir)352 GetCollectionSearchDirection(CSparseAln::ESearchDirection dir)
353 {
354 typedef CSparseAln::TAlnRngColl T;
355 switch(dir) {
356 case CSparseAln::eNone:
357 return T::eNone;
358 case CSparseAln::eLeft:
359 return T::eLeft;
360 case CSparseAln::eRight:
361 return T::eRight;
362 case CSparseAln::eForward:
363 return T::eForward;
364 case CSparseAln::eBackwards:
365 return T::eBackwards;
366 }
367 _ASSERT(false); // invalid
368 return T::eNone;
369 }
370
371
372 TSignedSeqPos
GetAlnPosFromSeqPos(TNumrow row,TSeqPos seq_pos,ESearchDirection dir,bool try_reverse_dir) const373 CSparseAln::GetAlnPosFromSeqPos(TNumrow row,
374 TSeqPos seq_pos,
375 ESearchDirection dir,
376 bool try_reverse_dir) const
377 {
378 _ASSERT(row >= 0 && row < GetDim());
379
380 TAlnRngColl::ESearchDirection c_dir = GetCollectionSearchDirection(dir);
381 return m_Aln->GetPairwiseAlns()[row]->GetFirstPosBySecondPos(seq_pos, c_dir);
382 }
383
384
GetSeqPosFromAlnPos(TNumrow row,TSeqPos aln_pos,ESearchDirection dir,bool try_reverse_dir) const385 TSignedSeqPos CSparseAln::GetSeqPosFromAlnPos(TNumrow row, TSeqPos aln_pos,
386 ESearchDirection dir,
387 bool try_reverse_dir) const
388 {
389 _ASSERT(row >= 0 && row < GetDim());
390
391 TAlnRngColl::ESearchDirection c_dir = GetCollectionSearchDirection(dir);
392 return m_Aln->GetPairwiseAlns()[row]->GetSecondPosByFirstPos(aln_pos, c_dir);
393 }
394
395
GetBioseqHandle(TNumrow row) const396 const CBioseq_Handle& CSparseAln::GetBioseqHandle(TNumrow row) const
397 {
398 _ASSERT(row >= 0 && row < GetDim());
399
400 if ( !m_BioseqHandles[row] ) {
401 if ( !(m_BioseqHandles[row] = m_Scope->GetBioseqHandle(GetSeqId(row))) ) {
402 string errstr = "Invalid bioseq handle. Seq id \"" +
403 GetSeqId(row).AsFastaString() + "\" not in scope?";
404 NCBI_THROW(CAlnException, eInvalidRequest, errstr);
405 }
406 }
407 return m_BioseqHandles[row];
408 }
409
410
x_GetSeqVector(TNumrow row) const411 CSeqVector& CSparseAln::x_GetSeqVector(TNumrow row) const
412 {
413 _ASSERT(row >= 0 && row < GetDim());
414
415 if ( !m_SeqVectors[row] ) {
416 CSeqVector vec = GetBioseqHandle(row).GetSeqVector
417 (CBioseq_Handle::eCoding_Iupac,
418 IsPositiveStrand(row) ?
419 CBioseq_Handle::eStrand_Plus :
420 CBioseq_Handle::eStrand_Minus);
421 m_SeqVectors[row].Reset(new CSeqVector(vec));
422 }
423
424 CSeqVector& seq_vec = *m_SeqVectors[row];
425 if ( seq_vec.IsNucleotide() ) {
426 if (m_NaCoding != CSeq_data::e_not_set) {
427 seq_vec.SetCoding(m_NaCoding);
428 }
429 else {
430 seq_vec.SetIupacCoding();
431 }
432 }
433 else if ( seq_vec.IsProtein() ) {
434 if (m_AaCoding != CSeq_data::e_not_set) {
435 seq_vec.SetCoding(m_AaCoding);
436 }
437 else {
438 seq_vec.SetIupacCoding();
439 }
440 }
441
442 return seq_vec;
443 }
444
445
x_GetGenCode(TNumrow row) const446 int CSparseAln::x_GetGenCode(TNumrow row) const
447 {
448 CBioseq_Handle h = GetBioseqHandle(row);
449 if ( !h ) return kDefaultGenCode;
450
451 // Try to use bio-source first.
452 CConstRef<CBioSource> src(sequence::GetBioSource(h));
453 if ( src ) return src->GetGenCode();
454
455 // Fallback to org-ref.
456 CConstRef<COrg_ref> ref(sequence::GetOrg_refOrNull(h));
457 if ( ref ) return ref->GetGcode();
458
459 return kDefaultGenCode;
460 }
461
462
TranslateNAToAA(const string & na,string & aa,int gencode)463 void CSparseAln::TranslateNAToAA(const string& na,
464 string& aa,
465 int gencode)
466 {
467 const CTrans_table& tbl = CGen_code_table::GetTransTable(gencode);
468
469 size_t na_remainder = na.size() % 3;
470 size_t na_size = na.size() - na_remainder;
471
472 if (&aa != &na) {
473 aa.resize(na_size / 3 + (na_remainder ? 1 : 0));
474 }
475
476 if ( na.empty() ) return;
477
478 size_t aa_i = 0;
479 int state = 0;
480 for (size_t na_i = 0; na_i < na_size; ) {
481 for (size_t i = 0; i < 3; ++i, ++na_i) {
482 state = tbl.NextCodonState(state, na[na_i]);
483 }
484 aa[aa_i++] = tbl.GetCodonResidue(state);
485 }
486 if (na_remainder) {
487 aa[aa_i++] = '\\';
488 }
489
490 if (&aa == &na) {
491 aa.resize(aa_i);
492 }
493 }
494
495
GetSeqString(TNumrow row,string & buffer,TSeqPos seq_from,TSeqPos seq_to,bool force_translation) const496 string& CSparseAln::GetSeqString(TNumrow row,
497 string &buffer,
498 TSeqPos seq_from, TSeqPos seq_to,
499 bool force_translation) const
500 {
501 return GetSeqString(row, buffer, TRange(seq_from, seq_to), force_translation);
502 }
503
504
GetSeqString(TNumrow row,string & buffer,const TRange & rq_seq_range,bool force_translation) const505 string& CSparseAln::GetSeqString(TNumrow row,
506 string &buffer,
507 const TRange &rq_seq_range,
508 bool force_translation) const
509 {
510 _ASSERT(row >= 0 && row < GetDim());
511
512 TRange seq_range = rq_seq_range;
513 if ( seq_range.IsWhole() ) {
514 seq_range = GetSeqRange(row);
515 }
516
517 buffer.erase();
518 int width = m_Aln->GetPairwiseAlns()[row]->GetSecondBaseWidth();
519 TSeqPos tr_from = seq_range.GetFrom();
520 TSeqPos tr_to = seq_range.GetToOpen();
521 if (width > 1) {
522 // It's a protein sequence. Protein coordinates were multiplied by 3,
523 // any fractions (incomplete AAs) must be discarded.
524 tr_from /= 3;
525 if (seq_range.GetFrom() % 3 > 0) {
526 tr_from++; // skip incomplete AA
527 }
528 tr_to /= 3;
529 force_translation = false; // do not translate AAs.
530 }
531 if (tr_to > tr_from) {
532 CSeqVector& seq_vector = x_GetSeqVector(row);
533
534 TSeqPos size = tr_to - tr_from;
535 buffer.resize(size, m_GapChar);
536
537 if (IsPositiveStrand(row)) {
538 seq_vector.GetSeqData(tr_from, tr_to, buffer);
539 } else {
540 TSeqPos vec_size = seq_vector.size();
541 seq_vector.GetSeqData(vec_size - tr_to, vec_size - tr_from, buffer);
542 }
543
544 if ( force_translation ) {
545 TranslateNAToAA(buffer, buffer, x_GetGenCode(row));
546 }
547 }
548 return buffer;
549 }
550
551
GetAlnSeqString(TNumrow row,string & buffer,const TSignedRange & rq_aln_range,bool force_translation) const552 string& CSparseAln::GetAlnSeqString(TNumrow row,
553 string &buffer,
554 const TSignedRange &rq_aln_range,
555 bool force_translation) const
556 {
557 _ASSERT(row >= 0 && row < GetDim());
558
559 TSignedRange aln_range(rq_aln_range);
560 if ( aln_range.IsWhole() ) {
561 aln_range = GetSeqAlnRange(row);
562 }
563
564 buffer.erase();
565 if (aln_range.GetLength() <= 0) {
566 return buffer;
567 }
568
569 const CPairwiseAln& pairwise_aln = *m_Aln->GetPairwiseAlns()[row];
570 if (pairwise_aln.empty()) {
571 string errstr = "Invalid (empty) row (" + NStr::IntToString(row) + "). Seq id \"" +
572 GetSeqId(row).AsFastaString() + "\".";
573 NCBI_THROW(CAlnException, eInvalidRequest, errstr);
574 }
575
576 CSeqVector& seq_vector = x_GetSeqVector(row);
577 TSeqPos vec_size = seq_vector.size();
578
579 const int base_width = pairwise_aln.GetSecondBaseWidth();
580 int gencode = 0;
581 bool translate = force_translation || pairwise_aln.GetSecondId()->IsProtein();
582
583 // buffer holds sequence for "aln_range", 0 index corresonds to aln_range.GetFrom()
584 size_t buf_size = aln_range.GetLength();
585 if (translate) {
586 gencode = x_GetGenCode(row);
587 buf_size /= 3;
588 }
589 buffer.resize(buf_size, m_GapChar);
590
591 string s; // current segment sequence
592 CSparse_CI it(*this, row, IAlnSegmentIterator::eSkipInserts, aln_range);
593
594 // Store last incomplete codon's position and length. If the next
595 // segment includes the rest of the same codon, it should be included
596 // in the results. Otherwise incomplete codons are removed from protein
597 // sequences.
598 TSeqPos split_codon_pos = kInvalidSeqPos;
599 // When trimming one or more codons, future offsets must be adjusted.
600 bool direct = IsPositiveStrand(row);
601 bool is_first_seg = true;
602 size_t trim_from = 0;
603 size_t trim_to = 0;
604 while ( it ) {
605 trim_to = 0;
606 const IAlnSegment::TSignedRange& aln_r = it->GetAlnRange(); // in alignment
607 const IAlnSegment::TSignedRange& row_r = it->GetRange(); // on sequence
608 if ( row_r.Empty() ) {
609 ++it;
610 is_first_seg = false;
611 continue;
612 }
613
614 size_t off = aln_r.GetFrom() - aln_range.GetFrom();
615 if (base_width == 1) {
616 // TODO performance issue - waiting for better API
617 if ( direct ) {
618 seq_vector.GetSeqData(row_r.GetFrom(), row_r.GetToOpen(), s);
619 } else {
620 seq_vector.GetSeqData(vec_size - row_r.GetToOpen(),
621 vec_size - row_r.GetFrom(), s);
622 }
623 if ( translate ) {
624 TranslateNAToAA(s, s, gencode);
625 off /= 3;
626 }
627 }
628 else {
629 _ASSERT(base_width == 3);
630 TSeqPos tr_from = row_r.GetFrom();
631 TSeqPos tr_to = row_r.GetToOpen();
632 if ( direct ) {
633 if (tr_from % 3 > 0) {
634 if (tr_from == split_codon_pos) {
635 // Include incomplete codon from the last range.
636 _ASSERT(off > 0);
637 if ( is_first_seg ) trim_from = tr_from % 3;
638 off -= tr_from % 3;
639 tr_from -= tr_from % 3;
640 }
641 else {
642 // Trim incomplete codon at range start.
643 off += 3 - (tr_from % 3);
644 tr_from += 3 - (tr_from % 3);
645 }
646 }
647 if (tr_to % 3 > 0) {
648 split_codon_pos = tr_to;
649 trim_to = tr_to % 3;
650 tr_to -= tr_to % 3;
651 }
652 }
653 else /* reverse */ {
654 if (tr_to % 3 > 0) {
655 if (tr_to == split_codon_pos) {
656 // Include incomplete codon from the last range.
657 _ASSERT(off > 0);
658 if ( is_first_seg ) trim_from = 3 - (tr_to % 3);
659 off -= 3 - (tr_to % 3);
660 tr_to += 3 - (tr_to % 3);
661 }
662 else {
663 off += tr_to % 3;
664 tr_to -= tr_to % 3;
665 }
666 }
667 if (tr_from % 3 > 0) {
668 split_codon_pos = tr_from;
669 trim_to = 3 - (tr_from % 3);
670 tr_from += 3 - (tr_from % 3);
671 }
672 }
673 off /= 3;
674 _ASSERT(tr_from % 3 == 0);
675 _ASSERT(tr_to % 3 == 0);
676 IAlnSegment::TSignedRange prot_r;
677 prot_r.SetOpen(tr_from / 3, tr_to / 3);
678 if ( direct ) {
679 seq_vector.GetSeqData(prot_r.GetFrom(),
680 prot_r.GetToOpen(), s);
681 }
682 else {
683 seq_vector.GetSeqData(vec_size - prot_r.GetToOpen(),
684 vec_size - prot_r.GetFrom(), s);
685 }
686 }
687
688 size_t len = min(buf_size - off, s.size());
689 _ASSERT(off + len <= buf_size);
690
691 if (len > 0) {
692 if ( m_AnchorDirect ) {
693 buffer.replace(off, len, s, 0, len);
694 }
695 else {
696 buffer.replace(buf_size - off - len, len, s, 0, len);
697 }
698 }
699 ++it;
700 is_first_seg = false;
701 }
702 if (translate && aln_range.GetLength() >= trim_from + trim_to) {
703 buffer.resize((aln_range.GetLength() - trim_from - trim_to) / 3);
704 }
705 return buffer;
706 }
707
708
709 IAlnSegmentIterator*
CreateSegmentIterator(TNumrow row,const TSignedRange & range,IAlnSegmentIterator::EFlags flag) const710 CSparseAln::CreateSegmentIterator(TNumrow row,
711 const TSignedRange& range,
712 IAlnSegmentIterator::EFlags flag) const
713 {
714 _ASSERT(row >= 0 && row < GetDim());
715 _ASSERT( !m_Aln->GetPairwiseAlns()[row]->empty() );
716 if (m_Aln->GetPairwiseAlns()[row]->empty()) {
717 string errstr = "Invalid (empty) row (" + NStr::IntToString(row) + "). Seq id \"" +
718 GetSeqId(row).AsFastaString() + "\".";
719 NCBI_THROW(CAlnException, eInvalidRequest, errstr);
720 }
721 return new CSparse_CI(*this, row, flag, range);
722 }
723
724
725 END_NCBI_SCOPE
726