1 /*  $Id: transform_align.cpp 624079 2021-01-25 20:00:40Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Vyacheslav Chetvernin
27  *
28  * File Description: Alignment transformations
29  *
30  */
31 #include <ncbi_pch.hpp>
32 #include <algo/sequence/gene_model.hpp>
33 #include <objects/seqalign/seqalign__.hpp>
34 #include <objects/seqloc/Na_strand.hpp>
35 #include <objects/general/User_object.hpp>
36 #include <objects/general/Object_id.hpp>
37 
38 #include <objmgr/bioseq_handle.hpp>
39 #include <objmgr/scope.hpp>
40 #include <objmgr/feat_ci.hpp>
41 #include <objmgr/util/sequence.hpp>
42 
43 #include <objtools/alnmgr/score_builder_base.hpp>
44 
45 #include "feature_generator.hpp"
46 
47 BEGIN_NCBI_SCOPE
48 
49 USING_SCOPE(objects);
50 
51 
52 namespace {
53 
GetSplicedStrands(const CSpliced_seg & spliced_seg)54 pair <ENa_strand, ENa_strand> GetSplicedStrands(const CSpliced_seg& spliced_seg)
55 {
56     ENa_strand product_strand =
57         spliced_seg.IsSetProduct_strand() ?
58         spliced_seg.GetProduct_strand() :
59         (spliced_seg.GetExons().front()->IsSetProduct_strand() ?
60          spliced_seg.GetExons().front()->GetProduct_strand() :
61          eNa_strand_unknown);
62     ENa_strand genomic_strand =
63         spliced_seg.IsSetGenomic_strand() ?
64         spliced_seg.GetGenomic_strand() :
65         (spliced_seg.GetExons().front()->IsSetGenomic_strand()?
66          spliced_seg.GetExons().front()->GetGenomic_strand():
67          eNa_strand_unknown);
68 
69     return make_pair(product_strand, genomic_strand);
70 }
71 
SetProtpos(CProduct_pos & pos,int value)72 void SetProtpos(CProduct_pos &pos, int value)
73 {
74     pos.SetProtpos().SetAmin(value/3);
75     pos.SetProtpos().SetFrame((value % 3) +1);
76 }
77 
78 }
79 
80 
GetExonStructure(const CSpliced_seg & spliced_seg,vector<SExon> & exons,CScope * scope)81 void CFeatureGenerator::SImplementation::GetExonStructure(const CSpliced_seg& spliced_seg, vector<SExon>& exons, CScope* scope)
82 {
83     pair <ENa_strand, ENa_strand> strands = GetSplicedStrands(spliced_seg);
84     ENa_strand product_strand = strands.first;
85     ENa_strand genomic_strand = strands.second;
86 
87     exons.resize(spliced_seg.GetExons().size());
88     int i = 0;
89     TSignedSeqPos prev_genomic_pos = 0;
90     TSignedSeqPos offset = 0;
91     ITERATE(CSpliced_seg::TExons, it, spliced_seg.GetExons()) {
92         const CSpliced_exon& exon = **it;
93         SExon& exon_struct = exons[i++];
94 
95         const CProduct_pos& prod_from = exon.GetProduct_start();
96         const CProduct_pos& prod_to = exon.GetProduct_end();
97 
98         exon_struct.prod_from = prod_from.AsSeqPos();
99         exon_struct.prod_to = prod_to.AsSeqPos();
100         if (product_strand == eNa_strand_minus) {
101             swap(exon_struct.prod_from, exon_struct.prod_to);
102             exon_struct.prod_from = -exon_struct.prod_from;
103             exon_struct.prod_to = -exon_struct.prod_to;
104         }
105 
106         exon_struct.genomic_from = exon.GetGenomic_start();
107         exon_struct.genomic_to = exon.GetGenomic_end();
108 
109         bool cross_the_origin = i > 1 && (
110             genomic_strand != eNa_strand_minus
111             ? (exon_struct.genomic_from < prev_genomic_pos)
112             : (exon_struct.genomic_from > prev_genomic_pos));
113 
114         if (cross_the_origin && scope) {
115             offset = scope->GetSequenceLength(spliced_seg.GetGenomic_id());
116         }
117 
118         prev_genomic_pos = exon_struct.genomic_from;
119 
120         if (genomic_strand == eNa_strand_minus) {
121             swap(exon_struct.genomic_from, exon_struct.genomic_to);
122             exon_struct.genomic_from = -exon_struct.genomic_from;
123             exon_struct.genomic_to = -exon_struct.genomic_to;
124         }
125 
126         if (offset) {
127             exon_struct.genomic_from += offset;
128             exon_struct.genomic_to += offset;
129         }
130 
131     }
132 
133     _ASSERT( exons.size() == spliced_seg.GetExons().size() );
134 }
135 
136 
StitchSmallHoles(CSeq_align & align)137 void CFeatureGenerator::SImplementation::StitchSmallHoles(CSeq_align& align)
138 {
139     CSpliced_seg& spliced_seg = align.SetSegs().SetSpliced();
140 
141     if (!spliced_seg.CanGetExons() || spliced_seg.GetExons().size() < 2)
142         return;
143 
144     vector<SExon> exons;
145     GetExonStructure(spliced_seg, exons, m_scope);
146 
147     bool is_protein = (spliced_seg.GetProduct_type()==CSpliced_seg::eProduct_type_protein);
148 
149     pair <ENa_strand, ENa_strand> strands = GetSplicedStrands(spliced_seg);
150     ENa_strand product_strand = strands.first;
151     ENa_strand genomic_strand = strands.second;
152 
153     int product_min_pos;
154     int product_max_pos;
155     if (product_strand != eNa_strand_minus) {
156         product_min_pos = 0;
157         if (spliced_seg.IsSetPoly_a()) {
158             product_max_pos = spliced_seg.GetPoly_a()-1;
159         } else if (spliced_seg.IsSetProduct_length()) {
160             product_max_pos = spliced_seg.GetProduct_length()-1;
161             if (is_protein)
162                 product_max_pos = product_max_pos*3+2;
163         } else {
164             product_max_pos = exons.back().prod_to;
165         }
166     } else {
167         if (spliced_seg.IsSetProduct_length()) {
168             product_min_pos = -int(spliced_seg.GetProduct_length())+1;
169             if (is_protein)
170                 product_min_pos = product_min_pos*3-2;
171         } else {
172             product_min_pos = exons[0].prod_from;
173         }
174         if (spliced_seg.IsSetPoly_a()) {
175             product_max_pos = -int(spliced_seg.GetPoly_a())+1;
176         } else {
177             product_max_pos = 0;
178         }
179     }
180 
181     CSpliced_seg::TExons::iterator it = spliced_seg.SetExons().begin();
182     CRef<CSpliced_exon> prev_exon = *it;
183     size_t i = 1;
184     for (++it; it != spliced_seg.SetExons().end();  ++i, prev_exon = *it++) {
185         CSpliced_exon& exon = **it;
186 
187         bool donor_set = prev_exon->IsSetDonor_after_exon() || (genomic_strand ==eNa_strand_minus && prev_exon->GetGenomic_start()==0);
188         bool acceptor_set = exon.IsSetAcceptor_before_exon() || (genomic_strand ==eNa_strand_minus && prev_exon->GetGenomic_start()==0);
189 
190         if(donor_set && acceptor_set && exons[i-1].prod_to + 1 == exons[i].prod_from) {
191             continue;
192         }
193 
194         _ASSERT( exons[i].prod_from > exons[i-1].prod_to );
195         int prod_hole_len = exons[i].prod_from - exons[i-1].prod_to -1;
196         _ASSERT( exons[i].genomic_from > exons[i-1].genomic_to );
197         int genomic_hole_len = exons[i].genomic_from - exons[i-1].genomic_to -1;
198 
199         if (prod_hole_len >= (int)m_min_intron || genomic_hole_len >= (int)m_min_intron)
200             continue;
201 
202         if (!prev_exon->IsSetParts() || prev_exon->GetParts().empty()) {
203             CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
204             part->SetMatch(exons[i-1].prod_to-exons[i-1].prod_from+1);
205             prev_exon->SetParts().push_back(part);
206         }
207         if (!exon.IsSetParts() || exon.GetParts().empty()) {
208             CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
209             part->SetMatch(exons[i].prod_to-exons[i].prod_from+1);
210             exon.SetParts().push_back(part);
211         }
212 
213         int max_hole_len = max(prod_hole_len, genomic_hole_len);
214         int min_hole_len = min(prod_hole_len, genomic_hole_len);
215         int left_mismatch_len = 0;
216         int right_mismatch_len = min_hole_len;
217         if (prod_hole_len != genomic_hole_len) {
218             // does not matter for transcripts, but for proteins ensures insersions at codon boundary
219             int bases_needed_to_complete_codon = 2 - (exons[i-1].prod_to % 3);
220 
221             if (right_mismatch_len >= bases_needed_to_complete_codon) {
222                 left_mismatch_len = bases_needed_to_complete_codon + ((right_mismatch_len-bases_needed_to_complete_codon)/2/3)*3;
223                 right_mismatch_len -= left_mismatch_len;
224             }
225         }
226 
227         bool no_acceptor_before = i > 1 && !prev_exon->IsSetAcceptor_before_exon();
228         bool no_donor_after = i < exons.size()-1 && !exon.IsSetDonor_after_exon();
229 
230 
231         bool cross_the_origin =
232             genomic_strand != eNa_strand_minus
233             ? (prev_exon->GetGenomic_start() > exon.GetGenomic_start())
234             : (prev_exon->GetGenomic_start() < exon.GetGenomic_start());
235 
236         if (cross_the_origin) {
237             int genomic_size = m_scope->GetSequenceLength(spliced_seg.GetGenomic_id());
238 
239             prev_exon->SetPartial(product_min_pos < exons[i-1].prod_from  &&
240                                   no_acceptor_before);
241 
242             exon.SetPartial(exons[i].prod_to < product_max_pos &&
243                             no_donor_after);
244 
245             if (genomic_strand != eNa_strand_minus) {
246                 prev_exon->SetGenomic_end(genomic_size-1);
247                 exon.SetGenomic_start(0);
248             } else {
249                 prev_exon->SetGenomic_start(0);
250                 exon.SetGenomic_end(genomic_size-1);
251             }
252 
253             int origin = genomic_strand != eNa_strand_minus ? genomic_size : 1;
254             int to_origin = origin - exons[i-1].genomic_to -1;
255             if (prod_hole_len == genomic_hole_len) {
256                 left_mismatch_len = to_origin;
257                 right_mismatch_len -= left_mismatch_len;
258             }
259 
260             if (left_mismatch_len > 0 && to_origin > 0) {
261                 int mismatch_len = min(left_mismatch_len, to_origin);
262                 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
263                 part->SetMismatch(mismatch_len);
264                 prev_exon->SetParts().push_back(part);
265                 prod_hole_len -= mismatch_len;
266                 genomic_hole_len -= mismatch_len;
267                 to_origin -= mismatch_len;
268                 exons[i-1].genomic_to += mismatch_len;
269                 exons[i-1].prod_to += mismatch_len;
270                 left_mismatch_len -= mismatch_len;
271             }
272 
273             if (to_origin > 0) {
274                 _ASSERT(left_mismatch_len == 0);
275                 _ASSERT(prod_hole_len != genomic_hole_len);
276                 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
277                 if (prod_hole_len < genomic_hole_len) {
278                     int genomic_ins = min(genomic_hole_len-prod_hole_len, to_origin);
279                     part->SetGenomic_ins(genomic_ins);
280                     genomic_hole_len -= genomic_ins;
281                     to_origin -= genomic_ins;
282                     exons[i-1].genomic_to += genomic_ins;
283                 } else {
284                     part->SetProduct_ins(prod_hole_len-genomic_hole_len);
285                     exons[i-1].prod_to += prod_hole_len-genomic_hole_len;
286                     prod_hole_len = genomic_hole_len;
287                 }
288                 prev_exon->SetParts().push_back(part);
289             }
290             if (to_origin > 0) {
291                 _ASSERT(prod_hole_len == genomic_hole_len);
292                 _ASSERT(right_mismatch_len >= to_origin);
293                 int mismatch_len = to_origin;
294                 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
295                 part->SetMismatch(mismatch_len);
296                 prev_exon->SetParts().push_back(part);
297                 prod_hole_len -= mismatch_len;
298                 genomic_hole_len -= mismatch_len;
299                 to_origin = 0;
300                 exons[i-1].genomic_to += mismatch_len;
301                 exons[i-1].prod_to += mismatch_len;
302                 right_mismatch_len -= mismatch_len;
303             }
304 
305             _ASSERT(to_origin == 0);
306             _ASSERT(exons[i-1].genomic_to == origin-1);
307 
308             exons[i].prod_from = exons[i-1].prod_to+1;
309             exons[i].genomic_from = exons[i-1].genomic_to+1;
310 
311             if (is_protein) {
312                 prev_exon->SetProduct_end().SetProtpos().SetAmin() = exons[i-1].prod_to/3;
313                 prev_exon->SetProduct_end().SetProtpos().SetFrame() = (exons[i-1].prod_to %3) +1;
314                 exon.SetProduct_start().SetProtpos().SetAmin() = exons[i].prod_from/3;
315                 exon.SetProduct_start().SetProtpos().SetFrame() = (exons[i].prod_from %3) +1;
316             } else if (product_strand != eNa_strand_minus) {
317                 prev_exon->SetProduct_end().SetNucpos( exons[i-1].prod_to );
318                 exon.SetProduct_start().SetNucpos( exons[i].prod_from );
319             } else {
320                 prev_exon->SetProduct_start().SetNucpos( -exons[i-1].prod_to );
321                 exon.SetProduct_end().SetNucpos( -exons[i].prod_from );
322             }
323 
324             list <CRef< CSpliced_exon_chunk > >::iterator insertion_point = exon.SetParts().begin();
325 
326             if (left_mismatch_len > 0) {
327                 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
328                 part->SetMismatch(left_mismatch_len);
329                 insertion_point = exon.SetParts().insert(insertion_point, part);
330                 ++insertion_point;
331             }
332             if (prod_hole_len != genomic_hole_len) {
333                 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
334                 if (prod_hole_len < genomic_hole_len) {
335                     part->SetGenomic_ins(genomic_hole_len - prod_hole_len);
336                 } else {
337                     part->SetProduct_ins(prod_hole_len - genomic_hole_len);
338                 }
339                 insertion_point = exon.SetParts().insert(insertion_point, part);
340                 ++insertion_point;
341             }
342             if (right_mismatch_len > 0) {
343                 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
344                 part->SetMismatch(right_mismatch_len);
345                 exon.SetParts().insert(insertion_point, part);
346 
347             }
348 
349         } else {
350 
351             if (is_protein || product_strand != eNa_strand_minus) {
352                 prev_exon->SetProduct_end().Assign( exon.GetProduct_end() );
353             } else {
354                 prev_exon->SetProduct_start().Assign( exon.GetProduct_start() );
355             }
356 
357             if (genomic_strand != eNa_strand_minus) {
358                 prev_exon->SetGenomic_end() = exon.GetGenomic_end();
359             } else {
360                 prev_exon->SetGenomic_start() = exon.GetGenomic_start();
361             }
362 
363             if (left_mismatch_len > 0) {
364                 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
365                 part->SetMismatch(left_mismatch_len);
366                 prev_exon->SetParts().push_back(part);
367             }
368             if (prod_hole_len != genomic_hole_len) {
369                CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
370                 if (prod_hole_len < genomic_hole_len) {
371                     part->SetGenomic_ins(max_hole_len - min_hole_len);
372                 } else {
373                     part->SetProduct_ins(max_hole_len - min_hole_len);
374                 }
375                 prev_exon->SetParts().push_back(part);
376             }
377             if (right_mismatch_len > 0) {
378                 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
379                 part->SetMismatch(right_mismatch_len);
380                 prev_exon->SetParts().push_back(part);
381 
382             }
383             prev_exon->SetParts().splice(prev_exon->SetParts().end(), exon.SetParts());
384 
385             if (exon.IsSetDonor_after_exon()) {
386                 prev_exon->SetDonor_after_exon().Assign( exon.GetDonor_after_exon() );
387             } else {
388                 prev_exon->ResetDonor_after_exon();
389             }
390 
391             exons[i].prod_from = exons[i-1].prod_from;
392             exons[i].genomic_from = exons[i-1].genomic_from;
393 
394             prev_exon->SetPartial(
395                                   (product_min_pos < exons[i-1].prod_from  && no_acceptor_before) ||
396                                   (exons[i].prod_to < product_max_pos  && no_donor_after));
397 
398             if (exon.IsSetExt()) {
399                 prev_exon->SetExt().splice(prev_exon->SetExt().end(), exon.SetExt());
400             }
401 
402             CSpliced_seg::TExons::iterator save_it = it;
403             --save_it;
404             spliced_seg.SetExons().erase(it);
405             it = save_it;
406         }
407     }
408 }
409 
410 vector<CFeatureGenerator::SImplementation::SExon> CFeatureGenerator::SImplementation::
GetExons(const CSeq_align & align)411 GetExons(const CSeq_align &align)
412 {
413     vector<SExon> exons;
414     GetExonStructure(align.GetSegs().GetSpliced(), exons, NULL);
415     return exons;
416 }
417 
418 CSeq_align::EScoreType s_ScoresToRecalculate[] =
419 { CSeq_align::eScore_IdentityCount,
420   CSeq_align::eScore_MismatchCount,
421   CSeq_align::eScore_GapCount,
422   CSeq_align::eScore_PercentIdentity_Gapped,
423   CSeq_align::eScore_PercentIdentity_Ungapped,
424   CSeq_align::eScore_PercentCoverage,
425   CSeq_align::eScore_HighQualityPercentCoverage,
426   (CSeq_align::EScoreType)0
427 };
428 
429 void CFeatureGenerator::SImplementation::
ClearScores(CSeq_align & align)430 ClearScores(CSeq_align &align)
431 {
432     NON_CONST_ITERATE (CSpliced_seg::TExons, exon_it,
433                        align.SetSegs().SetSpliced().SetExons())
434     {
435         (*exon_it)->ResetScores();
436     }
437     if (align.IsSetScore()) {
438         CScoreBuilderBase score_builder;
439         for (CSeq_align::EScoreType *score = s_ScoresToRecalculate;
440              *score; ++score)
441         {
442             align.ResetNamedScore(*score);
443         }
444         align.ResetNamedScore("weighted_identity");
445 
446         if (align.SetScore().empty()) {
447             align.ResetScore();
448         }
449     }
450 }
451 
452 
453 void CFeatureGenerator::SImplementation::
RecalculateScores(CSeq_align & align)454 RecalculateScores(CSeq_align &align)
455 {
456     NON_CONST_ITERATE (CSpliced_seg::TExons, exon_it,
457                        align.SetSegs().SetSpliced().SetExons())
458     {
459 	RecalculateExonIdty(**exon_it);
460     }
461 
462     if (align.IsSetScore()) {
463         CScoreBuilderBase score_builder;
464         for (CSeq_align::EScoreType *score = s_ScoresToRecalculate;
465              *score; ++score)
466         {
467             int sink;
468             if (align.GetNamedScore(*score, sink)) {
469                 align.ResetNamedScore(*score);
470                 score_builder.AddScore(*m_scope, align, *score);
471             }
472         }
473         if (align.GetSegs().GetSpliced().GetProduct_type() ==
474             CSpliced_seg::eProduct_type_transcript)
475         {
476             score_builder.AddSplignScores(align);
477         }
478         align.ResetNamedScore("weighted_identity");
479     }
480 }
481 
482 void CFeatureGenerator::SImplementation::
RecalculateExonIdty(CSpliced_exon & exon)483 RecalculateExonIdty(CSpliced_exon &exon)
484 {
485     if (!exon.IsSetScores())
486         return;
487 
488     Int8 idty = -1;
489     if (exon.IsSetParts()) {
490         int matches = 0;
491         int total = 0;
492         ITERATE (CSpliced_exon::TParts, part_it, exon.GetParts()) {
493             switch ((*part_it)->Which()) {
494             case CSpliced_exon_chunk::e_Match:
495                matches += (*part_it)->GetMatch();
496                total += (*part_it)->GetMatch();
497                break;
498 
499             case CSpliced_exon_chunk::e_Mismatch:
500                total += (*part_it)->GetMismatch();
501                break;
502 
503             case CSpliced_exon_chunk::e_Product_ins:
504                total += (*part_it)->GetProduct_ins();
505                break;
506 
507             case CSpliced_exon_chunk::e_Genomic_ins:
508                total += (*part_it)->GetGenomic_ins();
509                break;
510 
511             default:
512                 matches = INT_MIN; // to ensure negative identity
513                 total += 1;        // to prevent division by zero
514                 break;
515             }
516         }
517         if (total) {
518             idty = matches * NCBI_CONST_INT8(10000000000) / total;
519         }
520         else {
521             idty = 0;
522         }
523     }
524 
525     CScore_set::Tdata& exon_scores = exon.SetScores().Set();
526     ERASE_ITERATE (CScore_set::Tdata, score_it, exon_scores) {
527         if (idty >= 0 && (*score_it)->IsSetId() && (*score_it)->GetId().IsStr() &&
528             (*score_it)->GetId().GetStr() == "idty") {
529             (*score_it)->SetValue().SetReal(idty / 10000000000.);
530         } else {
531             exon_scores.erase(score_it);
532         }
533     }
534 }
535 
TrimHolesToCodons(CSeq_align & align)536 void CFeatureGenerator::SImplementation::TrimHolesToCodons(CSeq_align& align)
537 {
538     CSpliced_seg& spliced_seg = align.SetSegs().SetSpliced();
539 
540     if (!spliced_seg.CanGetExons())
541         return;
542 
543     bool is_protein = (spliced_seg.GetProduct_type()==CSpliced_seg::eProduct_type_protein);
544 
545     pair <ENa_strand, ENa_strand> strands = GetSplicedStrands(spliced_seg);
546     ENa_strand product_strand = strands.first;
547     ENa_strand genomic_strand = strands.second;
548 
549     TSignedSeqRange cds;
550     if (is_protein) {
551         cds = TSignedSeqRange(0, spliced_seg.GetProduct_length()*3 - 1);
552     } else {
553         if (!spliced_seg.CanGetProduct_id())
554             return;
555         cds = GetCds(spliced_seg.GetProduct_id());
556         if (cds.Empty())
557             return;
558         if (product_strand == eNa_strand_minus) {
559             NCBI_THROW(CException, eUnknown,
560                        "TrimHolesToCodons(): "
561                        "Reversed mRNA with CDS");
562         }
563     }
564 
565     vector<SExon> exons;
566     GetExonStructure(spliced_seg, exons, m_scope);
567 
568     int frame_offset = (exons.back().prod_to/3+1)*3+cds.GetFrom(); // to make modulo operands always positive
569 
570     vector<SExon>::iterator right_exon_it = exons.begin();
571     CSpliced_seg::TExons::iterator right_spl_exon_it = spliced_seg.SetExons().begin();
572 
573     for(;;++right_exon_it, ++right_spl_exon_it) {
574 
575         vector<SExon>::reverse_iterator left_exon_it(right_exon_it);
576         CSpliced_seg::TExons::reverse_iterator left_spl_exon_it(right_spl_exon_it);
577 
578         if (right_exon_it != exons.begin() && right_exon_it != exons.end()) {
579             bool donor_set = left_spl_exon_it != spliced_seg.SetExons().rend() && (*left_spl_exon_it)->IsSetDonor_after_exon();
580             bool acceptor_set = right_spl_exon_it != spliced_seg.SetExons().end() && (*right_spl_exon_it)->IsSetAcceptor_before_exon();
581 
582             if(((donor_set && acceptor_set) || left_exon_it->genomic_to + 1 == right_exon_it->genomic_from) && left_exon_it->prod_to + 1 == right_exon_it->prod_from) {
583                 continue;
584             }
585         }
586 
587         if (right_exon_it != exons.begin() && (right_exon_it != exons.end() || (m_flags & fTrimEnds))) {
588             while (exons.rend() != left_exon_it &&
589                    cds.GetFrom() < left_exon_it->prod_to && left_exon_it->prod_to < cds.GetTo() &&
590                    (left_exon_it->prod_to - cds.GetFrom() + 1) % 3 > 0
591                 ) {
592                 TrimLeftExon(min(left_exon_it->prod_to - left_exon_it->prod_from + 1,
593                                  (left_exon_it->prod_to - cds.GetFrom() + 1) % 3),
594                              eTrimProduct,
595                              exons.rend(), left_exon_it, left_spl_exon_it,
596                              product_strand, genomic_strand);
597             }
598         }
599 
600         if (right_exon_it != exons.end() && (right_exon_it != exons.begin() || (m_flags & fTrimEnds))) {
601             while (right_exon_it != exons.end() &&
602                    cds.GetFrom() < right_exon_it->prod_from && right_exon_it->prod_from < cds.GetTo() &&
603                    (frame_offset-right_exon_it->prod_from) % 3 > 0
604                 ) {
605                 TrimRightExon(min(right_exon_it->prod_to - right_exon_it->prod_from + 1,
606                                   (frame_offset-right_exon_it->prod_from) % 3),
607                               eTrimProduct,
608                               right_exon_it, exons.end(), right_spl_exon_it,
609                               product_strand, genomic_strand);
610             }
611         }
612 
613         if (left_exon_it.base() != right_exon_it) {
614             right_exon_it = exons.erase(left_exon_it.base(), right_exon_it);
615             right_spl_exon_it = spliced_seg.SetExons().erase(left_spl_exon_it.base(), right_spl_exon_it);
616         }
617 
618         if (right_exon_it == exons.end())
619             break;
620     }
621     _ASSERT(right_exon_it == exons.end() && right_spl_exon_it == spliced_seg.SetExons().end());
622 }
623 
MaximizeTranslation(CSeq_align & align)624 void CFeatureGenerator::SImplementation::MaximizeTranslation(CSeq_align& align)
625 {
626     CSpliced_seg& spliced_seg = align.SetSegs().SetSpliced();
627     bool is_protein_align =
628         spliced_seg.GetProduct_type() == CSpliced_seg::eProduct_type_protein;
629 
630     int aa_offset = 0;
631 
632     CSpliced_seg::TExons::iterator prev_exon_it = spliced_seg.SetExons().end();
633 
634     NON_CONST_ITERATE (CSpliced_seg::TExons, exon_it, spliced_seg.SetExons()) {
635         CSpliced_exon& exon = **exon_it;
636 
637         if (exon_it == spliced_seg.SetExons().begin()) {
638             if (is_protein_align)
639                 aa_offset = - int(exon.GetProduct_start().GetProtpos().GetAmin());
640             else
641                 aa_offset = - int(exon.GetProduct_start().GetNucpos())/3;
642         }
643 
644         if (aa_offset) {
645             if (is_protein_align)
646                 exon.SetProduct_start().SetProtpos().SetAmin() += aa_offset;
647             else
648                 exon.SetProduct_start().SetNucpos() += aa_offset*3;
649         }
650         if (exon.IsSetParts()) {
651             int part_index = 0;
652             ERASE_ITERATE (CSpliced_exon::TParts, part_it, exon.SetParts()) {
653                 CSpliced_exon_chunk& chunk = **part_it;
654                 switch (chunk.Which()) {
655                 case CSpliced_exon_chunk::e_Genomic_ins: {
656                     int len = chunk.GetGenomic_ins();
657                     if (len % 3 == 0) {
658                         chunk.SetDiag(len);
659                     } else {
660                         if (part_index == 0 && prev_exon_it != spliced_seg.SetExons().end() &&
661                             (*prev_exon_it)->IsSetParts()) {
662                             CSpliced_exon_chunk& prev_chunk = **(*prev_exon_it)->SetParts().rbegin();
663                             if (prev_chunk.Which()==CSpliced_exon_chunk::e_Genomic_ins) {
664                                 int prev_len = prev_chunk.GetGenomic_ins();
665                                 if (prev_len + len >= 3) {
666 
667                                     prev_chunk.SetDiag(prev_len);
668 
669                                     if (is_protein_align) {
670                                         TSeqPos product_end = (*prev_exon_it)->GetProduct_end().AsSeqPos();
671                                         product_end += prev_len;
672                                         (*prev_exon_it)->SetProduct_end().SetProtpos().SetAmin (product_end/3);
673                                         (*prev_exon_it)->SetProduct_end().SetProtpos().SetFrame(product_end%3+1);
674 
675                                         TSeqPos product_start = exon.GetProduct_start().AsSeqPos();
676                                         product_start += prev_len;
677                                         exon.SetProduct_start().SetProtpos().SetAmin (product_start/3);
678                                         exon.SetProduct_start().SetProtpos().SetFrame(product_start%3+1);
679                                     }  else {
680                                         (*prev_exon_it)->SetProduct_end().SetNucpos() += prev_len;
681                                         exon.SetProduct_start().SetNucpos() += prev_len;
682                                     }
683 
684                                     if (len > 3-prev_len) {
685                                         CRef<CSpliced_exon_chunk> new_chunk(new CSpliced_exon_chunk);
686                                         new_chunk->SetDiag(3-prev_len);
687                                         exon.SetParts().insert(part_it, new_chunk);
688                                         chunk.SetGenomic_ins(len - (3-prev_len));
689                                     } else {
690                                         chunk.SetDiag(len);
691                                     }
692                                     aa_offset += 1;
693                                     len -= 3-prev_len;
694                                 }
695                             }
696                         }
697                         if (len > 3) {
698                             CRef<CSpliced_exon_chunk> new_chunk(new CSpliced_exon_chunk);
699                             new_chunk->SetDiag((len/3)*3);
700                             exon.SetParts().insert(part_it, new_chunk);
701                             chunk.SetGenomic_ins(len % 3);
702                         }
703                     }
704                     aa_offset += len/3;
705                 }
706                     break;
707                 case CSpliced_exon_chunk::e_Product_ins: {
708                     int len = chunk.GetProduct_ins();
709                     if (len % 3 == 0) {
710                         exon.SetParts().erase(part_it);
711                     } else {
712                         chunk.SetProduct_ins(len % 3);
713                     }
714                     aa_offset -= len/3;
715                 }
716                     break;
717                 default:
718                     break;
719                 }
720                 ++part_index;
721             }
722         }
723         if (aa_offset) {
724             if (is_protein_align)
725                 exon.SetProduct_end().SetProtpos().SetAmin() += aa_offset;
726             else
727                 exon.SetProduct_end().SetNucpos() += aa_offset*3;
728         }
729         prev_exon_it = exon_it;
730     }
731     spliced_seg.SetProduct_length() = is_protein_align
732         ? (*prev_exon_it)->GetProduct_end().GetProtpos().GetAmin()+1
733         : (*prev_exon_it)->GetProduct_end().GetNucpos()+1;
734 }
735 
AdjustAlignment(const CSeq_align & align_in,TSeqRange range,EProductPositionsMode mode)736 CConstRef<CSeq_align> CFeatureGenerator::AdjustAlignment(const CSeq_align& align_in, TSeqRange range, EProductPositionsMode mode)
737 {
738     return m_impl->AdjustAlignment(align_in, range, mode);
739 }
740 
AdjustAlignment(const CSeq_align & align_in,TSeqRange range,EProductPositionsMode mode)741 CConstRef<CSeq_align> CFeatureGenerator::SImplementation::AdjustAlignment(const CSeq_align& align_in, TSeqRange range, EProductPositionsMode mode)
742 {
743     if (!align_in.CanGetSegs() || !align_in.GetSegs().IsSpliced())
744         return CConstRef<CSeq_align>(&align_in);
745 
746     CRef<CSeq_align> align(new CSeq_align);
747     align->Assign(align_in);
748 
749     vector<SExon> orig_exons = GetExons(*align);
750 
751     CSpliced_seg& spliced_seg = align->SetSegs().SetSpliced();
752 
753     pair <ENa_strand, ENa_strand> strands = GetSplicedStrands(spliced_seg);
754     ENa_strand product_strand = strands.first;
755     ENa_strand genomic_strand = strands.second;
756 
757     if (product_strand == eNa_strand_minus) {
758         NCBI_THROW(CException, eUnknown,
759                    "AdjustAlignment(): "
760                    "product minus strand not supported");
761 
762     }
763 
764     bool plus_strand = !(genomic_strand == eNa_strand_minus);
765 
766     TSeqRange align_range;
767     if (plus_strand) {
768         align_range = TSeqRange(spliced_seg.GetExons().front()->GetGenomic_start(),
769                                 spliced_seg.GetExons().back()->GetGenomic_end());
770     } else {
771         align_range = TSeqRange(spliced_seg.GetExons().back()->GetGenomic_start(),
772                                 spliced_seg.GetExons().front()->GetGenomic_end());
773     }
774     bool cross_the_origin = range.GetFrom() > range.GetTo() || align_range.GetFrom() > align_range.GetTo();
775     TSeqPos genomic_size = 0;
776     if (cross_the_origin) {
777         genomic_size = m_scope->GetSequenceLength(spliced_seg.GetGenomic_id());
778 
779 
780         if (range.GetFrom() > range.GetTo()) {
781             range.SetTo(range.GetTo() + genomic_size);
782         }
783         if (align_range.GetFrom() > align_range.GetTo()) {
784             align_range.SetTo(align_range.GetTo() + genomic_size);
785         }
786 
787         if (range.GetTo() < align_range.GetFrom()) {
788             range.SetFrom(range.GetFrom() + genomic_size);
789             range.SetTo(range.GetTo() + genomic_size);
790         }
791         if (align_range.GetTo() < range.GetFrom()) {
792             align_range.SetFrom(align_range.GetFrom() + genomic_size);
793             align_range.SetTo(align_range.GetTo() + genomic_size);
794         }
795 
796         TSeqPos outside_point = min(range.GetFrom(), align_range.GetFrom());
797         NON_CONST_ITERATE(CSpliced_seg::TExons, exon_it, spliced_seg.SetExons()) {
798             CSpliced_exon& exon = **exon_it;
799             if (exon.GetGenomic_start() < outside_point) {
800                 exon.SetGenomic_start() += genomic_size;
801                 exon.SetGenomic_end() += genomic_size;
802             }
803         }
804     }
805 
806     if (!(range.GetFrom() <= range.GetTo()) ||
807         !(align_range.GetFrom() <= align_range.GetTo())) {
808         NCBI_USER_THROW("no inverted range assertion failed");
809     }
810     if (range.GetTo() < align_range.GetFrom() ||
811         align_range.GetTo() < range.GetFrom()) {
812         NCBI_USER_THROW("alignmentrange and requested range don't overlap");
813     }
814 
815     vector<SExon> exons;
816     GetExonStructure(spliced_seg, exons, m_scope);
817 
818     bool is_protein_align =
819         spliced_seg.GetProduct_type() == CSpliced_seg::eProduct_type_protein;
820 
821     vector<SExon>::iterator right_exon_it = exons.begin();
822     CSpliced_seg::TExons::iterator right_spl_exon_it = spliced_seg.SetExons().begin();
823 
824     int range_left = plus_strand ? int(range.GetFrom()) : -int(range.GetTo());
825     int range_right = plus_strand ? int(range.GetTo()) : -int(range.GetFrom());
826 
827     for(;;++right_exon_it, ++right_spl_exon_it) {
828 
829         vector<SExon>::reverse_iterator left_exon_it(right_exon_it);
830         CSpliced_seg::TExons::reverse_iterator left_spl_exon_it(right_spl_exon_it);
831 
832         if (right_exon_it == exons.end() &&
833             left_exon_it->genomic_to > range_right
834             )
835             CFeatureGenerator::SImplementation::TrimLeftExon(left_exon_it->genomic_to - range_right, eTrimGenomic,
836                          exons.rend(), left_exon_it, left_spl_exon_it,
837                          product_strand, genomic_strand);
838 
839         if (right_exon_it == exons.begin() &&
840             right_exon_it->genomic_from < range_left
841             )
842             CFeatureGenerator::SImplementation::TrimRightExon(range_left - right_exon_it->genomic_from, eTrimGenomic,
843                           right_exon_it, exons.end(), right_spl_exon_it,
844                           product_strand, genomic_strand);
845 
846         if (left_exon_it.base() != right_exon_it) {
847             right_exon_it = exons.erase(left_exon_it.base(), right_exon_it);
848             right_spl_exon_it = spliced_seg.SetExons().erase(left_spl_exon_it.base(), right_spl_exon_it);
849         }
850 
851         if (right_exon_it == exons.end())
852             break;
853     }
854 
855     CSpliced_exon& first_exon = *spliced_seg.SetExons().front();
856     CSpliced_exon& last_exon = *spliced_seg.SetExons().back();
857 
858     int first_exon_extension = 0;
859     int last_exon_extension = 0;
860 
861     if (plus_strand) {
862 
863         first_exon_extension =
864             first_exon.GetGenomic_start()
865             - ((range.GetFrom() < genomic_size && genomic_size <= first_exon.GetGenomic_start())
866                ? genomic_size
867                : range.GetFrom());
868 
869         if (first_exon_extension > 0) {
870             first_exon.SetGenomic_start() -= first_exon_extension;
871             if (first_exon.IsSetParts()) {
872                 CRef<CSpliced_exon_chunk> chunk(new CSpliced_exon_chunk);
873                 chunk->SetDiag(first_exon_extension);
874                 first_exon.SetParts().insert(first_exon.SetParts().begin(), chunk);
875             }
876         }
877 
878         last_exon_extension =
879             ((last_exon.GetGenomic_end() <= genomic_size-1 && genomic_size-1 < range.GetTo())
880              ? genomic_size-1
881              : range.GetTo())
882             - last_exon.GetGenomic_end();
883 
884         if (last_exon_extension > 0) {
885             last_exon.SetGenomic_end() += last_exon_extension;
886             if (last_exon.IsSetParts()) {
887                 CRef<CSpliced_exon_chunk> chunk(new CSpliced_exon_chunk);
888                 chunk->SetDiag(last_exon_extension);
889                 last_exon.SetParts().push_back(chunk);
890             }
891         }
892     } else {
893         last_exon_extension =
894             last_exon.GetGenomic_start()
895             - ((range.GetFrom() < genomic_size && genomic_size <= last_exon.GetGenomic_start())
896                ? genomic_size
897                : range.GetFrom());
898 
899         if (last_exon_extension > 0) {
900             last_exon.SetGenomic_start() -= last_exon_extension;
901             if (last_exon.IsSetParts()) {
902                 CRef<CSpliced_exon_chunk> chunk(new CSpliced_exon_chunk);
903                 chunk->SetDiag(last_exon_extension);
904                 last_exon.SetParts().push_back(chunk);
905             }
906         }
907 
908         first_exon_extension =
909             ((first_exon.GetGenomic_end() <= genomic_size-1 && genomic_size-1 < range.GetTo())
910              ? genomic_size-1
911              : range.GetTo())
912             - first_exon.GetGenomic_end();
913         if (first_exon_extension > 0) {
914             first_exon.SetGenomic_end() += first_exon_extension;
915             if (first_exon.IsSetParts()) {
916                 CRef<CSpliced_exon_chunk> chunk(new CSpliced_exon_chunk);
917                 chunk->SetDiag(first_exon_extension);
918                 first_exon.SetParts().insert(first_exon.SetParts().begin(), chunk);
919             }
920         }
921     }
922 
923     exons.front().prod_from -= first_exon_extension;
924     exons.front().genomic_from -= first_exon_extension;
925     exons.back().prod_to += last_exon_extension;
926     exons.back().genomic_to += last_exon_extension;
927 
928 
929     if (plus_strand) {
930         first_exon_extension = first_exon.GetGenomic_start() - range.GetFrom();
931 
932         if (first_exon_extension > 0) {
933             CRef<CSpliced_exon> exon(new CSpliced_exon);
934             exon->SetGenomic_start() = range.GetFrom();
935             exon->SetGenomic_end() = genomic_size-1;
936             spliced_seg.SetExons().push_front(exon);
937 
938             SExon exon_struct;
939             exon_struct.prod_from = exons.front().prod_from - first_exon_extension;
940             exon_struct.prod_to = exons.front().prod_from - 1;
941             exon_struct.genomic_from = exons.front().genomic_from - first_exon_extension;
942             exon_struct.genomic_to = exons.front().genomic_from - 1;
943 
944             exons.insert(exons.begin(), exon_struct);
945         }
946 
947         last_exon_extension = range.GetTo() - last_exon.GetGenomic_end();
948 
949         if (last_exon_extension > 0) {
950             CRef<CSpliced_exon> exon(new CSpliced_exon);
951             exon->SetGenomic_start() = 0;
952             exon->SetGenomic_end() = last_exon_extension - 1;
953             spliced_seg.SetExons().push_back(exon);
954 
955             SExon exon_struct;
956             exon_struct.prod_from = exons.back().prod_to + 1;
957             exon_struct.prod_to = exons.back().prod_to + last_exon_extension;
958             exon_struct.genomic_from = exons.back().genomic_to +1;
959             exon_struct.genomic_to = exons.back().genomic_to + last_exon_extension;
960 
961             exons.push_back(exon_struct);
962         }
963     } else {
964         last_exon_extension = last_exon.GetGenomic_start() - range.GetFrom();
965 
966         if (last_exon_extension > 0) {
967             CRef<CSpliced_exon> exon(new CSpliced_exon);
968             exon->SetGenomic_start() = range.GetFrom();
969             exon->SetGenomic_end() = genomic_size-1;
970             spliced_seg.SetExons().push_back(exon);
971 
972             SExon exon_struct;
973             exon_struct.prod_from = exons.back().prod_to + 1;
974             exon_struct.prod_to = exons.back().prod_to + last_exon_extension;
975             exon_struct.genomic_from = exons.back().genomic_to +1;
976             exon_struct.genomic_to = exons.back().genomic_to + last_exon_extension;
977 
978             exons.push_back(exon_struct);
979         }
980 
981         first_exon_extension = range.GetTo() - first_exon.GetGenomic_end();
982 
983         if (first_exon_extension > 0) {
984             CRef<CSpliced_exon> exon(new CSpliced_exon);
985             exon->SetGenomic_start() = 0;
986             exon->SetGenomic_end() = first_exon_extension - 1;
987             spliced_seg.SetExons().push_front(exon);
988 
989             SExon exon_struct;
990             exon_struct.prod_from = exons.front().prod_from - first_exon_extension;
991             exon_struct.prod_to = exons.front().prod_from - 1;
992             exon_struct.genomic_from = exons.front().genomic_from - first_exon_extension;
993             exon_struct.genomic_to = exons.front().genomic_from - 1;
994 
995             exons.insert(exons.begin(), exon_struct);
996         }
997     }
998 
999     if (range_left != exons.front().genomic_from || range_right != exons.back().genomic_to) {
1000         NCBI_THROW(CException, eUnknown,
1001                    "AdjustAlignment(): "
1002                    "result's ends do not match the range. This is a bug in AdjustAlignment implementation");
1003     }
1004 
1005     int offset = is_protein_align ? int(exons.front().prod_from/3)*3 : exons.front().prod_from;
1006     if (offset > exons.front().prod_from) // negative division rounds toward zero
1007         offset -= 3;
1008 
1009     if (mode == eTryToPreserveProductPositions && offset > 0) {
1010         offset = 0; // do not shift product position unnecessarily
1011     }
1012 
1013     vector<SExon>::iterator exon_struct_it = exons.begin();
1014 
1015     int putative_prod_length = 0;
1016     if (is_protein_align) {
1017         NON_CONST_ITERATE (CSpliced_seg::TExons, exon_it, spliced_seg.SetExons()) {
1018             CSpliced_exon& exon = **exon_it;
1019             SetProtpos(exon.SetProduct_start(), exon_struct_it->prod_from - offset);
1020             SetProtpos(exon.SetProduct_end(), exon_struct_it->prod_to - offset);
1021             ++exon_struct_it;
1022         }
1023         putative_prod_length = (exons.back().prod_to - offset + 3)/3;
1024     } else {
1025         NON_CONST_ITERATE (CSpliced_seg::TExons, exon_it, spliced_seg.SetExons()) {
1026             CSpliced_exon& exon = **exon_it;
1027             exon.SetProduct_start().SetNucpos() = exon_struct_it->prod_from - offset;
1028             exon.SetProduct_end().SetNucpos() = exon_struct_it->prod_to - offset;
1029             ++exon_struct_it;
1030         }
1031         putative_prod_length = exons.back().prod_to - offset + 1;
1032     }
1033     if (mode == eForceProductFrom0 || (int)spliced_seg.GetProduct_length() < putative_prod_length) {
1034         spliced_seg.SetProduct_length(putative_prod_length);
1035     }
1036 
1037     if (cross_the_origin) {
1038         NON_CONST_ITERATE(CSpliced_seg::TExons, exon_it, spliced_seg.SetExons()) {
1039             CSpliced_exon& exon = **exon_it;
1040             if (exon.GetGenomic_start() >= genomic_size)
1041                 exon.SetGenomic_start() -= genomic_size;
1042             if (exon.GetGenomic_end() >= genomic_size)
1043                 exon.SetGenomic_end() -= genomic_size;
1044         }
1045     }
1046 
1047     if (GetExons(*align) != orig_exons) {
1048         ClearScores(*align);
1049     }
1050 
1051     return align;
1052 }
1053 
GetCdsOnMrna(const objects::CSeq_id & rna_id,CScope & scope)1054 CMappedFeat GetCdsOnMrna(const objects::CSeq_id& rna_id, CScope& scope)
1055 {
1056     CMappedFeat cdregion_feat;
1057     CBioseq_Handle handle = scope.GetBioseqHandle(rna_id);
1058     if (handle) {
1059         CFeat_CI feat_iter(handle, CSeqFeatData::eSubtype_cdregion);
1060         if (feat_iter  &&  feat_iter.GetSize()) {
1061             cdregion_feat = *feat_iter;
1062             const CSeq_loc& cds_loc = cdregion_feat.GetLocation();
1063             const CSeq_id* cds_loc_seq_id  = cds_loc.GetId();
1064             if (cds_loc_seq_id == NULL || !sequence::IsSameBioseq(*cds_loc_seq_id, rna_id, &scope)) {
1065                 cdregion_feat = CMappedFeat();
1066             }
1067         }
1068     }
1069     return cdregion_feat;
1070 }
1071 
GetCds(const objects::CSeq_id & rna_id)1072 TSignedSeqRange CFeatureGenerator::SImplementation::GetCds(const objects::CSeq_id& rna_id)
1073 {
1074     CMappedFeat cdregion = GetCdsOnMrna(rna_id, *m_scope);
1075     if (!cdregion) {
1076         return TSignedSeqRange();
1077     }
1078 
1079     TSeqRange cds = cdregion.GetLocation().GetTotalRange();
1080 
1081     return TSignedSeqRange(cds.GetFrom(), cds.GetTo());
1082 }
1083 
TrimLeftExon(int trim_amount,ETrimSide side,vector<SExon>::reverse_iterator left_edge,vector<SExon>::reverse_iterator & exon_it,CSpliced_seg::TExons::reverse_iterator & spl_exon_it,ENa_strand product_strand,ENa_strand genomic_strand)1084 void CFeatureGenerator::SImplementation::TrimLeftExon(int trim_amount, ETrimSide side,
1085                                                       vector<SExon>::reverse_iterator left_edge,
1086                                                       vector<SExon>::reverse_iterator& exon_it,
1087                                                       CSpliced_seg::TExons::reverse_iterator& spl_exon_it,
1088                                                       ENa_strand product_strand,
1089                                                       ENa_strand genomic_strand)
1090 {
1091     _ASSERT( trim_amount < 3 || side!=eTrimProduct );
1092     bool is_protein = (*spl_exon_it)->GetProduct_start().IsProtpos();
1093 
1094     while (trim_amount > 0) {
1095         int exon_len = side==eTrimProduct
1096             ? (exon_it->prod_to - exon_it->prod_from + 1)
1097             : (exon_it->genomic_to - exon_it->genomic_from + 1);
1098         if (exon_len <= trim_amount) {
1099             int next_from = exon_it->genomic_from;
1100             ++exon_it;
1101             ++spl_exon_it;
1102             trim_amount -= exon_len;
1103             _ASSERT( trim_amount==0 || side!=eTrimProduct );
1104             if (exon_it == left_edge)
1105                 break;
1106             if (trim_amount > 0) { // eTrimGenomic, account for distance between exons
1107                 trim_amount -= next_from - exon_it->genomic_to -1;
1108             }
1109         } else {
1110             (*spl_exon_it)->SetPartial(true);
1111             (*spl_exon_it)->ResetDonor_after_exon();
1112 
1113             int genomic_trim_amount = 0;
1114             int product_trim_amount = 0;
1115 
1116             if ((*spl_exon_it)->CanGetParts() && !(*spl_exon_it)->GetParts().empty()) {
1117                 CSpliced_exon::TParts& parts = (*spl_exon_it)->SetParts();
1118                 CSpliced_exon_Base::TParts::iterator chunk = parts.end();
1119                 while (--chunk, (trim_amount>0 ||
1120                                  (side==eTrimProduct
1121                                   ? (*chunk)->IsGenomic_ins()
1122                                   : (*chunk)->IsProduct_ins()))) {
1123                     int product_chunk_len = 0;
1124                     int genomic_chunk_len = 0;
1125                     switch((*chunk)->Which()) {
1126                     case CSpliced_exon_chunk::e_Match:
1127                         product_chunk_len = (*chunk)->GetMatch();
1128                         genomic_chunk_len = product_chunk_len;
1129                         if (product_chunk_len > trim_amount) {
1130                             (*chunk)->SetMatch(product_chunk_len - trim_amount);
1131                         }
1132                         break;
1133                     case CSpliced_exon_chunk::e_Mismatch:
1134                         product_chunk_len = (*chunk)->GetMismatch();
1135                         genomic_chunk_len = product_chunk_len;
1136                         if (product_chunk_len > trim_amount) {
1137                             (*chunk)->SetMismatch(product_chunk_len - trim_amount);
1138                         }
1139                         break;
1140                     case CSpliced_exon_chunk::e_Diag:
1141                         product_chunk_len = (*chunk)->GetDiag();
1142                         genomic_chunk_len = product_chunk_len;
1143                         if (product_chunk_len > trim_amount) {
1144                             (*chunk)->SetDiag(product_chunk_len - trim_amount);
1145                         }
1146                         break;
1147 
1148                     case CSpliced_exon_chunk::e_Product_ins:
1149                         product_chunk_len = (*chunk)->GetProduct_ins();
1150                         if (side==eTrimProduct && product_chunk_len > trim_amount) {
1151                             (*chunk)->SetProduct_ins(product_chunk_len - trim_amount);
1152                         }
1153                         break;
1154                     case CSpliced_exon_chunk::e_Genomic_ins:
1155                         genomic_chunk_len = (*chunk)->GetGenomic_ins();
1156                         if (side==eTrimGenomic && genomic_chunk_len > trim_amount) {
1157                             (*chunk)->SetGenomic_ins(genomic_chunk_len - trim_amount);
1158                         }
1159                         break;
1160                     default:
1161                         _ASSERT(false);
1162                         break;
1163                     }
1164 
1165                     if (side==eTrimProduct && product_chunk_len <= trim_amount) {
1166                         genomic_trim_amount += genomic_chunk_len;
1167                         product_trim_amount += product_chunk_len;
1168                         trim_amount -= product_chunk_len;
1169                     } else if (side==eTrimGenomic && genomic_chunk_len <= trim_amount) {
1170                         genomic_trim_amount += genomic_chunk_len;
1171                         product_trim_amount += product_chunk_len;
1172                         trim_amount -= genomic_chunk_len;
1173                     } else {
1174                         genomic_trim_amount += min(trim_amount, genomic_chunk_len);
1175                         product_trim_amount += min(trim_amount, product_chunk_len);
1176                         trim_amount = 0;
1177                         break;
1178                     }
1179                     chunk = parts.erase(chunk);
1180                 }
1181 
1182             } else {
1183                 genomic_trim_amount += trim_amount;
1184                 product_trim_amount += trim_amount;
1185                 trim_amount = 0;
1186             }
1187 
1188             exon_it->prod_to -= product_trim_amount;
1189             exon_it->genomic_to -= genomic_trim_amount;
1190 
1191             if (is_protein) {
1192                 CProduct_pos& prot_pos = (*spl_exon_it)->SetProduct_end();
1193                 SetProtpos(prot_pos, exon_it->prod_to);
1194             } else {
1195                 if (product_strand != eNa_strand_minus) {
1196                     (*spl_exon_it)->SetProduct_end().SetNucpos() -= product_trim_amount;
1197                 } else {
1198                     (*spl_exon_it)->SetProduct_start().SetNucpos() += product_trim_amount;
1199                 }
1200             }
1201 
1202             if (genomic_strand != eNa_strand_minus) {
1203                 (*spl_exon_it)->SetGenomic_end() -= genomic_trim_amount;
1204             } else {
1205                 (*spl_exon_it)->SetGenomic_start() += genomic_trim_amount;
1206             }
1207         }
1208     }
1209 }
TrimRightExon(int trim_amount,ETrimSide side,vector<SExon>::iterator & exon_it,vector<SExon>::iterator right_edge,CSpliced_seg::TExons::iterator & spl_exon_it,ENa_strand product_strand,ENa_strand genomic_strand)1210 void CFeatureGenerator::SImplementation::TrimRightExon(int trim_amount, ETrimSide side,
1211                                                        vector<SExon>::iterator& exon_it,
1212                                                        vector<SExon>::iterator right_edge,
1213                                                        CSpliced_seg::TExons::iterator& spl_exon_it,
1214                                                        ENa_strand product_strand,
1215                                                        ENa_strand genomic_strand)
1216 {
1217     _ASSERT( trim_amount < 3 || side!=eTrimProduct );
1218     bool is_protein = (*spl_exon_it)->GetProduct_start().IsProtpos();
1219 
1220     while (trim_amount > 0) {
1221         int exon_len = side==eTrimProduct
1222             ? (exon_it->prod_to - exon_it->prod_from + 1)
1223             : (exon_it->genomic_to - exon_it->genomic_from + 1);
1224         if (exon_len <= trim_amount) {
1225             int prev_to = exon_it->genomic_to;
1226             ++exon_it;
1227             ++spl_exon_it;
1228             trim_amount -= exon_len;
1229             _ASSERT( trim_amount==0 || side!=eTrimProduct );
1230             if (exon_it == right_edge)
1231                 break;
1232             if (trim_amount > 0) { // eTrimGenomic, account for distance between exons
1233                 trim_amount -= exon_it->genomic_from - prev_to -1;
1234             }
1235         } else {
1236             (*spl_exon_it)->SetPartial(true);
1237             (*spl_exon_it)->ResetAcceptor_before_exon();
1238 
1239             int genomic_trim_amount = 0;
1240             int product_trim_amount = 0;
1241 
1242             if ((*spl_exon_it)->CanGetParts() && !(*spl_exon_it)->GetParts().empty()) {
1243                 CSpliced_exon::TParts& parts = (*spl_exon_it)->SetParts();
1244                 CSpliced_exon_Base::TParts::iterator chunk = parts.begin();
1245                 for (; trim_amount>0 ||
1246                          (side==eTrimProduct
1247                           ? (*chunk)->IsGenomic_ins()
1248                           : (*chunk)->IsProduct_ins());
1249                      ) {
1250                     int product_chunk_len = 0;
1251                     int genomic_chunk_len = 0;
1252                     switch((*chunk)->Which()) {
1253                     case CSpliced_exon_chunk::e_Match:
1254                         product_chunk_len = (*chunk)->GetMatch();
1255                         genomic_chunk_len = product_chunk_len;
1256                         if (product_chunk_len > trim_amount) {
1257                             (*chunk)->SetMatch(product_chunk_len - trim_amount);
1258                         }
1259                         break;
1260                     case CSpliced_exon_chunk::e_Mismatch:
1261                         product_chunk_len = (*chunk)->GetMismatch();
1262                         genomic_chunk_len = product_chunk_len;
1263                         if (product_chunk_len > trim_amount) {
1264                             (*chunk)->SetMismatch(product_chunk_len - trim_amount);
1265                         }
1266                         break;
1267                     case CSpliced_exon_chunk::e_Diag:
1268                         product_chunk_len = (*chunk)->GetDiag();
1269                         genomic_chunk_len = product_chunk_len;
1270                         if (product_chunk_len > trim_amount) {
1271                             (*chunk)->SetDiag(product_chunk_len - trim_amount);
1272                         }
1273                         break;
1274 
1275                     case CSpliced_exon_chunk::e_Product_ins:
1276                         product_chunk_len = (*chunk)->GetProduct_ins();
1277                         if (side==eTrimProduct && product_chunk_len > trim_amount) {
1278                             (*chunk)->SetProduct_ins(product_chunk_len - trim_amount);
1279                         }
1280                         break;
1281                     case CSpliced_exon_chunk::e_Genomic_ins:
1282                         genomic_chunk_len = (*chunk)->GetGenomic_ins();
1283                         if (side==eTrimGenomic && genomic_chunk_len > trim_amount) {
1284                             (*chunk)->SetGenomic_ins(genomic_chunk_len - trim_amount);
1285                         }
1286                         break;
1287                     default:
1288                         _ASSERT(false);
1289                         break;
1290                     }
1291 
1292                     if (side==eTrimProduct && product_chunk_len <= trim_amount) {
1293                         genomic_trim_amount += genomic_chunk_len;
1294                         product_trim_amount += product_chunk_len;
1295                         trim_amount -= product_chunk_len;
1296                     } else if (side==eTrimGenomic && genomic_chunk_len <= trim_amount) {
1297                         genomic_trim_amount += genomic_chunk_len;
1298                         product_trim_amount += product_chunk_len;
1299                         trim_amount -= genomic_chunk_len;
1300                     } else {
1301                         genomic_trim_amount += min(trim_amount, genomic_chunk_len);
1302                         product_trim_amount += min(trim_amount, product_chunk_len);
1303                         trim_amount = 0;
1304                         break;
1305                     }
1306                     chunk = parts.erase(chunk);
1307                 }
1308 
1309             } else {
1310                 genomic_trim_amount += trim_amount;
1311                 product_trim_amount += trim_amount;
1312                 trim_amount = 0;
1313             }
1314 
1315             exon_it->prod_from += product_trim_amount;
1316             exon_it->genomic_from += genomic_trim_amount;
1317 
1318             if (is_protein) {
1319                 CProduct_pos& prot_pos = (*spl_exon_it)->SetProduct_start();
1320                 SetProtpos(prot_pos, exon_it->prod_from);
1321             } else {
1322                 if (product_strand != eNa_strand_minus) {
1323                     (*spl_exon_it)->SetProduct_start().SetNucpos() += product_trim_amount;
1324                 } else {
1325                     (*spl_exon_it)->SetProduct_end().SetNucpos() -= product_trim_amount;
1326                 }
1327             }
1328 
1329             if (genomic_strand != eNa_strand_minus) {
1330                 (*spl_exon_it)->SetGenomic_start() += genomic_trim_amount;
1331             } else {
1332                 (*spl_exon_it)->SetGenomic_end() -= genomic_trim_amount;
1333             }
1334         }
1335     }
1336 }
1337 
1338 namespace fg {
GetGeneticCode(const CBioseq_Handle & bsh)1339 int GetGeneticCode(const CBioseq_Handle& bsh)
1340 {
1341     int gcode = 1;
1342 
1343     auto source = sequence::GetBioSource(bsh);
1344     if (source != nullptr) {
1345         gcode = source->GetGenCode(gcode);
1346     }
1347 
1348     return gcode;
1349 }
1350 }
1351 
1352 END_NCBI_SCOPE
1353