1 /* $Id: transform_align.cpp 624079 2021-01-25 20:00:40Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Vyacheslav Chetvernin
27 *
28 * File Description: Alignment transformations
29 *
30 */
31 #include <ncbi_pch.hpp>
32 #include <algo/sequence/gene_model.hpp>
33 #include <objects/seqalign/seqalign__.hpp>
34 #include <objects/seqloc/Na_strand.hpp>
35 #include <objects/general/User_object.hpp>
36 #include <objects/general/Object_id.hpp>
37
38 #include <objmgr/bioseq_handle.hpp>
39 #include <objmgr/scope.hpp>
40 #include <objmgr/feat_ci.hpp>
41 #include <objmgr/util/sequence.hpp>
42
43 #include <objtools/alnmgr/score_builder_base.hpp>
44
45 #include "feature_generator.hpp"
46
47 BEGIN_NCBI_SCOPE
48
49 USING_SCOPE(objects);
50
51
52 namespace {
53
GetSplicedStrands(const CSpliced_seg & spliced_seg)54 pair <ENa_strand, ENa_strand> GetSplicedStrands(const CSpliced_seg& spliced_seg)
55 {
56 ENa_strand product_strand =
57 spliced_seg.IsSetProduct_strand() ?
58 spliced_seg.GetProduct_strand() :
59 (spliced_seg.GetExons().front()->IsSetProduct_strand() ?
60 spliced_seg.GetExons().front()->GetProduct_strand() :
61 eNa_strand_unknown);
62 ENa_strand genomic_strand =
63 spliced_seg.IsSetGenomic_strand() ?
64 spliced_seg.GetGenomic_strand() :
65 (spliced_seg.GetExons().front()->IsSetGenomic_strand()?
66 spliced_seg.GetExons().front()->GetGenomic_strand():
67 eNa_strand_unknown);
68
69 return make_pair(product_strand, genomic_strand);
70 }
71
SetProtpos(CProduct_pos & pos,int value)72 void SetProtpos(CProduct_pos &pos, int value)
73 {
74 pos.SetProtpos().SetAmin(value/3);
75 pos.SetProtpos().SetFrame((value % 3) +1);
76 }
77
78 }
79
80
GetExonStructure(const CSpliced_seg & spliced_seg,vector<SExon> & exons,CScope * scope)81 void CFeatureGenerator::SImplementation::GetExonStructure(const CSpliced_seg& spliced_seg, vector<SExon>& exons, CScope* scope)
82 {
83 pair <ENa_strand, ENa_strand> strands = GetSplicedStrands(spliced_seg);
84 ENa_strand product_strand = strands.first;
85 ENa_strand genomic_strand = strands.second;
86
87 exons.resize(spliced_seg.GetExons().size());
88 int i = 0;
89 TSignedSeqPos prev_genomic_pos = 0;
90 TSignedSeqPos offset = 0;
91 ITERATE(CSpliced_seg::TExons, it, spliced_seg.GetExons()) {
92 const CSpliced_exon& exon = **it;
93 SExon& exon_struct = exons[i++];
94
95 const CProduct_pos& prod_from = exon.GetProduct_start();
96 const CProduct_pos& prod_to = exon.GetProduct_end();
97
98 exon_struct.prod_from = prod_from.AsSeqPos();
99 exon_struct.prod_to = prod_to.AsSeqPos();
100 if (product_strand == eNa_strand_minus) {
101 swap(exon_struct.prod_from, exon_struct.prod_to);
102 exon_struct.prod_from = -exon_struct.prod_from;
103 exon_struct.prod_to = -exon_struct.prod_to;
104 }
105
106 exon_struct.genomic_from = exon.GetGenomic_start();
107 exon_struct.genomic_to = exon.GetGenomic_end();
108
109 bool cross_the_origin = i > 1 && (
110 genomic_strand != eNa_strand_minus
111 ? (exon_struct.genomic_from < prev_genomic_pos)
112 : (exon_struct.genomic_from > prev_genomic_pos));
113
114 if (cross_the_origin && scope) {
115 offset = scope->GetSequenceLength(spliced_seg.GetGenomic_id());
116 }
117
118 prev_genomic_pos = exon_struct.genomic_from;
119
120 if (genomic_strand == eNa_strand_minus) {
121 swap(exon_struct.genomic_from, exon_struct.genomic_to);
122 exon_struct.genomic_from = -exon_struct.genomic_from;
123 exon_struct.genomic_to = -exon_struct.genomic_to;
124 }
125
126 if (offset) {
127 exon_struct.genomic_from += offset;
128 exon_struct.genomic_to += offset;
129 }
130
131 }
132
133 _ASSERT( exons.size() == spliced_seg.GetExons().size() );
134 }
135
136
StitchSmallHoles(CSeq_align & align)137 void CFeatureGenerator::SImplementation::StitchSmallHoles(CSeq_align& align)
138 {
139 CSpliced_seg& spliced_seg = align.SetSegs().SetSpliced();
140
141 if (!spliced_seg.CanGetExons() || spliced_seg.GetExons().size() < 2)
142 return;
143
144 vector<SExon> exons;
145 GetExonStructure(spliced_seg, exons, m_scope);
146
147 bool is_protein = (spliced_seg.GetProduct_type()==CSpliced_seg::eProduct_type_protein);
148
149 pair <ENa_strand, ENa_strand> strands = GetSplicedStrands(spliced_seg);
150 ENa_strand product_strand = strands.first;
151 ENa_strand genomic_strand = strands.second;
152
153 int product_min_pos;
154 int product_max_pos;
155 if (product_strand != eNa_strand_minus) {
156 product_min_pos = 0;
157 if (spliced_seg.IsSetPoly_a()) {
158 product_max_pos = spliced_seg.GetPoly_a()-1;
159 } else if (spliced_seg.IsSetProduct_length()) {
160 product_max_pos = spliced_seg.GetProduct_length()-1;
161 if (is_protein)
162 product_max_pos = product_max_pos*3+2;
163 } else {
164 product_max_pos = exons.back().prod_to;
165 }
166 } else {
167 if (spliced_seg.IsSetProduct_length()) {
168 product_min_pos = -int(spliced_seg.GetProduct_length())+1;
169 if (is_protein)
170 product_min_pos = product_min_pos*3-2;
171 } else {
172 product_min_pos = exons[0].prod_from;
173 }
174 if (spliced_seg.IsSetPoly_a()) {
175 product_max_pos = -int(spliced_seg.GetPoly_a())+1;
176 } else {
177 product_max_pos = 0;
178 }
179 }
180
181 CSpliced_seg::TExons::iterator it = spliced_seg.SetExons().begin();
182 CRef<CSpliced_exon> prev_exon = *it;
183 size_t i = 1;
184 for (++it; it != spliced_seg.SetExons().end(); ++i, prev_exon = *it++) {
185 CSpliced_exon& exon = **it;
186
187 bool donor_set = prev_exon->IsSetDonor_after_exon() || (genomic_strand ==eNa_strand_minus && prev_exon->GetGenomic_start()==0);
188 bool acceptor_set = exon.IsSetAcceptor_before_exon() || (genomic_strand ==eNa_strand_minus && prev_exon->GetGenomic_start()==0);
189
190 if(donor_set && acceptor_set && exons[i-1].prod_to + 1 == exons[i].prod_from) {
191 continue;
192 }
193
194 _ASSERT( exons[i].prod_from > exons[i-1].prod_to );
195 int prod_hole_len = exons[i].prod_from - exons[i-1].prod_to -1;
196 _ASSERT( exons[i].genomic_from > exons[i-1].genomic_to );
197 int genomic_hole_len = exons[i].genomic_from - exons[i-1].genomic_to -1;
198
199 if (prod_hole_len >= (int)m_min_intron || genomic_hole_len >= (int)m_min_intron)
200 continue;
201
202 if (!prev_exon->IsSetParts() || prev_exon->GetParts().empty()) {
203 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
204 part->SetMatch(exons[i-1].prod_to-exons[i-1].prod_from+1);
205 prev_exon->SetParts().push_back(part);
206 }
207 if (!exon.IsSetParts() || exon.GetParts().empty()) {
208 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
209 part->SetMatch(exons[i].prod_to-exons[i].prod_from+1);
210 exon.SetParts().push_back(part);
211 }
212
213 int max_hole_len = max(prod_hole_len, genomic_hole_len);
214 int min_hole_len = min(prod_hole_len, genomic_hole_len);
215 int left_mismatch_len = 0;
216 int right_mismatch_len = min_hole_len;
217 if (prod_hole_len != genomic_hole_len) {
218 // does not matter for transcripts, but for proteins ensures insersions at codon boundary
219 int bases_needed_to_complete_codon = 2 - (exons[i-1].prod_to % 3);
220
221 if (right_mismatch_len >= bases_needed_to_complete_codon) {
222 left_mismatch_len = bases_needed_to_complete_codon + ((right_mismatch_len-bases_needed_to_complete_codon)/2/3)*3;
223 right_mismatch_len -= left_mismatch_len;
224 }
225 }
226
227 bool no_acceptor_before = i > 1 && !prev_exon->IsSetAcceptor_before_exon();
228 bool no_donor_after = i < exons.size()-1 && !exon.IsSetDonor_after_exon();
229
230
231 bool cross_the_origin =
232 genomic_strand != eNa_strand_minus
233 ? (prev_exon->GetGenomic_start() > exon.GetGenomic_start())
234 : (prev_exon->GetGenomic_start() < exon.GetGenomic_start());
235
236 if (cross_the_origin) {
237 int genomic_size = m_scope->GetSequenceLength(spliced_seg.GetGenomic_id());
238
239 prev_exon->SetPartial(product_min_pos < exons[i-1].prod_from &&
240 no_acceptor_before);
241
242 exon.SetPartial(exons[i].prod_to < product_max_pos &&
243 no_donor_after);
244
245 if (genomic_strand != eNa_strand_minus) {
246 prev_exon->SetGenomic_end(genomic_size-1);
247 exon.SetGenomic_start(0);
248 } else {
249 prev_exon->SetGenomic_start(0);
250 exon.SetGenomic_end(genomic_size-1);
251 }
252
253 int origin = genomic_strand != eNa_strand_minus ? genomic_size : 1;
254 int to_origin = origin - exons[i-1].genomic_to -1;
255 if (prod_hole_len == genomic_hole_len) {
256 left_mismatch_len = to_origin;
257 right_mismatch_len -= left_mismatch_len;
258 }
259
260 if (left_mismatch_len > 0 && to_origin > 0) {
261 int mismatch_len = min(left_mismatch_len, to_origin);
262 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
263 part->SetMismatch(mismatch_len);
264 prev_exon->SetParts().push_back(part);
265 prod_hole_len -= mismatch_len;
266 genomic_hole_len -= mismatch_len;
267 to_origin -= mismatch_len;
268 exons[i-1].genomic_to += mismatch_len;
269 exons[i-1].prod_to += mismatch_len;
270 left_mismatch_len -= mismatch_len;
271 }
272
273 if (to_origin > 0) {
274 _ASSERT(left_mismatch_len == 0);
275 _ASSERT(prod_hole_len != genomic_hole_len);
276 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
277 if (prod_hole_len < genomic_hole_len) {
278 int genomic_ins = min(genomic_hole_len-prod_hole_len, to_origin);
279 part->SetGenomic_ins(genomic_ins);
280 genomic_hole_len -= genomic_ins;
281 to_origin -= genomic_ins;
282 exons[i-1].genomic_to += genomic_ins;
283 } else {
284 part->SetProduct_ins(prod_hole_len-genomic_hole_len);
285 exons[i-1].prod_to += prod_hole_len-genomic_hole_len;
286 prod_hole_len = genomic_hole_len;
287 }
288 prev_exon->SetParts().push_back(part);
289 }
290 if (to_origin > 0) {
291 _ASSERT(prod_hole_len == genomic_hole_len);
292 _ASSERT(right_mismatch_len >= to_origin);
293 int mismatch_len = to_origin;
294 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
295 part->SetMismatch(mismatch_len);
296 prev_exon->SetParts().push_back(part);
297 prod_hole_len -= mismatch_len;
298 genomic_hole_len -= mismatch_len;
299 to_origin = 0;
300 exons[i-1].genomic_to += mismatch_len;
301 exons[i-1].prod_to += mismatch_len;
302 right_mismatch_len -= mismatch_len;
303 }
304
305 _ASSERT(to_origin == 0);
306 _ASSERT(exons[i-1].genomic_to == origin-1);
307
308 exons[i].prod_from = exons[i-1].prod_to+1;
309 exons[i].genomic_from = exons[i-1].genomic_to+1;
310
311 if (is_protein) {
312 prev_exon->SetProduct_end().SetProtpos().SetAmin() = exons[i-1].prod_to/3;
313 prev_exon->SetProduct_end().SetProtpos().SetFrame() = (exons[i-1].prod_to %3) +1;
314 exon.SetProduct_start().SetProtpos().SetAmin() = exons[i].prod_from/3;
315 exon.SetProduct_start().SetProtpos().SetFrame() = (exons[i].prod_from %3) +1;
316 } else if (product_strand != eNa_strand_minus) {
317 prev_exon->SetProduct_end().SetNucpos( exons[i-1].prod_to );
318 exon.SetProduct_start().SetNucpos( exons[i].prod_from );
319 } else {
320 prev_exon->SetProduct_start().SetNucpos( -exons[i-1].prod_to );
321 exon.SetProduct_end().SetNucpos( -exons[i].prod_from );
322 }
323
324 list <CRef< CSpliced_exon_chunk > >::iterator insertion_point = exon.SetParts().begin();
325
326 if (left_mismatch_len > 0) {
327 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
328 part->SetMismatch(left_mismatch_len);
329 insertion_point = exon.SetParts().insert(insertion_point, part);
330 ++insertion_point;
331 }
332 if (prod_hole_len != genomic_hole_len) {
333 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
334 if (prod_hole_len < genomic_hole_len) {
335 part->SetGenomic_ins(genomic_hole_len - prod_hole_len);
336 } else {
337 part->SetProduct_ins(prod_hole_len - genomic_hole_len);
338 }
339 insertion_point = exon.SetParts().insert(insertion_point, part);
340 ++insertion_point;
341 }
342 if (right_mismatch_len > 0) {
343 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
344 part->SetMismatch(right_mismatch_len);
345 exon.SetParts().insert(insertion_point, part);
346
347 }
348
349 } else {
350
351 if (is_protein || product_strand != eNa_strand_minus) {
352 prev_exon->SetProduct_end().Assign( exon.GetProduct_end() );
353 } else {
354 prev_exon->SetProduct_start().Assign( exon.GetProduct_start() );
355 }
356
357 if (genomic_strand != eNa_strand_minus) {
358 prev_exon->SetGenomic_end() = exon.GetGenomic_end();
359 } else {
360 prev_exon->SetGenomic_start() = exon.GetGenomic_start();
361 }
362
363 if (left_mismatch_len > 0) {
364 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
365 part->SetMismatch(left_mismatch_len);
366 prev_exon->SetParts().push_back(part);
367 }
368 if (prod_hole_len != genomic_hole_len) {
369 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
370 if (prod_hole_len < genomic_hole_len) {
371 part->SetGenomic_ins(max_hole_len - min_hole_len);
372 } else {
373 part->SetProduct_ins(max_hole_len - min_hole_len);
374 }
375 prev_exon->SetParts().push_back(part);
376 }
377 if (right_mismatch_len > 0) {
378 CRef< CSpliced_exon_chunk > part(new CSpliced_exon_chunk);
379 part->SetMismatch(right_mismatch_len);
380 prev_exon->SetParts().push_back(part);
381
382 }
383 prev_exon->SetParts().splice(prev_exon->SetParts().end(), exon.SetParts());
384
385 if (exon.IsSetDonor_after_exon()) {
386 prev_exon->SetDonor_after_exon().Assign( exon.GetDonor_after_exon() );
387 } else {
388 prev_exon->ResetDonor_after_exon();
389 }
390
391 exons[i].prod_from = exons[i-1].prod_from;
392 exons[i].genomic_from = exons[i-1].genomic_from;
393
394 prev_exon->SetPartial(
395 (product_min_pos < exons[i-1].prod_from && no_acceptor_before) ||
396 (exons[i].prod_to < product_max_pos && no_donor_after));
397
398 if (exon.IsSetExt()) {
399 prev_exon->SetExt().splice(prev_exon->SetExt().end(), exon.SetExt());
400 }
401
402 CSpliced_seg::TExons::iterator save_it = it;
403 --save_it;
404 spliced_seg.SetExons().erase(it);
405 it = save_it;
406 }
407 }
408 }
409
410 vector<CFeatureGenerator::SImplementation::SExon> CFeatureGenerator::SImplementation::
GetExons(const CSeq_align & align)411 GetExons(const CSeq_align &align)
412 {
413 vector<SExon> exons;
414 GetExonStructure(align.GetSegs().GetSpliced(), exons, NULL);
415 return exons;
416 }
417
418 CSeq_align::EScoreType s_ScoresToRecalculate[] =
419 { CSeq_align::eScore_IdentityCount,
420 CSeq_align::eScore_MismatchCount,
421 CSeq_align::eScore_GapCount,
422 CSeq_align::eScore_PercentIdentity_Gapped,
423 CSeq_align::eScore_PercentIdentity_Ungapped,
424 CSeq_align::eScore_PercentCoverage,
425 CSeq_align::eScore_HighQualityPercentCoverage,
426 (CSeq_align::EScoreType)0
427 };
428
429 void CFeatureGenerator::SImplementation::
ClearScores(CSeq_align & align)430 ClearScores(CSeq_align &align)
431 {
432 NON_CONST_ITERATE (CSpliced_seg::TExons, exon_it,
433 align.SetSegs().SetSpliced().SetExons())
434 {
435 (*exon_it)->ResetScores();
436 }
437 if (align.IsSetScore()) {
438 CScoreBuilderBase score_builder;
439 for (CSeq_align::EScoreType *score = s_ScoresToRecalculate;
440 *score; ++score)
441 {
442 align.ResetNamedScore(*score);
443 }
444 align.ResetNamedScore("weighted_identity");
445
446 if (align.SetScore().empty()) {
447 align.ResetScore();
448 }
449 }
450 }
451
452
453 void CFeatureGenerator::SImplementation::
RecalculateScores(CSeq_align & align)454 RecalculateScores(CSeq_align &align)
455 {
456 NON_CONST_ITERATE (CSpliced_seg::TExons, exon_it,
457 align.SetSegs().SetSpliced().SetExons())
458 {
459 RecalculateExonIdty(**exon_it);
460 }
461
462 if (align.IsSetScore()) {
463 CScoreBuilderBase score_builder;
464 for (CSeq_align::EScoreType *score = s_ScoresToRecalculate;
465 *score; ++score)
466 {
467 int sink;
468 if (align.GetNamedScore(*score, sink)) {
469 align.ResetNamedScore(*score);
470 score_builder.AddScore(*m_scope, align, *score);
471 }
472 }
473 if (align.GetSegs().GetSpliced().GetProduct_type() ==
474 CSpliced_seg::eProduct_type_transcript)
475 {
476 score_builder.AddSplignScores(align);
477 }
478 align.ResetNamedScore("weighted_identity");
479 }
480 }
481
482 void CFeatureGenerator::SImplementation::
RecalculateExonIdty(CSpliced_exon & exon)483 RecalculateExonIdty(CSpliced_exon &exon)
484 {
485 if (!exon.IsSetScores())
486 return;
487
488 Int8 idty = -1;
489 if (exon.IsSetParts()) {
490 int matches = 0;
491 int total = 0;
492 ITERATE (CSpliced_exon::TParts, part_it, exon.GetParts()) {
493 switch ((*part_it)->Which()) {
494 case CSpliced_exon_chunk::e_Match:
495 matches += (*part_it)->GetMatch();
496 total += (*part_it)->GetMatch();
497 break;
498
499 case CSpliced_exon_chunk::e_Mismatch:
500 total += (*part_it)->GetMismatch();
501 break;
502
503 case CSpliced_exon_chunk::e_Product_ins:
504 total += (*part_it)->GetProduct_ins();
505 break;
506
507 case CSpliced_exon_chunk::e_Genomic_ins:
508 total += (*part_it)->GetGenomic_ins();
509 break;
510
511 default:
512 matches = INT_MIN; // to ensure negative identity
513 total += 1; // to prevent division by zero
514 break;
515 }
516 }
517 if (total) {
518 idty = matches * NCBI_CONST_INT8(10000000000) / total;
519 }
520 else {
521 idty = 0;
522 }
523 }
524
525 CScore_set::Tdata& exon_scores = exon.SetScores().Set();
526 ERASE_ITERATE (CScore_set::Tdata, score_it, exon_scores) {
527 if (idty >= 0 && (*score_it)->IsSetId() && (*score_it)->GetId().IsStr() &&
528 (*score_it)->GetId().GetStr() == "idty") {
529 (*score_it)->SetValue().SetReal(idty / 10000000000.);
530 } else {
531 exon_scores.erase(score_it);
532 }
533 }
534 }
535
TrimHolesToCodons(CSeq_align & align)536 void CFeatureGenerator::SImplementation::TrimHolesToCodons(CSeq_align& align)
537 {
538 CSpliced_seg& spliced_seg = align.SetSegs().SetSpliced();
539
540 if (!spliced_seg.CanGetExons())
541 return;
542
543 bool is_protein = (spliced_seg.GetProduct_type()==CSpliced_seg::eProduct_type_protein);
544
545 pair <ENa_strand, ENa_strand> strands = GetSplicedStrands(spliced_seg);
546 ENa_strand product_strand = strands.first;
547 ENa_strand genomic_strand = strands.second;
548
549 TSignedSeqRange cds;
550 if (is_protein) {
551 cds = TSignedSeqRange(0, spliced_seg.GetProduct_length()*3 - 1);
552 } else {
553 if (!spliced_seg.CanGetProduct_id())
554 return;
555 cds = GetCds(spliced_seg.GetProduct_id());
556 if (cds.Empty())
557 return;
558 if (product_strand == eNa_strand_minus) {
559 NCBI_THROW(CException, eUnknown,
560 "TrimHolesToCodons(): "
561 "Reversed mRNA with CDS");
562 }
563 }
564
565 vector<SExon> exons;
566 GetExonStructure(spliced_seg, exons, m_scope);
567
568 int frame_offset = (exons.back().prod_to/3+1)*3+cds.GetFrom(); // to make modulo operands always positive
569
570 vector<SExon>::iterator right_exon_it = exons.begin();
571 CSpliced_seg::TExons::iterator right_spl_exon_it = spliced_seg.SetExons().begin();
572
573 for(;;++right_exon_it, ++right_spl_exon_it) {
574
575 vector<SExon>::reverse_iterator left_exon_it(right_exon_it);
576 CSpliced_seg::TExons::reverse_iterator left_spl_exon_it(right_spl_exon_it);
577
578 if (right_exon_it != exons.begin() && right_exon_it != exons.end()) {
579 bool donor_set = left_spl_exon_it != spliced_seg.SetExons().rend() && (*left_spl_exon_it)->IsSetDonor_after_exon();
580 bool acceptor_set = right_spl_exon_it != spliced_seg.SetExons().end() && (*right_spl_exon_it)->IsSetAcceptor_before_exon();
581
582 if(((donor_set && acceptor_set) || left_exon_it->genomic_to + 1 == right_exon_it->genomic_from) && left_exon_it->prod_to + 1 == right_exon_it->prod_from) {
583 continue;
584 }
585 }
586
587 if (right_exon_it != exons.begin() && (right_exon_it != exons.end() || (m_flags & fTrimEnds))) {
588 while (exons.rend() != left_exon_it &&
589 cds.GetFrom() < left_exon_it->prod_to && left_exon_it->prod_to < cds.GetTo() &&
590 (left_exon_it->prod_to - cds.GetFrom() + 1) % 3 > 0
591 ) {
592 TrimLeftExon(min(left_exon_it->prod_to - left_exon_it->prod_from + 1,
593 (left_exon_it->prod_to - cds.GetFrom() + 1) % 3),
594 eTrimProduct,
595 exons.rend(), left_exon_it, left_spl_exon_it,
596 product_strand, genomic_strand);
597 }
598 }
599
600 if (right_exon_it != exons.end() && (right_exon_it != exons.begin() || (m_flags & fTrimEnds))) {
601 while (right_exon_it != exons.end() &&
602 cds.GetFrom() < right_exon_it->prod_from && right_exon_it->prod_from < cds.GetTo() &&
603 (frame_offset-right_exon_it->prod_from) % 3 > 0
604 ) {
605 TrimRightExon(min(right_exon_it->prod_to - right_exon_it->prod_from + 1,
606 (frame_offset-right_exon_it->prod_from) % 3),
607 eTrimProduct,
608 right_exon_it, exons.end(), right_spl_exon_it,
609 product_strand, genomic_strand);
610 }
611 }
612
613 if (left_exon_it.base() != right_exon_it) {
614 right_exon_it = exons.erase(left_exon_it.base(), right_exon_it);
615 right_spl_exon_it = spliced_seg.SetExons().erase(left_spl_exon_it.base(), right_spl_exon_it);
616 }
617
618 if (right_exon_it == exons.end())
619 break;
620 }
621 _ASSERT(right_exon_it == exons.end() && right_spl_exon_it == spliced_seg.SetExons().end());
622 }
623
MaximizeTranslation(CSeq_align & align)624 void CFeatureGenerator::SImplementation::MaximizeTranslation(CSeq_align& align)
625 {
626 CSpliced_seg& spliced_seg = align.SetSegs().SetSpliced();
627 bool is_protein_align =
628 spliced_seg.GetProduct_type() == CSpliced_seg::eProduct_type_protein;
629
630 int aa_offset = 0;
631
632 CSpliced_seg::TExons::iterator prev_exon_it = spliced_seg.SetExons().end();
633
634 NON_CONST_ITERATE (CSpliced_seg::TExons, exon_it, spliced_seg.SetExons()) {
635 CSpliced_exon& exon = **exon_it;
636
637 if (exon_it == spliced_seg.SetExons().begin()) {
638 if (is_protein_align)
639 aa_offset = - int(exon.GetProduct_start().GetProtpos().GetAmin());
640 else
641 aa_offset = - int(exon.GetProduct_start().GetNucpos())/3;
642 }
643
644 if (aa_offset) {
645 if (is_protein_align)
646 exon.SetProduct_start().SetProtpos().SetAmin() += aa_offset;
647 else
648 exon.SetProduct_start().SetNucpos() += aa_offset*3;
649 }
650 if (exon.IsSetParts()) {
651 int part_index = 0;
652 ERASE_ITERATE (CSpliced_exon::TParts, part_it, exon.SetParts()) {
653 CSpliced_exon_chunk& chunk = **part_it;
654 switch (chunk.Which()) {
655 case CSpliced_exon_chunk::e_Genomic_ins: {
656 int len = chunk.GetGenomic_ins();
657 if (len % 3 == 0) {
658 chunk.SetDiag(len);
659 } else {
660 if (part_index == 0 && prev_exon_it != spliced_seg.SetExons().end() &&
661 (*prev_exon_it)->IsSetParts()) {
662 CSpliced_exon_chunk& prev_chunk = **(*prev_exon_it)->SetParts().rbegin();
663 if (prev_chunk.Which()==CSpliced_exon_chunk::e_Genomic_ins) {
664 int prev_len = prev_chunk.GetGenomic_ins();
665 if (prev_len + len >= 3) {
666
667 prev_chunk.SetDiag(prev_len);
668
669 if (is_protein_align) {
670 TSeqPos product_end = (*prev_exon_it)->GetProduct_end().AsSeqPos();
671 product_end += prev_len;
672 (*prev_exon_it)->SetProduct_end().SetProtpos().SetAmin (product_end/3);
673 (*prev_exon_it)->SetProduct_end().SetProtpos().SetFrame(product_end%3+1);
674
675 TSeqPos product_start = exon.GetProduct_start().AsSeqPos();
676 product_start += prev_len;
677 exon.SetProduct_start().SetProtpos().SetAmin (product_start/3);
678 exon.SetProduct_start().SetProtpos().SetFrame(product_start%3+1);
679 } else {
680 (*prev_exon_it)->SetProduct_end().SetNucpos() += prev_len;
681 exon.SetProduct_start().SetNucpos() += prev_len;
682 }
683
684 if (len > 3-prev_len) {
685 CRef<CSpliced_exon_chunk> new_chunk(new CSpliced_exon_chunk);
686 new_chunk->SetDiag(3-prev_len);
687 exon.SetParts().insert(part_it, new_chunk);
688 chunk.SetGenomic_ins(len - (3-prev_len));
689 } else {
690 chunk.SetDiag(len);
691 }
692 aa_offset += 1;
693 len -= 3-prev_len;
694 }
695 }
696 }
697 if (len > 3) {
698 CRef<CSpliced_exon_chunk> new_chunk(new CSpliced_exon_chunk);
699 new_chunk->SetDiag((len/3)*3);
700 exon.SetParts().insert(part_it, new_chunk);
701 chunk.SetGenomic_ins(len % 3);
702 }
703 }
704 aa_offset += len/3;
705 }
706 break;
707 case CSpliced_exon_chunk::e_Product_ins: {
708 int len = chunk.GetProduct_ins();
709 if (len % 3 == 0) {
710 exon.SetParts().erase(part_it);
711 } else {
712 chunk.SetProduct_ins(len % 3);
713 }
714 aa_offset -= len/3;
715 }
716 break;
717 default:
718 break;
719 }
720 ++part_index;
721 }
722 }
723 if (aa_offset) {
724 if (is_protein_align)
725 exon.SetProduct_end().SetProtpos().SetAmin() += aa_offset;
726 else
727 exon.SetProduct_end().SetNucpos() += aa_offset*3;
728 }
729 prev_exon_it = exon_it;
730 }
731 spliced_seg.SetProduct_length() = is_protein_align
732 ? (*prev_exon_it)->GetProduct_end().GetProtpos().GetAmin()+1
733 : (*prev_exon_it)->GetProduct_end().GetNucpos()+1;
734 }
735
AdjustAlignment(const CSeq_align & align_in,TSeqRange range,EProductPositionsMode mode)736 CConstRef<CSeq_align> CFeatureGenerator::AdjustAlignment(const CSeq_align& align_in, TSeqRange range, EProductPositionsMode mode)
737 {
738 return m_impl->AdjustAlignment(align_in, range, mode);
739 }
740
AdjustAlignment(const CSeq_align & align_in,TSeqRange range,EProductPositionsMode mode)741 CConstRef<CSeq_align> CFeatureGenerator::SImplementation::AdjustAlignment(const CSeq_align& align_in, TSeqRange range, EProductPositionsMode mode)
742 {
743 if (!align_in.CanGetSegs() || !align_in.GetSegs().IsSpliced())
744 return CConstRef<CSeq_align>(&align_in);
745
746 CRef<CSeq_align> align(new CSeq_align);
747 align->Assign(align_in);
748
749 vector<SExon> orig_exons = GetExons(*align);
750
751 CSpliced_seg& spliced_seg = align->SetSegs().SetSpliced();
752
753 pair <ENa_strand, ENa_strand> strands = GetSplicedStrands(spliced_seg);
754 ENa_strand product_strand = strands.first;
755 ENa_strand genomic_strand = strands.second;
756
757 if (product_strand == eNa_strand_minus) {
758 NCBI_THROW(CException, eUnknown,
759 "AdjustAlignment(): "
760 "product minus strand not supported");
761
762 }
763
764 bool plus_strand = !(genomic_strand == eNa_strand_minus);
765
766 TSeqRange align_range;
767 if (plus_strand) {
768 align_range = TSeqRange(spliced_seg.GetExons().front()->GetGenomic_start(),
769 spliced_seg.GetExons().back()->GetGenomic_end());
770 } else {
771 align_range = TSeqRange(spliced_seg.GetExons().back()->GetGenomic_start(),
772 spliced_seg.GetExons().front()->GetGenomic_end());
773 }
774 bool cross_the_origin = range.GetFrom() > range.GetTo() || align_range.GetFrom() > align_range.GetTo();
775 TSeqPos genomic_size = 0;
776 if (cross_the_origin) {
777 genomic_size = m_scope->GetSequenceLength(spliced_seg.GetGenomic_id());
778
779
780 if (range.GetFrom() > range.GetTo()) {
781 range.SetTo(range.GetTo() + genomic_size);
782 }
783 if (align_range.GetFrom() > align_range.GetTo()) {
784 align_range.SetTo(align_range.GetTo() + genomic_size);
785 }
786
787 if (range.GetTo() < align_range.GetFrom()) {
788 range.SetFrom(range.GetFrom() + genomic_size);
789 range.SetTo(range.GetTo() + genomic_size);
790 }
791 if (align_range.GetTo() < range.GetFrom()) {
792 align_range.SetFrom(align_range.GetFrom() + genomic_size);
793 align_range.SetTo(align_range.GetTo() + genomic_size);
794 }
795
796 TSeqPos outside_point = min(range.GetFrom(), align_range.GetFrom());
797 NON_CONST_ITERATE(CSpliced_seg::TExons, exon_it, spliced_seg.SetExons()) {
798 CSpliced_exon& exon = **exon_it;
799 if (exon.GetGenomic_start() < outside_point) {
800 exon.SetGenomic_start() += genomic_size;
801 exon.SetGenomic_end() += genomic_size;
802 }
803 }
804 }
805
806 if (!(range.GetFrom() <= range.GetTo()) ||
807 !(align_range.GetFrom() <= align_range.GetTo())) {
808 NCBI_USER_THROW("no inverted range assertion failed");
809 }
810 if (range.GetTo() < align_range.GetFrom() ||
811 align_range.GetTo() < range.GetFrom()) {
812 NCBI_USER_THROW("alignmentrange and requested range don't overlap");
813 }
814
815 vector<SExon> exons;
816 GetExonStructure(spliced_seg, exons, m_scope);
817
818 bool is_protein_align =
819 spliced_seg.GetProduct_type() == CSpliced_seg::eProduct_type_protein;
820
821 vector<SExon>::iterator right_exon_it = exons.begin();
822 CSpliced_seg::TExons::iterator right_spl_exon_it = spliced_seg.SetExons().begin();
823
824 int range_left = plus_strand ? int(range.GetFrom()) : -int(range.GetTo());
825 int range_right = plus_strand ? int(range.GetTo()) : -int(range.GetFrom());
826
827 for(;;++right_exon_it, ++right_spl_exon_it) {
828
829 vector<SExon>::reverse_iterator left_exon_it(right_exon_it);
830 CSpliced_seg::TExons::reverse_iterator left_spl_exon_it(right_spl_exon_it);
831
832 if (right_exon_it == exons.end() &&
833 left_exon_it->genomic_to > range_right
834 )
835 CFeatureGenerator::SImplementation::TrimLeftExon(left_exon_it->genomic_to - range_right, eTrimGenomic,
836 exons.rend(), left_exon_it, left_spl_exon_it,
837 product_strand, genomic_strand);
838
839 if (right_exon_it == exons.begin() &&
840 right_exon_it->genomic_from < range_left
841 )
842 CFeatureGenerator::SImplementation::TrimRightExon(range_left - right_exon_it->genomic_from, eTrimGenomic,
843 right_exon_it, exons.end(), right_spl_exon_it,
844 product_strand, genomic_strand);
845
846 if (left_exon_it.base() != right_exon_it) {
847 right_exon_it = exons.erase(left_exon_it.base(), right_exon_it);
848 right_spl_exon_it = spliced_seg.SetExons().erase(left_spl_exon_it.base(), right_spl_exon_it);
849 }
850
851 if (right_exon_it == exons.end())
852 break;
853 }
854
855 CSpliced_exon& first_exon = *spliced_seg.SetExons().front();
856 CSpliced_exon& last_exon = *spliced_seg.SetExons().back();
857
858 int first_exon_extension = 0;
859 int last_exon_extension = 0;
860
861 if (plus_strand) {
862
863 first_exon_extension =
864 first_exon.GetGenomic_start()
865 - ((range.GetFrom() < genomic_size && genomic_size <= first_exon.GetGenomic_start())
866 ? genomic_size
867 : range.GetFrom());
868
869 if (first_exon_extension > 0) {
870 first_exon.SetGenomic_start() -= first_exon_extension;
871 if (first_exon.IsSetParts()) {
872 CRef<CSpliced_exon_chunk> chunk(new CSpliced_exon_chunk);
873 chunk->SetDiag(first_exon_extension);
874 first_exon.SetParts().insert(first_exon.SetParts().begin(), chunk);
875 }
876 }
877
878 last_exon_extension =
879 ((last_exon.GetGenomic_end() <= genomic_size-1 && genomic_size-1 < range.GetTo())
880 ? genomic_size-1
881 : range.GetTo())
882 - last_exon.GetGenomic_end();
883
884 if (last_exon_extension > 0) {
885 last_exon.SetGenomic_end() += last_exon_extension;
886 if (last_exon.IsSetParts()) {
887 CRef<CSpliced_exon_chunk> chunk(new CSpliced_exon_chunk);
888 chunk->SetDiag(last_exon_extension);
889 last_exon.SetParts().push_back(chunk);
890 }
891 }
892 } else {
893 last_exon_extension =
894 last_exon.GetGenomic_start()
895 - ((range.GetFrom() < genomic_size && genomic_size <= last_exon.GetGenomic_start())
896 ? genomic_size
897 : range.GetFrom());
898
899 if (last_exon_extension > 0) {
900 last_exon.SetGenomic_start() -= last_exon_extension;
901 if (last_exon.IsSetParts()) {
902 CRef<CSpliced_exon_chunk> chunk(new CSpliced_exon_chunk);
903 chunk->SetDiag(last_exon_extension);
904 last_exon.SetParts().push_back(chunk);
905 }
906 }
907
908 first_exon_extension =
909 ((first_exon.GetGenomic_end() <= genomic_size-1 && genomic_size-1 < range.GetTo())
910 ? genomic_size-1
911 : range.GetTo())
912 - first_exon.GetGenomic_end();
913 if (first_exon_extension > 0) {
914 first_exon.SetGenomic_end() += first_exon_extension;
915 if (first_exon.IsSetParts()) {
916 CRef<CSpliced_exon_chunk> chunk(new CSpliced_exon_chunk);
917 chunk->SetDiag(first_exon_extension);
918 first_exon.SetParts().insert(first_exon.SetParts().begin(), chunk);
919 }
920 }
921 }
922
923 exons.front().prod_from -= first_exon_extension;
924 exons.front().genomic_from -= first_exon_extension;
925 exons.back().prod_to += last_exon_extension;
926 exons.back().genomic_to += last_exon_extension;
927
928
929 if (plus_strand) {
930 first_exon_extension = first_exon.GetGenomic_start() - range.GetFrom();
931
932 if (first_exon_extension > 0) {
933 CRef<CSpliced_exon> exon(new CSpliced_exon);
934 exon->SetGenomic_start() = range.GetFrom();
935 exon->SetGenomic_end() = genomic_size-1;
936 spliced_seg.SetExons().push_front(exon);
937
938 SExon exon_struct;
939 exon_struct.prod_from = exons.front().prod_from - first_exon_extension;
940 exon_struct.prod_to = exons.front().prod_from - 1;
941 exon_struct.genomic_from = exons.front().genomic_from - first_exon_extension;
942 exon_struct.genomic_to = exons.front().genomic_from - 1;
943
944 exons.insert(exons.begin(), exon_struct);
945 }
946
947 last_exon_extension = range.GetTo() - last_exon.GetGenomic_end();
948
949 if (last_exon_extension > 0) {
950 CRef<CSpliced_exon> exon(new CSpliced_exon);
951 exon->SetGenomic_start() = 0;
952 exon->SetGenomic_end() = last_exon_extension - 1;
953 spliced_seg.SetExons().push_back(exon);
954
955 SExon exon_struct;
956 exon_struct.prod_from = exons.back().prod_to + 1;
957 exon_struct.prod_to = exons.back().prod_to + last_exon_extension;
958 exon_struct.genomic_from = exons.back().genomic_to +1;
959 exon_struct.genomic_to = exons.back().genomic_to + last_exon_extension;
960
961 exons.push_back(exon_struct);
962 }
963 } else {
964 last_exon_extension = last_exon.GetGenomic_start() - range.GetFrom();
965
966 if (last_exon_extension > 0) {
967 CRef<CSpliced_exon> exon(new CSpliced_exon);
968 exon->SetGenomic_start() = range.GetFrom();
969 exon->SetGenomic_end() = genomic_size-1;
970 spliced_seg.SetExons().push_back(exon);
971
972 SExon exon_struct;
973 exon_struct.prod_from = exons.back().prod_to + 1;
974 exon_struct.prod_to = exons.back().prod_to + last_exon_extension;
975 exon_struct.genomic_from = exons.back().genomic_to +1;
976 exon_struct.genomic_to = exons.back().genomic_to + last_exon_extension;
977
978 exons.push_back(exon_struct);
979 }
980
981 first_exon_extension = range.GetTo() - first_exon.GetGenomic_end();
982
983 if (first_exon_extension > 0) {
984 CRef<CSpliced_exon> exon(new CSpliced_exon);
985 exon->SetGenomic_start() = 0;
986 exon->SetGenomic_end() = first_exon_extension - 1;
987 spliced_seg.SetExons().push_front(exon);
988
989 SExon exon_struct;
990 exon_struct.prod_from = exons.front().prod_from - first_exon_extension;
991 exon_struct.prod_to = exons.front().prod_from - 1;
992 exon_struct.genomic_from = exons.front().genomic_from - first_exon_extension;
993 exon_struct.genomic_to = exons.front().genomic_from - 1;
994
995 exons.insert(exons.begin(), exon_struct);
996 }
997 }
998
999 if (range_left != exons.front().genomic_from || range_right != exons.back().genomic_to) {
1000 NCBI_THROW(CException, eUnknown,
1001 "AdjustAlignment(): "
1002 "result's ends do not match the range. This is a bug in AdjustAlignment implementation");
1003 }
1004
1005 int offset = is_protein_align ? int(exons.front().prod_from/3)*3 : exons.front().prod_from;
1006 if (offset > exons.front().prod_from) // negative division rounds toward zero
1007 offset -= 3;
1008
1009 if (mode == eTryToPreserveProductPositions && offset > 0) {
1010 offset = 0; // do not shift product position unnecessarily
1011 }
1012
1013 vector<SExon>::iterator exon_struct_it = exons.begin();
1014
1015 int putative_prod_length = 0;
1016 if (is_protein_align) {
1017 NON_CONST_ITERATE (CSpliced_seg::TExons, exon_it, spliced_seg.SetExons()) {
1018 CSpliced_exon& exon = **exon_it;
1019 SetProtpos(exon.SetProduct_start(), exon_struct_it->prod_from - offset);
1020 SetProtpos(exon.SetProduct_end(), exon_struct_it->prod_to - offset);
1021 ++exon_struct_it;
1022 }
1023 putative_prod_length = (exons.back().prod_to - offset + 3)/3;
1024 } else {
1025 NON_CONST_ITERATE (CSpliced_seg::TExons, exon_it, spliced_seg.SetExons()) {
1026 CSpliced_exon& exon = **exon_it;
1027 exon.SetProduct_start().SetNucpos() = exon_struct_it->prod_from - offset;
1028 exon.SetProduct_end().SetNucpos() = exon_struct_it->prod_to - offset;
1029 ++exon_struct_it;
1030 }
1031 putative_prod_length = exons.back().prod_to - offset + 1;
1032 }
1033 if (mode == eForceProductFrom0 || (int)spliced_seg.GetProduct_length() < putative_prod_length) {
1034 spliced_seg.SetProduct_length(putative_prod_length);
1035 }
1036
1037 if (cross_the_origin) {
1038 NON_CONST_ITERATE(CSpliced_seg::TExons, exon_it, spliced_seg.SetExons()) {
1039 CSpliced_exon& exon = **exon_it;
1040 if (exon.GetGenomic_start() >= genomic_size)
1041 exon.SetGenomic_start() -= genomic_size;
1042 if (exon.GetGenomic_end() >= genomic_size)
1043 exon.SetGenomic_end() -= genomic_size;
1044 }
1045 }
1046
1047 if (GetExons(*align) != orig_exons) {
1048 ClearScores(*align);
1049 }
1050
1051 return align;
1052 }
1053
GetCdsOnMrna(const objects::CSeq_id & rna_id,CScope & scope)1054 CMappedFeat GetCdsOnMrna(const objects::CSeq_id& rna_id, CScope& scope)
1055 {
1056 CMappedFeat cdregion_feat;
1057 CBioseq_Handle handle = scope.GetBioseqHandle(rna_id);
1058 if (handle) {
1059 CFeat_CI feat_iter(handle, CSeqFeatData::eSubtype_cdregion);
1060 if (feat_iter && feat_iter.GetSize()) {
1061 cdregion_feat = *feat_iter;
1062 const CSeq_loc& cds_loc = cdregion_feat.GetLocation();
1063 const CSeq_id* cds_loc_seq_id = cds_loc.GetId();
1064 if (cds_loc_seq_id == NULL || !sequence::IsSameBioseq(*cds_loc_seq_id, rna_id, &scope)) {
1065 cdregion_feat = CMappedFeat();
1066 }
1067 }
1068 }
1069 return cdregion_feat;
1070 }
1071
GetCds(const objects::CSeq_id & rna_id)1072 TSignedSeqRange CFeatureGenerator::SImplementation::GetCds(const objects::CSeq_id& rna_id)
1073 {
1074 CMappedFeat cdregion = GetCdsOnMrna(rna_id, *m_scope);
1075 if (!cdregion) {
1076 return TSignedSeqRange();
1077 }
1078
1079 TSeqRange cds = cdregion.GetLocation().GetTotalRange();
1080
1081 return TSignedSeqRange(cds.GetFrom(), cds.GetTo());
1082 }
1083
TrimLeftExon(int trim_amount,ETrimSide side,vector<SExon>::reverse_iterator left_edge,vector<SExon>::reverse_iterator & exon_it,CSpliced_seg::TExons::reverse_iterator & spl_exon_it,ENa_strand product_strand,ENa_strand genomic_strand)1084 void CFeatureGenerator::SImplementation::TrimLeftExon(int trim_amount, ETrimSide side,
1085 vector<SExon>::reverse_iterator left_edge,
1086 vector<SExon>::reverse_iterator& exon_it,
1087 CSpliced_seg::TExons::reverse_iterator& spl_exon_it,
1088 ENa_strand product_strand,
1089 ENa_strand genomic_strand)
1090 {
1091 _ASSERT( trim_amount < 3 || side!=eTrimProduct );
1092 bool is_protein = (*spl_exon_it)->GetProduct_start().IsProtpos();
1093
1094 while (trim_amount > 0) {
1095 int exon_len = side==eTrimProduct
1096 ? (exon_it->prod_to - exon_it->prod_from + 1)
1097 : (exon_it->genomic_to - exon_it->genomic_from + 1);
1098 if (exon_len <= trim_amount) {
1099 int next_from = exon_it->genomic_from;
1100 ++exon_it;
1101 ++spl_exon_it;
1102 trim_amount -= exon_len;
1103 _ASSERT( trim_amount==0 || side!=eTrimProduct );
1104 if (exon_it == left_edge)
1105 break;
1106 if (trim_amount > 0) { // eTrimGenomic, account for distance between exons
1107 trim_amount -= next_from - exon_it->genomic_to -1;
1108 }
1109 } else {
1110 (*spl_exon_it)->SetPartial(true);
1111 (*spl_exon_it)->ResetDonor_after_exon();
1112
1113 int genomic_trim_amount = 0;
1114 int product_trim_amount = 0;
1115
1116 if ((*spl_exon_it)->CanGetParts() && !(*spl_exon_it)->GetParts().empty()) {
1117 CSpliced_exon::TParts& parts = (*spl_exon_it)->SetParts();
1118 CSpliced_exon_Base::TParts::iterator chunk = parts.end();
1119 while (--chunk, (trim_amount>0 ||
1120 (side==eTrimProduct
1121 ? (*chunk)->IsGenomic_ins()
1122 : (*chunk)->IsProduct_ins()))) {
1123 int product_chunk_len = 0;
1124 int genomic_chunk_len = 0;
1125 switch((*chunk)->Which()) {
1126 case CSpliced_exon_chunk::e_Match:
1127 product_chunk_len = (*chunk)->GetMatch();
1128 genomic_chunk_len = product_chunk_len;
1129 if (product_chunk_len > trim_amount) {
1130 (*chunk)->SetMatch(product_chunk_len - trim_amount);
1131 }
1132 break;
1133 case CSpliced_exon_chunk::e_Mismatch:
1134 product_chunk_len = (*chunk)->GetMismatch();
1135 genomic_chunk_len = product_chunk_len;
1136 if (product_chunk_len > trim_amount) {
1137 (*chunk)->SetMismatch(product_chunk_len - trim_amount);
1138 }
1139 break;
1140 case CSpliced_exon_chunk::e_Diag:
1141 product_chunk_len = (*chunk)->GetDiag();
1142 genomic_chunk_len = product_chunk_len;
1143 if (product_chunk_len > trim_amount) {
1144 (*chunk)->SetDiag(product_chunk_len - trim_amount);
1145 }
1146 break;
1147
1148 case CSpliced_exon_chunk::e_Product_ins:
1149 product_chunk_len = (*chunk)->GetProduct_ins();
1150 if (side==eTrimProduct && product_chunk_len > trim_amount) {
1151 (*chunk)->SetProduct_ins(product_chunk_len - trim_amount);
1152 }
1153 break;
1154 case CSpliced_exon_chunk::e_Genomic_ins:
1155 genomic_chunk_len = (*chunk)->GetGenomic_ins();
1156 if (side==eTrimGenomic && genomic_chunk_len > trim_amount) {
1157 (*chunk)->SetGenomic_ins(genomic_chunk_len - trim_amount);
1158 }
1159 break;
1160 default:
1161 _ASSERT(false);
1162 break;
1163 }
1164
1165 if (side==eTrimProduct && product_chunk_len <= trim_amount) {
1166 genomic_trim_amount += genomic_chunk_len;
1167 product_trim_amount += product_chunk_len;
1168 trim_amount -= product_chunk_len;
1169 } else if (side==eTrimGenomic && genomic_chunk_len <= trim_amount) {
1170 genomic_trim_amount += genomic_chunk_len;
1171 product_trim_amount += product_chunk_len;
1172 trim_amount -= genomic_chunk_len;
1173 } else {
1174 genomic_trim_amount += min(trim_amount, genomic_chunk_len);
1175 product_trim_amount += min(trim_amount, product_chunk_len);
1176 trim_amount = 0;
1177 break;
1178 }
1179 chunk = parts.erase(chunk);
1180 }
1181
1182 } else {
1183 genomic_trim_amount += trim_amount;
1184 product_trim_amount += trim_amount;
1185 trim_amount = 0;
1186 }
1187
1188 exon_it->prod_to -= product_trim_amount;
1189 exon_it->genomic_to -= genomic_trim_amount;
1190
1191 if (is_protein) {
1192 CProduct_pos& prot_pos = (*spl_exon_it)->SetProduct_end();
1193 SetProtpos(prot_pos, exon_it->prod_to);
1194 } else {
1195 if (product_strand != eNa_strand_minus) {
1196 (*spl_exon_it)->SetProduct_end().SetNucpos() -= product_trim_amount;
1197 } else {
1198 (*spl_exon_it)->SetProduct_start().SetNucpos() += product_trim_amount;
1199 }
1200 }
1201
1202 if (genomic_strand != eNa_strand_minus) {
1203 (*spl_exon_it)->SetGenomic_end() -= genomic_trim_amount;
1204 } else {
1205 (*spl_exon_it)->SetGenomic_start() += genomic_trim_amount;
1206 }
1207 }
1208 }
1209 }
TrimRightExon(int trim_amount,ETrimSide side,vector<SExon>::iterator & exon_it,vector<SExon>::iterator right_edge,CSpliced_seg::TExons::iterator & spl_exon_it,ENa_strand product_strand,ENa_strand genomic_strand)1210 void CFeatureGenerator::SImplementation::TrimRightExon(int trim_amount, ETrimSide side,
1211 vector<SExon>::iterator& exon_it,
1212 vector<SExon>::iterator right_edge,
1213 CSpliced_seg::TExons::iterator& spl_exon_it,
1214 ENa_strand product_strand,
1215 ENa_strand genomic_strand)
1216 {
1217 _ASSERT( trim_amount < 3 || side!=eTrimProduct );
1218 bool is_protein = (*spl_exon_it)->GetProduct_start().IsProtpos();
1219
1220 while (trim_amount > 0) {
1221 int exon_len = side==eTrimProduct
1222 ? (exon_it->prod_to - exon_it->prod_from + 1)
1223 : (exon_it->genomic_to - exon_it->genomic_from + 1);
1224 if (exon_len <= trim_amount) {
1225 int prev_to = exon_it->genomic_to;
1226 ++exon_it;
1227 ++spl_exon_it;
1228 trim_amount -= exon_len;
1229 _ASSERT( trim_amount==0 || side!=eTrimProduct );
1230 if (exon_it == right_edge)
1231 break;
1232 if (trim_amount > 0) { // eTrimGenomic, account for distance between exons
1233 trim_amount -= exon_it->genomic_from - prev_to -1;
1234 }
1235 } else {
1236 (*spl_exon_it)->SetPartial(true);
1237 (*spl_exon_it)->ResetAcceptor_before_exon();
1238
1239 int genomic_trim_amount = 0;
1240 int product_trim_amount = 0;
1241
1242 if ((*spl_exon_it)->CanGetParts() && !(*spl_exon_it)->GetParts().empty()) {
1243 CSpliced_exon::TParts& parts = (*spl_exon_it)->SetParts();
1244 CSpliced_exon_Base::TParts::iterator chunk = parts.begin();
1245 for (; trim_amount>0 ||
1246 (side==eTrimProduct
1247 ? (*chunk)->IsGenomic_ins()
1248 : (*chunk)->IsProduct_ins());
1249 ) {
1250 int product_chunk_len = 0;
1251 int genomic_chunk_len = 0;
1252 switch((*chunk)->Which()) {
1253 case CSpliced_exon_chunk::e_Match:
1254 product_chunk_len = (*chunk)->GetMatch();
1255 genomic_chunk_len = product_chunk_len;
1256 if (product_chunk_len > trim_amount) {
1257 (*chunk)->SetMatch(product_chunk_len - trim_amount);
1258 }
1259 break;
1260 case CSpliced_exon_chunk::e_Mismatch:
1261 product_chunk_len = (*chunk)->GetMismatch();
1262 genomic_chunk_len = product_chunk_len;
1263 if (product_chunk_len > trim_amount) {
1264 (*chunk)->SetMismatch(product_chunk_len - trim_amount);
1265 }
1266 break;
1267 case CSpliced_exon_chunk::e_Diag:
1268 product_chunk_len = (*chunk)->GetDiag();
1269 genomic_chunk_len = product_chunk_len;
1270 if (product_chunk_len > trim_amount) {
1271 (*chunk)->SetDiag(product_chunk_len - trim_amount);
1272 }
1273 break;
1274
1275 case CSpliced_exon_chunk::e_Product_ins:
1276 product_chunk_len = (*chunk)->GetProduct_ins();
1277 if (side==eTrimProduct && product_chunk_len > trim_amount) {
1278 (*chunk)->SetProduct_ins(product_chunk_len - trim_amount);
1279 }
1280 break;
1281 case CSpliced_exon_chunk::e_Genomic_ins:
1282 genomic_chunk_len = (*chunk)->GetGenomic_ins();
1283 if (side==eTrimGenomic && genomic_chunk_len > trim_amount) {
1284 (*chunk)->SetGenomic_ins(genomic_chunk_len - trim_amount);
1285 }
1286 break;
1287 default:
1288 _ASSERT(false);
1289 break;
1290 }
1291
1292 if (side==eTrimProduct && product_chunk_len <= trim_amount) {
1293 genomic_trim_amount += genomic_chunk_len;
1294 product_trim_amount += product_chunk_len;
1295 trim_amount -= product_chunk_len;
1296 } else if (side==eTrimGenomic && genomic_chunk_len <= trim_amount) {
1297 genomic_trim_amount += genomic_chunk_len;
1298 product_trim_amount += product_chunk_len;
1299 trim_amount -= genomic_chunk_len;
1300 } else {
1301 genomic_trim_amount += min(trim_amount, genomic_chunk_len);
1302 product_trim_amount += min(trim_amount, product_chunk_len);
1303 trim_amount = 0;
1304 break;
1305 }
1306 chunk = parts.erase(chunk);
1307 }
1308
1309 } else {
1310 genomic_trim_amount += trim_amount;
1311 product_trim_amount += trim_amount;
1312 trim_amount = 0;
1313 }
1314
1315 exon_it->prod_from += product_trim_amount;
1316 exon_it->genomic_from += genomic_trim_amount;
1317
1318 if (is_protein) {
1319 CProduct_pos& prot_pos = (*spl_exon_it)->SetProduct_start();
1320 SetProtpos(prot_pos, exon_it->prod_from);
1321 } else {
1322 if (product_strand != eNa_strand_minus) {
1323 (*spl_exon_it)->SetProduct_start().SetNucpos() += product_trim_amount;
1324 } else {
1325 (*spl_exon_it)->SetProduct_end().SetNucpos() -= product_trim_amount;
1326 }
1327 }
1328
1329 if (genomic_strand != eNa_strand_minus) {
1330 (*spl_exon_it)->SetGenomic_start() += genomic_trim_amount;
1331 } else {
1332 (*spl_exon_it)->SetGenomic_end() -= genomic_trim_amount;
1333 }
1334 }
1335 }
1336 }
1337
1338 namespace fg {
GetGeneticCode(const CBioseq_Handle & bsh)1339 int GetGeneticCode(const CBioseq_Handle& bsh)
1340 {
1341 int gcode = 1;
1342
1343 auto source = sequence::GetBioSource(bsh);
1344 if (source != nullptr) {
1345 gcode = source->GetGenCode(gcode);
1346 }
1347
1348 return gcode;
1349 }
1350 }
1351
1352 END_NCBI_SCOPE
1353