1 /* $Id: feature_propagate.cpp 632623 2021-06-03 17:38:11Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Colleen Bollin, based on work done by Frank Ludwig
27 *
28 * File Description:
29 * functions for propagating a feature using an alignment
30 */
31 #include <ncbi_pch.hpp>
32 #include <corelib/ncbistd.hpp>
33
34 #include <objects/seqfeat/Cdregion.hpp>
35 #include <objects/seqfeat/RNA_ref.hpp>
36 #include <objects/seqfeat/Trna_ext.hpp>
37 #include <objects/seqfeat/Code_break.hpp>
38 #include <objects/seqfeat/Genetic_code_table.hpp>
39 #include <objects/seqfeat/Feat_id.hpp>
40 #include <util/range_coll.hpp>
41 #include <objects/seqalign/Spliced_seg.hpp>
42
43 #include <objmgr/util/seq_loc_util.hpp>
44 #include <objmgr/seq_vector.hpp>
45 #include <objmgr/seq_vector_ci.hpp>
46
47 #include <objtools/edit/cds_fix.hpp>
48 #include <objtools/edit/feature_propagate.hpp>
49
50 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)51 BEGIN_SCOPE(objects)
52 BEGIN_SCOPE(edit)
53
54
55 CFeaturePropagator::CFeaturePropagator
56 (CBioseq_Handle src, CBioseq_Handle target,
57 const CSeq_align& align,
58 bool stop_at_stop, bool cleanup_partials, bool merge_abutting,
59 CMessageListener_Basic* pMessageListener, CObject_id::TId* feat_id)
60 : m_Src(src), m_Target(target),
61 m_Scope(m_Target.GetScope()),
62 m_CdsStopAtStopCodon(stop_at_stop),
63 m_CdsCleanupPartials(cleanup_partials),
64 m_MessageListener(pMessageListener),
65 m_MaxFeatId(feat_id),
66 m_MergeAbutting(merge_abutting),
67 m_ExpandOverGaps(true),
68 m_synonym_mapper(this)
69 {
70 if (align.GetSegs().IsDenseg()) {
71 m_Alignment.Reset(&align);
72 } else if (align.GetSegs().IsDisc()) {
73 m_Alignment.Reset(&*align.CreateDensegFromDisc());
74 } else if (align.GetSegs().IsStd()) {
75 m_Alignment.Reset(&*align.CreateDensegFromStdseg());
76 } else if (align.GetSegs().IsSpliced()) {
77 CRef<CSeq_align> disc_seg = align.GetSegs().GetSpliced().AsDiscSeg();
78 m_Alignment.Reset(&*disc_seg->CreateDensegFromDisc());
79 // } else if (align.GetSegs().IsDendiag()) {
80 // m_Alignment.Reset(&*align_format::CAlignFormatUtil::CreateDensegFromDendiag(align));
81 } else {
82 if (m_MessageListener) {
83 m_MessageListener->PostMessage(
84 CMessage_Basic("Unsupported alignment type",
85 eDiag_Error,
86 eFeaturePropagationProblem_FeatureLocation));
87 }
88 m_Alignment.Reset();
89 }
90 }
91
CFeaturePropagator(CBioseq_Handle src,CBioseq_Handle target,const CSeq_align & align,bool stop_at_stop,bool cleanup_partials,bool merge_abutting,bool expand_over_gaps,CMessageListener_Basic * pMessageListener,CObject_id::TId * feat_id)92 CFeaturePropagator::CFeaturePropagator
93 (CBioseq_Handle src, CBioseq_Handle target,
94 const CSeq_align& align,
95 bool stop_at_stop, bool cleanup_partials, bool merge_abutting, bool expand_over_gaps,
96 CMessageListener_Basic* pMessageListener, CObject_id::TId* feat_id)
97 : m_Src(src), m_Target(target),
98 m_Scope(m_Target.GetScope()),
99 m_CdsStopAtStopCodon(stop_at_stop),
100 m_CdsCleanupPartials(cleanup_partials),
101 m_MessageListener(pMessageListener),
102 m_MaxFeatId(feat_id),
103 m_MergeAbutting(merge_abutting),
104 m_ExpandOverGaps(expand_over_gaps),
105 m_synonym_mapper(this)
106 {
107 if (align.GetSegs().IsDenseg()) {
108 m_Alignment.Reset(&align);
109 } else if (align.GetSegs().IsDisc()) {
110 m_Alignment.Reset(&*align.CreateDensegFromDisc());
111 } else if (align.GetSegs().IsStd()) {
112 m_Alignment.Reset(&*align.CreateDensegFromStdseg());
113 } else if (align.GetSegs().IsSpliced()) {
114 CRef<CSeq_align> disc_seg = align.GetSegs().GetSpliced().AsDiscSeg();
115 m_Alignment.Reset(&*disc_seg->CreateDensegFromDisc());
116 // } else if (align.GetSegs().IsDendiag()) {
117 // m_Alignment.Reset(&*align_format::CAlignFormatUtil::CreateDensegFromDendiag(align));
118 } else {
119 if (m_MessageListener) {
120 m_MessageListener->PostMessage(
121 CMessage_Basic("Unsupported alignment type",
122 eDiag_Error,
123 eFeaturePropagationProblem_FeatureLocation));
124 }
125 m_Alignment.Reset();
126 }
127 }
128
Propagate(const CSeq_feat & orig_feat)129 CRef<CSeq_feat> CFeaturePropagator::Propagate(const CSeq_feat& orig_feat)
130 {
131 CRef<CSeq_feat> rval(new CSeq_feat());
132 rval->Assign(orig_feat);
133
134 if (rval->IsSetId() && (m_MaxFeatId && *m_MaxFeatId > 0)) {
135 rval->SetId().SetLocal().SetId(++(*m_MaxFeatId));
136 m_FeatIdMap.emplace(orig_feat.GetId().GetLocal().GetId(), rval->GetId().GetLocal().GetId());
137 }
138
139 // propagate feature location
140 CSeq_id_Handle best_idh = sequence::GetId(m_Target, sequence::eGetId_Best);
141 CConstRef<CSeq_id> pTargetId = best_idh.GetSeqId();
142 CRef<CSeq_loc> new_loc = x_MapLocation(orig_feat.GetLocation(), *pTargetId);
143 if (!new_loc) {
144 if (m_MessageListener) {
145 string loc_label;
146 orig_feat.GetLocation().GetLabel(&loc_label);
147 string target_label;
148 pTargetId->GetLabel(&target_label);
149 m_MessageListener->PostMessage(
150 CMessage_Basic("Unable to propagate location of feature " + loc_label + " to " + target_label,
151 eDiag_Error,
152 eFeaturePropagationProblem_FeatureLocation));
153 }
154 rval.Reset(nullptr);
155 return rval;
156 }
157
158 rval->SetLocation().Assign(*new_loc);
159 rval->ResetPartial();
160 rval->ResetProduct();
161 if (new_loc->IsPartialStart(eExtreme_Biological) || new_loc->IsPartialStop(eExtreme_Biological)) {
162 rval->SetPartial(true);
163 }
164
165 const CSeq_loc& location = orig_feat.GetLocation();
166 bool origIsPartialStart = location.IsPartialStart(eExtreme_Biological);
167
168 // depending on feature type, propagate locations in data
169 switch(orig_feat.GetData().GetSubtype()) {
170 case CSeqFeatData::eSubtype_cdregion:
171 x_PropagateCds(*rval, *pTargetId, origIsPartialStart);
172 break;
173 case CSeqFeatData::eSubtype_tRNA:
174 x_PropagatetRNA(*rval, *pTargetId);
175 break;
176 default:
177 break;
178 }
179
180 return rval;
181 }
182
183
PropagateAll()184 vector<CRef<CSeq_feat> > CFeaturePropagator::PropagateAll()
185 {
186 vector<CRef<CSeq_feat> > rval;
187 CFeat_CI fi(m_Src);
188 while (fi) {
189 CRef<CSeq_feat> new_feat = Propagate(*(fi->GetOriginalSeq_feat()));
190 if (new_feat) {
191 rval.push_back(new_feat);
192 }
193 ++fi;
194 }
195 return rval;
196 }
197
PropagateAllReportFailures(vector<CConstRef<CSeq_feat>> & failures)198 vector<CRef<CSeq_feat> > CFeaturePropagator::PropagateAllReportFailures(vector<CConstRef<CSeq_feat> >& failures)
199 {
200 vector<CRef<CSeq_feat> > rval;
201 CFeat_CI fi(m_Src);
202 while (fi) {
203 auto old_feat = fi->GetOriginalSeq_feat();
204 CRef<CSeq_feat> new_feat = Propagate(*old_feat);
205 if (new_feat) {
206 rval.push_back(new_feat);
207 }
208 else {
209 failures.push_back(old_feat);
210 }
211 ++fi;
212 }
213 return rval;
214 }
215
PropagateFeatureList(const vector<CConstRef<CSeq_feat>> & orig_feats)216 vector<CRef<CSeq_feat>> CFeaturePropagator::PropagateFeatureList(const vector<CConstRef<CSeq_feat>>& orig_feats)
217 {
218 vector<CRef<CSeq_feat>> propagated_feats;
219
220 for (auto&& it : orig_feats) {
221 CRef<CSeq_feat> new_feat = Propagate(*it);
222 if (new_feat) {
223 propagated_feats.push_back(new_feat);
224 if (new_feat->IsSetData() && new_feat->GetData().IsCdregion()) {
225 if (it->IsSetProduct()) {
226 CRef<CSeq_feat> prot_feat = ConstructProteinFeatureForPropagatedCodingRegion(*it, *new_feat);
227 if (prot_feat) {
228 propagated_feats.push_back(prot_feat);
229 }
230 }
231 }
232 }
233 }
234
235 if (!m_FeatIdMap.empty()) {
236 for (auto&& it : propagated_feats) {
237 if (it->IsSetXref()) {
238 auto xref_it = it->SetXref().begin();
239 while (xref_it != it->SetXref().end()) {
240 if ((*xref_it)->IsSetId()) {
241 CFeat_id& feat_id = (*xref_it)->SetId();
242 if (feat_id.IsLocal() && feat_id.GetLocal().IsId()) {
243 auto orig_featid = m_FeatIdMap.find(feat_id.GetLocal().GetId());
244 if (orig_featid != m_FeatIdMap.end()) {
245 feat_id.SetLocal().SetId(orig_featid->second);
246 }
247 else {
248 // it hasn't been propagated yet or it will not be propagated
249 xref_it = it->SetXref().erase(xref_it);
250 continue;
251 }
252 }
253 }
254 ++xref_it;
255 }
256 if (it->GetXref().empty()) {
257 it->ResetXref();
258 }
259 }
260 }
261 }
262
263 return propagated_feats;
264 }
265
SeqPosToAlignPos(TSignedSeqPos pos,CDense_seg::TDim row,bool left,bool & partial5,bool & partial3)266 TSignedSeqPos CFeaturePropagator::SeqPosToAlignPos(TSignedSeqPos pos, CDense_seg::TDim row, bool left, bool &partial5, bool &partial3)
267 {
268 TSignedSeqPos res = -1;
269 const CDense_seg& denseg = m_Alignment->GetSegs().GetDenseg();
270 const CSeq_id& id = denseg.GetSeq_id(row);
271 CBioseq_Handle bsh = m_Scope.GetBioseqHandle(id);
272 if (!bsh)
273 return -1;
274 //TSignedSeqPos length = bsh.GetBioseqLength();
275 CDense_seg::TNumseg num_segs = denseg.GetNumseg();
276 CDense_seg::TDim num_rows = denseg.GetDim();
277 CDense_seg::TNumseg seg = 0;
278 TSignedSeqPos total_len = 0;
279 TSignedSeqPos found_len = -1;
280 bool found = false;
281 while ( seg < num_segs )
282 {
283 TSignedSeqPos start = denseg.GetStarts()[seg * num_rows + row];
284 TSignedSeqPos len = denseg.GetLens()[seg];
285 ENa_strand strand = eNa_strand_plus;
286 if (denseg.IsSetStrands())
287 strand = denseg.GetStrands()[seg * num_rows + row];
288 if (strand == eNa_strand_minus)
289 {
290 NCBI_THROW(CException, eUnknown, "Cannot propagate through alignment on negative strand");
291 }
292 if (start >= 0 && pos >= start && pos < start + len)
293 {
294 res = total_len + pos - start;
295 found = true;
296 break;
297 }
298 if (start >= 0 && start > pos && left)
299 {
300 res = total_len;
301 partial5 = true;
302 found = true;
303 break;
304 }
305 if (start >= 0 && start + len - 1 < pos && !left)
306 {
307 found_len = total_len + len - 1;
308 }
309 total_len += len;
310 ++seg;
311 }
312 if (!found)
313 {
314 res = found_len;
315 if (!left)
316 partial3 = true;
317 }
318
319 return res;
320 }
321
AlignPosToSeqPos(TSignedSeqPos pos,CDense_seg::TDim row,bool left,bool & partial5,bool & partial3)322 TSignedSeqPos CFeaturePropagator::AlignPosToSeqPos(TSignedSeqPos pos, CDense_seg::TDim row, bool left, bool &partial5, bool &partial3)
323 {
324 TSignedSeqPos res = -1;
325 const CDense_seg& denseg = m_Alignment->GetSegs().GetDenseg();
326 const CSeq_id& id = denseg.GetSeq_id(row);
327 CBioseq_Handle bsh = m_Scope.GetBioseqHandle(id);
328 if (!bsh)
329 return -1;
330 //TSignedSeqPos length = bsh.GetBioseqLength();
331 CDense_seg::TNumseg num_segs = denseg.GetNumseg();
332 CDense_seg::TDim num_rows = denseg.GetDim();
333 CDense_seg::TNumseg seg = 0;
334 TSignedSeqPos total_len = 0;
335 TSignedSeqPos found_seg = -1;
336 while ( seg < num_segs )
337 {
338 TSignedSeqPos start = denseg.GetStarts()[seg * num_rows + row];
339 TSignedSeqPos len = denseg.GetLens()[seg];
340 ENa_strand strand = eNa_strand_plus;
341 if (denseg.IsSetStrands())
342 strand = denseg.GetStrands()[seg * num_rows + row];
343 if (strand == eNa_strand_minus)
344 {
345 NCBI_THROW(CException, eUnknown, "Cannot propagate through alignment on negative strand");
346 }
347 if (total_len <= pos && total_len + len > pos)
348 {
349 if (start >= 0)
350 {
351 res = start + pos - total_len;
352 break;
353 }
354 else
355 {
356 found_seg = seg;
357 break;
358 }
359 }
360 total_len += len;
361 ++seg;
362 }
363 if (found_seg >= 0)
364 {
365 seg = found_seg;
366 if (left)
367 {
368 ++seg;
369 while ( seg < num_segs )
370 {
371 TSignedSeqPos start = denseg.GetStarts()[seg * num_rows + row];
372 //TSignedSeqPos len = denseg.GetLens()[seg];
373 ENa_strand strand = eNa_strand_plus;
374 if (denseg.IsSetStrands())
375 strand = denseg.GetStrands()[seg * num_rows + row];
376 if (strand == eNa_strand_minus)
377 {
378 NCBI_THROW(CException, eUnknown, "Cannot propagate through alignment on negative strand");
379 }
380 if (start >= 0)
381 {
382 res = start;
383 break;
384 }
385 ++seg;
386 }
387 partial5 = true;
388 }
389 else
390 {
391 while (seg > 0)
392 {
393 --seg;
394 TSignedSeqPos start = denseg.GetStarts()[seg * num_rows + row];
395 TSignedSeqPos len = denseg.GetLens()[seg];
396 ENa_strand strand = eNa_strand_plus;
397 if (denseg.IsSetStrands())
398 strand = denseg.GetStrands()[seg * num_rows + row];
399 if (strand == eNa_strand_minus)
400 {
401 NCBI_THROW(CException, eUnknown, "Cannot propagate through alignment on negative strand");
402 }
403 if (start >= 0)
404 {
405 res = start + len - 1;
406 break;
407 }
408 }
409 partial3 = true;
410 }
411 }
412 return res;
413 }
414
FindRow(const CSeq_align & align,CBioseq_Handle bsh)415 CDense_seg::TDim CFeaturePropagator::FindRow(const CSeq_align& align, CBioseq_Handle bsh)
416 {
417 const CDense_seg& denseg = align.GetSegs().GetDenseg();
418 CDense_seg::TDim num_rows = denseg.GetDim();
419 for (CDense_seg::TDim i = 0; i < num_rows; i++)
420 {
421 const CSeq_id& id = denseg.GetSeq_id(i);
422 CBioseq_Handle bsh2 = m_Scope.GetBioseqHandle(id);
423 if (bsh == bsh2)
424 {
425 return i;
426 }
427 }
428 return -1;
429 }
430
431 // For reasons unknown CSeq_align::CreateRowSeq_loc is returning only a single interval for Dense-seg alignments
432 // Therefore we need to roll out our own version which creates a real multi-interval location.
CreateRowSeq_loc(const CSeq_align & align,CDense_seg::TDim row)433 CRef<CSeq_loc> CFeaturePropagator::CreateRowSeq_loc(const CSeq_align& align, CDense_seg::TDim row)
434 {
435 CRef<CSeq_loc> loc(new CSeq_loc);
436 const CDense_seg& denseg = align.GetSegs().GetDenseg();
437 const CSeq_id& id = denseg.GetSeq_id(row);
438 CDense_seg::TNumseg num_segs = denseg.GetNumseg();
439 CDense_seg::TDim num_rows = denseg.GetDim();
440 for (int seg = 0; seg < num_segs; seg++)
441 {
442 TSignedSeqPos start = denseg.GetStarts()[seg*num_rows + row];
443 if (start < 0)
444 continue;
445
446 TSignedSeqPos len = denseg.GetLens()[seg];
447 CRef<CSeq_interval> ret(new CSeq_interval);
448 ret->SetId().Assign(id);
449 ret->SetFrom(start);
450 ret->SetTo(start + len - 1);
451 if ( denseg.IsSetStrands() )
452 {
453 ENa_strand strand = denseg.GetStrands()[seg * num_rows + row];
454 ret->SetStrand(strand);
455 }
456
457 loc->SetPacked_int().Set().push_back(ret);
458 }
459 if (!loc->IsPacked_int())
460 loc.Reset();
461 return loc;
462 }
463
464 // An ordered Seq-loc should altername between non-NULL and NULL locations
IsOrdered(const CSeq_loc & loc)465 bool CFeaturePropagator::IsOrdered(const CSeq_loc &loc)
466 {
467 if (loc.IsMix() && loc.GetMix().Get().size() > 1)
468 {
469 bool should_be_null = false;
470 for (const auto& it : loc.GetMix().Get())
471 {
472 if (it->IsNull() != should_be_null)
473 return false;
474 should_be_null = !should_be_null;
475 }
476 return should_be_null;
477 }
478 return false;
479 }
480
MakeOrdered(const CSeq_loc & loc)481 CRef<CSeq_loc> CFeaturePropagator::MakeOrdered(const CSeq_loc &loc)
482 {
483 CRef<CSeq_loc> mix(new CSeq_loc());
484 for (const auto& it : loc.GetMix().Get())
485 {
486 mix->SetMix().Set().push_back(it);
487 CRef<CSeq_loc> null_loc(new CSeq_loc());
488 null_loc->CSeq_loc_Base::SetNull();
489 mix->SetMix().Set().push_back(null_loc);
490 }
491 if (mix->IsMix() && mix->GetMix().IsSet() && !mix->GetMix().Get().empty() && mix->GetMix().Get().back()->IsNull())
492 {
493 mix->SetMix().Set().pop_back();
494 }
495 return mix;
496 }
497
x_MapLocation(const CSeq_loc & sourceLoc,const CSeq_id & targetId)498 CRef<CSeq_loc> CFeaturePropagator::x_MapLocation(const CSeq_loc& sourceLoc, const CSeq_id& targetId)
499 {
500 // string label;
501 // targetId.GetLabel(&label);
502 CRef<CSeq_loc> target;
503 if (!m_Alignment)
504 return target;
505 bool partial5 = sourceLoc.IsPartialStart(eExtreme_Positional);
506 bool partial3 = sourceLoc.IsPartialStop(eExtreme_Positional);
507
508 CDense_seg::TDim source_row = FindRow(*m_Alignment, m_Src);
509 CDense_seg::TDim target_row = FindRow(*m_Alignment, m_Target);
510 if (source_row == -1 || target_row == -1)
511 {
512 return target;
513 }
514 CRef<CSeq_loc> new_loc(new CSeq_loc);
515 new_loc->Assign(sourceLoc);
516 if (!m_ExpandOverGaps)
517 {
518 CRef<CSeq_loc> mapped_loc = CreateRowSeq_loc(*m_Alignment, source_row);
519 if (!mapped_loc)
520 return target;
521 const TSeqPos start_before = new_loc->GetStart(eExtreme_Biological);
522 const TSeqPos stop_before = new_loc->GetStop(eExtreme_Biological);
523 new_loc = mapped_loc->Intersect(*new_loc, 0, &m_synonym_mapper);
524 const TSeqPos start_after = new_loc->GetStart(eExtreme_Biological);
525 const TSeqPos stop_after = new_loc->GetStop(eExtreme_Biological);
526 if (start_before != start_after)
527 partial5 = true;
528 if (stop_before != stop_after)
529 partial3 = true;
530 if (sourceLoc.IsSetStrand()) // Intersect eats up the strand information for some reason
531 new_loc->SetStrand(sourceLoc.GetStrand());
532 }
533 TSignedSeqPos seq_start = m_Src.GetRangeSeq_loc(0,0)->GetStart(objects::eExtreme_Positional);
534 new_loc->SetId(targetId);
535 CSeq_loc_I loc_it(*new_loc);
536 while(loc_it)
537 {
538 if (loc_it.IsEmpty())
539 {
540 loc_it.Delete();
541 continue;
542 }
543 CSeq_loc_CI::TRange range = loc_it.GetRange();
544 TSignedSeqPos start = range.GetFrom() - seq_start;
545 TSignedSeqPos stop = range.GetTo() - seq_start;
546 ENa_strand strand = loc_it.GetStrand();
547 bool sub_partial5(false), sub_partial3(false);
548 TSignedSeqPos align_start = -1, align_stop = -1;
549 try
550 {
551 align_start = SeqPosToAlignPos(start, source_row, true, sub_partial5, sub_partial3);
552 align_stop = SeqPosToAlignPos(stop, source_row, false, sub_partial5, sub_partial3);
553 } catch (const CException &e)
554 {
555 if (m_MessageListener) {
556 m_MessageListener->PostMessage(
557 CMessage_Basic(e.what(),
558 eDiag_Error,
559 eFeaturePropagationProblem_FeatureLocation));
560 }
561 return target;
562 }
563 if (align_start < 0 || align_stop < 0)
564 {
565 if (loc_it.GetPos() == 0)
566 {
567 if (IsReverse(strand))
568 partial3 = true;
569 else
570 partial5 = true;
571 }
572 if (loc_it.GetPos() == loc_it.GetSize() - 1)
573 {
574 if (IsReverse(strand))
575 partial5 = true;
576 else
577 partial3 = true;
578 }
579 loc_it.Delete();
580 continue;
581 }
582 TSignedSeqPos new_start = -1, new_stop = -1;
583 try
584 {
585 new_start = AlignPosToSeqPos(align_start, target_row, true, sub_partial5, sub_partial3);
586 new_stop = AlignPosToSeqPos(align_stop, target_row, false, sub_partial5, sub_partial3);
587 }
588 catch(const CException &e)
589 {
590 if (m_MessageListener) {
591 m_MessageListener->PostMessage(
592 CMessage_Basic(e.what(),
593 eDiag_Error,
594 eFeaturePropagationProblem_FeatureLocation));
595 }
596 return target;
597 }
598 if (new_start < 0 || new_stop < 0 || new_stop < new_start)
599 {
600 if (loc_it.GetPos() == 0)
601 {
602 if (IsReverse(strand))
603 partial3 = true;
604 else
605 partial5 = true;
606 }
607 if (loc_it.GetPos() == loc_it.GetSize() - 1)
608 {
609 if (IsReverse(strand))
610 partial5 = true;
611 else
612 partial3 = true;
613 }
614 loc_it.Delete();
615 continue;
616 }
617 if (sub_partial5 && loc_it.GetPos() == 0 && !IsReverse(strand))
618 partial5 = true;
619 if (sub_partial5 && loc_it.GetPos() == loc_it.GetSize() - 1 && IsReverse(strand))
620 partial5 = true;
621 if (sub_partial3 && loc_it.GetPos() == loc_it.GetSize() - 1 && !IsReverse(strand))
622 partial3 = true;
623 if (sub_partial3 && loc_it.GetPos() == 0 && IsReverse(strand))
624 partial3 = true;
625 loc_it.SetFrom(new_start);
626 loc_it.SetTo(new_stop);
627 ++loc_it;
628 }
629 target = loc_it.MakeSeq_loc();
630 if (!m_ExpandOverGaps && target)
631 {
632 CRef<CSeq_loc> mapped_loc = CreateRowSeq_loc(*m_Alignment, target_row);
633 if (!mapped_loc)
634 {
635 target.Reset();
636 return target;
637 }
638 target = mapped_loc->Intersect(*target, 0, &m_synonym_mapper);
639 if (sourceLoc.IsSetStrand()) // Intersect eats up the strand information for some reason
640 target->SetStrand(sourceLoc.GetStrand());
641 }
642 if (target && (target->IsNull() || target->IsEmpty()
643 || (target->IsMix() && (!target->GetMix().IsSet() || target->GetMix().Get().empty()))
644 || (target->IsPacked_int() && (!target->GetPacked_int().IsSet() || target->GetPacked_int().Get().empty()))
645 || (target->IsPacked_pnt() && (!target->GetPacked_pnt().IsSetPoints() || target->GetPacked_pnt().GetPoints().empty())) ))
646 {
647 target.Reset();
648 }
649 if (m_MergeAbutting && target)
650 {
651 target = target->Merge(CSeq_loc::fMerge_All, nullptr);
652 }
653 if (partial5 && target)
654 {
655 target->SetPartialStart(true, eExtreme_Positional);
656 }
657 if (partial3 && target)
658 {
659 target->SetPartialStop(true, eExtreme_Positional);
660 }
661 if (sourceLoc.IsMix() && target && target->IsPacked_int())
662 {
663 target->ChangeToMix();
664 }
665 if (target && target->IsMix() && target->GetMix().IsSet() && target->GetMix().Get().size() > 1
666 && sourceLoc.IsMix() && IsOrdered(sourceLoc))
667 {
668 target = MakeOrdered(*target);
669 }
670 return target;
671 }
672
x_TruncateToStopCodon(const CSeq_loc & loc,unsigned int truncLen)673 CRef<CSeq_loc> CFeaturePropagator::x_TruncateToStopCodon(const CSeq_loc& loc, unsigned int truncLen)
674 {
675 CRef<CSeq_loc> pTruncated;
676
677 unsigned int curLen = 0;
678 for (CSeq_loc_CI it(loc); it && curLen < truncLen; ++it) {
679 unsigned int thisLen = it.GetRange().GetLength();
680 CConstRef<CSeq_loc> pThisLoc = it.GetRangeAsSeq_loc();
681 if (curLen + thisLen <= truncLen) {
682 if (pTruncated) {
683 pTruncated->Add(*pThisLoc);
684 } else {
685 pTruncated.Reset(new CSeq_loc());
686 pTruncated->Assign(*pThisLoc);
687 }
688 curLen += thisLen;
689 } else {
690 CRef<CSeq_loc> pPartialLoc(new CSeq_loc());
691 unsigned int missingLen = truncLen - curLen;
692 const TSeqPos start = pThisLoc->GetStart(eExtreme_Biological);
693 if (missingLen == 1) {
694 // make a point
695 pPartialLoc->SetPnt().SetPoint(start);
696 } else {
697 // make an interval
698 if (pThisLoc->IsSetStrand() && pThisLoc->GetStrand() == eNa_strand_minus) {
699 pPartialLoc->SetInt().SetFrom(start - missingLen + 1);
700 pPartialLoc->SetInt().SetTo(start);
701 } else {
702 pPartialLoc->SetInt().SetFrom(start);
703 pPartialLoc->SetInt().SetTo(start + missingLen - 1);
704 }
705 }
706 pPartialLoc->SetId(*pThisLoc->GetId());
707 if (pThisLoc->IsSetStrand()) {
708 pPartialLoc->SetStrand(pThisLoc->GetStrand());
709 }
710 if (pTruncated) {
711 pTruncated->Add(*pPartialLoc);
712 } else {
713 pTruncated.Reset(new CSeq_loc());
714 pTruncated->Assign(*pPartialLoc);
715 }
716 curLen += missingLen;
717 }
718 }
719
720 return pTruncated;
721 }
722
x_ExtendToStopCodon(CSeq_feat & feat)723 CRef<CSeq_loc> CFeaturePropagator::x_ExtendToStopCodon (CSeq_feat& feat)
724 {
725 CRef<CSeq_loc> new_loc;
726
727 if (!feat.IsSetLocation()) {
728 return new_loc;
729 }
730 const CSeq_loc& loc = feat.GetLocation();
731
732 const CGenetic_code* code = nullptr;
733 if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.GetData().GetCdregion().IsSetCode()) {
734 code = &(feat.GetData().GetCdregion().GetCode());
735 }
736
737 const TSeqPos stop = loc.GetStop(eExtreme_Biological);
738 // figure out if we have a partial codon at the end
739 size_t orig_len = sequence::GetLength(loc, &m_Scope);
740 size_t nuc_len = orig_len;
741 if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.GetData().GetCdregion().IsSetFrame()) {
742 CCdregion::EFrame frame = feat.GetData().GetCdregion().GetFrame();
743 if (frame == CCdregion::eFrame_two) {
744 nuc_len -= 1;
745 } else if (frame == CCdregion::eFrame_three) {
746 nuc_len -= 2;
747 }
748 }
749 unsigned int mod = nuc_len % 3;
750 CRef<CSeq_loc> vector_loc(new CSeq_loc());
751 CSeq_interval& vector_int = vector_loc->SetInt();
752 vector_int.SetId().Assign(*(loc.GetId()));
753
754 if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus) {
755 vector_int.SetFrom(0);
756 vector_int.SetTo(stop + mod - 1);
757 vector_loc->SetStrand(eNa_strand_minus);
758 } else {
759 vector_int.SetFrom(stop - mod + 1);
760 vector_int.SetTo(m_Target.GetInst_Length() - 1);
761 }
762
763 CSeqVector seq(*vector_loc, m_Scope, CBioseq_Handle::eCoding_Iupac);
764 // reserve our space
765 const unsigned int usable_size = seq.size();
766
767 // get appropriate translation table
768 const CTrans_table & tbl =
769 (code ? CGen_code_table::GetTransTable(*code) :
770 CGen_code_table::GetTransTable(1));
771
772 // main loop through bases
773 CSeqVector::const_iterator start = seq.begin();
774
775 int state = 0;
776 unsigned int prot_len = usable_size / 3;
777
778 for (unsigned int i = 0; i < prot_len; ++i) {
779 // loop through one codon at a time
780 for (size_t k = 0; k < 3; ++k, ++start) {
781 state = tbl.NextCodonState(state, *start);
782 }
783
784 if (tbl.GetCodonResidue (state) == '*') {
785 CSeq_loc_CI it(loc);
786 CSeq_loc_CI it_next = it;
787 ++it_next;
788 while (it_next) {
789 CConstRef<CSeq_loc> this_loc = it.GetRangeAsSeq_loc();
790 if (new_loc) {
791 new_loc->Add(*this_loc);
792 } else {
793 new_loc.Reset(new CSeq_loc());
794 new_loc->Assign(*this_loc);
795 }
796 it = it_next;
797 ++it_next;
798 }
799 CRef<CSeq_loc> last_interval(new CSeq_loc());
800 CConstRef<CSeq_loc> this_loc = it.GetRangeAsSeq_loc();
801 const TSeqPos this_start = this_loc->GetStart(eExtreme_Positional);
802 const TSeqPos this_stop = this_loc->GetStop(eExtreme_Positional);
803 unsigned int extension = ((i + 1) * 3) - mod;
804 last_interval->SetInt().SetId().Assign(*(this_loc->GetId()));
805 if (this_loc->IsSetStrand() && this_loc->GetStrand() == eNa_strand_minus) {
806 last_interval->SetStrand(eNa_strand_minus);
807 last_interval->SetInt().SetFrom(this_start - extension);
808 last_interval->SetInt().SetTo(this_stop);
809 } else {
810 last_interval->SetInt().SetFrom(this_start);
811 last_interval->SetInt().SetTo(this_stop + extension);
812 }
813
814 if (new_loc) {
815 new_loc->Add(*last_interval);
816 } else {
817 new_loc.Reset(new CSeq_loc());
818 new_loc->Assign(*last_interval);
819 }
820
821 return new_loc;
822 }
823 }
824
825 return new_loc;
826 }
827
828
x_PropagateCds(CSeq_feat & feat,const CSeq_id & targetId,bool origIsPartialStart)829 void CFeaturePropagator::x_PropagateCds(CSeq_feat& feat, const CSeq_id& targetId, bool origIsPartialStart)
830 {
831 bool ambiguous = false;
832 feat.SetData().SetCdregion().SetFrame(CSeqTranslator::FindBestFrame(feat, m_Scope, ambiguous));
833
834 x_CdsMapCodeBreaks(feat, targetId);
835 if (m_CdsStopAtStopCodon) {
836 x_CdsStopAtStopCodon(feat);
837 }
838 if (m_CdsCleanupPartials) {
839 x_CdsCleanupPartials(feat, origIsPartialStart);
840 }
841 }
842
843
x_CdsMapCodeBreaks(CSeq_feat & feat,const CSeq_id & targetId)844 void CFeaturePropagator::x_CdsMapCodeBreaks(CSeq_feat& feat, const CSeq_id& targetId)
845 {
846 CCdregion& cds = feat.SetData().SetCdregion();
847 if (cds.IsSetCode_break()) {
848 auto it = cds.SetCode_break().begin();
849 while (it != cds.SetCode_break().end()) {
850 bool remove = false;
851 if ((*it)->IsSetLoc()) {
852 const CSeq_loc& codebreak = (*it)->GetLoc();
853 CRef<CSeq_loc> new_codebreak = x_MapLocation(codebreak, targetId);
854 if (new_codebreak) {
855 (*it)->SetLoc(*new_codebreak);
856 } else {
857 if (m_MessageListener) {
858 string loc_label;
859 (*it)->GetLoc().GetLabel(&loc_label);
860 string target_label;
861 targetId.GetLabel(&target_label);
862 m_MessageListener->PostMessage(
863 CMessage_Basic("Unable to propagate location of translation exception " + loc_label + " to " + target_label,
864 eDiag_Error,
865 eFeaturePropagationProblem_CodeBreakLocation));
866 }
867 remove = true;
868 }
869 }
870 if (remove) {
871 it = cds.SetCode_break().erase(it);
872 } else {
873 ++it;
874 }
875 }
876 if (cds.GetCode_break().size() == 0) {
877 cds.ResetCode_break();
878 }
879 }
880 }
881
882
x_CdsCleanupPartials(CSeq_feat & cds,bool origIsPartialStart)883 void CFeaturePropagator::x_CdsCleanupPartials(CSeq_feat& cds, bool origIsPartialStart)
884 {
885 if (cds.GetData().GetSubtype() != CSeqFeatData::eSubtype_cdregion) {
886 return;
887 }
888
889 // assume that the original is marked correctly, i.e. if there is not fuzz-from
890 // then it is 5'complete and it there is no fuzz-to then it is 3' prime complete.
891 // no need to recheck whther there are actual start and stop codons.
892
893 // for the copies:
894 // the copy ends in a stop codon: it is 3' complete. Nothing needs to be done.
895 // the copy does not end in a stop codon. hence it is not 3' complete.
896 // the original is 5' incomplete: the copy is 5' incomplete
897 // the original is 5' complete, copy starts with a start codon: the copy is
898 // 5' complete
899
900 try {
901 string prot;
902 CSeqTranslator::Translate(cds, m_Scope, prot);
903
904 bool targetIsPartialStart = ! (NStr::StartsWith(prot, 'M')) || cds.GetLocation().IsPartialStart(eExtreme_Biological);
905 bool targetIsPartialStop = ! (NStr::EndsWith(prot, '*'));
906
907 if (!targetIsPartialStart && !origIsPartialStart) {
908 //propagated feature is 5' complete
909 cds.SetLocation().SetPartialStart(false, eExtreme_Biological);
910 }
911 else {
912 //propagated feature is 5' partial
913 cds.SetLocation().SetPartialStart(true, eExtreme_Biological);
914 }
915 if (!targetIsPartialStop) {
916 //propagated feature ends in stop codon hence is 3' complete
917 cds.SetLocation().SetPartialStop(false, eExtreme_Biological);
918 }
919 else {
920 //propagated feature is 3' partial
921 cds.SetLocation().SetPartialStop(true, eExtreme_Biological);
922 }
923
924 if (cds.GetLocation().IsPartialStart(eExtreme_Biological) ||
925 cds.GetLocation().IsPartialStop(eExtreme_Biological)) {
926 cds.SetPartial(true);
927 }
928 }
929 catch (...) {
930 }
931 }
932
933
x_CdsStopAtStopCodon(CSeq_feat & cds)934 void CFeaturePropagator::x_CdsStopAtStopCodon(CSeq_feat& cds)
935 {
936 if (cds.GetData().GetSubtype() != CSeqFeatData::eSubtype_cdregion) {
937 return;
938 }
939 CRef<CBioseq> pTranslated = CSeqTranslator::TranslateToProtein(cds, m_Scope);
940 if (!pTranslated) {
941 return;
942 }
943 CRef<CSeq_loc> pAdjustedLoc;
944 CSeqVector seqv(*pTranslated, &m_Scope, CBioseq_Handle::eCoding_Ncbi);
945 seqv.SetCoding(CSeq_data::e_Ncbieaa);
946 CSeqVector_CI it(seqv);
947 while (it && *it != '*') {
948 it++;
949 }
950 if (it) {
951 //found stop codon inside the given feature
952 unsigned int newSize = 3*(it.GetPos()+1);
953 if (cds.GetData().GetCdregion().IsSetFrame()) {
954 switch(cds.GetData().GetCdregion().GetFrame()) {
955 default:
956 break;
957 case CCdregion::eFrame_two:
958 newSize += 1;
959 break;
960 case CCdregion::eFrame_three:
961 newSize += 2;
962 break;
963 }
964 }
965 pAdjustedLoc = x_TruncateToStopCodon(cds.GetLocation(), newSize);
966 }
967 else {
968 pAdjustedLoc = x_ExtendToStopCodon(cds);
969 }
970 if (pAdjustedLoc) {
971 cds.SetLocation(*pAdjustedLoc);
972 }
973 }
974
975
x_PropagatetRNA(CSeq_feat & feat,const CSeq_id & targetId)976 void CFeaturePropagator::x_PropagatetRNA(CSeq_feat& feat, const CSeq_id& targetId)
977 {
978 if (feat.GetData().GetRna().IsSetExt()) {
979 const CRNA_ref::C_Ext& ext = feat.GetData().GetRna().GetExt();
980 if (ext.IsTRNA()) {
981 const CTrna_ext& trna_ext = ext.GetTRNA();
982 if (trna_ext.IsSetAnticodon()) {
983 const CSeq_loc& anticodon = trna_ext.GetAnticodon();
984 CRef<CSeq_loc> new_anticodon = x_MapLocation(anticodon, targetId);
985 if (new_anticodon) {
986 feat.SetData().SetRna().SetExt().SetTRNA().SetAnticodon(*new_anticodon);
987 } else {
988 if (m_MessageListener) {
989 string loc_label;
990 anticodon.GetLabel(&loc_label);
991 string target_label;
992 targetId.GetLabel(&target_label);
993 m_MessageListener->PostMessage(
994 CMessage_Basic("Unable to propagate location of anticodon " + loc_label + " to " + target_label,
995 eDiag_Error,
996 eFeaturePropagationProblem_AnticodonLocation));
997 }
998 feat.SetData().SetRna().SetExt().SetTRNA().ResetAnticodon();
999 }
1000 }
1001 }
1002 }
1003 }
1004
1005
ConstructProteinFeatureForPropagatedCodingRegion(const CSeq_feat & orig_cds,const CSeq_feat & new_cds)1006 CRef<CSeq_feat> CFeaturePropagator::ConstructProteinFeatureForPropagatedCodingRegion(const CSeq_feat& orig_cds, const CSeq_feat& new_cds)
1007 {
1008 if (!orig_cds.IsSetProduct()) {
1009 return CRef<CSeq_feat>(nullptr);
1010 }
1011 CRef<CSeq_feat> prot_feat;
1012 CBioseq_Handle prot = m_Scope.GetBioseqHandle(orig_cds.GetProduct());
1013 if (prot) {
1014 CFeat_CI pf(prot, CSeqFeatData::eSubtype_prot);
1015 if (pf) {
1016 prot_feat.Reset(new CSeq_feat());
1017 const CSeq_feat& orig_prot = *pf->GetOriginalSeq_feat();
1018 prot_feat->Assign(orig_prot);
1019
1020 if ((m_MaxFeatId && *m_MaxFeatId > 0) && (pf->GetOriginalSeq_feat()->IsSetId() || orig_cds.IsSetId())) {
1021 prot_feat->SetId().SetLocal().SetId(++(*m_MaxFeatId));
1022 // protein features don't have Xrefs
1023 }
1024 // fix partials
1025 prot_feat->SetLocation().SetPartialStart(new_cds.GetLocation().IsPartialStart(eExtreme_Biological), eExtreme_Biological);
1026 prot_feat->SetLocation().SetPartialStop(new_cds.GetLocation().IsPartialStop(eExtreme_Biological), eExtreme_Biological);
1027 prot_feat->SetPartial(prot_feat->GetLocation().IsPartialStart(eExtreme_Biological) ||
1028 prot_feat->GetLocation().IsPartialStop(eExtreme_Biological));
1029 if (new_cds.IsSetProduct() && !new_cds.GetProduct().GetId()->Equals(*(orig_cds.GetProduct().GetId()))) {
1030 prot_feat->SetLocation().SetId(new_cds.GetProduct().GetWhole());
1031 }
1032 }
1033 }
1034 return prot_feat;
1035 }
1036
1037 END_SCOPE(edit)
1038 END_SCOPE(objects)
1039 END_NCBI_SCOPE
1040
1041