1 /*  $Id: feature_propagate.cpp 632623 2021-06-03 17:38:11Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Colleen Bollin, based on work done by Frank Ludwig
27 *
28 * File Description:
29 *   functions for propagating a feature using an alignment
30 */
31 #include <ncbi_pch.hpp>
32 #include <corelib/ncbistd.hpp>
33 
34 #include <objects/seqfeat/Cdregion.hpp>
35 #include <objects/seqfeat/RNA_ref.hpp>
36 #include <objects/seqfeat/Trna_ext.hpp>
37 #include <objects/seqfeat/Code_break.hpp>
38 #include <objects/seqfeat/Genetic_code_table.hpp>
39 #include <objects/seqfeat/Feat_id.hpp>
40 #include <util/range_coll.hpp>
41 #include <objects/seqalign/Spliced_seg.hpp>
42 
43 #include <objmgr/util/seq_loc_util.hpp>
44 #include <objmgr/seq_vector.hpp>
45 #include <objmgr/seq_vector_ci.hpp>
46 
47 #include <objtools/edit/cds_fix.hpp>
48 #include <objtools/edit/feature_propagate.hpp>
49 
50 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)51 BEGIN_SCOPE(objects)
52 BEGIN_SCOPE(edit)
53 
54 
55 CFeaturePropagator::CFeaturePropagator
56 (CBioseq_Handle src, CBioseq_Handle target,
57  const CSeq_align& align,
58  bool stop_at_stop, bool cleanup_partials, bool merge_abutting,
59  CMessageListener_Basic* pMessageListener, CObject_id::TId* feat_id)
60 :     m_Src(src), m_Target(target),
61     m_Scope(m_Target.GetScope()),
62     m_CdsStopAtStopCodon(stop_at_stop),
63     m_CdsCleanupPartials(cleanup_partials),
64     m_MessageListener(pMessageListener),
65     m_MaxFeatId(feat_id),
66     m_MergeAbutting(merge_abutting),
67       m_ExpandOverGaps(true),
68       m_synonym_mapper(this)
69 {
70     if (align.GetSegs().IsDenseg()) {
71         m_Alignment.Reset(&align);
72     } else if (align.GetSegs().IsDisc()) {
73         m_Alignment.Reset(&*align.CreateDensegFromDisc());
74     } else if (align.GetSegs().IsStd()) {
75         m_Alignment.Reset(&*align.CreateDensegFromStdseg());
76     } else if (align.GetSegs().IsSpliced()) {
77         CRef<CSeq_align> disc_seg = align.GetSegs().GetSpliced().AsDiscSeg();
78         m_Alignment.Reset(&*disc_seg->CreateDensegFromDisc());
79 //    } else if (align.GetSegs().IsDendiag()) {
80 //        m_Alignment.Reset(&*align_format::CAlignFormatUtil::CreateDensegFromDendiag(align));
81     } else {
82         if (m_MessageListener) {
83             m_MessageListener->PostMessage(
84                 CMessage_Basic("Unsupported alignment type",
85                                eDiag_Error,
86                                eFeaturePropagationProblem_FeatureLocation));
87         }
88         m_Alignment.Reset();
89     }
90 }
91 
CFeaturePropagator(CBioseq_Handle src,CBioseq_Handle target,const CSeq_align & align,bool stop_at_stop,bool cleanup_partials,bool merge_abutting,bool expand_over_gaps,CMessageListener_Basic * pMessageListener,CObject_id::TId * feat_id)92 CFeaturePropagator::CFeaturePropagator
93 (CBioseq_Handle src, CBioseq_Handle target,
94  const CSeq_align& align,
95  bool stop_at_stop, bool cleanup_partials, bool merge_abutting, bool expand_over_gaps,
96  CMessageListener_Basic* pMessageListener, CObject_id::TId* feat_id)
97     :     m_Src(src), m_Target(target),
98           m_Scope(m_Target.GetScope()),
99           m_CdsStopAtStopCodon(stop_at_stop),
100           m_CdsCleanupPartials(cleanup_partials),
101           m_MessageListener(pMessageListener),
102           m_MaxFeatId(feat_id),
103           m_MergeAbutting(merge_abutting),
104     m_ExpandOverGaps(expand_over_gaps),
105     m_synonym_mapper(this)
106 {
107     if (align.GetSegs().IsDenseg()) {
108         m_Alignment.Reset(&align);
109     } else if (align.GetSegs().IsDisc()) {
110         m_Alignment.Reset(&*align.CreateDensegFromDisc());
111     } else if (align.GetSegs().IsStd()) {
112         m_Alignment.Reset(&*align.CreateDensegFromStdseg());
113     } else if (align.GetSegs().IsSpliced()) {
114         CRef<CSeq_align> disc_seg = align.GetSegs().GetSpliced().AsDiscSeg();
115         m_Alignment.Reset(&*disc_seg->CreateDensegFromDisc());
116 //    } else if (align.GetSegs().IsDendiag()) {
117 //        m_Alignment.Reset(&*align_format::CAlignFormatUtil::CreateDensegFromDendiag(align));
118     } else {
119         if (m_MessageListener) {
120             m_MessageListener->PostMessage(
121                 CMessage_Basic("Unsupported alignment type",
122                                eDiag_Error,
123                                eFeaturePropagationProblem_FeatureLocation));
124         }
125         m_Alignment.Reset();
126     }
127 }
128 
Propagate(const CSeq_feat & orig_feat)129 CRef<CSeq_feat> CFeaturePropagator::Propagate(const CSeq_feat& orig_feat)
130 {
131     CRef<CSeq_feat> rval(new CSeq_feat());
132     rval->Assign(orig_feat);
133 
134     if (rval->IsSetId() && (m_MaxFeatId && *m_MaxFeatId > 0)) {
135         rval->SetId().SetLocal().SetId(++(*m_MaxFeatId));
136         m_FeatIdMap.emplace(orig_feat.GetId().GetLocal().GetId(), rval->GetId().GetLocal().GetId());
137     }
138 
139     // propagate feature location
140     CSeq_id_Handle best_idh = sequence::GetId(m_Target, sequence::eGetId_Best);
141     CConstRef<CSeq_id> pTargetId = best_idh.GetSeqId();
142     CRef<CSeq_loc> new_loc = x_MapLocation(orig_feat.GetLocation(), *pTargetId);
143     if (!new_loc) {
144         if (m_MessageListener) {
145             string loc_label;
146             orig_feat.GetLocation().GetLabel(&loc_label);
147             string target_label;
148             pTargetId->GetLabel(&target_label);
149             m_MessageListener->PostMessage(
150                 CMessage_Basic("Unable to propagate location of feature " + loc_label + " to " + target_label,
151                                eDiag_Error,
152                                eFeaturePropagationProblem_FeatureLocation));
153         }
154         rval.Reset(nullptr);
155         return rval;
156     }
157 
158     rval->SetLocation().Assign(*new_loc);
159     rval->ResetPartial();
160     rval->ResetProduct();
161     if (new_loc->IsPartialStart(eExtreme_Biological) || new_loc->IsPartialStop(eExtreme_Biological)) {
162         rval->SetPartial(true);
163     }
164 
165     const CSeq_loc& location = orig_feat.GetLocation();
166     bool origIsPartialStart = location.IsPartialStart(eExtreme_Biological);
167 
168     // depending on feature type, propagate locations in data
169     switch(orig_feat.GetData().GetSubtype()) {
170     case CSeqFeatData::eSubtype_cdregion:
171         x_PropagateCds(*rval, *pTargetId, origIsPartialStart);
172         break;
173     case CSeqFeatData::eSubtype_tRNA:
174         x_PropagatetRNA(*rval, *pTargetId);
175         break;
176     default:
177         break;
178     }
179 
180     return rval;
181 }
182 
183 
PropagateAll()184 vector<CRef<CSeq_feat> > CFeaturePropagator::PropagateAll()
185 {
186     vector<CRef<CSeq_feat> > rval;
187     CFeat_CI fi(m_Src);
188     while (fi) {
189         CRef<CSeq_feat> new_feat = Propagate(*(fi->GetOriginalSeq_feat()));
190         if (new_feat) {
191             rval.push_back(new_feat);
192         }
193         ++fi;
194     }
195     return rval;
196 }
197 
PropagateAllReportFailures(vector<CConstRef<CSeq_feat>> & failures)198 vector<CRef<CSeq_feat> > CFeaturePropagator::PropagateAllReportFailures(vector<CConstRef<CSeq_feat> >& failures)
199 {
200     vector<CRef<CSeq_feat> > rval;
201     CFeat_CI fi(m_Src);
202     while (fi) {
203         auto old_feat = fi->GetOriginalSeq_feat();
204         CRef<CSeq_feat> new_feat = Propagate(*old_feat);
205         if (new_feat) {
206             rval.push_back(new_feat);
207         }
208         else {
209             failures.push_back(old_feat);
210         }
211         ++fi;
212     }
213     return rval;
214 }
215 
PropagateFeatureList(const vector<CConstRef<CSeq_feat>> & orig_feats)216 vector<CRef<CSeq_feat>> CFeaturePropagator::PropagateFeatureList(const vector<CConstRef<CSeq_feat>>& orig_feats)
217 {
218     vector<CRef<CSeq_feat>> propagated_feats;
219 
220     for (auto&& it : orig_feats) {
221         CRef<CSeq_feat> new_feat = Propagate(*it);
222         if (new_feat) {
223             propagated_feats.push_back(new_feat);
224             if (new_feat->IsSetData() && new_feat->GetData().IsCdregion()) {
225                 if (it->IsSetProduct()) {
226                     CRef<CSeq_feat> prot_feat = ConstructProteinFeatureForPropagatedCodingRegion(*it, *new_feat);
227                     if (prot_feat) {
228                         propagated_feats.push_back(prot_feat);
229                     }
230                 }
231             }
232         }
233     }
234 
235     if (!m_FeatIdMap.empty()) {
236         for (auto&& it : propagated_feats) {
237             if (it->IsSetXref()) {
238                 auto xref_it = it->SetXref().begin();
239                 while (xref_it != it->SetXref().end()) {
240                     if ((*xref_it)->IsSetId()) {
241                         CFeat_id& feat_id = (*xref_it)->SetId();
242                         if (feat_id.IsLocal() && feat_id.GetLocal().IsId()) {
243                             auto orig_featid = m_FeatIdMap.find(feat_id.GetLocal().GetId());
244                             if (orig_featid != m_FeatIdMap.end()) {
245                                 feat_id.SetLocal().SetId(orig_featid->second);
246                             }
247                             else {
248                                 // it hasn't been propagated yet or it will not be propagated
249                                 xref_it = it->SetXref().erase(xref_it);
250                                 continue;
251                             }
252                         }
253                     }
254                     ++xref_it;
255                 }
256                 if (it->GetXref().empty()) {
257                     it->ResetXref();
258                 }
259             }
260         }
261     }
262 
263     return propagated_feats;
264 }
265 
SeqPosToAlignPos(TSignedSeqPos pos,CDense_seg::TDim row,bool left,bool & partial5,bool & partial3)266 TSignedSeqPos CFeaturePropagator::SeqPosToAlignPos(TSignedSeqPos pos, CDense_seg::TDim row, bool left, bool &partial5, bool &partial3)
267 {
268     TSignedSeqPos res = -1;
269     const CDense_seg& denseg = m_Alignment->GetSegs().GetDenseg();
270     const CSeq_id& id = denseg.GetSeq_id(row);
271     CBioseq_Handle bsh = m_Scope.GetBioseqHandle(id);
272     if (!bsh)
273         return -1;
274     //TSignedSeqPos length = bsh.GetBioseqLength();
275     CDense_seg::TNumseg num_segs = denseg.GetNumseg();
276     CDense_seg::TDim num_rows = denseg.GetDim();
277     CDense_seg::TNumseg seg = 0;
278     TSignedSeqPos total_len = 0;
279     TSignedSeqPos found_len = -1;
280     bool found = false;
281     while ( seg < num_segs )
282     {
283         TSignedSeqPos start = denseg.GetStarts()[seg * num_rows + row];
284         TSignedSeqPos len   = denseg.GetLens()[seg];
285         ENa_strand strand = eNa_strand_plus;
286         if (denseg.IsSetStrands())
287             strand = denseg.GetStrands()[seg * num_rows + row];
288         if (strand == eNa_strand_minus)
289         {
290             NCBI_THROW(CException, eUnknown, "Cannot propagate through alignment on negative strand");
291         }
292         if (start >= 0 && pos >= start && pos < start + len)
293         {
294             res = total_len + pos - start;
295             found = true;
296             break;
297         }
298         if (start >= 0 && start > pos && left)
299         {
300             res = total_len;
301             partial5 = true;
302             found = true;
303             break;
304         }
305         if (start >= 0 && start + len - 1 < pos && !left)
306         {
307             found_len = total_len + len - 1;
308         }
309         total_len += len;
310         ++seg;
311     }
312     if (!found)
313     {
314         res = found_len;
315         if (!left)
316             partial3 = true;
317     }
318 
319     return res;
320 }
321 
AlignPosToSeqPos(TSignedSeqPos pos,CDense_seg::TDim row,bool left,bool & partial5,bool & partial3)322 TSignedSeqPos CFeaturePropagator::AlignPosToSeqPos(TSignedSeqPos pos, CDense_seg::TDim row, bool left, bool &partial5, bool &partial3)
323 {
324     TSignedSeqPos res = -1;
325     const CDense_seg& denseg = m_Alignment->GetSegs().GetDenseg();
326     const CSeq_id& id = denseg.GetSeq_id(row);
327     CBioseq_Handle bsh = m_Scope.GetBioseqHandle(id);
328     if (!bsh)
329         return -1;
330     //TSignedSeqPos length = bsh.GetBioseqLength();
331     CDense_seg::TNumseg num_segs = denseg.GetNumseg();
332     CDense_seg::TDim num_rows = denseg.GetDim();
333     CDense_seg::TNumseg seg = 0;
334     TSignedSeqPos total_len = 0;
335     TSignedSeqPos found_seg = -1;
336     while ( seg < num_segs )
337     {
338         TSignedSeqPos start = denseg.GetStarts()[seg * num_rows + row];
339         TSignedSeqPos len   = denseg.GetLens()[seg];
340         ENa_strand strand = eNa_strand_plus;
341         if (denseg.IsSetStrands())
342             strand = denseg.GetStrands()[seg * num_rows + row];
343         if (strand == eNa_strand_minus)
344         {
345             NCBI_THROW(CException, eUnknown, "Cannot propagate through alignment on negative strand");
346         }
347         if (total_len <= pos && total_len + len > pos)
348         {
349             if (start >= 0)
350             {
351                 res = start + pos - total_len;
352                 break;
353             }
354             else
355             {
356                 found_seg = seg;
357                 break;
358             }
359         }
360         total_len += len;
361         ++seg;
362     }
363     if (found_seg >= 0)
364     {
365         seg = found_seg;
366         if (left)
367         {
368             ++seg;
369             while ( seg < num_segs )
370             {
371                 TSignedSeqPos start = denseg.GetStarts()[seg * num_rows + row];
372                 //TSignedSeqPos len   = denseg.GetLens()[seg];
373                 ENa_strand strand = eNa_strand_plus;
374                 if (denseg.IsSetStrands())
375                     strand = denseg.GetStrands()[seg * num_rows + row];
376                 if (strand == eNa_strand_minus)
377                 {
378                     NCBI_THROW(CException, eUnknown, "Cannot propagate through alignment on negative strand");
379                 }
380                 if (start >= 0)
381                 {
382                     res = start;
383                     break;
384                 }
385                 ++seg;
386             }
387             partial5 = true;
388         }
389         else
390         {
391             while (seg > 0)
392             {
393                 --seg;
394                 TSignedSeqPos start = denseg.GetStarts()[seg * num_rows + row];
395                 TSignedSeqPos len   = denseg.GetLens()[seg];
396                 ENa_strand strand = eNa_strand_plus;
397                 if (denseg.IsSetStrands())
398                     strand = denseg.GetStrands()[seg * num_rows + row];
399                 if (strand == eNa_strand_minus)
400                 {
401                     NCBI_THROW(CException, eUnknown, "Cannot propagate through alignment on negative strand");
402                 }
403                 if (start >= 0)
404                 {
405                     res = start + len - 1;
406                     break;
407                 }
408             }
409             partial3 = true;
410         }
411     }
412     return res;
413 }
414 
FindRow(const CSeq_align & align,CBioseq_Handle bsh)415 CDense_seg::TDim  CFeaturePropagator::FindRow(const CSeq_align& align, CBioseq_Handle bsh)
416 {
417     const CDense_seg& denseg = align.GetSegs().GetDenseg();
418     CDense_seg::TDim num_rows = denseg.GetDim();
419     for (CDense_seg::TDim  i = 0; i < num_rows; i++)
420     {
421         const CSeq_id& id = denseg.GetSeq_id(i);
422         CBioseq_Handle bsh2 = m_Scope.GetBioseqHandle(id);
423         if (bsh == bsh2)
424         {
425             return i;
426         }
427     }
428     return -1;
429 }
430 
431 // For reasons unknown CSeq_align::CreateRowSeq_loc is returning only a single interval for Dense-seg alignments
432 // Therefore we need to roll out our own version which creates a real multi-interval location.
CreateRowSeq_loc(const CSeq_align & align,CDense_seg::TDim row)433 CRef<CSeq_loc>  CFeaturePropagator::CreateRowSeq_loc(const CSeq_align& align, CDense_seg::TDim  row)
434 {
435     CRef<CSeq_loc> loc(new CSeq_loc);
436     const CDense_seg& denseg = align.GetSegs().GetDenseg();
437     const CSeq_id& id = denseg.GetSeq_id(row);
438     CDense_seg::TNumseg num_segs = denseg.GetNumseg();
439     CDense_seg::TDim num_rows = denseg.GetDim();
440     for (int seg = 0; seg < num_segs; seg++)
441     {
442         TSignedSeqPos start = denseg.GetStarts()[seg*num_rows + row];
443         if (start < 0)
444             continue;
445 
446         TSignedSeqPos len = denseg.GetLens()[seg];
447         CRef<CSeq_interval> ret(new CSeq_interval);
448         ret->SetId().Assign(id);
449         ret->SetFrom(start);
450         ret->SetTo(start + len - 1);
451         if ( denseg.IsSetStrands() )
452         {
453            ENa_strand  strand = denseg.GetStrands()[seg * num_rows + row];
454            ret->SetStrand(strand);
455         }
456 
457         loc->SetPacked_int().Set().push_back(ret);
458     }
459     if (!loc->IsPacked_int())
460         loc.Reset();
461     return loc;
462 }
463 
464 // An ordered Seq-loc should altername between non-NULL and NULL locations
IsOrdered(const CSeq_loc & loc)465 bool CFeaturePropagator::IsOrdered(const CSeq_loc &loc)
466 {
467     if (loc.IsMix() && loc.GetMix().Get().size() > 1)
468     {
469         bool should_be_null = false;
470         for (const auto& it : loc.GetMix().Get())
471         {
472             if (it->IsNull() != should_be_null)
473                 return false;
474             should_be_null = !should_be_null;
475         }
476         return should_be_null;
477     }
478     return false;
479 }
480 
MakeOrdered(const CSeq_loc & loc)481 CRef<CSeq_loc> CFeaturePropagator::MakeOrdered(const CSeq_loc &loc)
482 {
483     CRef<CSeq_loc> mix(new CSeq_loc());
484     for (const auto& it : loc.GetMix().Get())
485     {
486         mix->SetMix().Set().push_back(it);
487         CRef<CSeq_loc> null_loc(new CSeq_loc());
488         null_loc->CSeq_loc_Base::SetNull();
489         mix->SetMix().Set().push_back(null_loc);
490     }
491     if (mix->IsMix() && mix->GetMix().IsSet() && !mix->GetMix().Get().empty() && mix->GetMix().Get().back()->IsNull())
492     {
493         mix->SetMix().Set().pop_back();
494     }
495     return mix;
496 }
497 
x_MapLocation(const CSeq_loc & sourceLoc,const CSeq_id & targetId)498 CRef<CSeq_loc> CFeaturePropagator::x_MapLocation(const CSeq_loc& sourceLoc, const CSeq_id& targetId)
499 {
500 //    string label;
501 //    targetId.GetLabel(&label);
502     CRef<CSeq_loc> target;
503     if (!m_Alignment)
504         return target;
505     bool partial5 = sourceLoc.IsPartialStart(eExtreme_Positional);
506     bool partial3 = sourceLoc.IsPartialStop(eExtreme_Positional);
507 
508     CDense_seg::TDim  source_row = FindRow(*m_Alignment, m_Src);
509     CDense_seg::TDim  target_row = FindRow(*m_Alignment, m_Target);
510     if (source_row == -1 || target_row == -1)
511     {
512         return target;
513     }
514     CRef<CSeq_loc> new_loc(new CSeq_loc);
515     new_loc->Assign(sourceLoc);
516     if (!m_ExpandOverGaps)
517     {
518         CRef<CSeq_loc> mapped_loc = CreateRowSeq_loc(*m_Alignment, source_row);
519         if (!mapped_loc)
520             return target;
521         const TSeqPos start_before = new_loc->GetStart(eExtreme_Biological);
522         const TSeqPos stop_before = new_loc->GetStop(eExtreme_Biological);
523         new_loc = mapped_loc->Intersect(*new_loc, 0, &m_synonym_mapper);
524         const TSeqPos start_after = new_loc->GetStart(eExtreme_Biological);
525         const TSeqPos stop_after = new_loc->GetStop(eExtreme_Biological);
526         if (start_before != start_after)
527             partial5 = true;
528         if (stop_before != stop_after)
529             partial3 = true;
530         if (sourceLoc.IsSetStrand()) // Intersect eats up the strand information for some reason
531             new_loc->SetStrand(sourceLoc.GetStrand());
532     }
533     TSignedSeqPos seq_start = m_Src.GetRangeSeq_loc(0,0)->GetStart(objects::eExtreme_Positional);
534     new_loc->SetId(targetId);
535     CSeq_loc_I loc_it(*new_loc);
536     while(loc_it)
537     {
538         if (loc_it.IsEmpty())
539         {
540             loc_it.Delete();
541             continue;
542         }
543         CSeq_loc_CI::TRange range = loc_it.GetRange();
544         TSignedSeqPos start = range.GetFrom() - seq_start;
545         TSignedSeqPos stop = range.GetTo() - seq_start;
546         ENa_strand strand = loc_it.GetStrand();
547         bool sub_partial5(false), sub_partial3(false);
548         TSignedSeqPos align_start = -1,  align_stop = -1;
549         try
550         {
551             align_start = SeqPosToAlignPos(start, source_row, true, sub_partial5, sub_partial3);
552             align_stop = SeqPosToAlignPos(stop, source_row, false, sub_partial5, sub_partial3);
553         } catch (const CException &e)
554         {
555             if (m_MessageListener) {
556                 m_MessageListener->PostMessage(
557                     CMessage_Basic(e.what(),
558                                    eDiag_Error,
559                                    eFeaturePropagationProblem_FeatureLocation));
560             }
561             return target;
562         }
563         if (align_start < 0 || align_stop < 0)
564         {
565             if (loc_it.GetPos() == 0)
566             {
567                 if (IsReverse(strand))
568                     partial3 = true;
569                 else
570                     partial5 = true;
571             }
572             if (loc_it.GetPos() == loc_it.GetSize() - 1)
573             {
574                  if (IsReverse(strand))
575                      partial5 = true;
576                  else
577                      partial3 = true;
578             }
579             loc_it.Delete();
580             continue;
581         }
582         TSignedSeqPos new_start = -1, new_stop = -1;
583         try
584         {
585             new_start = AlignPosToSeqPos(align_start, target_row, true, sub_partial5, sub_partial3);
586             new_stop = AlignPosToSeqPos(align_stop, target_row, false, sub_partial5, sub_partial3);
587         }
588         catch(const CException &e)
589         {
590             if (m_MessageListener) {
591                 m_MessageListener->PostMessage(
592                     CMessage_Basic(e.what(),
593                                    eDiag_Error,
594                                    eFeaturePropagationProblem_FeatureLocation));
595             }
596             return target;
597         }
598         if (new_start < 0 || new_stop < 0 || new_stop < new_start)
599         {
600             if (loc_it.GetPos() == 0)
601             {
602                 if (IsReverse(strand))
603                     partial3 = true;
604                 else
605                     partial5 = true;
606             }
607             if (loc_it.GetPos() == loc_it.GetSize() - 1)
608             {
609                 if (IsReverse(strand))
610                     partial5 = true;
611                 else
612                     partial3 = true;
613             }
614             loc_it.Delete();
615             continue;
616         }
617         if (sub_partial5 && loc_it.GetPos() == 0 && !IsReverse(strand))
618             partial5 = true;
619         if (sub_partial5 && loc_it.GetPos() == loc_it.GetSize() - 1 && IsReverse(strand))
620             partial5 = true;
621         if (sub_partial3 && loc_it.GetPos() == loc_it.GetSize() - 1 && !IsReverse(strand))
622             partial3 = true;
623         if (sub_partial3 && loc_it.GetPos() == 0 && IsReverse(strand))
624             partial3 = true;
625         loc_it.SetFrom(new_start);
626         loc_it.SetTo(new_stop);
627         ++loc_it;
628     }
629     target = loc_it.MakeSeq_loc();
630     if (!m_ExpandOverGaps && target)
631     {
632         CRef<CSeq_loc> mapped_loc = CreateRowSeq_loc(*m_Alignment, target_row);
633         if (!mapped_loc)
634         {
635             target.Reset();
636             return target;
637         }
638         target = mapped_loc->Intersect(*target, 0, &m_synonym_mapper);
639         if (sourceLoc.IsSetStrand()) // Intersect eats up the strand information for some reason
640             target->SetStrand(sourceLoc.GetStrand());
641     }
642     if (target && (target->IsNull() || target->IsEmpty()
643                    || (target->IsMix() && (!target->GetMix().IsSet() || target->GetMix().Get().empty()))
644                    || (target->IsPacked_int() && (!target->GetPacked_int().IsSet() || target->GetPacked_int().Get().empty()))
645                    || (target->IsPacked_pnt() && (!target->GetPacked_pnt().IsSetPoints() || target->GetPacked_pnt().GetPoints().empty())) ))
646     {
647         target.Reset();
648     }
649     if (m_MergeAbutting && target)
650     {
651         target = target->Merge(CSeq_loc::fMerge_All, nullptr);
652     }
653     if (partial5 && target)
654     {
655         target->SetPartialStart(true, eExtreme_Positional);
656     }
657     if (partial3 && target)
658     {
659         target->SetPartialStop(true, eExtreme_Positional);
660     }
661     if (sourceLoc.IsMix() && target && target->IsPacked_int())
662     {
663         target->ChangeToMix();
664     }
665     if (target && target->IsMix() && target->GetMix().IsSet() && target->GetMix().Get().size() > 1
666         && sourceLoc.IsMix() && IsOrdered(sourceLoc))
667     {
668         target = MakeOrdered(*target);
669     }
670     return target;
671 }
672 
x_TruncateToStopCodon(const CSeq_loc & loc,unsigned int truncLen)673 CRef<CSeq_loc> CFeaturePropagator::x_TruncateToStopCodon(const CSeq_loc& loc, unsigned int truncLen)
674 {
675     CRef<CSeq_loc> pTruncated;
676 
677     unsigned int curLen = 0;
678     for (CSeq_loc_CI it(loc); it && curLen < truncLen; ++it) {
679         unsigned int thisLen = it.GetRange().GetLength();
680         CConstRef<CSeq_loc> pThisLoc = it.GetRangeAsSeq_loc();
681         if (curLen + thisLen <= truncLen) {
682             if (pTruncated) {
683                 pTruncated->Add(*pThisLoc);
684             } else {
685                 pTruncated.Reset(new CSeq_loc());
686                 pTruncated->Assign(*pThisLoc);
687             }
688             curLen += thisLen;
689         } else {
690             CRef<CSeq_loc> pPartialLoc(new CSeq_loc());
691             unsigned int missingLen = truncLen - curLen;
692             const TSeqPos start = pThisLoc->GetStart(eExtreme_Biological);
693             if (missingLen == 1) {
694                 // make a point
695                 pPartialLoc->SetPnt().SetPoint(start);
696             } else {
697                 // make an interval
698                 if (pThisLoc->IsSetStrand() && pThisLoc->GetStrand() == eNa_strand_minus) {
699                     pPartialLoc->SetInt().SetFrom(start - missingLen + 1);
700                     pPartialLoc->SetInt().SetTo(start);
701                 } else {
702                     pPartialLoc->SetInt().SetFrom(start);
703                     pPartialLoc->SetInt().SetTo(start + missingLen - 1);
704                 }
705             }
706             pPartialLoc->SetId(*pThisLoc->GetId());
707             if (pThisLoc->IsSetStrand()) {
708                 pPartialLoc->SetStrand(pThisLoc->GetStrand());
709             }
710             if (pTruncated) {
711                 pTruncated->Add(*pPartialLoc);
712             } else {
713                 pTruncated.Reset(new CSeq_loc());
714                 pTruncated->Assign(*pPartialLoc);
715             }
716             curLen += missingLen;
717         }
718     }
719 
720     return pTruncated;
721 }
722 
x_ExtendToStopCodon(CSeq_feat & feat)723 CRef<CSeq_loc> CFeaturePropagator::x_ExtendToStopCodon (CSeq_feat& feat)
724 {
725     CRef<CSeq_loc> new_loc;
726 
727     if (!feat.IsSetLocation()) {
728         return new_loc;
729     }
730     const CSeq_loc& loc = feat.GetLocation();
731 
732     const CGenetic_code* code = nullptr;
733     if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.GetData().GetCdregion().IsSetCode()) {
734         code = &(feat.GetData().GetCdregion().GetCode());
735     }
736 
737     const TSeqPos stop = loc.GetStop(eExtreme_Biological);
738     // figure out if we have a partial codon at the end
739     size_t orig_len = sequence::GetLength(loc, &m_Scope);
740     size_t nuc_len = orig_len;
741     if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.GetData().GetCdregion().IsSetFrame()) {
742         CCdregion::EFrame frame = feat.GetData().GetCdregion().GetFrame();
743         if (frame == CCdregion::eFrame_two) {
744             nuc_len -= 1;
745         } else if (frame == CCdregion::eFrame_three) {
746             nuc_len -= 2;
747         }
748     }
749     unsigned int mod = nuc_len % 3;
750     CRef<CSeq_loc> vector_loc(new CSeq_loc());
751     CSeq_interval& vector_int = vector_loc->SetInt();
752     vector_int.SetId().Assign(*(loc.GetId()));
753 
754     if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus) {
755         vector_int.SetFrom(0);
756         vector_int.SetTo(stop + mod - 1);
757         vector_loc->SetStrand(eNa_strand_minus);
758     } else {
759         vector_int.SetFrom(stop - mod + 1);
760         vector_int.SetTo(m_Target.GetInst_Length() - 1);
761     }
762 
763     CSeqVector seq(*vector_loc, m_Scope, CBioseq_Handle::eCoding_Iupac);
764     // reserve our space
765     const unsigned int usable_size = seq.size();
766 
767     // get appropriate translation table
768     const CTrans_table & tbl =
769         (code ? CGen_code_table::GetTransTable(*code) :
770                 CGen_code_table::GetTransTable(1));
771 
772     // main loop through bases
773     CSeqVector::const_iterator start = seq.begin();
774 
775     int state = 0;
776     unsigned int prot_len = usable_size / 3;
777 
778     for (unsigned int i = 0;  i < prot_len;  ++i) {
779         // loop through one codon at a time
780         for (size_t k = 0;  k < 3;  ++k, ++start) {
781             state = tbl.NextCodonState(state, *start);
782         }
783 
784         if (tbl.GetCodonResidue (state) == '*') {
785             CSeq_loc_CI it(loc);
786             CSeq_loc_CI it_next = it;
787             ++it_next;
788             while (it_next) {
789                 CConstRef<CSeq_loc> this_loc = it.GetRangeAsSeq_loc();
790                 if (new_loc) {
791                     new_loc->Add(*this_loc);
792                 } else {
793                     new_loc.Reset(new CSeq_loc());
794                     new_loc->Assign(*this_loc);
795                 }
796                 it = it_next;
797                 ++it_next;
798             }
799             CRef<CSeq_loc> last_interval(new CSeq_loc());
800             CConstRef<CSeq_loc> this_loc = it.GetRangeAsSeq_loc();
801             const TSeqPos this_start = this_loc->GetStart(eExtreme_Positional);
802             const TSeqPos this_stop = this_loc->GetStop(eExtreme_Positional);
803             unsigned int extension = ((i + 1) * 3) - mod;
804             last_interval->SetInt().SetId().Assign(*(this_loc->GetId()));
805             if (this_loc->IsSetStrand() && this_loc->GetStrand() == eNa_strand_minus) {
806                 last_interval->SetStrand(eNa_strand_minus);
807                 last_interval->SetInt().SetFrom(this_start - extension);
808                 last_interval->SetInt().SetTo(this_stop);
809             } else {
810                 last_interval->SetInt().SetFrom(this_start);
811                 last_interval->SetInt().SetTo(this_stop + extension);
812             }
813 
814             if (new_loc) {
815                 new_loc->Add(*last_interval);
816             } else {
817                 new_loc.Reset(new CSeq_loc());
818                 new_loc->Assign(*last_interval);
819             }
820 
821             return new_loc;
822         }
823     }
824 
825     return new_loc;
826 }
827 
828 
x_PropagateCds(CSeq_feat & feat,const CSeq_id & targetId,bool origIsPartialStart)829 void CFeaturePropagator::x_PropagateCds(CSeq_feat& feat, const CSeq_id& targetId, bool origIsPartialStart)
830 {
831     bool ambiguous = false;
832     feat.SetData().SetCdregion().SetFrame(CSeqTranslator::FindBestFrame(feat, m_Scope, ambiguous));
833 
834     x_CdsMapCodeBreaks(feat, targetId);
835     if (m_CdsStopAtStopCodon) {
836         x_CdsStopAtStopCodon(feat);
837     }
838     if (m_CdsCleanupPartials) {
839         x_CdsCleanupPartials(feat, origIsPartialStart);
840     }
841 }
842 
843 
x_CdsMapCodeBreaks(CSeq_feat & feat,const CSeq_id & targetId)844 void CFeaturePropagator::x_CdsMapCodeBreaks(CSeq_feat& feat, const CSeq_id& targetId)
845 {
846     CCdregion& cds = feat.SetData().SetCdregion();
847     if (cds.IsSetCode_break()) {
848         auto it = cds.SetCode_break().begin();
849         while (it != cds.SetCode_break().end()) {
850             bool remove = false;
851             if ((*it)->IsSetLoc()) {
852                 const CSeq_loc& codebreak = (*it)->GetLoc();
853                 CRef<CSeq_loc> new_codebreak = x_MapLocation(codebreak, targetId);
854                 if (new_codebreak) {
855                     (*it)->SetLoc(*new_codebreak);
856                 } else {
857                     if (m_MessageListener) {
858                         string loc_label;
859                         (*it)->GetLoc().GetLabel(&loc_label);
860                         string target_label;
861                         targetId.GetLabel(&target_label);
862                         m_MessageListener->PostMessage(
863                             CMessage_Basic("Unable to propagate location of translation exception " + loc_label + " to " + target_label,
864                                            eDiag_Error,
865                                            eFeaturePropagationProblem_CodeBreakLocation));
866                     }
867                     remove = true;
868                 }
869             }
870             if (remove) {
871                 it = cds.SetCode_break().erase(it);
872             } else {
873                 ++it;
874             }
875         }
876         if (cds.GetCode_break().size() == 0) {
877             cds.ResetCode_break();
878         }
879     }
880 }
881 
882 
x_CdsCleanupPartials(CSeq_feat & cds,bool origIsPartialStart)883 void CFeaturePropagator::x_CdsCleanupPartials(CSeq_feat& cds, bool origIsPartialStart)
884 {
885     if (cds.GetData().GetSubtype() != CSeqFeatData::eSubtype_cdregion) {
886         return;
887     }
888 
889     // assume that the original is marked correctly, i.e. if there is not fuzz-from
890     // then it is 5'complete and it there is no fuzz-to then it is 3' prime complete.
891     // no need to recheck whther there are actual start and stop codons.
892 
893     // for the copies:
894     // the copy ends in a stop codon: it is 3' complete. Nothing needs to be done.
895     // the copy does not end in a stop codon. hence it is not 3' complete.
896     // the original is 5' incomplete: the copy is 5' incomplete
897     // the original is 5' complete, copy starts with a start codon: the copy is
898     // 5' complete
899 
900     try {
901         string prot;
902         CSeqTranslator::Translate(cds, m_Scope, prot);
903 
904         bool targetIsPartialStart = ! (NStr::StartsWith(prot, 'M')) || cds.GetLocation().IsPartialStart(eExtreme_Biological);
905         bool targetIsPartialStop = ! (NStr::EndsWith(prot, '*'));
906 
907         if (!targetIsPartialStart  && !origIsPartialStart) {
908             //propagated feature is 5' complete
909             cds.SetLocation().SetPartialStart(false, eExtreme_Biological);
910         }
911         else {
912             //propagated feature is 5' partial
913             cds.SetLocation().SetPartialStart(true, eExtreme_Biological);
914         }
915         if (!targetIsPartialStop) {
916             //propagated feature ends in stop codon hence is 3' complete
917             cds.SetLocation().SetPartialStop(false, eExtreme_Biological);
918         }
919         else {
920             //propagated feature is 3' partial
921             cds.SetLocation().SetPartialStop(true, eExtreme_Biological);
922         }
923 
924         if (cds.GetLocation().IsPartialStart(eExtreme_Biological) ||
925             cds.GetLocation().IsPartialStop(eExtreme_Biological)) {
926             cds.SetPartial(true);
927         }
928     }
929     catch (...) {
930     }
931 }
932 
933 
x_CdsStopAtStopCodon(CSeq_feat & cds)934 void CFeaturePropagator::x_CdsStopAtStopCodon(CSeq_feat& cds)
935 {
936     if (cds.GetData().GetSubtype() != CSeqFeatData::eSubtype_cdregion) {
937         return;
938     }
939     CRef<CBioseq> pTranslated = CSeqTranslator::TranslateToProtein(cds, m_Scope);
940     if (!pTranslated) {
941         return;
942     }
943     CRef<CSeq_loc> pAdjustedLoc;
944     CSeqVector seqv(*pTranslated, &m_Scope, CBioseq_Handle::eCoding_Ncbi);
945     seqv.SetCoding(CSeq_data::e_Ncbieaa);
946     CSeqVector_CI it(seqv);
947     while (it && *it != '*') {
948         it++;
949     }
950     if (it) {
951         //found stop codon inside the given feature
952         unsigned int newSize = 3*(it.GetPos()+1);
953         if (cds.GetData().GetCdregion().IsSetFrame()) {
954             switch(cds.GetData().GetCdregion().GetFrame()) {
955             default:
956                 break;
957             case CCdregion::eFrame_two:
958                 newSize += 1;
959                 break;
960             case CCdregion::eFrame_three:
961                 newSize += 2;
962                 break;
963             }
964         }
965         pAdjustedLoc = x_TruncateToStopCodon(cds.GetLocation(), newSize);
966     }
967     else {
968         pAdjustedLoc = x_ExtendToStopCodon(cds);
969     }
970     if (pAdjustedLoc) {
971         cds.SetLocation(*pAdjustedLoc);
972     }
973 }
974 
975 
x_PropagatetRNA(CSeq_feat & feat,const CSeq_id & targetId)976 void CFeaturePropagator::x_PropagatetRNA(CSeq_feat& feat, const CSeq_id& targetId)
977 {
978     if (feat.GetData().GetRna().IsSetExt()) {
979         const CRNA_ref::C_Ext& ext = feat.GetData().GetRna().GetExt();
980         if (ext.IsTRNA()) {
981             const CTrna_ext& trna_ext = ext.GetTRNA();
982             if (trna_ext.IsSetAnticodon()) {
983                 const CSeq_loc& anticodon = trna_ext.GetAnticodon();
984                 CRef<CSeq_loc> new_anticodon = x_MapLocation(anticodon, targetId);
985                 if (new_anticodon) {
986                     feat.SetData().SetRna().SetExt().SetTRNA().SetAnticodon(*new_anticodon);
987                 } else {
988                     if (m_MessageListener) {
989                         string loc_label;
990                         anticodon.GetLabel(&loc_label);
991                         string target_label;
992                         targetId.GetLabel(&target_label);
993                         m_MessageListener->PostMessage(
994                             CMessage_Basic("Unable to propagate location of anticodon " + loc_label + " to " + target_label,
995                                            eDiag_Error,
996                                            eFeaturePropagationProblem_AnticodonLocation));
997                     }
998                     feat.SetData().SetRna().SetExt().SetTRNA().ResetAnticodon();
999                 }
1000             }
1001         }
1002     }
1003 }
1004 
1005 
ConstructProteinFeatureForPropagatedCodingRegion(const CSeq_feat & orig_cds,const CSeq_feat & new_cds)1006 CRef<CSeq_feat> CFeaturePropagator::ConstructProteinFeatureForPropagatedCodingRegion(const CSeq_feat& orig_cds, const CSeq_feat& new_cds)
1007 {
1008     if (!orig_cds.IsSetProduct()) {
1009         return CRef<CSeq_feat>(nullptr);
1010     }
1011     CRef<CSeq_feat> prot_feat;
1012     CBioseq_Handle prot = m_Scope.GetBioseqHandle(orig_cds.GetProduct());
1013     if (prot) {
1014         CFeat_CI pf(prot, CSeqFeatData::eSubtype_prot);
1015         if (pf) {
1016             prot_feat.Reset(new CSeq_feat());
1017             const CSeq_feat& orig_prot = *pf->GetOriginalSeq_feat();
1018             prot_feat->Assign(orig_prot);
1019 
1020             if ((m_MaxFeatId && *m_MaxFeatId > 0) && (pf->GetOriginalSeq_feat()->IsSetId() || orig_cds.IsSetId())) {
1021                 prot_feat->SetId().SetLocal().SetId(++(*m_MaxFeatId));
1022                 // protein features don't have Xrefs
1023             }
1024             // fix partials
1025             prot_feat->SetLocation().SetPartialStart(new_cds.GetLocation().IsPartialStart(eExtreme_Biological), eExtreme_Biological);
1026             prot_feat->SetLocation().SetPartialStop(new_cds.GetLocation().IsPartialStop(eExtreme_Biological), eExtreme_Biological);
1027             prot_feat->SetPartial(prot_feat->GetLocation().IsPartialStart(eExtreme_Biological) ||
1028                                   prot_feat->GetLocation().IsPartialStop(eExtreme_Biological));
1029             if (new_cds.IsSetProduct() && !new_cds.GetProduct().GetId()->Equals(*(orig_cds.GetProduct().GetId()))) {
1030                 prot_feat->SetLocation().SetId(new_cds.GetProduct().GetWhole());
1031             }
1032         }
1033     }
1034     return prot_feat;
1035 }
1036 
1037 END_SCOPE(edit)
1038 END_SCOPE(objects)
1039 END_NCBI_SCOPE
1040 
1041