1 /* $Id: gap_trim.cpp 632623 2021-06-03 17:38:11Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Colleen Bollin
27  *
28  * File Description:
29  *   For adjusting features for gaps
30  *
31  * ===========================================================================
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <objtools/edit/gap_trim.hpp>
36 #include <objtools/edit/loc_edit.hpp>
37 #include <objtools/edit/cds_fix.hpp>
38 #include <objmgr/util/feature.hpp>
39 #include <objmgr/seq_map.hpp>
40 #include <objmgr/seq_map_ci.hpp>
41 #include <objmgr/seq_vector.hpp>
42 
43 #include <objmgr/util/seq_loc_util.hpp>
44 #include <objmgr/util/sequence.hpp>
45 #include <objects/seqfeat/Seq_feat.hpp>
46 #include <objects/seqfeat/Code_break.hpp>
47 #include <objects/seqfeat/Feat_id.hpp>
48 #include <objects/seqfeat/RNA_ref.hpp>
49 #include <objects/seqfeat/Trna_ext.hpp>
50 #include <objects/general/Dbtag.hpp>
51 #include <objects/general/Object_id.hpp>
52 #include <objects/seq/Seq_descr.hpp>
53 #include <objects/seq/Seqdesc.hpp>
54 #include <objtools/edit/cds_fix.hpp>
55 
56 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)57 BEGIN_SCOPE(objects)
58 BEGIN_SCOPE(edit)
59 
60 CFeatGapInfo::CFeatGapInfo(CSeq_feat_Handle sf)
61 {
62     m_Feature = sf;
63     CollectGaps(sf.GetLocation(), sf.GetScope());
64 }
65 
66 
CollectGaps(const CSeq_loc & feat_loc,CScope & scope)67 void CFeatGapInfo::CollectGaps(const CSeq_loc& feat_loc, CScope& scope)
68 {
69     m_Gaps.clear();
70     m_Unknown = false;
71     m_Known = false;
72     m_Ns = false;
73 
74     m_Start = feat_loc.GetStart(objects::eExtreme_Positional);
75     m_Stop = feat_loc.GetStop(objects::eExtreme_Positional);
76     CRef<CSeq_loc> total_loc = sequence::Seq_loc_Merge(feat_loc, CSeq_loc::fMerge_SingleRange, &scope);
77     if (total_loc->IsSetStrand()) total_loc->ResetStrand();
78     CConstRef<objects::CSeqMap> seq_map = CSeqMap::GetSeqMapForSeq_loc(*total_loc, &scope);
79 
80     // use CSeqVector for finding Ns
81     CSeqVector vec(*seq_map, scope, CBioseq_Handle::eCoding_Iupac);
82 
83     CSeqMap_CI seq_map_ci = seq_map->ResolvedRangeIterator(&scope,
84         0,
85         m_Stop - m_Start + 1,
86         eNa_strand_plus, //feat_loc.GetStrand(),
87         size_t(-1),
88         objects::CSeqMap::fFindGap | objects::CSeqMap::fFindData);
89 
90     for (; seq_map_ci; ++seq_map_ci)
91     {
92 
93         if (seq_map_ci.GetType() == objects::CSeqMap::eSeqGap)
94         {
95             TSeqPos gap_start = m_Start + seq_map_ci.GetPosition();
96             TSeqPos gap_stop = gap_start + seq_map_ci.GetLength() - 1;
97             bool is_unknown = seq_map_ci.IsUnknownLength();
98             if (is_unknown) {
99                 m_Unknown = true;
100             } else {
101                 m_Known = true;
102             }
103             m_Gaps.push_back(TGapInterval(is_unknown ? eGapIntervalType_unknown : eGapIntervalType_known, pair<size_t, size_t>(gap_start, gap_stop)));
104         } else {
105             // look for Ns
106             TSeqPos map_start = seq_map_ci.GetPosition();
107             TSeqPos map_stop = map_start + seq_map_ci.GetLength() - 1;
108             bool in_ns = false;
109             TSeqPos gap_start;
110             for (TSeqPos i = map_start; i <= map_stop; ++i) {
111                 char letter = vec[i];
112                 if (letter == 'N') {
113                     if (!in_ns) {
114                         // start new gap
115                         gap_start = m_Start + i;
116                         in_ns = true;
117                     }
118                 } else if (in_ns) {
119                     // end previous gap
120                     TSeqPos gap_stop = m_Start + i - 1;
121                     m_Gaps.push_back(TGapInterval(eGapIntervalType_n, pair<size_t, size_t>(gap_start, gap_stop)));
122                     m_Ns = true;
123                     in_ns = false;
124                 }
125             }
126             if (in_ns) {
127                 TSeqPos gap_stop = m_Start + map_stop;
128                 m_Gaps.push_back(TGapInterval(eGapIntervalType_n, pair<size_t, size_t>(gap_start, gap_stop)));
129                 m_Ns = true;
130             }
131         }
132     }
133 }
134 
135 
x_UsableInterval(const TGapInterval & interval,bool unknown_length,bool known_length,bool ns)136 bool CFeatGapInfo::x_UsableInterval(const TGapInterval& interval, bool unknown_length, bool known_length, bool ns)
137 {
138     if (interval.first == eGapIntervalType_unknown && !unknown_length) {
139         return false;
140     } else if (interval.first == eGapIntervalType_known && !known_length) {
141         return false;
142     } else if (interval.first == eGapIntervalType_n && !ns) {
143         return false;
144     } else {
145         return true;
146     }
147 }
148 
149 
CalculateRelevantIntervals(bool unknown_length,bool known_length,bool ns)150 void CFeatGapInfo::CalculateRelevantIntervals(bool unknown_length, bool known_length, bool ns)
151 {
152     m_InsideGaps.clear();
153     m_LeftGaps.clear();
154     m_RightGaps.clear();
155 
156     if (!m_Gaps.empty()) {
157         // find left offset
158         size_t skip_left = 0;
159         TGapIntervalList::iterator it = m_Gaps.begin();
160         while (it != m_Gaps.end()) {
161             if (!x_UsableInterval(*it, unknown_length, known_length, ns)) {
162                 break;
163             }
164 
165             if (m_LeftGaps.empty()) {
166                 if (it->second.first <= m_Start && it->second.second >= m_Start) {
167                     m_LeftGaps.push_back(it->second);
168                     skip_left++;
169                 } else {
170                     break;
171                 }
172             } else if (it->second.first <= m_LeftGaps.front().second + 1 && it->second.second >= m_LeftGaps.front().second) {
173                 m_LeftGaps.front().second = it->second.second;
174                 skip_left++;
175             } else {
176                 break;
177             }
178             ++it;
179         }
180         TGapIntervalList::reverse_iterator rit = m_Gaps.rbegin();
181         size_t skip_right = 0;
182         while (rit != m_Gaps.rend()) {
183             if (!x_UsableInterval(*rit, unknown_length, known_length, ns)) {
184                 break;
185             }
186             if (m_RightGaps.empty()) {
187                 if (rit->second.first <= m_Stop && rit->second.second >= m_Stop) {
188                     m_RightGaps.push_back(rit->second);
189                     skip_right++;
190                 } else {
191                     break;
192                 }
193             } else if (rit->second.first <= m_RightGaps.front().first - 1 && rit->second.second >= m_RightGaps.front().second) {
194                 m_RightGaps.front().first = rit->second.first;
195                 skip_right++;
196             } else {
197                 break;
198             }
199             ++rit;
200         }
201         for (size_t offset = skip_left; offset < m_Gaps.size() - skip_right; offset++) {
202             if (x_UsableInterval(m_Gaps[offset], unknown_length, known_length, ns)) {
203                 m_InsideGaps.push_back(m_Gaps[offset].second);
204             }
205         }
206     }
207 }
208 
209 
Trimmable() const210 bool CFeatGapInfo::Trimmable() const
211 {
212     if (ShouldRemove()) {
213         return false;
214     } else if (!m_LeftGaps.empty() || !m_RightGaps.empty()) {
215         return true;
216     } else {
217         return false;
218     }
219 }
220 
221 
Splittable() const222 bool CFeatGapInfo::Splittable() const
223 {
224     if (!m_InsideGaps.empty()) {
225         return true;
226     } else {
227         return false;
228     }
229 }
230 
231 
ShouldRemove() const232 bool CFeatGapInfo::ShouldRemove() const
233 {
234     if (!m_LeftGaps.empty() && m_LeftGaps.front().second >= m_Stop) {
235         return true;
236     } else {
237         return false;
238     }
239 }
240 
241 
Trim(CSeq_loc & loc,bool make_partial,CScope & scope)242 void CFeatGapInfo::Trim(CSeq_loc& loc, bool make_partial, CScope& scope)
243 {
244   CRef<CSeq_id> seqid(new CSeq_id);
245   seqid->Assign(*loc.GetId());
246     for (vector<pair<size_t, size_t> >::reverse_iterator b = m_LeftGaps.rbegin(); b != m_LeftGaps.rend(); ++b)
247     {
248         size_t start = b->first;
249         size_t stop = b->second;
250         CRef<CSeq_loc> loc2(new CSeq_loc());
251         int options = edit::eSplitLocOption_split_in_exon | edit::eSplitLocOption_split_in_intron;
252         if (make_partial)
253             options |= edit::eSplitLocOption_make_partial;
254         edit::SplitLocationForGap(loc, *loc2, start, stop, seqid.GetPointer(), options);
255         if (loc2->Which() != CSeq_loc::e_not_set)
256         {
257             loc.Assign(*loc2);
258         }
259     }
260     for (vector<pair<size_t, size_t> >::reverse_iterator b = m_RightGaps.rbegin(); b != m_RightGaps.rend(); ++b)
261     {
262         size_t start = b->first;
263         size_t stop = b->second;
264         CRef<CSeq_loc> loc2(new CSeq_loc());
265         int options = edit::eSplitLocOption_split_in_exon;
266         if (make_partial)
267             options |= edit::eSplitLocOption_make_partial;
268         edit::SplitLocationForGap(loc, *loc2, start, stop, seqid.GetPointer(), options);
269     }
270 }
271 
272 
Split(const CSeq_loc & orig,bool in_intron,bool make_partial)273 CFeatGapInfo::TLocList CFeatGapInfo::Split(const CSeq_loc& orig, bool in_intron, bool make_partial)
274 {
275     TLocList locs;
276     CRef<CSeq_loc> left_loc(new CSeq_loc());
277     left_loc->Assign(orig);
278     CRef<CSeq_id> seqid(new CSeq_id);
279     seqid->Assign(*orig.GetId());
280     for (vector<pair<size_t, size_t> >::reverse_iterator b = m_InsideGaps.rbegin(); b != m_InsideGaps.rend(); ++b)
281     {
282         size_t start = b->first;
283         size_t stop = b->second;
284         CRef<CSeq_loc> loc2(new CSeq_loc());
285         int options = edit::eSplitLocOption_make_partial | edit::eSplitLocOption_split_in_exon;
286         if (in_intron)
287             options |= edit::eSplitLocOption_split_in_intron;
288         edit::SplitLocationForGap(*left_loc, *loc2, start, stop, seqid.GetPointer(), options);
289         if (loc2->GetId())
290         {
291             if (make_partial)
292             {
293                 loc2->SetPartialStart(true, objects::eExtreme_Positional);
294                 if (left_loc->Which() != CSeq_loc::e_not_set)
295                 {
296                     left_loc->SetPartialStop(true, objects::eExtreme_Positional);
297                 }
298             }
299             locs.push_back(loc2);
300         }
301     }
302     if (locs.size() > 0) {
303         if (left_loc->Which() != CSeq_loc::e_not_set)
304         {
305             locs.push_back(left_loc);
306         }
307         reverse(locs.begin(), locs.end());
308     }
309     return locs;
310 }
311 
312 
x_AdjustOrigLabel(CSeq_feat & feat,size_t & id_offset,string & id_label,const string & qual)313 void CFeatGapInfo::x_AdjustOrigLabel(CSeq_feat& feat, size_t& id_offset, string& id_label, const string& qual)
314 {
315     if (!feat.IsSetQual()) {
316         return;
317     }
318     NON_CONST_ITERATE(CSeq_feat::TQual, it, feat.SetQual()) {
319         if ((*it)->IsSetQual() && (*it)->IsSetVal() &&
320             !NStr::IsBlank((*it)->GetVal()) &&
321             NStr::EqualNocase((*it)->GetQual(), qual) &&
322             (id_label.empty() || NStr::Equal((*it)->GetVal(), id_label) || NStr::Equal((*it)->GetVal(), id_label + "_1"))) {
323             if (id_label.empty()) {
324                 id_label = (*it)->GetVal();
325             }
326             if (id_offset != 0)
327                 (*it)->SetVal(id_label + "_" + NStr::NumericToString(id_offset));
328             id_offset++;
329         }
330     }
331 }
332 
AdjustProteinSeq(const CBioseq & seq,const CSeq_feat & feat,const CSeq_feat & orig_cds,CScope & scope)333 CRef<CBioseq> CFeatGapInfo::AdjustProteinSeq(const CBioseq& seq, const CSeq_feat& feat, const CSeq_feat& orig_cds, CScope& scope)
334 {
335     if (!feat.IsSetProduct() || !feat.GetProduct().IsWhole() || !seq.IsAa()) {
336         return CRef<CBioseq>(NULL);
337     }
338 
339     TSeqPos orig_len = seq.GetInst().GetLength();
340 
341     string prot;
342     CSeqTranslator::Translate(feat, scope, prot);
343     if (prot.empty()) {
344         return CRef<CBioseq>(NULL);
345     }
346 
347     CRef<CBioseq> prot_seq(new CBioseq);
348     prot_seq->Assign(seq);
349     prot_seq->SetInst().ResetExt();
350     prot_seq->SetInst().SetRepr(objects::CSeq_inst::eRepr_raw);
351     prot_seq->SetInst().SetSeq_data().SetIupacaa().Set(prot);
352     prot_seq->SetInst().SetLength(TSeqPos(prot.length()));
353     prot_seq->SetInst().SetMol(objects::CSeq_inst::eMol_aa);
354     // fix sequence ID
355     const CSeq_id& feat_prod = feat.GetProduct().GetWhole();
356     if (!feat_prod.Equals(orig_cds.GetProduct().GetWhole())) {
357         NON_CONST_ITERATE(CBioseq::TId, id, prot_seq->SetId()) {
358             if ((*id)->Which() == feat_prod.Which()) {
359                 bool do_replace = false;
360                 if ((*id)->IsGeneral()) {
361                     if ((*id)->GetGeneral().IsSetDb()) {
362                         if (feat_prod.GetGeneral().IsSetDb() &&
363                             NStr::Equal(feat_prod.GetGeneral().GetDb(), (*id)->GetGeneral().GetDb())) {
364                             do_replace = true;
365                         }
366                     } else if (!feat_prod.GetGeneral().IsSetDb()) {
367                         do_replace = true;
368                     }
369                 } else {
370                     do_replace = true;
371                 }
372                 if (do_replace) {
373                     (*id)->Assign(feat.GetProduct().GetWhole());
374                 }
375             }
376         }
377     }
378     // fix molinfo
379     if (prot_seq->IsSetDescr()) {
380         NON_CONST_ITERATE(CBioseq::TDescr::Tdata, mi, prot_seq->SetDescr().Set()) {
381             if ((*mi)->IsMolinfo()) {
382                 feature::AdjustProteinMolInfoToMatchCDS((*mi)->SetMolinfo(), feat);
383             }
384         }
385     }
386 
387     // also fix protein feature locations
388     if (prot_seq->IsSetAnnot()) {
389         CRef<CSeq_loc_Mapper> nuc2prot_mapper(
390             new CSeq_loc_Mapper(orig_cds, CSeq_loc_Mapper::eLocationToProduct, &scope));
391         CRef<CSeq_loc> prot_shadow = nuc2prot_mapper->Map(feat.GetLocation());
392         TSeqPos start = prot_shadow->GetStart(eExtreme_Positional);
393         TSeqPos stop = prot_shadow->GetStop(eExtreme_Positional);
394         NON_CONST_ITERATE(CBioseq::TAnnot, ait, prot_seq->SetAnnot()) {
395             if ((*ait)->IsFtable()) {
396                 CSeq_annot::TData::TFtable::iterator fit = (*ait)->SetData().SetFtable().begin();
397                 while (fit != (*ait)->SetData().SetFtable().end()) {
398                     CRef<CSeq_loc> new_prot_loc(new CSeq_loc());
399                     new_prot_loc->Assign((*fit)->GetLocation());
400                     bool complete_cut = false;
401                     bool adjusted = false;
402                     TSeqPos removed = 0;
403                     if (start > 0) {
404                         SeqLocAdjustForTrim(*new_prot_loc, 0, start - 1, orig_cds.GetProduct().GetId(), complete_cut, removed, adjusted);
405                     }
406                     if (!complete_cut && stop < orig_len - 1) {
407                         SeqLocAdjustForTrim(*new_prot_loc, stop + 1, orig_len - 1, orig_cds.GetProduct().GetId(), complete_cut, removed, adjusted);
408                     }
409                     if (complete_cut) {
410                         // out of range, drop this one
411                         fit = (*ait)->SetData().SetFtable().erase(fit);
412                     } else {
413                         new_prot_loc->SetId(feat_prod);
414                         AdjustProteinFeaturePartialsToMatchCDS(**fit, feat);
415                         if (!feat_prod.Equals(orig_cds.GetProduct().GetWhole())) {
416                             // fix sequence ID
417                             (*fit)->SetLocation().Assign(*new_prot_loc);
418                         }
419                     }
420                     ++fit;
421                 }
422             }
423         }
424     }
425 
426     return prot_seq;
427 }
428 
429 
x_AdjustCodebreaks(CSeq_feat & feat)430 void CFeatGapInfo::x_AdjustCodebreaks(CSeq_feat& feat)
431 {
432     if (!feat.IsSetData() || !feat.GetData().IsCdregion() ||
433         !feat.GetData().GetCdregion().IsSetCode_break()) {
434         return;
435     }
436     CCdregion& cdr = feat.SetData().SetCdregion();
437     CCdregion::TCode_break::iterator cit = cdr.SetCode_break().begin();
438     while (cit != cdr.SetCode_break().end()) {
439         bool do_remove = false;
440         if ((*cit)->IsSetLoc()) {
441             CRef<CSeq_loc> new_loc = feat.GetLocation().Intersect((*cit)->GetLoc(), 0, NULL);
442             if (new_loc && !new_loc->IsEmpty() && !new_loc->IsNull()) {
443                 (*cit)->SetLoc().Assign(*new_loc);
444             } else {
445                 do_remove = true;
446             }
447         }
448         if (do_remove) {
449             cit = cdr.SetCode_break().erase(cit);
450         } else {
451             ++cit;
452         }
453     }
454     if (cdr.GetCode_break().empty()) {
455         cdr.ResetCode_break();
456     }
457 }
458 
459 
x_AdjustAnticodons(CSeq_feat & feat)460 void CFeatGapInfo::x_AdjustAnticodons(CSeq_feat& feat)
461 {
462     if (!feat.IsSetData() || !feat.GetData().IsRna() ||
463         !feat.GetData().GetRna().IsSetExt() ||
464         !feat.GetData().GetRna().GetExt().IsTRNA()) {
465         return;
466     }
467     CTrna_ext& trna = feat.SetData().SetRna().SetExt().SetTRNA();
468     if (!trna.IsSetAnticodon()) {
469         return;
470     }
471     CRef<CSeq_loc> new_loc = feat.GetLocation().Intersect(trna.GetAnticodon(), 0, NULL);
472     if (new_loc && !new_loc->IsEmpty() && !new_loc->IsNull()) {
473         trna.SetAnticodon().Assign(*new_loc);
474     } else {
475         trna.ResetAnticodon();
476     }
477 }
478 
479 
s_FixPartial(CSeq_feat & feat)480 void s_FixPartial(CSeq_feat& feat)
481 {
482     if (feat.GetLocation().IsPartialStart(eExtreme_Biological) ||
483         feat.GetLocation().IsPartialStop(eExtreme_Biological)) {
484         feat.SetPartial(true);
485     }
486 }
487 
488 
489 // returns a list of features to replace the original
490 // if list is empty, feature should be removed
491 // list should only contain one element if split is not specified
492 // coding regions should be retranslated after split
AdjustForRelevantGapIntervals(bool make_partial,bool trim,bool split,bool in_intron,bool create_general_only)493 vector<CRef<CSeq_feat> > CFeatGapInfo::AdjustForRelevantGapIntervals(bool make_partial, bool trim, bool split, bool in_intron, bool create_general_only)
494 {
495     CRef<objects::CSeq_feat> new_feat(new CSeq_feat);
496     new_feat->Assign(*m_Feature.GetOriginalSeq_feat());
497     vector<CRef<CSeq_feat> > rval;
498 
499     if (!trim && !split) {
500         rval.push_back(new_feat);
501         return rval;
502     } else if (ShouldRemove()) {
503         return rval;
504     }
505 
506     if (trim && Trimmable()) {
507         Trim(new_feat->SetLocation(), make_partial, m_Feature.GetScope());
508         new_feat->SetPartial(new_feat->GetLocation().IsPartialStart(objects::eExtreme_Positional) || new_feat->GetLocation().IsPartialStop(objects::eExtreme_Positional));
509         if (new_feat->GetData().IsCdregion()) {
510             // adjust frame
511             TSeqPos frame_adjust = sequence::LocationOffset(m_Feature.GetLocation(), new_feat->GetLocation(),
512                 sequence::eOffset_FromStart, &(m_Feature.GetScope()));
513             x_AdjustFrame(new_feat->SetData().SetCdregion(), frame_adjust);
514         }
515     }
516 
517     if (split) {
518         const string cds_gap_comment = "coding region disrupted by sequencing gap";
519 
520         vector<CRef<CSeq_loc> > locs = Split(new_feat->GetLocation(), in_intron, make_partial);
521         if (locs.size() > 0) {
522             // set comment
523             if (!new_feat->IsSetComment())
524             {
525                 new_feat->SetComment(cds_gap_comment);
526             } else if (new_feat->IsSetComment() && new_feat->GetComment().find(cds_gap_comment) == string::npos)
527             {
528                 string comment = new_feat->GetComment();
529                 comment = comment + "; " + cds_gap_comment;
530                 new_feat->SetComment(comment);
531             }
532 
533             // adjust transcript id if splitting
534             size_t transcript_id_offset = 0;
535             string transcript_id_label;
536             size_t protein_id_offset = 0;
537             string protein_id_label;
538             int protein_seqid_offset = 0;
539             string protein_seqid_label;
540 
541 
542             ITERATE(vector<CRef<CSeq_loc> >, lit, locs) {
543                 CRef<CSeq_feat> split_feat(new CSeq_feat());
544                 // copy from original
545                 split_feat->Assign(*new_feat);
546                 // with new location
547                 split_feat->SetLocation().Assign(**lit);
548                 split_feat->SetPartial(split_feat->GetLocation().IsPartialStart(eExtreme_Positional) || new_feat->GetLocation().IsPartialStop(eExtreme_Positional));
549                 //adjust transcript id
550                 x_AdjustOrigLabel(*split_feat, transcript_id_offset, transcript_id_label, "orig_transcript_id");
551                 x_AdjustOrigLabel(*split_feat, protein_id_offset, protein_id_label, "orig_protein_id");
552                 if (split_feat->GetData().IsCdregion()) {
553                     // adjust frame
554                     TSeqPos frame_adjust = sequence::LocationOffset(new_feat->GetLocation(), split_feat->GetLocation(),
555                         sequence::eOffset_FromStart, &(m_Feature.GetScope()));
556                     x_AdjustFrame(split_feat->SetData().SetCdregion(), frame_adjust);
557                     // adjust product ID
558                     if (split_feat->IsSetProduct() && split_feat->GetProduct().IsWhole() && protein_seqid_offset > 0) {
559                         objects::CBioseq_Handle product = m_Feature.GetScope().GetBioseqHandle(split_feat->GetProduct());
560                         CRef<CSeq_id> new_id;
561                         if (product)
562                         {
563                             vector<CRef<objects::CSeq_id> > new_ids = objects::edit::GetNewProtIdFromExistingProt(product, protein_seqid_offset, protein_seqid_label);
564                             new_id = FindBestChoice(new_ids, CSeq_id::Score);
565                         }
566                         else
567                         {
568                             objects::CBioseq_Handle bsh = m_Feature.GetScope().GetBioseqHandle(split_feat->GetLocation());
569                             new_id = objects::edit::GetNewProtId(bsh, protein_seqid_offset, protein_seqid_label, create_general_only);
570                         }
571                         split_feat->SetProduct().SetWhole().Assign(*new_id);
572                     }
573                     protein_seqid_offset++;
574                 }
575                 rval.push_back(split_feat);
576             }
577 
578         } else {
579             rval.push_back(new_feat);
580         }
581     } else {
582         rval.push_back(new_feat);
583     }
584     NON_CONST_ITERATE(vector<CRef<CSeq_feat> >, it, rval) {
585         x_AdjustCodebreaks(**it);
586         x_AdjustAnticodons(**it);
587         s_FixPartial(**it);
588     }
589     return rval;
590 }
591 
592 
x_AdjustFrame(CCdregion & cdregion,TSeqPos frame_adjust)593 void CFeatGapInfo::x_AdjustFrame(CCdregion& cdregion, TSeqPos frame_adjust)
594 {
595     frame_adjust = frame_adjust % 3;
596     if (frame_adjust == 0) {
597         return;
598     }
599 
600     CCdregion::TFrame orig_frame = cdregion.SetFrame();
601     if (orig_frame == CCdregion::eFrame_not_set) {
602         orig_frame = CCdregion::eFrame_one;
603     }
604 
605     if (frame_adjust == 1) {
606         if (orig_frame == CCdregion::eFrame_one) {
607             cdregion.SetFrame(CCdregion::eFrame_three); //
608         } else if (orig_frame == CCdregion::eFrame_two) {
609             cdregion.SetFrame(CCdregion::eFrame_one); //
610         } else if (orig_frame == CCdregion::eFrame_three) {
611             cdregion.SetFrame(CCdregion::eFrame_two);
612         }
613     } else if (frame_adjust == 2) {
614         if (orig_frame == CCdregion::eFrame_one) {
615             cdregion.SetFrame(CCdregion::eFrame_two);
616         } else if (orig_frame == CCdregion::eFrame_two) {
617             cdregion.SetFrame(CCdregion::eFrame_three);
618         } else if (orig_frame == CCdregion::eFrame_three) {
619             cdregion.SetFrame(CCdregion::eFrame_one); //
620         }
621     }
622 }
623 
624 
ListGappedFeatures(CFeat_CI & feat_it,CScope & scope)625 TGappedFeatList ListGappedFeatures(CFeat_CI& feat_it, CScope& scope)
626 {
627     TGappedFeatList gapped_feats;
628     while (feat_it) {
629         if (!feat_it->GetData().IsProt()) {
630             CRef<CFeatGapInfo> fgap(new CFeatGapInfo(*feat_it));
631             if (fgap->HasKnown() || fgap->HasUnknown() || fgap->HasNs()) {
632                 gapped_feats.push_back(fgap);
633             }
634         }
635         ++feat_it;
636     }
637     return gapped_feats;
638 }
639 
640 
ProcessForTrimAndSplitUpdates(CSeq_feat_Handle cds,vector<CRef<CSeq_feat>> updates)641 void ProcessForTrimAndSplitUpdates(CSeq_feat_Handle cds, vector<CRef<CSeq_feat> > updates)
642 {
643     CBioseq_Handle orig_prot_handle = cds.GetScope().GetBioseqHandle(cds.GetProduct());
644     CConstRef<CBioseq> orig_prot = orig_prot_handle.GetCompleteBioseq();
645     CBioseq_set_Handle nph = orig_prot_handle.GetParentBioseq_set();
646     CBioseq_set_EditHandle npeh(nph);
647     // need to remove original first if not changing sequence IDs
648     CBioseq_EditHandle beh(orig_prot_handle);
649     beh.Remove();
650     ITERATE(vector<CRef<CSeq_feat> >, it, updates) {
651         CRef<CBioseq> new_prot = CFeatGapInfo::AdjustProteinSeq(*orig_prot, **it, *(cds.GetSeq_feat()), cds.GetScope());
652         if (new_prot) {
653             npeh.AttachBioseq(*new_prot);
654         }
655     }
656 
657     CSeq_annot_Handle sah = cds.GetAnnot();
658     CSeq_annot_EditHandle saeh = sah.GetEditHandle();
659     CSeq_feat_EditHandle feh(cds);
660     if (updates.size() == 0) {
661         feh.Remove();
662     } else {
663         feh.Replace(*(updates[0]));
664         for (size_t i = 1; i < updates.size(); i++) {
665             saeh.AddFeat(*(updates[i]));
666         }
667     }
668 }
669 
670 
671 
672 // for fixing feature IDs and feature ID xrefs
FixFeatureIdsForUpdates(CSeq_feat & feat,CObject_id::TId & next_id)673 void FixFeatureIdsForUpdates(CSeq_feat& feat, CObject_id::TId& next_id)
674 {
675     if (feat.IsSetId() && feat.GetId().IsLocal() &&
676         feat.GetId().GetLocal().IsId()) {
677         feat.SetId().SetLocal().SetId(next_id);
678         ++next_id;
679     }
680 
681 }
682 
683 
FixFeatureIdsForUpdates(vector<CRef<CSeq_feat>> updates,CObject_id::TId & next_id)684 void FixFeatureIdsForUpdates(vector<CRef<CSeq_feat> > updates, CObject_id::TId& next_id)
685 {
686     for (size_t i = 1; i < updates.size(); i++) {
687         FixFeatureIdsForUpdates(*(updates[i]), next_id);
688     }
689 }
690 
691 
s_IsRelated(const CSeq_feat & f1,CObject_id::TId search)692 bool s_IsRelated(const CSeq_feat& f1, CObject_id::TId search)
693 {
694     if (!f1.IsSetXref()) {
695         return false;
696     }
697     ITERATE(CSeq_feat::TXref, xit, f1.GetXref()) {
698         if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal() &&
699             (*xit)->GetId().GetLocal().IsId() &&
700             (*xit)->GetId().GetLocal().GetId() == search) {
701             return true;
702         }
703     }
704     return false;
705 }
706 
707 
s_IsRelated(const CSeq_feat & f1,const CSeq_feat & f2)708 bool s_IsRelated(const CSeq_feat& f1, const CSeq_feat& f2)
709 {
710     if (f1.IsSetId() && f1.GetId().IsLocal() && f1.GetId().GetLocal().IsId() &&
711         s_IsRelated(f2, f1.GetId().GetLocal().GetId())) {
712         return true;
713     } else if (f2.IsSetId() && f2.GetId().IsLocal() && f2.GetId().GetLocal().IsId() &&
714         s_IsRelated(f1, f2.GetId().GetLocal().GetId())) {
715         return true;
716     } else {
717         return false;
718     }
719 }
720 
721 
IsRelatedByCrossRef(const CFeatGapInfo & other) const722 bool CFeatGapInfo::IsRelatedByCrossRef(const CFeatGapInfo& other) const
723 {
724     return s_IsRelated(*(GetFeature().GetSeq_feat()), *(other.GetFeature().GetSeq_feat()));
725 }
726 
727 
728 
s_ReplaceFeatureIdXref(CSeq_feat & f,CObject_id::TId orig_id,CObject_id::TId new_id)729 void s_ReplaceFeatureIdXref(CSeq_feat& f, CObject_id::TId orig_id, CObject_id::TId new_id)
730 {
731     if (orig_id > 0 && new_id > 0 && f.IsSetXref()) {
732         NON_CONST_ITERATE(CSeq_feat::TXref, xit, f.SetXref()) {
733             if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal() &&
734                 (*xit)->GetId().GetLocal().IsId() &&
735                 (*xit)->GetId().GetLocal().GetId() == orig_id) {
736                 (*xit)->SetId().SetLocal().SetId(new_id);
737             }
738         }
739     }
740 }
741 
742 
FixFeatureIdsForUpdatePair(vector<CRef<CSeq_feat>> & updates1,vector<CRef<CSeq_feat>> & updates2)743 void FixFeatureIdsForUpdatePair(vector<CRef<CSeq_feat> >& updates1, vector<CRef<CSeq_feat> >& updates2)
744 {
745     if (updates1.size() != updates2.size()) {
746         // can't fix if lists are different lengths
747         return;
748     }
749     if (updates1.size() < 2) {
750         // nothing to fix if there's only one object
751     }
752     vector<CRef<CSeq_feat> >::iterator u1 = updates1.begin();
753     vector<CRef<CSeq_feat> >::iterator u2 = updates2.begin();
754 
755     CObject_id::TId orig_id_1 = 0;
756     if ((*u1)->IsSetId() && (*u1)->GetId().IsLocal() && (*u1)->GetId().GetLocal().IsId()) {
757         orig_id_1 = (*u1)->GetId().GetLocal().GetId();
758     }
759     CObject_id::TId orig_id_2 = 0;
760     if ((*u2)->IsSetId() && (*u2)->GetId().IsLocal() && (*u2)->GetId().GetLocal().IsId()) {
761         orig_id_2 = (*u2)->GetId().GetLocal().GetId();
762     }
763     u1++;
764     u2++;
765 
766     while (u1 != updates1.end() && u2 != updates2.end()) {
767         CObject_id::TId new_id_1 = 0;
768         if ((*u1)->IsSetId() && (*u1)->GetId().IsLocal() && (*u1)->GetId().GetLocal().IsId()) {
769             new_id_1 = (*u1)->GetId().GetLocal().GetId();
770         }
771         CObject_id::TId new_id_2 = 0;
772         if ((*u2)->IsSetId() && (*u2)->GetId().IsLocal() && (*u2)->GetId().GetLocal().IsId()) {
773             new_id_2 = (*u2)->GetId().GetLocal().GetId();
774         }
775         s_ReplaceFeatureIdXref(**u1, orig_id_2, new_id_2);
776         s_ReplaceFeatureIdXref(**u2, orig_id_1, new_id_1);
777         ++u1;
778         ++u2;
779     }
780 }
781 
782 
783 END_SCOPE(edit)
784 END_SCOPE(objects)
785 END_NCBI_SCOPE
786