1 /* $Id: gap_trim.cpp 632623 2021-06-03 17:38:11Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Colleen Bollin
27 *
28 * File Description:
29 * For adjusting features for gaps
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <objtools/edit/gap_trim.hpp>
36 #include <objtools/edit/loc_edit.hpp>
37 #include <objtools/edit/cds_fix.hpp>
38 #include <objmgr/util/feature.hpp>
39 #include <objmgr/seq_map.hpp>
40 #include <objmgr/seq_map_ci.hpp>
41 #include <objmgr/seq_vector.hpp>
42
43 #include <objmgr/util/seq_loc_util.hpp>
44 #include <objmgr/util/sequence.hpp>
45 #include <objects/seqfeat/Seq_feat.hpp>
46 #include <objects/seqfeat/Code_break.hpp>
47 #include <objects/seqfeat/Feat_id.hpp>
48 #include <objects/seqfeat/RNA_ref.hpp>
49 #include <objects/seqfeat/Trna_ext.hpp>
50 #include <objects/general/Dbtag.hpp>
51 #include <objects/general/Object_id.hpp>
52 #include <objects/seq/Seq_descr.hpp>
53 #include <objects/seq/Seqdesc.hpp>
54 #include <objtools/edit/cds_fix.hpp>
55
56 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)57 BEGIN_SCOPE(objects)
58 BEGIN_SCOPE(edit)
59
60 CFeatGapInfo::CFeatGapInfo(CSeq_feat_Handle sf)
61 {
62 m_Feature = sf;
63 CollectGaps(sf.GetLocation(), sf.GetScope());
64 }
65
66
CollectGaps(const CSeq_loc & feat_loc,CScope & scope)67 void CFeatGapInfo::CollectGaps(const CSeq_loc& feat_loc, CScope& scope)
68 {
69 m_Gaps.clear();
70 m_Unknown = false;
71 m_Known = false;
72 m_Ns = false;
73
74 m_Start = feat_loc.GetStart(objects::eExtreme_Positional);
75 m_Stop = feat_loc.GetStop(objects::eExtreme_Positional);
76 CRef<CSeq_loc> total_loc = sequence::Seq_loc_Merge(feat_loc, CSeq_loc::fMerge_SingleRange, &scope);
77 if (total_loc->IsSetStrand()) total_loc->ResetStrand();
78 CConstRef<objects::CSeqMap> seq_map = CSeqMap::GetSeqMapForSeq_loc(*total_loc, &scope);
79
80 // use CSeqVector for finding Ns
81 CSeqVector vec(*seq_map, scope, CBioseq_Handle::eCoding_Iupac);
82
83 CSeqMap_CI seq_map_ci = seq_map->ResolvedRangeIterator(&scope,
84 0,
85 m_Stop - m_Start + 1,
86 eNa_strand_plus, //feat_loc.GetStrand(),
87 size_t(-1),
88 objects::CSeqMap::fFindGap | objects::CSeqMap::fFindData);
89
90 for (; seq_map_ci; ++seq_map_ci)
91 {
92
93 if (seq_map_ci.GetType() == objects::CSeqMap::eSeqGap)
94 {
95 TSeqPos gap_start = m_Start + seq_map_ci.GetPosition();
96 TSeqPos gap_stop = gap_start + seq_map_ci.GetLength() - 1;
97 bool is_unknown = seq_map_ci.IsUnknownLength();
98 if (is_unknown) {
99 m_Unknown = true;
100 } else {
101 m_Known = true;
102 }
103 m_Gaps.push_back(TGapInterval(is_unknown ? eGapIntervalType_unknown : eGapIntervalType_known, pair<size_t, size_t>(gap_start, gap_stop)));
104 } else {
105 // look for Ns
106 TSeqPos map_start = seq_map_ci.GetPosition();
107 TSeqPos map_stop = map_start + seq_map_ci.GetLength() - 1;
108 bool in_ns = false;
109 TSeqPos gap_start;
110 for (TSeqPos i = map_start; i <= map_stop; ++i) {
111 char letter = vec[i];
112 if (letter == 'N') {
113 if (!in_ns) {
114 // start new gap
115 gap_start = m_Start + i;
116 in_ns = true;
117 }
118 } else if (in_ns) {
119 // end previous gap
120 TSeqPos gap_stop = m_Start + i - 1;
121 m_Gaps.push_back(TGapInterval(eGapIntervalType_n, pair<size_t, size_t>(gap_start, gap_stop)));
122 m_Ns = true;
123 in_ns = false;
124 }
125 }
126 if (in_ns) {
127 TSeqPos gap_stop = m_Start + map_stop;
128 m_Gaps.push_back(TGapInterval(eGapIntervalType_n, pair<size_t, size_t>(gap_start, gap_stop)));
129 m_Ns = true;
130 }
131 }
132 }
133 }
134
135
x_UsableInterval(const TGapInterval & interval,bool unknown_length,bool known_length,bool ns)136 bool CFeatGapInfo::x_UsableInterval(const TGapInterval& interval, bool unknown_length, bool known_length, bool ns)
137 {
138 if (interval.first == eGapIntervalType_unknown && !unknown_length) {
139 return false;
140 } else if (interval.first == eGapIntervalType_known && !known_length) {
141 return false;
142 } else if (interval.first == eGapIntervalType_n && !ns) {
143 return false;
144 } else {
145 return true;
146 }
147 }
148
149
CalculateRelevantIntervals(bool unknown_length,bool known_length,bool ns)150 void CFeatGapInfo::CalculateRelevantIntervals(bool unknown_length, bool known_length, bool ns)
151 {
152 m_InsideGaps.clear();
153 m_LeftGaps.clear();
154 m_RightGaps.clear();
155
156 if (!m_Gaps.empty()) {
157 // find left offset
158 size_t skip_left = 0;
159 TGapIntervalList::iterator it = m_Gaps.begin();
160 while (it != m_Gaps.end()) {
161 if (!x_UsableInterval(*it, unknown_length, known_length, ns)) {
162 break;
163 }
164
165 if (m_LeftGaps.empty()) {
166 if (it->second.first <= m_Start && it->second.second >= m_Start) {
167 m_LeftGaps.push_back(it->second);
168 skip_left++;
169 } else {
170 break;
171 }
172 } else if (it->second.first <= m_LeftGaps.front().second + 1 && it->second.second >= m_LeftGaps.front().second) {
173 m_LeftGaps.front().second = it->second.second;
174 skip_left++;
175 } else {
176 break;
177 }
178 ++it;
179 }
180 TGapIntervalList::reverse_iterator rit = m_Gaps.rbegin();
181 size_t skip_right = 0;
182 while (rit != m_Gaps.rend()) {
183 if (!x_UsableInterval(*rit, unknown_length, known_length, ns)) {
184 break;
185 }
186 if (m_RightGaps.empty()) {
187 if (rit->second.first <= m_Stop && rit->second.second >= m_Stop) {
188 m_RightGaps.push_back(rit->second);
189 skip_right++;
190 } else {
191 break;
192 }
193 } else if (rit->second.first <= m_RightGaps.front().first - 1 && rit->second.second >= m_RightGaps.front().second) {
194 m_RightGaps.front().first = rit->second.first;
195 skip_right++;
196 } else {
197 break;
198 }
199 ++rit;
200 }
201 for (size_t offset = skip_left; offset < m_Gaps.size() - skip_right; offset++) {
202 if (x_UsableInterval(m_Gaps[offset], unknown_length, known_length, ns)) {
203 m_InsideGaps.push_back(m_Gaps[offset].second);
204 }
205 }
206 }
207 }
208
209
Trimmable() const210 bool CFeatGapInfo::Trimmable() const
211 {
212 if (ShouldRemove()) {
213 return false;
214 } else if (!m_LeftGaps.empty() || !m_RightGaps.empty()) {
215 return true;
216 } else {
217 return false;
218 }
219 }
220
221
Splittable() const222 bool CFeatGapInfo::Splittable() const
223 {
224 if (!m_InsideGaps.empty()) {
225 return true;
226 } else {
227 return false;
228 }
229 }
230
231
ShouldRemove() const232 bool CFeatGapInfo::ShouldRemove() const
233 {
234 if (!m_LeftGaps.empty() && m_LeftGaps.front().second >= m_Stop) {
235 return true;
236 } else {
237 return false;
238 }
239 }
240
241
Trim(CSeq_loc & loc,bool make_partial,CScope & scope)242 void CFeatGapInfo::Trim(CSeq_loc& loc, bool make_partial, CScope& scope)
243 {
244 CRef<CSeq_id> seqid(new CSeq_id);
245 seqid->Assign(*loc.GetId());
246 for (vector<pair<size_t, size_t> >::reverse_iterator b = m_LeftGaps.rbegin(); b != m_LeftGaps.rend(); ++b)
247 {
248 size_t start = b->first;
249 size_t stop = b->second;
250 CRef<CSeq_loc> loc2(new CSeq_loc());
251 int options = edit::eSplitLocOption_split_in_exon | edit::eSplitLocOption_split_in_intron;
252 if (make_partial)
253 options |= edit::eSplitLocOption_make_partial;
254 edit::SplitLocationForGap(loc, *loc2, start, stop, seqid.GetPointer(), options);
255 if (loc2->Which() != CSeq_loc::e_not_set)
256 {
257 loc.Assign(*loc2);
258 }
259 }
260 for (vector<pair<size_t, size_t> >::reverse_iterator b = m_RightGaps.rbegin(); b != m_RightGaps.rend(); ++b)
261 {
262 size_t start = b->first;
263 size_t stop = b->second;
264 CRef<CSeq_loc> loc2(new CSeq_loc());
265 int options = edit::eSplitLocOption_split_in_exon;
266 if (make_partial)
267 options |= edit::eSplitLocOption_make_partial;
268 edit::SplitLocationForGap(loc, *loc2, start, stop, seqid.GetPointer(), options);
269 }
270 }
271
272
Split(const CSeq_loc & orig,bool in_intron,bool make_partial)273 CFeatGapInfo::TLocList CFeatGapInfo::Split(const CSeq_loc& orig, bool in_intron, bool make_partial)
274 {
275 TLocList locs;
276 CRef<CSeq_loc> left_loc(new CSeq_loc());
277 left_loc->Assign(orig);
278 CRef<CSeq_id> seqid(new CSeq_id);
279 seqid->Assign(*orig.GetId());
280 for (vector<pair<size_t, size_t> >::reverse_iterator b = m_InsideGaps.rbegin(); b != m_InsideGaps.rend(); ++b)
281 {
282 size_t start = b->first;
283 size_t stop = b->second;
284 CRef<CSeq_loc> loc2(new CSeq_loc());
285 int options = edit::eSplitLocOption_make_partial | edit::eSplitLocOption_split_in_exon;
286 if (in_intron)
287 options |= edit::eSplitLocOption_split_in_intron;
288 edit::SplitLocationForGap(*left_loc, *loc2, start, stop, seqid.GetPointer(), options);
289 if (loc2->GetId())
290 {
291 if (make_partial)
292 {
293 loc2->SetPartialStart(true, objects::eExtreme_Positional);
294 if (left_loc->Which() != CSeq_loc::e_not_set)
295 {
296 left_loc->SetPartialStop(true, objects::eExtreme_Positional);
297 }
298 }
299 locs.push_back(loc2);
300 }
301 }
302 if (locs.size() > 0) {
303 if (left_loc->Which() != CSeq_loc::e_not_set)
304 {
305 locs.push_back(left_loc);
306 }
307 reverse(locs.begin(), locs.end());
308 }
309 return locs;
310 }
311
312
x_AdjustOrigLabel(CSeq_feat & feat,size_t & id_offset,string & id_label,const string & qual)313 void CFeatGapInfo::x_AdjustOrigLabel(CSeq_feat& feat, size_t& id_offset, string& id_label, const string& qual)
314 {
315 if (!feat.IsSetQual()) {
316 return;
317 }
318 NON_CONST_ITERATE(CSeq_feat::TQual, it, feat.SetQual()) {
319 if ((*it)->IsSetQual() && (*it)->IsSetVal() &&
320 !NStr::IsBlank((*it)->GetVal()) &&
321 NStr::EqualNocase((*it)->GetQual(), qual) &&
322 (id_label.empty() || NStr::Equal((*it)->GetVal(), id_label) || NStr::Equal((*it)->GetVal(), id_label + "_1"))) {
323 if (id_label.empty()) {
324 id_label = (*it)->GetVal();
325 }
326 if (id_offset != 0)
327 (*it)->SetVal(id_label + "_" + NStr::NumericToString(id_offset));
328 id_offset++;
329 }
330 }
331 }
332
AdjustProteinSeq(const CBioseq & seq,const CSeq_feat & feat,const CSeq_feat & orig_cds,CScope & scope)333 CRef<CBioseq> CFeatGapInfo::AdjustProteinSeq(const CBioseq& seq, const CSeq_feat& feat, const CSeq_feat& orig_cds, CScope& scope)
334 {
335 if (!feat.IsSetProduct() || !feat.GetProduct().IsWhole() || !seq.IsAa()) {
336 return CRef<CBioseq>(NULL);
337 }
338
339 TSeqPos orig_len = seq.GetInst().GetLength();
340
341 string prot;
342 CSeqTranslator::Translate(feat, scope, prot);
343 if (prot.empty()) {
344 return CRef<CBioseq>(NULL);
345 }
346
347 CRef<CBioseq> prot_seq(new CBioseq);
348 prot_seq->Assign(seq);
349 prot_seq->SetInst().ResetExt();
350 prot_seq->SetInst().SetRepr(objects::CSeq_inst::eRepr_raw);
351 prot_seq->SetInst().SetSeq_data().SetIupacaa().Set(prot);
352 prot_seq->SetInst().SetLength(TSeqPos(prot.length()));
353 prot_seq->SetInst().SetMol(objects::CSeq_inst::eMol_aa);
354 // fix sequence ID
355 const CSeq_id& feat_prod = feat.GetProduct().GetWhole();
356 if (!feat_prod.Equals(orig_cds.GetProduct().GetWhole())) {
357 NON_CONST_ITERATE(CBioseq::TId, id, prot_seq->SetId()) {
358 if ((*id)->Which() == feat_prod.Which()) {
359 bool do_replace = false;
360 if ((*id)->IsGeneral()) {
361 if ((*id)->GetGeneral().IsSetDb()) {
362 if (feat_prod.GetGeneral().IsSetDb() &&
363 NStr::Equal(feat_prod.GetGeneral().GetDb(), (*id)->GetGeneral().GetDb())) {
364 do_replace = true;
365 }
366 } else if (!feat_prod.GetGeneral().IsSetDb()) {
367 do_replace = true;
368 }
369 } else {
370 do_replace = true;
371 }
372 if (do_replace) {
373 (*id)->Assign(feat.GetProduct().GetWhole());
374 }
375 }
376 }
377 }
378 // fix molinfo
379 if (prot_seq->IsSetDescr()) {
380 NON_CONST_ITERATE(CBioseq::TDescr::Tdata, mi, prot_seq->SetDescr().Set()) {
381 if ((*mi)->IsMolinfo()) {
382 feature::AdjustProteinMolInfoToMatchCDS((*mi)->SetMolinfo(), feat);
383 }
384 }
385 }
386
387 // also fix protein feature locations
388 if (prot_seq->IsSetAnnot()) {
389 CRef<CSeq_loc_Mapper> nuc2prot_mapper(
390 new CSeq_loc_Mapper(orig_cds, CSeq_loc_Mapper::eLocationToProduct, &scope));
391 CRef<CSeq_loc> prot_shadow = nuc2prot_mapper->Map(feat.GetLocation());
392 TSeqPos start = prot_shadow->GetStart(eExtreme_Positional);
393 TSeqPos stop = prot_shadow->GetStop(eExtreme_Positional);
394 NON_CONST_ITERATE(CBioseq::TAnnot, ait, prot_seq->SetAnnot()) {
395 if ((*ait)->IsFtable()) {
396 CSeq_annot::TData::TFtable::iterator fit = (*ait)->SetData().SetFtable().begin();
397 while (fit != (*ait)->SetData().SetFtable().end()) {
398 CRef<CSeq_loc> new_prot_loc(new CSeq_loc());
399 new_prot_loc->Assign((*fit)->GetLocation());
400 bool complete_cut = false;
401 bool adjusted = false;
402 TSeqPos removed = 0;
403 if (start > 0) {
404 SeqLocAdjustForTrim(*new_prot_loc, 0, start - 1, orig_cds.GetProduct().GetId(), complete_cut, removed, adjusted);
405 }
406 if (!complete_cut && stop < orig_len - 1) {
407 SeqLocAdjustForTrim(*new_prot_loc, stop + 1, orig_len - 1, orig_cds.GetProduct().GetId(), complete_cut, removed, adjusted);
408 }
409 if (complete_cut) {
410 // out of range, drop this one
411 fit = (*ait)->SetData().SetFtable().erase(fit);
412 } else {
413 new_prot_loc->SetId(feat_prod);
414 AdjustProteinFeaturePartialsToMatchCDS(**fit, feat);
415 if (!feat_prod.Equals(orig_cds.GetProduct().GetWhole())) {
416 // fix sequence ID
417 (*fit)->SetLocation().Assign(*new_prot_loc);
418 }
419 }
420 ++fit;
421 }
422 }
423 }
424 }
425
426 return prot_seq;
427 }
428
429
x_AdjustCodebreaks(CSeq_feat & feat)430 void CFeatGapInfo::x_AdjustCodebreaks(CSeq_feat& feat)
431 {
432 if (!feat.IsSetData() || !feat.GetData().IsCdregion() ||
433 !feat.GetData().GetCdregion().IsSetCode_break()) {
434 return;
435 }
436 CCdregion& cdr = feat.SetData().SetCdregion();
437 CCdregion::TCode_break::iterator cit = cdr.SetCode_break().begin();
438 while (cit != cdr.SetCode_break().end()) {
439 bool do_remove = false;
440 if ((*cit)->IsSetLoc()) {
441 CRef<CSeq_loc> new_loc = feat.GetLocation().Intersect((*cit)->GetLoc(), 0, NULL);
442 if (new_loc && !new_loc->IsEmpty() && !new_loc->IsNull()) {
443 (*cit)->SetLoc().Assign(*new_loc);
444 } else {
445 do_remove = true;
446 }
447 }
448 if (do_remove) {
449 cit = cdr.SetCode_break().erase(cit);
450 } else {
451 ++cit;
452 }
453 }
454 if (cdr.GetCode_break().empty()) {
455 cdr.ResetCode_break();
456 }
457 }
458
459
x_AdjustAnticodons(CSeq_feat & feat)460 void CFeatGapInfo::x_AdjustAnticodons(CSeq_feat& feat)
461 {
462 if (!feat.IsSetData() || !feat.GetData().IsRna() ||
463 !feat.GetData().GetRna().IsSetExt() ||
464 !feat.GetData().GetRna().GetExt().IsTRNA()) {
465 return;
466 }
467 CTrna_ext& trna = feat.SetData().SetRna().SetExt().SetTRNA();
468 if (!trna.IsSetAnticodon()) {
469 return;
470 }
471 CRef<CSeq_loc> new_loc = feat.GetLocation().Intersect(trna.GetAnticodon(), 0, NULL);
472 if (new_loc && !new_loc->IsEmpty() && !new_loc->IsNull()) {
473 trna.SetAnticodon().Assign(*new_loc);
474 } else {
475 trna.ResetAnticodon();
476 }
477 }
478
479
s_FixPartial(CSeq_feat & feat)480 void s_FixPartial(CSeq_feat& feat)
481 {
482 if (feat.GetLocation().IsPartialStart(eExtreme_Biological) ||
483 feat.GetLocation().IsPartialStop(eExtreme_Biological)) {
484 feat.SetPartial(true);
485 }
486 }
487
488
489 // returns a list of features to replace the original
490 // if list is empty, feature should be removed
491 // list should only contain one element if split is not specified
492 // coding regions should be retranslated after split
AdjustForRelevantGapIntervals(bool make_partial,bool trim,bool split,bool in_intron,bool create_general_only)493 vector<CRef<CSeq_feat> > CFeatGapInfo::AdjustForRelevantGapIntervals(bool make_partial, bool trim, bool split, bool in_intron, bool create_general_only)
494 {
495 CRef<objects::CSeq_feat> new_feat(new CSeq_feat);
496 new_feat->Assign(*m_Feature.GetOriginalSeq_feat());
497 vector<CRef<CSeq_feat> > rval;
498
499 if (!trim && !split) {
500 rval.push_back(new_feat);
501 return rval;
502 } else if (ShouldRemove()) {
503 return rval;
504 }
505
506 if (trim && Trimmable()) {
507 Trim(new_feat->SetLocation(), make_partial, m_Feature.GetScope());
508 new_feat->SetPartial(new_feat->GetLocation().IsPartialStart(objects::eExtreme_Positional) || new_feat->GetLocation().IsPartialStop(objects::eExtreme_Positional));
509 if (new_feat->GetData().IsCdregion()) {
510 // adjust frame
511 TSeqPos frame_adjust = sequence::LocationOffset(m_Feature.GetLocation(), new_feat->GetLocation(),
512 sequence::eOffset_FromStart, &(m_Feature.GetScope()));
513 x_AdjustFrame(new_feat->SetData().SetCdregion(), frame_adjust);
514 }
515 }
516
517 if (split) {
518 const string cds_gap_comment = "coding region disrupted by sequencing gap";
519
520 vector<CRef<CSeq_loc> > locs = Split(new_feat->GetLocation(), in_intron, make_partial);
521 if (locs.size() > 0) {
522 // set comment
523 if (!new_feat->IsSetComment())
524 {
525 new_feat->SetComment(cds_gap_comment);
526 } else if (new_feat->IsSetComment() && new_feat->GetComment().find(cds_gap_comment) == string::npos)
527 {
528 string comment = new_feat->GetComment();
529 comment = comment + "; " + cds_gap_comment;
530 new_feat->SetComment(comment);
531 }
532
533 // adjust transcript id if splitting
534 size_t transcript_id_offset = 0;
535 string transcript_id_label;
536 size_t protein_id_offset = 0;
537 string protein_id_label;
538 int protein_seqid_offset = 0;
539 string protein_seqid_label;
540
541
542 ITERATE(vector<CRef<CSeq_loc> >, lit, locs) {
543 CRef<CSeq_feat> split_feat(new CSeq_feat());
544 // copy from original
545 split_feat->Assign(*new_feat);
546 // with new location
547 split_feat->SetLocation().Assign(**lit);
548 split_feat->SetPartial(split_feat->GetLocation().IsPartialStart(eExtreme_Positional) || new_feat->GetLocation().IsPartialStop(eExtreme_Positional));
549 //adjust transcript id
550 x_AdjustOrigLabel(*split_feat, transcript_id_offset, transcript_id_label, "orig_transcript_id");
551 x_AdjustOrigLabel(*split_feat, protein_id_offset, protein_id_label, "orig_protein_id");
552 if (split_feat->GetData().IsCdregion()) {
553 // adjust frame
554 TSeqPos frame_adjust = sequence::LocationOffset(new_feat->GetLocation(), split_feat->GetLocation(),
555 sequence::eOffset_FromStart, &(m_Feature.GetScope()));
556 x_AdjustFrame(split_feat->SetData().SetCdregion(), frame_adjust);
557 // adjust product ID
558 if (split_feat->IsSetProduct() && split_feat->GetProduct().IsWhole() && protein_seqid_offset > 0) {
559 objects::CBioseq_Handle product = m_Feature.GetScope().GetBioseqHandle(split_feat->GetProduct());
560 CRef<CSeq_id> new_id;
561 if (product)
562 {
563 vector<CRef<objects::CSeq_id> > new_ids = objects::edit::GetNewProtIdFromExistingProt(product, protein_seqid_offset, protein_seqid_label);
564 new_id = FindBestChoice(new_ids, CSeq_id::Score);
565 }
566 else
567 {
568 objects::CBioseq_Handle bsh = m_Feature.GetScope().GetBioseqHandle(split_feat->GetLocation());
569 new_id = objects::edit::GetNewProtId(bsh, protein_seqid_offset, protein_seqid_label, create_general_only);
570 }
571 split_feat->SetProduct().SetWhole().Assign(*new_id);
572 }
573 protein_seqid_offset++;
574 }
575 rval.push_back(split_feat);
576 }
577
578 } else {
579 rval.push_back(new_feat);
580 }
581 } else {
582 rval.push_back(new_feat);
583 }
584 NON_CONST_ITERATE(vector<CRef<CSeq_feat> >, it, rval) {
585 x_AdjustCodebreaks(**it);
586 x_AdjustAnticodons(**it);
587 s_FixPartial(**it);
588 }
589 return rval;
590 }
591
592
x_AdjustFrame(CCdregion & cdregion,TSeqPos frame_adjust)593 void CFeatGapInfo::x_AdjustFrame(CCdregion& cdregion, TSeqPos frame_adjust)
594 {
595 frame_adjust = frame_adjust % 3;
596 if (frame_adjust == 0) {
597 return;
598 }
599
600 CCdregion::TFrame orig_frame = cdregion.SetFrame();
601 if (orig_frame == CCdregion::eFrame_not_set) {
602 orig_frame = CCdregion::eFrame_one;
603 }
604
605 if (frame_adjust == 1) {
606 if (orig_frame == CCdregion::eFrame_one) {
607 cdregion.SetFrame(CCdregion::eFrame_three); //
608 } else if (orig_frame == CCdregion::eFrame_two) {
609 cdregion.SetFrame(CCdregion::eFrame_one); //
610 } else if (orig_frame == CCdregion::eFrame_three) {
611 cdregion.SetFrame(CCdregion::eFrame_two);
612 }
613 } else if (frame_adjust == 2) {
614 if (orig_frame == CCdregion::eFrame_one) {
615 cdregion.SetFrame(CCdregion::eFrame_two);
616 } else if (orig_frame == CCdregion::eFrame_two) {
617 cdregion.SetFrame(CCdregion::eFrame_three);
618 } else if (orig_frame == CCdregion::eFrame_three) {
619 cdregion.SetFrame(CCdregion::eFrame_one); //
620 }
621 }
622 }
623
624
ListGappedFeatures(CFeat_CI & feat_it,CScope & scope)625 TGappedFeatList ListGappedFeatures(CFeat_CI& feat_it, CScope& scope)
626 {
627 TGappedFeatList gapped_feats;
628 while (feat_it) {
629 if (!feat_it->GetData().IsProt()) {
630 CRef<CFeatGapInfo> fgap(new CFeatGapInfo(*feat_it));
631 if (fgap->HasKnown() || fgap->HasUnknown() || fgap->HasNs()) {
632 gapped_feats.push_back(fgap);
633 }
634 }
635 ++feat_it;
636 }
637 return gapped_feats;
638 }
639
640
ProcessForTrimAndSplitUpdates(CSeq_feat_Handle cds,vector<CRef<CSeq_feat>> updates)641 void ProcessForTrimAndSplitUpdates(CSeq_feat_Handle cds, vector<CRef<CSeq_feat> > updates)
642 {
643 CBioseq_Handle orig_prot_handle = cds.GetScope().GetBioseqHandle(cds.GetProduct());
644 CConstRef<CBioseq> orig_prot = orig_prot_handle.GetCompleteBioseq();
645 CBioseq_set_Handle nph = orig_prot_handle.GetParentBioseq_set();
646 CBioseq_set_EditHandle npeh(nph);
647 // need to remove original first if not changing sequence IDs
648 CBioseq_EditHandle beh(orig_prot_handle);
649 beh.Remove();
650 ITERATE(vector<CRef<CSeq_feat> >, it, updates) {
651 CRef<CBioseq> new_prot = CFeatGapInfo::AdjustProteinSeq(*orig_prot, **it, *(cds.GetSeq_feat()), cds.GetScope());
652 if (new_prot) {
653 npeh.AttachBioseq(*new_prot);
654 }
655 }
656
657 CSeq_annot_Handle sah = cds.GetAnnot();
658 CSeq_annot_EditHandle saeh = sah.GetEditHandle();
659 CSeq_feat_EditHandle feh(cds);
660 if (updates.size() == 0) {
661 feh.Remove();
662 } else {
663 feh.Replace(*(updates[0]));
664 for (size_t i = 1; i < updates.size(); i++) {
665 saeh.AddFeat(*(updates[i]));
666 }
667 }
668 }
669
670
671
672 // for fixing feature IDs and feature ID xrefs
FixFeatureIdsForUpdates(CSeq_feat & feat,CObject_id::TId & next_id)673 void FixFeatureIdsForUpdates(CSeq_feat& feat, CObject_id::TId& next_id)
674 {
675 if (feat.IsSetId() && feat.GetId().IsLocal() &&
676 feat.GetId().GetLocal().IsId()) {
677 feat.SetId().SetLocal().SetId(next_id);
678 ++next_id;
679 }
680
681 }
682
683
FixFeatureIdsForUpdates(vector<CRef<CSeq_feat>> updates,CObject_id::TId & next_id)684 void FixFeatureIdsForUpdates(vector<CRef<CSeq_feat> > updates, CObject_id::TId& next_id)
685 {
686 for (size_t i = 1; i < updates.size(); i++) {
687 FixFeatureIdsForUpdates(*(updates[i]), next_id);
688 }
689 }
690
691
s_IsRelated(const CSeq_feat & f1,CObject_id::TId search)692 bool s_IsRelated(const CSeq_feat& f1, CObject_id::TId search)
693 {
694 if (!f1.IsSetXref()) {
695 return false;
696 }
697 ITERATE(CSeq_feat::TXref, xit, f1.GetXref()) {
698 if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal() &&
699 (*xit)->GetId().GetLocal().IsId() &&
700 (*xit)->GetId().GetLocal().GetId() == search) {
701 return true;
702 }
703 }
704 return false;
705 }
706
707
s_IsRelated(const CSeq_feat & f1,const CSeq_feat & f2)708 bool s_IsRelated(const CSeq_feat& f1, const CSeq_feat& f2)
709 {
710 if (f1.IsSetId() && f1.GetId().IsLocal() && f1.GetId().GetLocal().IsId() &&
711 s_IsRelated(f2, f1.GetId().GetLocal().GetId())) {
712 return true;
713 } else if (f2.IsSetId() && f2.GetId().IsLocal() && f2.GetId().GetLocal().IsId() &&
714 s_IsRelated(f1, f2.GetId().GetLocal().GetId())) {
715 return true;
716 } else {
717 return false;
718 }
719 }
720
721
IsRelatedByCrossRef(const CFeatGapInfo & other) const722 bool CFeatGapInfo::IsRelatedByCrossRef(const CFeatGapInfo& other) const
723 {
724 return s_IsRelated(*(GetFeature().GetSeq_feat()), *(other.GetFeature().GetSeq_feat()));
725 }
726
727
728
s_ReplaceFeatureIdXref(CSeq_feat & f,CObject_id::TId orig_id,CObject_id::TId new_id)729 void s_ReplaceFeatureIdXref(CSeq_feat& f, CObject_id::TId orig_id, CObject_id::TId new_id)
730 {
731 if (orig_id > 0 && new_id > 0 && f.IsSetXref()) {
732 NON_CONST_ITERATE(CSeq_feat::TXref, xit, f.SetXref()) {
733 if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal() &&
734 (*xit)->GetId().GetLocal().IsId() &&
735 (*xit)->GetId().GetLocal().GetId() == orig_id) {
736 (*xit)->SetId().SetLocal().SetId(new_id);
737 }
738 }
739 }
740 }
741
742
FixFeatureIdsForUpdatePair(vector<CRef<CSeq_feat>> & updates1,vector<CRef<CSeq_feat>> & updates2)743 void FixFeatureIdsForUpdatePair(vector<CRef<CSeq_feat> >& updates1, vector<CRef<CSeq_feat> >& updates2)
744 {
745 if (updates1.size() != updates2.size()) {
746 // can't fix if lists are different lengths
747 return;
748 }
749 if (updates1.size() < 2) {
750 // nothing to fix if there's only one object
751 }
752 vector<CRef<CSeq_feat> >::iterator u1 = updates1.begin();
753 vector<CRef<CSeq_feat> >::iterator u2 = updates2.begin();
754
755 CObject_id::TId orig_id_1 = 0;
756 if ((*u1)->IsSetId() && (*u1)->GetId().IsLocal() && (*u1)->GetId().GetLocal().IsId()) {
757 orig_id_1 = (*u1)->GetId().GetLocal().GetId();
758 }
759 CObject_id::TId orig_id_2 = 0;
760 if ((*u2)->IsSetId() && (*u2)->GetId().IsLocal() && (*u2)->GetId().GetLocal().IsId()) {
761 orig_id_2 = (*u2)->GetId().GetLocal().GetId();
762 }
763 u1++;
764 u2++;
765
766 while (u1 != updates1.end() && u2 != updates2.end()) {
767 CObject_id::TId new_id_1 = 0;
768 if ((*u1)->IsSetId() && (*u1)->GetId().IsLocal() && (*u1)->GetId().GetLocal().IsId()) {
769 new_id_1 = (*u1)->GetId().GetLocal().GetId();
770 }
771 CObject_id::TId new_id_2 = 0;
772 if ((*u2)->IsSetId() && (*u2)->GetId().IsLocal() && (*u2)->GetId().GetLocal().IsId()) {
773 new_id_2 = (*u2)->GetId().GetLocal().GetId();
774 }
775 s_ReplaceFeatureIdXref(**u1, orig_id_2, new_id_2);
776 s_ReplaceFeatureIdXref(**u2, orig_id_1, new_id_1);
777 ++u1;
778 ++u2;
779 }
780 }
781
782
783 END_SCOPE(edit)
784 END_SCOPE(objects)
785 END_NCBI_SCOPE
786