1 /* $Id: cds_fix.cpp 632623 2021-06-03 17:38:11Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Colleen Bollin, NCBI
27 *
28 * File Description:
29 * functions for editing and working with coding regions
30 */
31 #include <ncbi_pch.hpp>
32 #include <corelib/ncbistd.hpp>
33 #include <corelib/ncbiobj.hpp>
34 #include <objtools/edit/cds_fix.hpp>
35 #include <objtools/edit/loc_edit.hpp>
36 #include <objects/seq/Seq_descr.hpp>
37 #include <objects/seq/Seqdesc.hpp>
38 #include <objects/seq/Seq_inst.hpp>
39 #include <objects/general/Object_id.hpp>
40 #include <objects/seqfeat/Cdregion.hpp>
41 #include <objects/seqfeat/Code_break.hpp>
42 #include <objects/seqfeat/Imp_feat.hpp>
43 #include <objects/seqfeat/Org_ref.hpp>
44 #include <objects/seqfeat/RNA_ref.hpp>
45 #include <objects/seqfeat/SeqFeatXref.hpp>
46 #include <objects/seqfeat/seqfeat_macros.hpp>
47 #include <objects/seqfeat/Genetic_code_table.hpp>
48 #include <util/checksum.hpp>
49 #include <objmgr/bioseq_handle.hpp>
50 #include <objmgr/seqdesc_ci.hpp>
51 #include <objmgr/seq_annot_ci.hpp>
52 #include <util/sequtil/sequtil_convert.hpp>
53 #include <objmgr/util/sequence.hpp>
54 #include <objmgr/seq_vector.hpp>
55 #include <objects/general/Dbtag.hpp>
56
57 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)58 BEGIN_SCOPE(objects)
59 BEGIN_SCOPE(edit)
60
61 unsigned char GetCodeBreakCharacter(const CCode_break& cbr)
62 {
63 unsigned char ex = 0;
64 vector<char> seqData;
65 string str;
66
67 if (!cbr.IsSetAa()) {
68 return ex;
69 }
70 switch (cbr.GetAa().Which()) {
71 case CCode_break::C_Aa::e_Ncbi8aa:
72 str = cbr.GetAa().GetNcbi8aa();
73 CSeqConvert::Convert(str, CSeqUtil::e_Ncbi8aa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
74 ex = seqData[0];
75 break;
76 case CCode_break::C_Aa::e_Ncbistdaa:
77 str = cbr.GetAa().GetNcbi8aa();
78 CSeqConvert::Convert(str, CSeqUtil::e_Ncbistdaa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
79 ex = seqData[0];
80 break;
81 case CCode_break::C_Aa::e_Ncbieaa:
82 seqData.push_back(cbr.GetAa().GetNcbieaa());
83 ex = seqData[0];
84 break;
85 default:
86 // do nothing, code break wasn't actually set
87
88 break;
89 }
90 return ex;
91 }
92
93
DoesCodingRegionHaveTerminalCodeBreak(const objects::CCdregion & cdr)94 bool DoesCodingRegionHaveTerminalCodeBreak(const objects::CCdregion& cdr)
95 {
96 if (!cdr.IsSetCode_break()) {
97 return false;
98 }
99 bool rval = false;
100 ITERATE(objects::CCdregion::TCode_break, it, cdr.GetCode_break()) {
101 if (GetCodeBreakCharacter(**it) == '*') {
102 rval = true;
103 break;
104 }
105 }
106 return rval;
107 }
108
109
GetLastPartialCodonLength(const objects::CSeq_feat & cds,objects::CScope & scope)110 TSeqPos GetLastPartialCodonLength(const objects::CSeq_feat& cds, objects::CScope& scope)
111 {
112 if (!cds.IsSetData() || !cds.GetData().IsCdregion()) {
113 return 0;
114 }
115 // find the length of the last partial codon.
116 const objects::CCdregion& cdr = cds.GetData().GetCdregion();
117 int dna_len = sequence::GetLength(cds.GetLocation(), &scope);
118 TSeqPos except_len = 0;
119 if (cds.GetLocation().IsPartialStart(eExtreme_Biological) && cdr.IsSetFrame()) {
120 switch (cdr.GetFrame()) {
121 case objects::CCdregion::eFrame_two:
122 except_len = (dna_len - 1) % 3;
123 break;
124 case objects::CCdregion::eFrame_three:
125 except_len = (dna_len - 2) % 3;
126 break;
127 default:
128 except_len = dna_len % 3;
129 break;
130 }
131 } else {
132 except_len = dna_len % 3;
133 }
134 return except_len;
135 }
136
137
GetLastCodonLoc(const CSeq_feat & cds,CScope & scope)138 CRef<CSeq_loc> GetLastCodonLoc(const CSeq_feat& cds, CScope& scope)
139 {
140 TSeqPos except_len = GetLastPartialCodonLength(cds, scope);
141 if (except_len == 0) {
142 except_len = 3;
143 }
144 const CSeq_loc& cds_loc = cds.GetLocation();
145 TSeqPos stop = cds_loc.GetStop(eExtreme_Biological);
146 CRef<CSeq_id> new_id(new CSeq_id());
147 new_id->Assign(*(cds_loc.GetId()));
148 CRef<CSeq_loc> codon_loc(new CSeq_loc());
149 codon_loc->SetInt().SetId(*new_id);
150 if (cds_loc.GetStrand() == eNa_strand_minus) {
151 codon_loc->SetInt().SetFrom(stop);
152 codon_loc->SetInt().SetTo(stop + except_len - 1);
153 codon_loc->SetInt().SetStrand(eNa_strand_minus);
154 } else {
155 codon_loc->SetInt().SetFrom(stop - except_len + 1);
156 codon_loc->SetInt().SetTo(stop);
157 }
158 return codon_loc;
159 }
160
161
AddTerminalCodeBreak(CSeq_feat & cds,CScope & scope)162 bool AddTerminalCodeBreak(CSeq_feat& cds, CScope& scope)
163 {
164 CRef<CSeq_loc> codon_loc = GetLastCodonLoc(cds, scope);
165 if (!codon_loc) {
166 return false;
167 }
168
169 CRef<objects::CCode_break> cbr(new objects::CCode_break());
170 cbr->SetAa().SetNcbieaa('*');
171 cbr->SetLoc().Assign(*codon_loc);
172 cds.SetData().SetCdregion().SetCode_break().push_back(cbr);
173 return true;
174 }
175
176
DoesCodingRegionEndWithStopCodon(const objects::CSeq_feat & cds,objects::CScope & scope)177 bool DoesCodingRegionEndWithStopCodon(const objects::CSeq_feat& cds, objects::CScope& scope)
178 {
179 string transl_prot;
180 bool alt_start = false;
181 bool rval = false;
182 try {
183 CSeqTranslator::Translate(cds, scope, transl_prot,
184 true, // include stop codons
185 false, // do not remove trailing X/B/Z
186 &alt_start);
187 if (NStr::EndsWith(transl_prot, "*")) {
188 rval = true;
189 }
190 } catch (CException&) {
191 // can't translate
192 }
193 return rval;
194 }
195
196
ExtendStop(CSeq_loc & loc,TSeqPos len,CScope & scope)197 void ExtendStop(CSeq_loc& loc, TSeqPos len, CScope& scope)
198 {
199 if (len == 0) {
200 return;
201 }
202 TSeqPos stop = loc.GetStop(eExtreme_Biological);
203 CRef<CSeq_loc> add(new CSeq_loc());
204 add->SetInt().SetId().Assign(*(loc.GetId()));
205 if (loc.GetStrand() == eNa_strand_minus) {
206 add->SetInt().SetFrom(stop - len);
207 add->SetInt().SetTo(stop - 1);
208 add->SetInt().SetStrand(eNa_strand_minus);
209 } else {
210 add->SetInt().SetId().Assign(*(loc.GetId()));
211 add->SetInt().SetFrom(stop + 1);
212 add->SetInt().SetTo(stop + len);
213 }
214 loc.Assign(*(sequence::Seq_loc_Add(loc, *add, CSeq_loc::fMerge_All|CSeq_loc::fSort, &scope)));
215 }
216
217
ExtendLocationForTranslExcept(objects::CSeq_loc & loc,objects::CScope & scope)218 TSeqPos ExtendLocationForTranslExcept(objects::CSeq_loc& loc, objects::CScope& scope)
219 {
220 CBioseq_Handle bsh = scope.GetBioseqHandle(loc);
221 TSeqPos stop = loc.GetStop(eExtreme_Biological);
222 TSeqPos len = 0;
223 TSeqPos except_len = 0;
224 CRef<CSeq_loc> overhang(new CSeq_loc());
225 overhang->SetInt().SetId().Assign(*loc.GetId());
226
227 if (loc.GetStrand() == eNa_strand_minus) {
228 if (stop < 3) {
229 len = stop;
230 } else {
231 len = 3;
232 }
233 if (len > 0) {
234 overhang->SetInt().SetFrom(0);
235 overhang->SetInt().SetTo(stop - 1);
236 overhang->SetStrand(eNa_strand_minus);
237 }
238 } else {
239 len = bsh.GetBioseqLength() - stop - 1;
240 if (len > 3) {
241 len = 3;
242 }
243 if (len > 0) {
244 overhang->SetInt().SetFrom(stop + 1);
245 overhang->SetInt().SetTo(bsh.GetBioseqLength() - 1);
246 }
247 }
248 if (len > 0) {
249 CSeqVector vec(*overhang, scope, CBioseq_Handle::eCoding_Iupac);
250 string seq_string;
251 vec.GetSeqData(0, len, seq_string);
252 if (vec[0] == 'T') {
253 except_len++;
254 if (len > 1 && vec[1] == 'A') {
255 except_len++;
256 if (len > 2 && vec[2] == 'A') {
257 // adding a real stop codon
258 except_len ++;
259 }
260 }
261 }
262 }
263 // extend
264 if (except_len > 0) {
265 ExtendStop(loc, except_len, scope);
266 }
267 return except_len;
268 }
269
270
IsOverhangOkForTerminalCodeBreak(const CSeq_feat & cds,CScope & scope,bool strict)271 bool IsOverhangOkForTerminalCodeBreak(const CSeq_feat& cds, CScope& scope, bool strict)
272 {
273 CRef<CSeq_loc> loc = GetLastCodonLoc(cds, scope);
274 if (!loc) {
275 return false;
276 }
277 TSeqPos len = sequence::GetLength(*loc, &scope);
278 CSeqVector vec(*loc, scope, CBioseq_Handle::eCoding_Iupac);
279 bool rval = true;
280 if (strict) {
281 if (vec[0] != 'T') {
282 rval = false;
283 } else if (len > 1 && vec[1] != 'A') {
284 rval = false;
285 }
286 } else {
287 if (vec[0] != 'T' && vec[0] != 'N') {
288 rval = false;
289 }
290 }
291 return rval;
292 }
293
294
295 /// SetTranslExcept
296 /// A function to set a code break at the 3' end of a coding region to indicate
297 /// that the stop codon is formed by the addition of a poly-A tail.
298 /// @param cds The coding region to be adjusted (if necessary)
299 /// @param comment The string to place in the note on cds if a code break is added
300 /// @param strict Only add code break if last partial codon consists of "TA" or just "T".
301 /// If strict is false, add code break if first NT of last partial codon
302 /// is T or N.
303 /// @param extend If true, extend coding region to cover partial stop codon
SetTranslExcept(objects::CSeq_feat & cds,const string & comment,bool strict,bool extend,objects::CScope & scope)304 bool SetTranslExcept(objects::CSeq_feat& cds, const string& comment, bool strict, bool extend, objects::CScope& scope)
305 {
306 // do nothing if this isn't a coding region
307 if (!cds.IsSetData() || !cds.GetData().IsCdregion()) {
308 return false;
309 }
310 // do nothing if coding region is 3' partial
311 if (cds.GetLocation().IsPartialStop(eExtreme_Biological)) {
312 return false;
313 }
314
315 objects::CCdregion& cdr = cds.SetData().SetCdregion();
316 // do nothing if coding region already has terminal code break
317 if (DoesCodingRegionHaveTerminalCodeBreak(cdr)) {
318 return false;
319 }
320
321 // find the length of the last partial codon.
322 TSeqPos except_len = GetLastPartialCodonLength(cds, scope);
323
324 bool extended = false;
325 if (except_len == 0 && extend && !DoesCodingRegionEndWithStopCodon(cds, scope)) {
326 except_len = ExtendLocationForTranslExcept(cds.SetLocation(), scope);
327 if (except_len > 0) {
328 extended = true;
329 }
330 }
331
332 if (except_len == 0) {
333 return false;
334 }
335
336 bool added_code_break = false;
337 if (!extend || !DoesCodingRegionEndWithStopCodon(cds, scope)) {
338 // TODO: check for strictness
339 if (IsOverhangOkForTerminalCodeBreak(cds, scope, strict)
340 && AddTerminalCodeBreak(cds, scope)) {
341 if (!NStr::IsBlank(comment)) {
342 if (cds.IsSetComment() && !NStr::IsBlank(cds.GetComment())) {
343 string orig_comment = cds.GetComment();
344 if (NStr::Find(orig_comment, comment) == string::npos) {
345 cds.SetComment(cds.GetComment() + ";" + comment);
346 }
347 } else {
348 cds.SetComment(comment);
349 }
350 }
351 added_code_break = true;
352 }
353 }
354
355 return extended || added_code_break;
356 }
357
358
359 /// AdjustProteinMolInfoToMatchCDS
360 /// A function to change an existing MolInfo to match a coding region
361 /// @param molinfo The MolInfo to be adjusted (if necessary)
362 /// @param cds The CDS to match
363 ///
364 /// @return Boolean to indicate whether molinfo was changed
365
AdjustProteinMolInfoToMatchCDS(CMolInfo & molinfo,const CSeq_feat & cds)366 bool AdjustProteinMolInfoToMatchCDS(CMolInfo& molinfo, const CSeq_feat& cds)
367 {
368 bool rval = false;
369 if (!molinfo.IsSetBiomol() || molinfo.GetBiomol() != CMolInfo::eBiomol_peptide) {
370 molinfo.SetBiomol(CMolInfo::eBiomol_peptide);
371 rval = true;
372 }
373
374 bool partial5 = cds.GetLocation().IsPartialStart(eExtreme_Biological);
375 bool partial3 = cds.GetLocation().IsPartialStop(eExtreme_Biological);
376 CMolInfo::ECompleteness completeness = CMolInfo::eCompleteness_complete;
377 if (partial5 && partial3) {
378 completeness = CMolInfo::eCompleteness_no_ends;
379 } else if (partial5) {
380 completeness = CMolInfo::eCompleteness_no_left;
381 } else if (partial3) {
382 completeness = CMolInfo::eCompleteness_no_right;
383 }
384 if (!molinfo.IsSetCompleteness() || molinfo.GetCompleteness() != completeness) {
385 molinfo.SetCompleteness(completeness);
386 rval = true;
387 }
388 return rval;
389 }
390
391
392 /// AdjustProteinFeaturePartialsToMatchCDS
393 /// A function to change an existing MolInfo to match a coding region
394 /// @param new_prot The protein feature to be adjusted (if necessary)
395 /// @param cds The CDS to match
396 ///
397 /// @return Boolean to indicate whether the protein feature was changed
AdjustProteinFeaturePartialsToMatchCDS(CSeq_feat & new_prot,const CSeq_feat & cds)398 bool AdjustProteinFeaturePartialsToMatchCDS(CSeq_feat& new_prot, const CSeq_feat& cds)
399 {
400 bool any_change = false;
401 bool partial5 = cds.GetLocation().IsPartialStart(eExtreme_Biological);
402 bool partial3 = cds.GetLocation().IsPartialStop(eExtreme_Biological);
403 bool prot_5 = new_prot.GetLocation().IsPartialStart(eExtreme_Biological);
404 bool prot_3 = new_prot.GetLocation().IsPartialStop(eExtreme_Biological);
405 if ((partial5 && !prot_5) || (!partial5 && prot_5)
406 || (partial3 && !prot_3) || (!partial3 && prot_3)) {
407 new_prot.SetLocation().SetPartialStart(partial5, eExtreme_Biological);
408 new_prot.SetLocation().SetPartialStop(partial3, eExtreme_Biological);
409 any_change = true;
410 }
411 any_change |= feature::AdjustFeaturePartialFlagForLocation(new_prot);
412 return any_change;
413 }
414
415
416 /// AdjustForCDSPartials
417 /// A function to make all of the necessary related changes to
418 /// a Seq-entry after the partialness of a coding region has been
419 /// changed.
420 /// @param cds The feature for which adjustments are to be made
421 /// @param seh The Seq-entry-handle to be adjusted (if necessary)
422 ///
423 /// @return Boolean to indicate whether the Seq-entry-handle was changed
AdjustForCDSPartials(const CSeq_feat & cds,CSeq_entry_Handle seh)424 bool AdjustForCDSPartials(const CSeq_feat& cds, CSeq_entry_Handle seh)
425 {
426 if (!cds.IsSetProduct() || !seh) {
427 return false;
428 }
429
430 // find Bioseq for product
431 CBioseq_Handle product = seh.GetScope().GetBioseqHandle(cds.GetProduct());
432 if (!product) {
433 return false;
434 }
435
436 bool any_change = false;
437 // adjust protein feature
438 CFeat_CI f(product, SAnnotSelector(CSeqFeatData::eSubtype_prot));
439 if (f) {
440 // This is necessary, to make sure that we are in "editing mode"
441 const CSeq_annot_Handle& annot_handle = f->GetAnnot();
442 CSeq_entry_EditHandle eh = annot_handle.GetParentEntry().GetEditHandle();
443 CSeq_feat_EditHandle feh(*f);
444 CRef<CSeq_feat> new_feat(new CSeq_feat());
445 new_feat->Assign(*(f->GetSeq_feat()));
446 if (AdjustProteinFeaturePartialsToMatchCDS(*new_feat, cds)) {
447 feh.Replace(*new_feat);
448 any_change = true;
449 }
450 }
451
452 // change or create molinfo on protein bioseq
453 bool found = false;
454 CBioseq_EditHandle beh = product.GetEditHandle();
455 NON_CONST_ITERATE(CBioseq::TDescr::Tdata, it, beh.SetDescr().Set()) {
456 if ((*it)->IsMolinfo()) {
457 any_change |= AdjustProteinMolInfoToMatchCDS((*it)->SetMolinfo(), cds);
458 found = true;
459 }
460 }
461 if (!found) {
462 CRef<objects::CSeqdesc> new_molinfo_desc( new CSeqdesc );
463 AdjustProteinMolInfoToMatchCDS(new_molinfo_desc->SetMolinfo(), cds);
464 beh.SetDescr().Set().push_back(new_molinfo_desc);
465 any_change = true;
466 }
467
468 return any_change;
469 }
470
471
s_GetProductName(const CProt_ref & prot)472 string s_GetProductName (const CProt_ref& prot)
473 {
474 string prot_nm(kEmptyStr);
475 if (prot.IsSetName() && prot.GetName().size() > 0) {
476 prot_nm = prot.GetName().front();
477 }
478 return prot_nm;
479 }
480
481
s_GetProductName(const CSeq_feat & cds,CScope & scope)482 string s_GetProductName (const CSeq_feat& cds, CScope& scope)
483 {
484 string prot_nm(kEmptyStr);
485 if (cds.IsSetProduct()) {
486 CBioseq_Handle prot_bsq = sequence::GetBioseqFromSeqLoc(cds.GetProduct(), scope);
487 if (prot_bsq) {
488 CFeat_CI prot_ci(prot_bsq, CSeqFeatData::e_Prot);
489 if (prot_ci) {
490 prot_nm = s_GetProductName(prot_ci->GetOriginalFeature().GetData().GetProt());
491 }
492 }
493 } else if (cds.IsSetXref()) {
494 ITERATE(CSeq_feat::TXref, it, cds.GetXref()) {
495 if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
496 prot_nm = s_GetProductName((*it)->GetData().GetProt());
497 }
498 }
499 }
500 return prot_nm;
501 }
502
503
s_GetmRNAName(const CSeq_feat & mrna)504 string s_GetmRNAName (const CSeq_feat& mrna)
505 {
506 if (!mrna.IsSetData() || mrna.GetData().GetSubtype() != CSeqFeatData::eSubtype_mRNA
507 || !mrna.GetData().IsRna() || !mrna.GetData().GetRna().IsSetExt()
508 || !mrna.GetData().GetRna().GetExt().IsName()) {
509 return "";
510 } else {
511 return mrna.GetData().GetRna().GetExt().GetName();
512 }
513 }
514
515
516 /// MakemRNAforCDS
517 /// A function to create a CSeq_feat that represents the
518 /// appropriate mRNA for a given CDS. Note that this feature
519 /// is not added to the Seq-annot in the record; this step is
520 /// left for the caller.
521 /// @param cds The feature for which the mRNA to be made, if one is not already present
522 /// @param scope The scope in which adjustments are to be made (if necessary)
523 ///
524 /// @return CRef<CSeq_feat> for new mRNA (will be NULL if one was already present)
MakemRNAforCDS(const CSeq_feat & cds,CScope & scope)525 CRef<CSeq_feat> MakemRNAforCDS(const CSeq_feat& cds, CScope& scope)
526 {
527 CRef <CSeq_feat> new_mrna(NULL);
528 string prot_nm = s_GetProductName(cds, scope);
529 const CSeq_loc& cd_loc = cds.GetLocation();
530
531 CConstRef <CSeq_feat> mrna(NULL);
532 CBioseq_Handle bsh = scope.GetBioseqHandle(*cd_loc.GetId());
533 CSeq_feat_Handle fh = scope.GetSeq_featHandle(cds);
534 CSeq_annot_Handle sah;
535 if (fh) {
536 sah = fh.GetAnnot();
537 }
538 // can only look for overlapping mRNA with sequence::GetOverlappingmRNA
539 // if Bioseq is in scope.
540 if (bsh) {
541 mrna = sequence::GetOverlappingmRNA(cd_loc, scope);
542 } else if (sah) {
543 size_t best_len = 0;
544 for (CFeat_CI mrna_find(sah, CSeqFeatData::eSubtype_mRNA); mrna_find; ++mrna_find) {
545 if (sequence::TestForOverlap64(mrna_find->GetLocation(), cd_loc, sequence::eOverlap_CheckIntervals) != -1) {
546 size_t len = sequence::GetLength(mrna_find->GetLocation(), &scope);
547 if (best_len == 0 || len < best_len) {
548 best_len = len;
549 mrna = &(mrna_find->GetOriginalFeature());
550 }
551 }
552 }
553 }
554
555 if (!mrna || !NStr::Equal(prot_nm, s_GetmRNAName(*mrna))) {
556 new_mrna.Reset (new CSeq_feat());
557 new_mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
558 new_mrna->SetLocation().Assign(cd_loc);
559 new_mrna->SetData().SetRna().SetExt().SetName(prot_nm);
560
561 bool found3 = false;
562 bool found5 = false;
563 CRef<CSeq_loc> loc(new CSeq_loc());
564 loc->Assign(new_mrna->GetLocation());
565 CFeat_CI utr1, utr2;
566 if (bsh)
567 {
568 utr1 = CFeat_CI(bsh, cd_loc.IsReverseStrand() ? CSeqFeatData::eSubtype_5UTR : CSeqFeatData::eSubtype_3UTR);
569 utr2 = CFeat_CI(bsh, cd_loc.IsReverseStrand() ? CSeqFeatData::eSubtype_3UTR : CSeqFeatData::eSubtype_5UTR);
570 }
571 else if (sah)
572 {
573 utr1 = CFeat_CI(sah, cd_loc.IsReverseStrand() ? CSeqFeatData::eSubtype_5UTR : CSeqFeatData::eSubtype_3UTR);
574 utr2 = CFeat_CI(sah, cd_loc.IsReverseStrand() ? CSeqFeatData::eSubtype_3UTR : CSeqFeatData::eSubtype_5UTR);
575 }
576
577 for (; utr1; ++utr1)
578 {
579 if (utr1->GetLocation().GetStart(eExtreme_Positional) == cd_loc.GetStop(eExtreme_Positional) + 1)
580 {
581 loc = sequence::Seq_loc_Add(*loc, utr1->GetLocation(), CSeq_loc::fMerge_All|CSeq_loc::fSort, &scope);
582 if (cd_loc.IsReverseStrand())
583 found5 = true;
584 else
585 found3 = true;
586 break;
587 }
588 }
589 for (; utr2; ++utr2)
590 {
591 if (utr2->GetLocation().GetStop(eExtreme_Positional) + 1 == cd_loc.GetStart(eExtreme_Positional) )
592 {
593 loc = sequence::Seq_loc_Add(*loc, utr2->GetLocation(), CSeq_loc::fMerge_All|CSeq_loc::fSort, &scope);
594 if (cd_loc.IsReverseStrand())
595 found3 = true;
596 else
597 found5 = true;
598 break;
599 }
600 }
601
602 CConstRef <CSeq_feat> gene = sequence::GetBestGeneForCds(cds, scope);
603 const CSeq_loc *overlap_loc = &cd_loc;
604 if (gene && gene->IsSetLocation())
605 {
606 overlap_loc = &gene->GetLocation();
607 }
608 auto gene_start = overlap_loc->GetStart(eExtreme_Positional);
609 auto gene_stop = overlap_loc->GetStop(eExtreme_Positional);
610
611 CFeat_CI exon;
612 if (bsh)
613 exon = CFeat_CI(bsh, CSeqFeatData::eSubtype_exon);
614 else if (sah)
615 exon = CFeat_CI(sah, CSeqFeatData::eSubtype_exon);
616 for (; exon; ++exon)
617 {
618 if (!sequence::IsSameBioseq(*exon->GetLocation().GetId(), *overlap_loc->GetId(), &scope))
619 continue;
620 auto exon_start = exon->GetLocation().GetStart(eExtreme_Positional);
621 auto exon_stop = exon->GetLocation().GetStop(eExtreme_Positional);
622 auto mrna_start = loc->GetStart(eExtreme_Positional);
623 auto mrna_stop = loc->GetStop(eExtreme_Positional);
624 if (exon_start >= gene_start && exon_stop <= gene_stop)
625 {
626 bool exon_found = false;
627 if (exon_start < mrna_start )
628 {
629 if (loc->IsReverseStrand())
630 found3 = true;
631 else
632 found5 = true;
633 exon_found = true;
634 }
635 if (exon_stop > mrna_stop)
636 {
637 if (loc->IsReverseStrand())
638 found5 = true;
639 else
640 found3 = true;
641 exon_found = true;
642 }
643 if (exon_found)
644 {
645 loc = sequence::Seq_loc_Add(*loc, exon->GetLocation(), CSeq_loc::fMerge_All|CSeq_loc::fSort, &scope);
646 }
647 }
648 }
649
650 new_mrna->SetLocation(*loc);
651
652 if (!found5)
653 new_mrna->SetLocation().SetPartialStart(true, eExtreme_Positional);
654 if (!found3)
655 new_mrna->SetLocation().SetPartialStop(true, eExtreme_Positional);
656
657 new_mrna->SetPartial(new_mrna->GetLocation().IsPartialStart(eExtreme_Positional) || new_mrna->GetLocation().IsPartialStop(eExtreme_Positional));
658 }
659 return new_mrna;
660 }
661
662 /// GetmRNAforCDS
663 /// A function to find a CSeq_feat representing the
664 /// appropriate mRNA for a given CDS.
665 /// @param cds The feature for which the mRNA to be found
666 /// @param scope The scope
667 ///
668 /// @return CConstRef<CSeq_feat> for new mRNA (will be NULL if none is found)
669
GetmRNAforCDS(const CSeq_feat & cds,CScope & scope)670 CConstRef<CSeq_feat> GetmRNAforCDS(const CSeq_feat& cds, CScope& scope)
671 {
672 CConstRef<CSeq_feat> mrna;
673 bool has_xref = false;
674 if (cds.IsSetXref()) {
675 /* using FeatID from feature cross-references:
676 * if CDS refers to an mRNA by feature ID, use that feature
677 */
678 CBioseq_Handle bsh = scope.GetBioseqHandle(cds.GetLocation());
679 CTSE_Handle tse = bsh.GetTSE_Handle();
680 FOR_EACH_SEQFEATXREF_ON_SEQFEAT (it, cds) {
681 if ((*it)->IsSetId() && (*it)->GetId().IsLocal() && (*it)->GetId().GetLocal().IsId()) {
682 CSeq_feat_Handle mrna_h = tse.GetFeatureWithId(CSeqFeatData::eSubtype_mRNA, (*it)->GetId().GetLocal().GetId());
683 if (mrna_h) {
684 mrna = mrna_h.GetSeq_feat();
685 }
686 has_xref = true;
687 }
688 }
689 }
690 if (!has_xref) {
691 /* using original location to find mRNA:
692 * mRNA must include the CDS location and the internal interval boundaries need to be identical
693 */
694 mrna = sequence::GetBestOverlappingFeat( cds.GetLocation(), CSeqFeatData::eSubtype_mRNA, sequence::eOverlap_CheckIntRev, scope);
695 }
696 return mrna;
697 }
698
699 /// GetGeneticCodeForBioseq
700 /// A function to construct the appropriate CGenetic_code object to use
701 /// when constructing a coding region for a given Bioseq (if the default code
702 /// should not be used).
703 /// @param bh The Bioseq_Handle of the nucleotide sequence on which the
704 /// coding region is to be created.
705 ///
706 /// @return CRef<CGenetic_code> for new CGenetic_code (will be NULL if default should be used)
GetGeneticCodeForBioseq(CBioseq_Handle bh)707 CRef<CGenetic_code> GetGeneticCodeForBioseq(CBioseq_Handle bh)
708 {
709 CRef<CGenetic_code> code(NULL);
710 if (!bh) {
711 return code;
712 }
713 CSeqdesc_CI src(bh, CSeqdesc::e_Source);
714 if (src && src->GetSource().IsSetOrg() && src->GetSource().GetOrg().IsSetOrgname()) {
715 const CBioSource & bsrc = src->GetSource();
716 int bioseqGenCode = bsrc.GetGenCode(0);
717 if (bioseqGenCode > 0) {
718 code.Reset(new CGenetic_code());
719 code->SetId(bioseqGenCode);
720 }
721 }
722 return code;
723 }
724
725
TruncateSeqLoc(const CSeq_loc & orig_loc,size_t new_len)726 static CRef<CSeq_loc> TruncateSeqLoc (const CSeq_loc& orig_loc, size_t new_len)
727 {
728 CRef<CSeq_loc> new_loc;
729
730 size_t len = 0;
731 for (CSeq_loc_CI it(orig_loc); it && len < new_len; ++it) {
732 size_t this_len = it.GetRange().GetLength();
733 CConstRef<CSeq_loc> this_loc = it.GetRangeAsSeq_loc();
734 if (len + this_len <= new_len) {
735 if (new_loc) {
736 new_loc->Add(*this_loc);
737 } else {
738 new_loc.Reset(new CSeq_loc());
739 new_loc->Assign(*this_loc);
740 }
741 len += this_len;
742 } else {
743 CRef<CSeq_loc> partial_loc(new CSeq_loc());
744 size_t len_wanted = new_len - len;
745 size_t start = this_loc->GetStart(eExtreme_Biological);
746 if (len_wanted == 1) {
747 // make a point
748 partial_loc->SetPnt().SetPoint(start);
749 } else {
750 // make an interval
751 if (this_loc->IsSetStrand() && this_loc->GetStrand() == eNa_strand_minus) {
752 partial_loc->SetInt().SetFrom(start - len_wanted + 1);
753 partial_loc->SetInt().SetTo(start);
754 } else {
755 partial_loc->SetInt().SetFrom(start);
756 partial_loc->SetInt().SetTo(start + len_wanted - 1);
757 }
758 }
759 partial_loc->SetId(*this_loc->GetId());
760 if (this_loc->IsSetStrand()) {
761 partial_loc->SetStrand(this_loc->GetStrand());
762 }
763 if (new_loc) {
764 new_loc->Add(*partial_loc);
765 } else {
766 new_loc.Reset(new CSeq_loc());
767 new_loc->Assign(*partial_loc);
768 }
769 len += len_wanted;
770 }
771 }
772
773 return new_loc;
774 }
775
776
777 /// TruncateCDSAtStop
778 /// A function to truncate a CDS location after the first stop codon in the
779 /// protein translation. Note that adjustments are not made to the protein
780 /// sequence in this function, only the location and partialness of the coding
781 /// region.
782 /// @param cds The feature to adjust
783 /// @param scope The scope in which adjustments are to be made (if necessary)
784 ///
785 /// @return true if stop codon was found, false otherwise
TruncateCDSAtStop(CSeq_feat & cds,CScope & scope)786 bool TruncateCDSAtStop(CSeq_feat& cds, CScope& scope)
787 {
788 bool rval = false;
789 CRef<CBioseq> bioseq = CSeqTranslator::TranslateToProtein (cds, scope);
790 if (bioseq) {
791 CRef<CSeq_loc> new_loc(NULL);
792 string prot_str;
793 CSeqTranslator::Translate(cds, scope, prot_str);
794 size_t pos = NStr::Find(prot_str, "*");
795 if (pos != string::npos) {
796 // want to truncate the location and retranslate
797 size_t len_wanted = 3 * (pos + 1);
798 if (cds.GetData().GetCdregion().IsSetFrame()) {
799 CCdregion::EFrame frame = cds.GetData().GetCdregion().GetFrame();
800 if (frame == CCdregion::eFrame_two) {
801 len_wanted += 1;
802 } else if (frame == CCdregion::eFrame_three) {
803 len_wanted += 2;
804 }
805 }
806 if (len_wanted > 0) {
807 new_loc = TruncateSeqLoc (cds.GetLocation(), len_wanted);
808 if (new_loc) {
809 new_loc->SetPartialStart(cds.GetLocation().IsPartialStart(eExtreme_Biological), eExtreme_Biological);
810 new_loc->SetPartialStop(false, eExtreme_Biological);
811 cds.SetLocation().Assign(*new_loc);
812 if (cds.GetLocation().IsPartialStart(eExtreme_Biological)) {
813 cds.SetPartial(true);
814 } else {
815 cds.ResetPartial();
816 }
817 rval = true;
818 }
819 }
820 }
821 }
822 return rval;
823 }
824
825
826 /// ExtendCDSToStopCodon
827 /// A function to extend a CDS location to the first in-frame stop codon in the
828 /// protein translation. Note that adjustments are not made to the protein
829 /// sequence in this function, only the location and partialness of the coding
830 /// region.
831 /// @param cds The feature to adjust
832 /// @param scope The scope in which adjustments are to be made (if necessary)
833 ///
834 /// @return true if stop codon was found, false otherwise
ExtendCDSToStopCodon(CSeq_feat & cds,CScope & scope)835 bool ExtendCDSToStopCodon (CSeq_feat& cds, CScope& scope)
836 {
837 if (!cds.IsSetLocation()) {
838 return false;
839 }
840
841 const CSeq_loc& loc = cds.GetLocation();
842
843 CBioseq_Handle bsh = scope.GetBioseqHandle(*(loc.GetId()));
844 if (!bsh) {
845 return false;
846 }
847
848 const CGenetic_code* code = NULL;
849 if (cds.IsSetData() && cds.GetData().IsCdregion() && cds.GetData().GetCdregion().IsSetCode()) {
850 code = &(cds.GetData().GetCdregion().GetCode());
851 }
852
853 size_t stop = loc.GetStop(eExtreme_Biological);
854 // figure out if we have a partial codon at the end
855 size_t orig_len = sequence::GetLength(loc, &scope);
856 size_t len = orig_len;
857 if (cds.IsSetData() && cds.GetData().IsCdregion() && cds.GetData().GetCdregion().IsSetFrame()) {
858 CCdregion::EFrame frame = cds.GetData().GetCdregion().GetFrame();
859 if (frame == CCdregion::eFrame_two) {
860 len -= 1;
861 } else if (frame == CCdregion::eFrame_three) {
862 len -= 2;
863 }
864 }
865 size_t mod = len % 3;
866 CRef<CSeq_loc> vector_loc(new CSeq_loc());
867 vector_loc->SetInt().SetId().Assign(*(loc.GetId()));
868
869 if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus) {
870 vector_loc->SetInt().SetFrom(0);
871 vector_loc->SetInt().SetTo(stop + mod - 1);
872 vector_loc->SetStrand(eNa_strand_minus);
873 } else {
874 vector_loc->SetInt().SetFrom(stop - mod + 1);
875 vector_loc->SetInt().SetTo(bsh.GetInst_Length() - 1);
876 }
877
878 CSeqVector seq(*vector_loc, scope, CBioseq_Handle::eCoding_Iupac);
879 // reserve our space
880 const size_t usable_size = seq.size();
881
882 // get appropriate translation table
883 const CTrans_table & tbl =
884 (code ? CGen_code_table::GetTransTable(*code) :
885 CGen_code_table::GetTransTable(1));
886
887 // main loop through bases
888 CSeqVector::const_iterator start = seq.begin();
889
890 size_t i;
891 size_t k;
892 size_t state = 0;
893 size_t length = usable_size / 3;
894
895 CRef<CSeq_loc> new_loc(NULL);
896
897 for (i = 0; i < length; ++i) {
898 // loop through one codon at a time
899 for (k = 0; k < 3; ++k, ++start) {
900 state = tbl.NextCodonState(state, *start);
901 }
902
903 if (tbl.GetCodonResidue (state) == '*') {
904 CSeq_loc_CI it(loc);
905 CSeq_loc_CI it_next = it;
906 ++it_next;
907 while (it_next) {
908 CConstRef<CSeq_loc> this_loc = it.GetRangeAsSeq_loc();
909 if (new_loc) {
910 new_loc->Add(*this_loc);
911 } else {
912 new_loc.Reset(new CSeq_loc());
913 new_loc->Assign(*this_loc);
914 }
915 it = it_next;
916 ++it_next;
917 }
918 CRef<CSeq_loc> last_interval(new CSeq_loc());
919 CConstRef<CSeq_loc> this_loc = it.GetRangeAsSeq_loc();
920 size_t this_start = this_loc->GetStart(eExtreme_Positional);
921 size_t this_stop = this_loc->GetStop(eExtreme_Positional);
922 size_t extension = ((i + 1) * 3) - mod;
923 last_interval->SetInt().SetId().Assign(*(this_loc->GetId()));
924 if (this_loc->IsSetStrand() && this_loc->GetStrand() == eNa_strand_minus) {
925 last_interval->SetStrand(eNa_strand_minus);
926 last_interval->SetInt().SetFrom(this_start - extension);
927 last_interval->SetInt().SetTo(this_stop);
928 } else {
929 last_interval->SetInt().SetFrom(this_start);
930 last_interval->SetInt().SetTo(this_stop + extension);
931 }
932
933 if (new_loc) {
934 new_loc->Add(*last_interval);
935 } else {
936 new_loc.Reset(new CSeq_loc());
937 new_loc->Assign(*last_interval);
938 }
939 new_loc->SetPartialStop(false, eExtreme_Biological);
940
941 cds.SetLocation().Assign(*new_loc);
942 return true;
943 }
944 }
945
946 if (usable_size < 3 && !new_loc) {
947 if (loc.GetStrand() == eNa_strand_minus) {
948 new_loc = SeqLocExtend(loc, 0, &scope);
949 } else {
950 new_loc = SeqLocExtend(loc, bsh.GetInst_Length() - 1, &scope);
951 }
952 new_loc->SetPartialStop(true, eExtreme_Biological);
953 cds.SetLocation().Assign(*new_loc);
954 return true;
955 }
956
957 return false;
958 }
959
960
AdjustCDSFrameForStartChange(CCdregion & cds,int change)961 void AdjustCDSFrameForStartChange(CCdregion& cds, int change)
962 {
963 TSeqPos old_frame = CCdregion::eFrame_one;
964 if (cds.IsSetFrame() && cds.GetFrame() != CCdregion::eFrame_not_set) {
965 old_frame = cds.GetFrame();
966 }
967
968 TSignedSeqPos new_frame = old_frame - (change % 3);
969 if (new_frame < 1)
970 {
971 new_frame += 3;
972 }
973 cds.SetFrame((CCdregion::EFrame)new_frame);
974 }
975
976
DemoteCDSToNucSeq(objects::CSeq_feat_Handle & orig_feat)977 bool DemoteCDSToNucSeq(objects::CSeq_feat_Handle& orig_feat)
978 {
979 CSeq_feat_EditHandle feh(orig_feat);
980 CSeq_entry_Handle parent_entry = feh.GetAnnot().GetParentEntry();
981
982 if (!parent_entry.IsSet() || !parent_entry.GetSet().IsSetClass() ||
983 parent_entry.GetSet().GetClass() != CBioseq_set::eClass_nuc_prot) {
984 // no change, not on nuc-prot set
985 return false;
986 }
987
988 CBioseq_CI bi(parent_entry, CSeq_inst::eMol_na);
989 if (!bi) {
990 // no nucleotide sequence to move to
991 return false;
992 }
993
994
995 // This is necessary, to make sure that we are in "editing mode"
996 const CSeq_annot_Handle& annot_handle = orig_feat.GetAnnot();
997 CSeq_entry_EditHandle eh = annot_handle.GetParentEntry().GetEditHandle();
998
999 CSeq_annot_Handle ftable;
1000 CSeq_entry_Handle nuc_seh = bi->GetSeq_entry_Handle();
1001 CSeq_annot_CI annot_ci(nuc_seh, CSeq_annot_CI::eSearch_entry);
1002 for (; annot_ci; ++annot_ci) {
1003 if ((*annot_ci).IsFtable()) {
1004 ftable = *annot_ci;
1005 break;
1006 }
1007 }
1008
1009 if (!ftable) {
1010 CRef<CSeq_annot> new_annot(new CSeq_annot());
1011 new_annot->SetData().SetFtable();
1012 CSeq_entry_EditHandle eh = nuc_seh.GetEditHandle();
1013 ftable = eh.AttachAnnot(*new_annot);
1014 }
1015
1016 CSeq_annot_EditHandle old_annot = annot_handle.GetEditHandle();
1017 CSeq_annot_EditHandle new_annot = ftable.GetEditHandle();
1018 orig_feat = new_annot.TakeFeat(feh);
1019 const list< CRef< CSeq_feat > > &feat_list = old_annot.GetSeq_annotCore()->GetData().GetFtable();
1020 if (feat_list.empty())
1021 {
1022 old_annot.Remove();
1023 }
1024 return true;
1025 }
1026
1027
s_SetCDSFrame(CSeq_feat & cds,ECdsFrame frame_type,CScope & scope)1028 bool ApplyCDSFrame::s_SetCDSFrame(CSeq_feat& cds, ECdsFrame frame_type, CScope& scope)
1029 {
1030 if (!cds.IsSetData() || !cds.GetData().IsCdregion())
1031 return false;
1032
1033 CCdregion::TFrame orig_frame = CCdregion::eFrame_not_set;
1034 if (cds.GetData().GetCdregion().IsSetFrame()) {
1035 orig_frame = cds.GetData().GetCdregion().GetFrame();
1036 }
1037 // retrieve the new frame
1038 CCdregion::TFrame new_frame = orig_frame;
1039 switch (frame_type) {
1040 case eNotSet:
1041 break;
1042 case eBest:
1043 new_frame = CSeqTranslator::FindBestFrame(cds, scope);
1044 break;
1045 case eMatch:
1046 new_frame = s_FindMatchingFrame(cds, scope);
1047 break;
1048 case eOne:
1049 new_frame = CCdregion::eFrame_one;
1050 break;
1051 case eTwo:
1052 new_frame = CCdregion::eFrame_two;
1053 break;
1054 case eThree:
1055 new_frame = CCdregion::eFrame_three;
1056 break;
1057 }
1058
1059 bool modified = false;
1060 if (orig_frame != new_frame) {
1061 cds.SetData().SetCdregion().SetFrame(new_frame);
1062 modified = true;
1063 }
1064 return modified;
1065 }
1066
s_FindMatchingFrame(const CSeq_feat & cds,CScope & scope)1067 CCdregion::TFrame ApplyCDSFrame::s_FindMatchingFrame(const CSeq_feat& cds, CScope& scope)
1068 {
1069 CCdregion::TFrame new_frame = CCdregion::eFrame_not_set;
1070 //return the frame that matches the protein sequence if it can find one
1071 if (!cds.IsSetData() || !cds.GetData().IsCdregion() || !cds.IsSetLocation() || !cds.IsSetProduct()) {
1072 return new_frame;
1073 }
1074
1075 // get the protein sequence
1076 CBioseq_Handle product = scope.GetBioseqHandle(cds.GetProduct());
1077 if (!product || !product.IsProtein()) {
1078 return new_frame;
1079 }
1080
1081 // obtaining the original protein sequence
1082 CSeqVector prot_vec = product.GetSeqVector(CBioseq_Handle::eCoding_Ncbi);
1083 prot_vec.SetCoding(CSeq_data::e_Ncbieaa);
1084 string orig_prot_seq;
1085 prot_vec.GetSeqData(0, prot_vec.size(), orig_prot_seq);
1086 if (NStr::IsBlank(orig_prot_seq)) {
1087 return new_frame;
1088 }
1089
1090 CRef<CSeq_feat> tmp_cds(new CSeq_feat);
1091 tmp_cds->Assign(cds);
1092 for (int enumI = CCdregion::eFrame_one; enumI < CCdregion::eFrame_three + 1; ++enumI) {
1093 CCdregion::EFrame fr = (CCdregion::EFrame) (enumI);
1094 tmp_cds->SetData().SetCdregion().SetFrame(fr);
1095
1096 string new_prot_seq;
1097 CSeqTranslator::Translate(*tmp_cds, scope, new_prot_seq);
1098 if (NStr::EndsWith(new_prot_seq, '*'))
1099 new_prot_seq.erase(new_prot_seq.end() - 1);
1100 if (NStr::EqualNocase(new_prot_seq, orig_prot_seq)) {
1101 new_frame = fr;
1102 break;
1103 }
1104 }
1105
1106 return new_frame;
1107 }
1108
s_GetFrameFromName(const string & name)1109 ApplyCDSFrame::ECdsFrame ApplyCDSFrame::s_GetFrameFromName(const string& name)
1110 {
1111 ECdsFrame frame = eNotSet;
1112 if (NStr::EqualNocase(name, "best")) {
1113 frame = eBest;
1114 } else if (NStr::EqualNocase(name, "match")) {
1115 frame = eMatch;
1116 } else if (NStr::Equal(name, "1") || NStr::EqualNocase(name, "one")) {
1117 frame = eOne;
1118 } else if (NStr::Equal(name, "2") || NStr::EqualNocase(name, "two")) {
1119 frame = eTwo;
1120 } else if (NStr::Equal(name, "3") || NStr::EqualNocase(name, "three")) {
1121 frame = eThree;
1122 }
1123 return frame;
1124 }
1125
1126 const unsigned int MAX_ID_LENGTH = 50;
1127
GetIdHash(const string & str)1128 static inline string GetIdHash(const string &str)
1129 {
1130 return NStr::NumericToString(NHash::CityHash64(str), 0, 16);
1131 }
1132
GetIdHashOrValue(const string & base,int offset)1133 string GetIdHashOrValue(const string &base, int offset)
1134 {
1135 string new_str = base;
1136 if (offset > 0)
1137 new_str += "_" + NStr::NumericToString(offset);
1138 if (new_str.length() <= MAX_ID_LENGTH)
1139 return new_str;
1140 string new_hash = GetIdHash(base);
1141 if (offset > 0)
1142 new_hash += "_" + NStr::NumericToString(offset);
1143 return new_hash;
1144 }
1145
GetNewLocalProtId(const string & id_base,CScope & scope,int & offset)1146 CRef<objects::CSeq_id> GetNewLocalProtId(const string &id_base, CScope &scope, int &offset)
1147 {
1148 string id_base_hash = GetIdHash(id_base);
1149 CRef<objects::CSeq_id> new_id(new objects::CSeq_id());
1150 string new_str = id_base;
1151 if (offset > 0)
1152 new_str += "_" + NStr::NumericToString(offset);
1153 new_id->SetLocal().SetStr(new_str);
1154 CRef<objects::CSeq_id> new_id_hash(new objects::CSeq_id());
1155 string new_hash = id_base_hash;
1156 if (offset > 0)
1157 new_hash += "_" + NStr::NumericToString(offset);
1158 new_id_hash->SetLocal().SetStr(new_hash);
1159 objects::CBioseq_Handle b_found = scope.GetBioseqHandle(*new_id);
1160 objects::CBioseq_Handle b_found_hash = scope.GetBioseqHandle(*new_id_hash); // as we consider ID_NUM and HASH_NUM to be synonyms we need to check for the existence of both at the same time
1161 // to avoid a situation where ID_1 and HASH_1 are both created
1162 while (b_found || b_found_hash)
1163 {
1164 offset++;
1165 new_id->SetLocal().SetStr(id_base + "_" + NStr::NumericToString(offset));
1166 b_found = scope.GetBioseqHandle(*new_id);
1167 new_id_hash->SetLocal().SetStr(id_base_hash + "_" + NStr::NumericToString(offset));
1168 b_found_hash = scope.GetBioseqHandle(*new_id_hash);
1169 }
1170 if (new_id->GetLocal().GetStr().size() <= MAX_ID_LENGTH)
1171 return new_id;
1172 return new_id_hash;
1173 }
1174
GetNewGeneralProtId(const string & id_base,const string & db,CScope & scope,int & offset)1175 static CRef<objects::CSeq_id> GetNewGeneralProtId(const string &id_base, const string &db, CScope &scope, int &offset)
1176 {
1177 string id_base_hash = GetIdHash(id_base);
1178 CRef<objects::CSeq_id> new_id(new objects::CSeq_id());
1179 new_id->SetGeneral().SetDb(db);
1180 string new_str = id_base;
1181 if (offset > 0)
1182 new_str += "_" + NStr::NumericToString(offset);
1183 new_id->SetGeneral().SetTag().SetStr(new_str);
1184 CRef<objects::CSeq_id> new_id_hash(new objects::CSeq_id());
1185 new_id_hash->SetGeneral().SetDb(db);
1186 string new_hash = id_base_hash;
1187 if (offset > 0)
1188 new_hash += "_" + NStr::NumericToString(offset);
1189 new_id_hash->SetGeneral().SetTag().SetStr(new_hash);
1190 objects::CBioseq_Handle b_found = scope.GetBioseqHandle(*new_id);
1191 objects::CBioseq_Handle b_found_hash = scope.GetBioseqHandle(*new_id_hash); // as we consider ID_NUM and HASH_NUM to be synonyms we need to check for the existence of both at the same time
1192 // to avoid a situation where ID_1 and HASH_1 are both created
1193 while (b_found || b_found_hash)
1194 {
1195 offset++;
1196 new_id->SetGeneral().SetTag().SetStr(id_base + "_" + NStr::NumericToString(offset));
1197 b_found = scope.GetBioseqHandle(*new_id);
1198 new_id_hash->SetGeneral().SetTag().SetStr(id_base_hash + "_" + NStr::NumericToString(offset));
1199 b_found_hash = scope.GetBioseqHandle(*new_id_hash);
1200 }
1201 if (new_id->GetGeneral().GetTag().GetStr().size() <= MAX_ID_LENGTH)
1202 return new_id;
1203 return new_id_hash;
1204 }
1205
GetGeneralOrLocal(objects::CSeq_id_Handle hid,CScope & scope,int & offset,bool fall_through)1206 static CRef<objects::CSeq_id> GetGeneralOrLocal(objects::CSeq_id_Handle hid, CScope &scope, int &offset, bool fall_through)
1207 {
1208 CRef<objects::CSeq_id> new_id;
1209 if (hid.GetSeqId()->IsLocal())
1210 {
1211 string id_base;
1212 if (hid.GetSeqId()->GetLocal().IsId())
1213 {
1214 id_base = NStr::NumericToString(hid.GetSeqId()->GetLocal().GetId());
1215 }
1216 else
1217 {
1218 id_base = hid.GetSeqId()->GetLocal().GetStr();
1219 }
1220 new_id = GetNewLocalProtId(id_base, scope, offset);
1221 }
1222 else if (hid.GetSeqId()->IsGeneral() && hid.GetSeqId()->GetGeneral().IsSetTag()
1223 && (!hid.GetSeqId()->GetGeneral().IsSetDb() || hid.GetSeqId()->GetGeneral().GetDb() != "TMSMART"))
1224 {
1225 string id_base;
1226 if (hid.GetSeqId()->GetGeneral().GetTag().IsId())
1227 {
1228 id_base = NStr::NumericToString(hid.GetSeqId()->GetGeneral().GetTag().GetId());
1229 }
1230 else
1231 {
1232 id_base = hid.GetSeqId()->GetGeneral().GetTag().GetStr();
1233 }
1234 new_id = GetNewGeneralProtId(id_base, hid.GetSeqId()->GetGeneral().GetDb(), scope, offset);
1235 }
1236 else if (fall_through) // if we don't care for the incoming seq-id to be specifically local or general take any input and create a local seq-id output
1237 {
1238 string id_base;
1239 hid.GetSeqId()->GetLabel(&id_base, objects::CSeq_id::eContent);
1240 new_id = GetNewLocalProtId(id_base, scope, offset);
1241 }
1242 return new_id;
1243 }
1244
GetNewProtIdFromExistingProt(objects::CBioseq_Handle bsh,int & offset,string & id_label)1245 vector<CRef<objects::CSeq_id> > GetNewProtIdFromExistingProt(objects::CBioseq_Handle bsh, int &offset, string& id_label)
1246 {
1247 vector<CRef<objects::CSeq_id> > ids;
1248 for(auto it : bsh.GetId())
1249 {
1250 if (it.GetSeqIdOrNull())
1251 {
1252 objects::CSeq_id_Handle hid = it;
1253 CRef<objects::CSeq_id> new_id = GetGeneralOrLocal(hid, bsh.GetScope(), offset, false);
1254 if (new_id)
1255 ids.push_back(new_id);
1256 }
1257 }
1258
1259 if (ids.empty() && !bsh.GetId().empty())
1260 {
1261 CRef<objects::CSeq_id> new_id = GetGeneralOrLocal(bsh.GetId().front(), bsh.GetScope(), offset, true);
1262 ids.push_back(new_id);
1263 }
1264
1265 if (ids.empty())
1266 NCBI_THROW(CException, eUnknown, "Seq-id not found");
1267
1268 ids.front()->GetLabel(&id_label, objects::CSeq_id::eBoth);
1269 return ids;
1270 }
1271
GetNewProtId(objects::CBioseq_Handle bsh,int & offset,string & id_label,bool general_only)1272 CRef<objects::CSeq_id> GetNewProtId(objects::CBioseq_Handle bsh, int &offset, string& id_label, bool general_only)
1273 {
1274 objects::CSeq_id_Handle hid = sequence::GetId(bsh, sequence::eGetId_Best);
1275 objects::CSeq_id_Handle gen_id;
1276
1277 for (auto it : bsh.GetId())
1278 {
1279 if (it.GetSeqId()->IsGeneral() && it.GetSeqId()->GetGeneral().IsSetDb() &&
1280 !it.GetSeqId()->GetGeneral().IsSkippable())
1281 {
1282 gen_id = it;
1283 }
1284 }
1285 if (gen_id && general_only)
1286 {
1287 hid = gen_id;
1288 }
1289
1290 if (!hid)
1291 NCBI_THROW(CException, eUnknown, "Seq-id of the requested type not found");
1292
1293 CRef<objects::CSeq_id> new_id = GetGeneralOrLocal(hid, bsh.GetScope(), offset, true);
1294 new_id->GetLabel(&id_label, objects::CSeq_id::eBoth);
1295 return new_id;
1296 }
1297
IsGeneralIdProtPresent(objects::CSeq_entry_Handle tse)1298 bool IsGeneralIdProtPresent(objects::CSeq_entry_Handle tse)
1299 {
1300 bool found = false;
1301 for (CBioseq_CI b_iter(tse, CSeq_inst::eMol_aa); b_iter; ++b_iter)
1302 {
1303 for (auto it : b_iter->GetId())
1304 {
1305 if (it.GetSeqId()->IsGeneral() && it.GetSeqId()->GetGeneral().IsSetDb() &&
1306 !it.GetSeqId()->GetGeneral().IsSkippable())
1307 {
1308 found = true;
1309 break;
1310 }
1311 }
1312 }
1313 return found;
1314 }
1315
1316 END_SCOPE(edit)
1317 END_SCOPE(objects)
1318 END_NCBI_SCOPE
1319
1320