1 /*  $Id: cds_fix.cpp 632623 2021-06-03 17:38:11Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Colleen Bollin, NCBI
27 *
28 * File Description:
29 *   functions for editing and working with coding regions
30 */
31 #include <ncbi_pch.hpp>
32 #include <corelib/ncbistd.hpp>
33 #include <corelib/ncbiobj.hpp>
34 #include <objtools/edit/cds_fix.hpp>
35 #include <objtools/edit/loc_edit.hpp>
36 #include <objects/seq/Seq_descr.hpp>
37 #include <objects/seq/Seqdesc.hpp>
38 #include <objects/seq/Seq_inst.hpp>
39 #include <objects/general/Object_id.hpp>
40 #include <objects/seqfeat/Cdregion.hpp>
41 #include <objects/seqfeat/Code_break.hpp>
42 #include <objects/seqfeat/Imp_feat.hpp>
43 #include <objects/seqfeat/Org_ref.hpp>
44 #include <objects/seqfeat/RNA_ref.hpp>
45 #include <objects/seqfeat/SeqFeatXref.hpp>
46 #include <objects/seqfeat/seqfeat_macros.hpp>
47 #include <objects/seqfeat/Genetic_code_table.hpp>
48 #include <util/checksum.hpp>
49 #include <objmgr/bioseq_handle.hpp>
50 #include <objmgr/seqdesc_ci.hpp>
51 #include <objmgr/seq_annot_ci.hpp>
52 #include <util/sequtil/sequtil_convert.hpp>
53 #include <objmgr/util/sequence.hpp>
54 #include <objmgr/seq_vector.hpp>
55 #include <objects/general/Dbtag.hpp>
56 
57 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)58 BEGIN_SCOPE(objects)
59 BEGIN_SCOPE(edit)
60 
61 unsigned char GetCodeBreakCharacter(const CCode_break& cbr)
62 {
63     unsigned char ex = 0;
64     vector<char> seqData;
65     string str;
66 
67     if (!cbr.IsSetAa()) {
68         return ex;
69     }
70     switch (cbr.GetAa().Which()) {
71         case CCode_break::C_Aa::e_Ncbi8aa:
72             str = cbr.GetAa().GetNcbi8aa();
73             CSeqConvert::Convert(str, CSeqUtil::e_Ncbi8aa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
74             ex = seqData[0];
75             break;
76         case CCode_break::C_Aa::e_Ncbistdaa:
77             str = cbr.GetAa().GetNcbi8aa();
78             CSeqConvert::Convert(str, CSeqUtil::e_Ncbistdaa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
79             ex = seqData[0];
80             break;
81         case CCode_break::C_Aa::e_Ncbieaa:
82             seqData.push_back(cbr.GetAa().GetNcbieaa());
83             ex = seqData[0];
84             break;
85         default:
86             // do nothing, code break wasn't actually set
87 
88             break;
89     }
90     return ex;
91 }
92 
93 
DoesCodingRegionHaveTerminalCodeBreak(const objects::CCdregion & cdr)94 bool DoesCodingRegionHaveTerminalCodeBreak(const objects::CCdregion& cdr)
95 {
96     if (!cdr.IsSetCode_break()) {
97         return false;
98     }
99     bool rval = false;
100     ITERATE(objects::CCdregion::TCode_break, it, cdr.GetCode_break()) {
101         if (GetCodeBreakCharacter(**it) == '*') {
102             rval = true;
103             break;
104         }
105     }
106     return rval;
107 }
108 
109 
GetLastPartialCodonLength(const objects::CSeq_feat & cds,objects::CScope & scope)110 TSeqPos GetLastPartialCodonLength(const objects::CSeq_feat& cds, objects::CScope& scope)
111 {
112     if (!cds.IsSetData() || !cds.GetData().IsCdregion()) {
113         return 0;
114     }
115     // find the length of the last partial codon.
116     const objects::CCdregion& cdr = cds.GetData().GetCdregion();
117     int dna_len = sequence::GetLength(cds.GetLocation(), &scope);
118     TSeqPos except_len = 0;
119     if (cds.GetLocation().IsPartialStart(eExtreme_Biological) && cdr.IsSetFrame()) {
120         switch (cdr.GetFrame()) {
121             case objects::CCdregion::eFrame_two:
122                 except_len = (dna_len - 1) % 3;
123                 break;
124             case objects::CCdregion::eFrame_three:
125                 except_len = (dna_len - 2) % 3;
126                 break;
127             default:
128                 except_len = dna_len % 3;
129                 break;
130         }
131     } else {
132         except_len = dna_len % 3;
133     }
134     return except_len;
135 }
136 
137 
GetLastCodonLoc(const CSeq_feat & cds,CScope & scope)138 CRef<CSeq_loc> GetLastCodonLoc(const CSeq_feat& cds, CScope& scope)
139 {
140     TSeqPos except_len = GetLastPartialCodonLength(cds, scope);
141     if (except_len == 0) {
142         except_len = 3;
143     }
144     const CSeq_loc& cds_loc = cds.GetLocation();
145     TSeqPos stop = cds_loc.GetStop(eExtreme_Biological);
146     CRef<CSeq_id> new_id(new CSeq_id());
147     new_id->Assign(*(cds_loc.GetId()));
148     CRef<CSeq_loc> codon_loc(new CSeq_loc());
149     codon_loc->SetInt().SetId(*new_id);
150     if (cds_loc.GetStrand() == eNa_strand_minus) {
151         codon_loc->SetInt().SetFrom(stop);
152         codon_loc->SetInt().SetTo(stop + except_len - 1);
153         codon_loc->SetInt().SetStrand(eNa_strand_minus);
154     } else {
155         codon_loc->SetInt().SetFrom(stop - except_len + 1);
156         codon_loc->SetInt().SetTo(stop);
157     }
158     return codon_loc;
159 }
160 
161 
AddTerminalCodeBreak(CSeq_feat & cds,CScope & scope)162 bool AddTerminalCodeBreak(CSeq_feat& cds, CScope& scope)
163 {
164     CRef<CSeq_loc> codon_loc = GetLastCodonLoc(cds, scope);
165     if (!codon_loc) {
166         return false;
167     }
168 
169     CRef<objects::CCode_break> cbr(new objects::CCode_break());
170     cbr->SetAa().SetNcbieaa('*');
171     cbr->SetLoc().Assign(*codon_loc);
172     cds.SetData().SetCdregion().SetCode_break().push_back(cbr);
173     return true;
174 }
175 
176 
DoesCodingRegionEndWithStopCodon(const objects::CSeq_feat & cds,objects::CScope & scope)177 bool DoesCodingRegionEndWithStopCodon(const objects::CSeq_feat& cds, objects::CScope& scope)
178 {
179     string transl_prot;
180     bool alt_start = false;
181     bool rval = false;
182     try {
183         CSeqTranslator::Translate(cds, scope, transl_prot,
184                                     true,   // include stop codons
185                                     false,  // do not remove trailing X/B/Z
186                                     &alt_start);
187         if (NStr::EndsWith(transl_prot, "*")) {
188             rval = true;
189         }
190     } catch (CException&) {
191         // can't translate
192     }
193     return rval;
194 }
195 
196 
ExtendStop(CSeq_loc & loc,TSeqPos len,CScope & scope)197 void ExtendStop(CSeq_loc& loc, TSeqPos len, CScope& scope)
198 {
199     if (len == 0) {
200         return;
201     }
202     TSeqPos stop = loc.GetStop(eExtreme_Biological);
203     CRef<CSeq_loc> add(new CSeq_loc());
204     add->SetInt().SetId().Assign(*(loc.GetId()));
205     if (loc.GetStrand() == eNa_strand_minus) {
206         add->SetInt().SetFrom(stop - len);
207         add->SetInt().SetTo(stop - 1);
208         add->SetInt().SetStrand(eNa_strand_minus);
209     } else {
210         add->SetInt().SetId().Assign(*(loc.GetId()));
211         add->SetInt().SetFrom(stop + 1);
212         add->SetInt().SetTo(stop + len);
213     }
214     loc.Assign(*(sequence::Seq_loc_Add(loc, *add, CSeq_loc::fMerge_All|CSeq_loc::fSort, &scope)));
215 }
216 
217 
ExtendLocationForTranslExcept(objects::CSeq_loc & loc,objects::CScope & scope)218 TSeqPos ExtendLocationForTranslExcept(objects::CSeq_loc& loc, objects::CScope& scope)
219 {
220     CBioseq_Handle bsh = scope.GetBioseqHandle(loc);
221     TSeqPos stop = loc.GetStop(eExtreme_Biological);
222     TSeqPos len = 0;
223     TSeqPos except_len = 0;
224     CRef<CSeq_loc> overhang(new CSeq_loc());
225     overhang->SetInt().SetId().Assign(*loc.GetId());
226 
227     if (loc.GetStrand() == eNa_strand_minus) {
228         if (stop < 3) {
229             len = stop;
230         } else {
231             len = 3;
232         }
233         if (len > 0) {
234             overhang->SetInt().SetFrom(0);
235             overhang->SetInt().SetTo(stop - 1);
236             overhang->SetStrand(eNa_strand_minus);
237         }
238     } else {
239         len = bsh.GetBioseqLength() - stop - 1;
240         if (len > 3) {
241             len = 3;
242         }
243         if (len > 0) {
244             overhang->SetInt().SetFrom(stop + 1);
245             overhang->SetInt().SetTo(bsh.GetBioseqLength() - 1);
246         }
247     }
248     if (len > 0) {
249         CSeqVector vec(*overhang, scope, CBioseq_Handle::eCoding_Iupac);
250         string seq_string;
251         vec.GetSeqData(0, len, seq_string);
252         if (vec[0] == 'T') {
253             except_len++;
254             if (len > 1 && vec[1] == 'A') {
255                 except_len++;
256                 if (len > 2 && vec[2] == 'A') {
257                     // adding a real stop codon
258                     except_len ++;
259                 }
260             }
261         }
262     }
263     // extend
264     if (except_len > 0) {
265         ExtendStop(loc, except_len, scope);
266     }
267     return except_len;
268 }
269 
270 
IsOverhangOkForTerminalCodeBreak(const CSeq_feat & cds,CScope & scope,bool strict)271 bool IsOverhangOkForTerminalCodeBreak(const CSeq_feat& cds, CScope& scope, bool strict)
272 {
273     CRef<CSeq_loc> loc = GetLastCodonLoc(cds, scope);
274     if (!loc) {
275         return false;
276     }
277     TSeqPos len = sequence::GetLength(*loc, &scope);
278     CSeqVector vec(*loc, scope, CBioseq_Handle::eCoding_Iupac);
279     bool rval = true;
280     if (strict) {
281         if (vec[0] != 'T') {
282             rval = false;
283         } else if (len > 1 && vec[1] != 'A') {
284             rval = false;
285         }
286     } else {
287         if (vec[0] != 'T' && vec[0] != 'N') {
288             rval = false;
289         }
290     }
291     return rval;
292 }
293 
294 
295 /// SetTranslExcept
296 /// A function to set a code break at the 3' end of a coding region to indicate
297 /// that the stop codon is formed by the addition of a poly-A tail.
298 /// @param cds      The coding region to be adjusted (if necessary)
299 /// @param comment  The string to place in the note on cds if a code break is added
300 /// @param strict   Only add code break if last partial codon consists of "TA" or just "T".
301 ///                 If strict is false, add code break if first NT of last partial codon
302 ///                 is T or N.
303 /// @param extend   If true, extend coding region to cover partial stop codon
SetTranslExcept(objects::CSeq_feat & cds,const string & comment,bool strict,bool extend,objects::CScope & scope)304 bool SetTranslExcept(objects::CSeq_feat& cds, const string& comment, bool strict, bool extend, objects::CScope& scope)
305 {
306     // do nothing if this isn't a coding region
307     if (!cds.IsSetData() || !cds.GetData().IsCdregion()) {
308         return false;
309     }
310     // do nothing if coding region is 3' partial
311     if (cds.GetLocation().IsPartialStop(eExtreme_Biological)) {
312         return false;
313     }
314 
315     objects::CCdregion& cdr = cds.SetData().SetCdregion();
316     // do nothing if coding region already has terminal code break
317     if (DoesCodingRegionHaveTerminalCodeBreak(cdr)) {
318         return false;
319     }
320 
321     // find the length of the last partial codon.
322     TSeqPos except_len = GetLastPartialCodonLength(cds, scope);
323 
324     bool extended = false;
325     if (except_len == 0 && extend && !DoesCodingRegionEndWithStopCodon(cds, scope)) {
326         except_len = ExtendLocationForTranslExcept(cds.SetLocation(), scope);
327         if (except_len > 0) {
328             extended = true;
329         }
330     }
331 
332     if (except_len == 0) {
333         return false;
334     }
335 
336     bool added_code_break = false;
337     if (!extend || !DoesCodingRegionEndWithStopCodon(cds, scope)) {
338         // TODO: check for strictness
339         if (IsOverhangOkForTerminalCodeBreak(cds, scope, strict)
340             && AddTerminalCodeBreak(cds, scope)) {
341             if (!NStr::IsBlank(comment)) {
342                 if (cds.IsSetComment() && !NStr::IsBlank(cds.GetComment())) {
343                     string orig_comment = cds.GetComment();
344                     if (NStr::Find(orig_comment, comment) == string::npos) {
345                         cds.SetComment(cds.GetComment() + ";" + comment);
346                     }
347                 } else {
348                     cds.SetComment(comment);
349                 }
350             }
351             added_code_break = true;
352         }
353     }
354 
355     return extended || added_code_break;
356 }
357 
358 
359 /// AdjustProteinMolInfoToMatchCDS
360 /// A function to change an existing MolInfo to match a coding region
361 /// @param molinfo  The MolInfo to be adjusted (if necessary)
362 /// @param cds      The CDS to match
363 ///
364 /// @return         Boolean to indicate whether molinfo was changed
365 
AdjustProteinMolInfoToMatchCDS(CMolInfo & molinfo,const CSeq_feat & cds)366 bool AdjustProteinMolInfoToMatchCDS(CMolInfo& molinfo, const CSeq_feat& cds)
367 {
368     bool rval = false;
369     if (!molinfo.IsSetBiomol() || molinfo.GetBiomol() != CMolInfo::eBiomol_peptide) {
370         molinfo.SetBiomol(CMolInfo::eBiomol_peptide);
371         rval = true;
372     }
373 
374     bool partial5 = cds.GetLocation().IsPartialStart(eExtreme_Biological);
375     bool partial3 = cds.GetLocation().IsPartialStop(eExtreme_Biological);
376     CMolInfo::ECompleteness completeness = CMolInfo::eCompleteness_complete;
377     if (partial5 && partial3) {
378         completeness = CMolInfo::eCompleteness_no_ends;
379     } else if (partial5) {
380         completeness = CMolInfo::eCompleteness_no_left;
381     } else if (partial3) {
382         completeness = CMolInfo::eCompleteness_no_right;
383     }
384     if (!molinfo.IsSetCompleteness() || molinfo.GetCompleteness() != completeness) {
385         molinfo.SetCompleteness(completeness);
386         rval = true;
387     }
388     return rval;
389 }
390 
391 
392 /// AdjustProteinFeaturePartialsToMatchCDS
393 /// A function to change an existing MolInfo to match a coding region
394 /// @param new_prot  The protein feature to be adjusted (if necessary)
395 /// @param cds       The CDS to match
396 ///
397 /// @return          Boolean to indicate whether the protein feature was changed
AdjustProteinFeaturePartialsToMatchCDS(CSeq_feat & new_prot,const CSeq_feat & cds)398 bool AdjustProteinFeaturePartialsToMatchCDS(CSeq_feat& new_prot, const CSeq_feat& cds)
399 {
400     bool any_change = false;
401     bool partial5 = cds.GetLocation().IsPartialStart(eExtreme_Biological);
402     bool partial3 = cds.GetLocation().IsPartialStop(eExtreme_Biological);
403     bool prot_5 = new_prot.GetLocation().IsPartialStart(eExtreme_Biological);
404     bool prot_3 = new_prot.GetLocation().IsPartialStop(eExtreme_Biological);
405     if ((partial5 && !prot_5) || (!partial5 && prot_5)
406         || (partial3 && !prot_3) || (!partial3 && prot_3)) {
407         new_prot.SetLocation().SetPartialStart(partial5, eExtreme_Biological);
408         new_prot.SetLocation().SetPartialStop(partial3, eExtreme_Biological);
409         any_change = true;
410     }
411     any_change |= feature::AdjustFeaturePartialFlagForLocation(new_prot);
412     return any_change;
413 }
414 
415 
416 /// AdjustForCDSPartials
417 /// A function to make all of the necessary related changes to
418 /// a Seq-entry after the partialness of a coding region has been
419 /// changed.
420 /// @param cds        The feature for which adjustments are to be made
421 /// @param seh        The Seq-entry-handle to be adjusted (if necessary)
422 ///
423 /// @return           Boolean to indicate whether the Seq-entry-handle was changed
AdjustForCDSPartials(const CSeq_feat & cds,CSeq_entry_Handle seh)424 bool AdjustForCDSPartials(const CSeq_feat& cds, CSeq_entry_Handle seh)
425 {
426     if (!cds.IsSetProduct() || !seh) {
427         return false;
428     }
429 
430     // find Bioseq for product
431     CBioseq_Handle product = seh.GetScope().GetBioseqHandle(cds.GetProduct());
432     if (!product) {
433         return false;
434     }
435 
436     bool any_change = false;
437     // adjust protein feature
438     CFeat_CI f(product, SAnnotSelector(CSeqFeatData::eSubtype_prot));
439     if (f) {
440         // This is necessary, to make sure that we are in "editing mode"
441         const CSeq_annot_Handle& annot_handle = f->GetAnnot();
442         CSeq_entry_EditHandle eh = annot_handle.GetParentEntry().GetEditHandle();
443         CSeq_feat_EditHandle feh(*f);
444         CRef<CSeq_feat> new_feat(new CSeq_feat());
445         new_feat->Assign(*(f->GetSeq_feat()));
446         if (AdjustProteinFeaturePartialsToMatchCDS(*new_feat, cds)) {
447             feh.Replace(*new_feat);
448             any_change = true;
449         }
450     }
451 
452     // change or create molinfo on protein bioseq
453     bool found = false;
454     CBioseq_EditHandle beh = product.GetEditHandle();
455     NON_CONST_ITERATE(CBioseq::TDescr::Tdata, it, beh.SetDescr().Set()) {
456         if ((*it)->IsMolinfo()) {
457             any_change |= AdjustProteinMolInfoToMatchCDS((*it)->SetMolinfo(), cds);
458             found = true;
459         }
460     }
461     if (!found) {
462         CRef<objects::CSeqdesc> new_molinfo_desc( new CSeqdesc );
463         AdjustProteinMolInfoToMatchCDS(new_molinfo_desc->SetMolinfo(), cds);
464         beh.SetDescr().Set().push_back(new_molinfo_desc);
465         any_change = true;
466     }
467 
468     return any_change;
469 }
470 
471 
s_GetProductName(const CProt_ref & prot)472 string s_GetProductName (const CProt_ref& prot)
473 {
474     string prot_nm(kEmptyStr);
475     if (prot.IsSetName() && prot.GetName().size() > 0) {
476         prot_nm = prot.GetName().front();
477     }
478     return prot_nm;
479 }
480 
481 
s_GetProductName(const CSeq_feat & cds,CScope & scope)482 string s_GetProductName (const CSeq_feat& cds, CScope& scope)
483 {
484     string prot_nm(kEmptyStr);
485     if (cds.IsSetProduct()) {
486         CBioseq_Handle prot_bsq = sequence::GetBioseqFromSeqLoc(cds.GetProduct(), scope);
487         if (prot_bsq) {
488             CFeat_CI prot_ci(prot_bsq, CSeqFeatData::e_Prot);
489             if (prot_ci) {
490                 prot_nm = s_GetProductName(prot_ci->GetOriginalFeature().GetData().GetProt());
491             }
492         }
493     } else if (cds.IsSetXref()) {
494         ITERATE(CSeq_feat::TXref, it, cds.GetXref()) {
495             if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
496                 prot_nm = s_GetProductName((*it)->GetData().GetProt());
497             }
498         }
499     }
500     return prot_nm;
501 }
502 
503 
s_GetmRNAName(const CSeq_feat & mrna)504 string s_GetmRNAName (const CSeq_feat& mrna)
505 {
506     if (!mrna.IsSetData() || mrna.GetData().GetSubtype() != CSeqFeatData::eSubtype_mRNA
507         || !mrna.GetData().IsRna() || !mrna.GetData().GetRna().IsSetExt()
508         || !mrna.GetData().GetRna().GetExt().IsName()) {
509         return "";
510     } else {
511         return mrna.GetData().GetRna().GetExt().GetName();
512     }
513 }
514 
515 
516 /// MakemRNAforCDS
517 /// A function to create a CSeq_feat that represents the
518 /// appropriate mRNA for a given CDS. Note that this feature
519 /// is not added to the Seq-annot in the record; this step is
520 /// left for the caller.
521 /// @param cds        The feature for which the mRNA to be made, if one is not already present
522 /// @param scope      The scope in which adjustments are to be made (if necessary)
523 ///
524 /// @return           CRef<CSeq_feat> for new mRNA (will be NULL if one was already present)
MakemRNAforCDS(const CSeq_feat & cds,CScope & scope)525 CRef<CSeq_feat> MakemRNAforCDS(const CSeq_feat& cds, CScope& scope)
526 {
527     CRef <CSeq_feat> new_mrna(NULL);
528     string prot_nm = s_GetProductName(cds, scope);
529     const CSeq_loc& cd_loc = cds.GetLocation();
530 
531     CConstRef <CSeq_feat> mrna(NULL);
532     CBioseq_Handle bsh = scope.GetBioseqHandle(*cd_loc.GetId());
533     CSeq_feat_Handle fh = scope.GetSeq_featHandle(cds);
534     CSeq_annot_Handle sah;
535     if (fh) {
536         sah = fh.GetAnnot();
537     }
538     // can only look for overlapping mRNA with sequence::GetOverlappingmRNA
539     // if Bioseq is in scope.
540     if (bsh) {
541         mrna = sequence::GetOverlappingmRNA(cd_loc, scope);
542     } else if (sah) {
543         size_t best_len = 0;
544         for (CFeat_CI mrna_find(sah, CSeqFeatData::eSubtype_mRNA); mrna_find; ++mrna_find) {
545             if (sequence::TestForOverlap64(mrna_find->GetLocation(), cd_loc, sequence::eOverlap_CheckIntervals) != -1) {
546                 size_t len = sequence::GetLength(mrna_find->GetLocation(), &scope);
547                 if (best_len == 0 || len < best_len) {
548                     best_len = len;
549                     mrna = &(mrna_find->GetOriginalFeature());
550                 }
551             }
552         }
553     }
554 
555     if (!mrna || !NStr::Equal(prot_nm, s_GetmRNAName(*mrna))) {
556         new_mrna.Reset (new CSeq_feat());
557         new_mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
558         new_mrna->SetLocation().Assign(cd_loc);
559         new_mrna->SetData().SetRna().SetExt().SetName(prot_nm);
560 
561         bool found3 = false;
562         bool found5 = false;
563         CRef<CSeq_loc> loc(new CSeq_loc());
564         loc->Assign(new_mrna->GetLocation());
565         CFeat_CI utr1, utr2;
566         if (bsh)
567         {
568             utr1 = CFeat_CI(bsh, cd_loc.IsReverseStrand() ? CSeqFeatData::eSubtype_5UTR : CSeqFeatData::eSubtype_3UTR);
569             utr2 = CFeat_CI(bsh, cd_loc.IsReverseStrand() ? CSeqFeatData::eSubtype_3UTR : CSeqFeatData::eSubtype_5UTR);
570         }
571         else if (sah)
572         {
573             utr1 = CFeat_CI(sah, cd_loc.IsReverseStrand() ? CSeqFeatData::eSubtype_5UTR : CSeqFeatData::eSubtype_3UTR);
574             utr2 = CFeat_CI(sah, cd_loc.IsReverseStrand() ? CSeqFeatData::eSubtype_3UTR : CSeqFeatData::eSubtype_5UTR);
575         }
576 
577         for (; utr1; ++utr1)
578         {
579             if (utr1->GetLocation().GetStart(eExtreme_Positional) == cd_loc.GetStop(eExtreme_Positional) + 1)
580             {
581                 loc = sequence::Seq_loc_Add(*loc, utr1->GetLocation(), CSeq_loc::fMerge_All|CSeq_loc::fSort, &scope);
582                 if (cd_loc.IsReverseStrand())
583                     found5 = true;
584                 else
585                     found3 = true;
586                 break;
587             }
588         }
589         for (; utr2; ++utr2)
590         {
591             if (utr2->GetLocation().GetStop(eExtreme_Positional) + 1 == cd_loc.GetStart(eExtreme_Positional) )
592             {
593                 loc = sequence::Seq_loc_Add(*loc, utr2->GetLocation(), CSeq_loc::fMerge_All|CSeq_loc::fSort, &scope);
594                 if (cd_loc.IsReverseStrand())
595                     found3 = true;
596                 else
597                     found5 = true;
598                 break;
599             }
600         }
601 
602         CConstRef <CSeq_feat> gene = sequence::GetBestGeneForCds(cds, scope);
603         const CSeq_loc *overlap_loc = &cd_loc;
604         if (gene && gene->IsSetLocation())
605         {
606             overlap_loc = &gene->GetLocation();
607         }
608         auto gene_start = overlap_loc->GetStart(eExtreme_Positional);
609         auto gene_stop = overlap_loc->GetStop(eExtreme_Positional);
610 
611         CFeat_CI exon;
612         if (bsh)
613             exon = CFeat_CI(bsh, CSeqFeatData::eSubtype_exon);
614         else if (sah)
615             exon = CFeat_CI(sah, CSeqFeatData::eSubtype_exon);
616         for (; exon; ++exon)
617         {
618             if (!sequence::IsSameBioseq(*exon->GetLocation().GetId(), *overlap_loc->GetId(), &scope))
619                 continue;
620             auto exon_start = exon->GetLocation().GetStart(eExtreme_Positional);
621             auto exon_stop = exon->GetLocation().GetStop(eExtreme_Positional);
622             auto mrna_start = loc->GetStart(eExtreme_Positional);
623             auto mrna_stop = loc->GetStop(eExtreme_Positional);
624             if (exon_start >= gene_start && exon_stop <= gene_stop)
625             {
626                 bool exon_found = false;
627                 if (exon_start < mrna_start )
628                 {
629                     if (loc->IsReverseStrand())
630                         found3 = true;
631                     else
632                         found5 = true;
633                     exon_found = true;
634                 }
635                 if (exon_stop > mrna_stop)
636                 {
637                     if (loc->IsReverseStrand())
638                         found5 = true;
639                     else
640                         found3 = true;
641                     exon_found = true;
642                 }
643                 if (exon_found)
644                 {
645                     loc = sequence::Seq_loc_Add(*loc, exon->GetLocation(), CSeq_loc::fMerge_All|CSeq_loc::fSort, &scope);
646                 }
647             }
648         }
649 
650         new_mrna->SetLocation(*loc);
651 
652         if (!found5)
653             new_mrna->SetLocation().SetPartialStart(true, eExtreme_Positional);
654         if (!found3)
655             new_mrna->SetLocation().SetPartialStop(true, eExtreme_Positional);
656 
657         new_mrna->SetPartial(new_mrna->GetLocation().IsPartialStart(eExtreme_Positional) || new_mrna->GetLocation().IsPartialStop(eExtreme_Positional));
658     }
659     return new_mrna;
660 }
661 
662 /// GetmRNAforCDS
663 /// A function to find a CSeq_feat representing the
664 /// appropriate mRNA for a given CDS.
665 /// @param cds        The feature for which the mRNA to be found
666 /// @param scope      The scope
667 ///
668 /// @return           CConstRef<CSeq_feat> for new mRNA (will be NULL if none is found)
669 
GetmRNAforCDS(const CSeq_feat & cds,CScope & scope)670 CConstRef<CSeq_feat> GetmRNAforCDS(const CSeq_feat& cds, CScope& scope)
671 {
672     CConstRef<CSeq_feat> mrna;
673     bool has_xref = false;
674     if (cds.IsSetXref()) {
675         /* using FeatID from feature cross-references:
676         * if CDS refers to an mRNA by feature ID, use that feature
677         */
678         CBioseq_Handle bsh = scope.GetBioseqHandle(cds.GetLocation());
679         CTSE_Handle tse = bsh.GetTSE_Handle();
680         FOR_EACH_SEQFEATXREF_ON_SEQFEAT (it, cds) {
681             if ((*it)->IsSetId() && (*it)->GetId().IsLocal() && (*it)->GetId().GetLocal().IsId()) {
682                 CSeq_feat_Handle mrna_h = tse.GetFeatureWithId(CSeqFeatData::eSubtype_mRNA, (*it)->GetId().GetLocal().GetId());
683                 if (mrna_h) {
684                     mrna = mrna_h.GetSeq_feat();
685                 }
686                 has_xref = true;
687             }
688         }
689     }
690     if (!has_xref) {
691         /* using original location to find mRNA:
692         * mRNA must include the CDS location and the internal interval boundaries need to be identical
693         */
694         mrna = sequence::GetBestOverlappingFeat( cds.GetLocation(), CSeqFeatData::eSubtype_mRNA, sequence::eOverlap_CheckIntRev, scope);
695     }
696     return mrna;
697 }
698 
699 /// GetGeneticCodeForBioseq
700 /// A function to construct the appropriate CGenetic_code object to use
701 /// when constructing a coding region for a given Bioseq (if the default code
702 /// should not be used).
703 /// @param bh         The Bioseq_Handle of the nucleotide sequence on which the
704 ///                   coding region is to be created.
705 ///
706 /// @return           CRef<CGenetic_code> for new CGenetic_code (will be NULL if default should be used)
GetGeneticCodeForBioseq(CBioseq_Handle bh)707 CRef<CGenetic_code> GetGeneticCodeForBioseq(CBioseq_Handle bh)
708 {
709     CRef<CGenetic_code> code(NULL);
710     if (!bh) {
711         return code;
712     }
713     CSeqdesc_CI src(bh, CSeqdesc::e_Source);
714     if (src && src->GetSource().IsSetOrg() && src->GetSource().GetOrg().IsSetOrgname()) {
715         const CBioSource & bsrc = src->GetSource();
716         int bioseqGenCode = bsrc.GetGenCode(0);
717         if (bioseqGenCode > 0) {
718             code.Reset(new CGenetic_code());
719             code->SetId(bioseqGenCode);
720         }
721     }
722     return code;
723 }
724 
725 
TruncateSeqLoc(const CSeq_loc & orig_loc,size_t new_len)726 static CRef<CSeq_loc> TruncateSeqLoc (const CSeq_loc& orig_loc, size_t new_len)
727 {
728     CRef<CSeq_loc> new_loc;
729 
730     size_t len = 0;
731     for (CSeq_loc_CI it(orig_loc); it && len < new_len; ++it) {
732         size_t this_len = it.GetRange().GetLength();
733         CConstRef<CSeq_loc> this_loc = it.GetRangeAsSeq_loc();
734         if (len + this_len <= new_len) {
735             if (new_loc) {
736                 new_loc->Add(*this_loc);
737             } else {
738                 new_loc.Reset(new CSeq_loc());
739                 new_loc->Assign(*this_loc);
740             }
741             len += this_len;
742         } else {
743             CRef<CSeq_loc> partial_loc(new CSeq_loc());
744             size_t len_wanted = new_len - len;
745             size_t start = this_loc->GetStart(eExtreme_Biological);
746             if (len_wanted == 1) {
747                 // make a point
748                 partial_loc->SetPnt().SetPoint(start);
749             } else {
750                 // make an interval
751                 if (this_loc->IsSetStrand() && this_loc->GetStrand() == eNa_strand_minus) {
752                     partial_loc->SetInt().SetFrom(start - len_wanted + 1);
753                     partial_loc->SetInt().SetTo(start);
754                 } else {
755                     partial_loc->SetInt().SetFrom(start);
756                     partial_loc->SetInt().SetTo(start + len_wanted - 1);
757                 }
758             }
759             partial_loc->SetId(*this_loc->GetId());
760             if (this_loc->IsSetStrand()) {
761                 partial_loc->SetStrand(this_loc->GetStrand());
762             }
763             if (new_loc) {
764                 new_loc->Add(*partial_loc);
765             } else {
766                 new_loc.Reset(new CSeq_loc());
767                 new_loc->Assign(*partial_loc);
768             }
769             len += len_wanted;
770         }
771     }
772 
773     return new_loc;
774 }
775 
776 
777 /// TruncateCDSAtStop
778 /// A function to truncate a CDS location after the first stop codon in the
779 /// protein translation. Note that adjustments are not made to the protein
780 /// sequence in this function, only the location and partialness of the coding
781 /// region.
782 /// @param cds        The feature to adjust
783 /// @param scope      The scope in which adjustments are to be made (if necessary)
784 ///
785 /// @return           true if stop codon was found, false otherwise
TruncateCDSAtStop(CSeq_feat & cds,CScope & scope)786 bool TruncateCDSAtStop(CSeq_feat& cds, CScope& scope)
787 {
788     bool rval = false;
789     CRef<CBioseq> bioseq = CSeqTranslator::TranslateToProtein (cds, scope);
790     if (bioseq) {
791         CRef<CSeq_loc> new_loc(NULL);
792         string prot_str;
793         CSeqTranslator::Translate(cds, scope, prot_str);
794         size_t pos = NStr::Find(prot_str, "*");
795         if (pos != string::npos) {
796             // want to truncate the location and retranslate
797             size_t len_wanted =  3 * (pos + 1);
798             if (cds.GetData().GetCdregion().IsSetFrame()) {
799                 CCdregion::EFrame frame = cds.GetData().GetCdregion().GetFrame();
800                 if (frame == CCdregion::eFrame_two) {
801                     len_wanted += 1;
802                 } else if (frame == CCdregion::eFrame_three) {
803                     len_wanted += 2;
804                 }
805             }
806             if (len_wanted > 0) {
807                 new_loc = TruncateSeqLoc (cds.GetLocation(), len_wanted);
808                 if (new_loc) {
809                     new_loc->SetPartialStart(cds.GetLocation().IsPartialStart(eExtreme_Biological), eExtreme_Biological);
810                     new_loc->SetPartialStop(false, eExtreme_Biological);
811                     cds.SetLocation().Assign(*new_loc);
812                     if (cds.GetLocation().IsPartialStart(eExtreme_Biological)) {
813                         cds.SetPartial(true);
814                     } else {
815                         cds.ResetPartial();
816                     }
817                     rval = true;
818                 }
819             }
820         }
821     }
822     return rval;
823 }
824 
825 
826 /// ExtendCDSToStopCodon
827 /// A function to extend a CDS location to the first in-frame stop codon in the
828 /// protein translation. Note that adjustments are not made to the protein
829 /// sequence in this function, only the location and partialness of the coding
830 /// region.
831 /// @param cds        The feature to adjust
832 /// @param scope      The scope in which adjustments are to be made (if necessary)
833 ///
834 /// @return           true if stop codon was found, false otherwise
ExtendCDSToStopCodon(CSeq_feat & cds,CScope & scope)835 bool ExtendCDSToStopCodon (CSeq_feat& cds, CScope& scope)
836 {
837     if (!cds.IsSetLocation()) {
838         return false;
839     }
840 
841     const CSeq_loc& loc = cds.GetLocation();
842 
843     CBioseq_Handle bsh = scope.GetBioseqHandle(*(loc.GetId()));
844     if (!bsh) {
845         return false;
846     }
847 
848     const CGenetic_code* code = NULL;
849     if (cds.IsSetData() && cds.GetData().IsCdregion() && cds.GetData().GetCdregion().IsSetCode()) {
850         code = &(cds.GetData().GetCdregion().GetCode());
851     }
852 
853     size_t stop = loc.GetStop(eExtreme_Biological);
854     // figure out if we have a partial codon at the end
855     size_t orig_len = sequence::GetLength(loc, &scope);
856     size_t len = orig_len;
857     if (cds.IsSetData() && cds.GetData().IsCdregion() && cds.GetData().GetCdregion().IsSetFrame()) {
858         CCdregion::EFrame frame = cds.GetData().GetCdregion().GetFrame();
859         if (frame == CCdregion::eFrame_two) {
860             len -= 1;
861         } else if (frame == CCdregion::eFrame_three) {
862             len -= 2;
863         }
864     }
865     size_t mod = len % 3;
866     CRef<CSeq_loc> vector_loc(new CSeq_loc());
867     vector_loc->SetInt().SetId().Assign(*(loc.GetId()));
868 
869     if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus) {
870         vector_loc->SetInt().SetFrom(0);
871         vector_loc->SetInt().SetTo(stop + mod - 1);
872         vector_loc->SetStrand(eNa_strand_minus);
873     } else {
874         vector_loc->SetInt().SetFrom(stop - mod + 1);
875         vector_loc->SetInt().SetTo(bsh.GetInst_Length() - 1);
876     }
877 
878     CSeqVector seq(*vector_loc, scope, CBioseq_Handle::eCoding_Iupac);
879     // reserve our space
880     const size_t usable_size = seq.size();
881 
882     // get appropriate translation table
883     const CTrans_table & tbl =
884         (code ? CGen_code_table::GetTransTable(*code) :
885                 CGen_code_table::GetTransTable(1));
886 
887     // main loop through bases
888     CSeqVector::const_iterator start = seq.begin();
889 
890     size_t i;
891     size_t k;
892     size_t state = 0;
893     size_t length = usable_size / 3;
894 
895     CRef<CSeq_loc> new_loc(NULL);
896 
897     for (i = 0;  i < length;  ++i) {
898         // loop through one codon at a time
899         for (k = 0;  k < 3;  ++k, ++start) {
900             state = tbl.NextCodonState(state, *start);
901         }
902 
903         if (tbl.GetCodonResidue (state) == '*') {
904             CSeq_loc_CI it(loc);
905             CSeq_loc_CI it_next = it;
906             ++it_next;
907             while (it_next) {
908                 CConstRef<CSeq_loc> this_loc = it.GetRangeAsSeq_loc();
909                 if (new_loc) {
910                     new_loc->Add(*this_loc);
911                 } else {
912                     new_loc.Reset(new CSeq_loc());
913                     new_loc->Assign(*this_loc);
914                 }
915                 it = it_next;
916                 ++it_next;
917             }
918             CRef<CSeq_loc> last_interval(new CSeq_loc());
919             CConstRef<CSeq_loc> this_loc = it.GetRangeAsSeq_loc();
920             size_t this_start = this_loc->GetStart(eExtreme_Positional);
921             size_t this_stop = this_loc->GetStop(eExtreme_Positional);
922             size_t extension = ((i + 1) * 3) - mod;
923             last_interval->SetInt().SetId().Assign(*(this_loc->GetId()));
924             if (this_loc->IsSetStrand() && this_loc->GetStrand() == eNa_strand_minus) {
925                 last_interval->SetStrand(eNa_strand_minus);
926                 last_interval->SetInt().SetFrom(this_start - extension);
927                 last_interval->SetInt().SetTo(this_stop);
928             } else {
929                 last_interval->SetInt().SetFrom(this_start);
930                 last_interval->SetInt().SetTo(this_stop + extension);
931             }
932 
933             if (new_loc) {
934                 new_loc->Add(*last_interval);
935             } else {
936                 new_loc.Reset(new CSeq_loc());
937                 new_loc->Assign(*last_interval);
938             }
939             new_loc->SetPartialStop(false, eExtreme_Biological);
940 
941             cds.SetLocation().Assign(*new_loc);
942             return true;
943         }
944     }
945 
946     if (usable_size < 3 && !new_loc) {
947         if (loc.GetStrand() == eNa_strand_minus) {
948             new_loc = SeqLocExtend(loc, 0, &scope);
949         } else {
950             new_loc = SeqLocExtend(loc, bsh.GetInst_Length() - 1, &scope);
951         }
952         new_loc->SetPartialStop(true, eExtreme_Biological);
953         cds.SetLocation().Assign(*new_loc);
954         return true;
955     }
956 
957     return false;
958 }
959 
960 
AdjustCDSFrameForStartChange(CCdregion & cds,int change)961 void AdjustCDSFrameForStartChange(CCdregion& cds, int change)
962 {
963     TSeqPos old_frame = CCdregion::eFrame_one;
964     if (cds.IsSetFrame() && cds.GetFrame() != CCdregion::eFrame_not_set) {
965         old_frame = cds.GetFrame();
966     }
967 
968     TSignedSeqPos new_frame = old_frame - (change % 3);
969     if (new_frame < 1)
970     {
971         new_frame += 3;
972     }
973     cds.SetFrame((CCdregion::EFrame)new_frame);
974 }
975 
976 
DemoteCDSToNucSeq(objects::CSeq_feat_Handle & orig_feat)977 bool DemoteCDSToNucSeq(objects::CSeq_feat_Handle& orig_feat)
978 {
979     CSeq_feat_EditHandle feh(orig_feat);
980     CSeq_entry_Handle parent_entry = feh.GetAnnot().GetParentEntry();
981 
982     if (!parent_entry.IsSet() || !parent_entry.GetSet().IsSetClass() ||
983         parent_entry.GetSet().GetClass() != CBioseq_set::eClass_nuc_prot) {
984         // no change, not on nuc-prot set
985         return false;
986     }
987 
988     CBioseq_CI bi(parent_entry, CSeq_inst::eMol_na);
989     if (!bi) {
990         // no nucleotide sequence to move to
991         return false;
992     }
993 
994 
995     // This is necessary, to make sure that we are in "editing mode"
996     const CSeq_annot_Handle& annot_handle = orig_feat.GetAnnot();
997     CSeq_entry_EditHandle eh = annot_handle.GetParentEntry().GetEditHandle();
998 
999     CSeq_annot_Handle ftable;
1000     CSeq_entry_Handle nuc_seh = bi->GetSeq_entry_Handle();
1001     CSeq_annot_CI annot_ci(nuc_seh, CSeq_annot_CI::eSearch_entry);
1002     for (; annot_ci; ++annot_ci) {
1003         if ((*annot_ci).IsFtable()) {
1004             ftable = *annot_ci;
1005             break;
1006         }
1007     }
1008 
1009     if (!ftable) {
1010         CRef<CSeq_annot> new_annot(new CSeq_annot());
1011         new_annot->SetData().SetFtable();
1012         CSeq_entry_EditHandle eh = nuc_seh.GetEditHandle();
1013         ftable = eh.AttachAnnot(*new_annot);
1014     }
1015 
1016     CSeq_annot_EditHandle old_annot = annot_handle.GetEditHandle();
1017     CSeq_annot_EditHandle new_annot = ftable.GetEditHandle();
1018     orig_feat = new_annot.TakeFeat(feh);
1019     const list< CRef< CSeq_feat > > &feat_list = old_annot.GetSeq_annotCore()->GetData().GetFtable();
1020     if (feat_list.empty())
1021     {
1022         old_annot.Remove();
1023     }
1024     return true;
1025 }
1026 
1027 
s_SetCDSFrame(CSeq_feat & cds,ECdsFrame frame_type,CScope & scope)1028 bool ApplyCDSFrame::s_SetCDSFrame(CSeq_feat& cds, ECdsFrame frame_type, CScope& scope)
1029 {
1030     if (!cds.IsSetData() || !cds.GetData().IsCdregion())
1031         return false;
1032 
1033     CCdregion::TFrame orig_frame = CCdregion::eFrame_not_set;
1034     if (cds.GetData().GetCdregion().IsSetFrame()) {
1035         orig_frame = cds.GetData().GetCdregion().GetFrame();
1036     }
1037     // retrieve the new frame
1038     CCdregion::TFrame new_frame = orig_frame;
1039     switch (frame_type) {
1040     case eNotSet:
1041         break;
1042     case eBest:
1043         new_frame = CSeqTranslator::FindBestFrame(cds, scope);
1044         break;
1045     case eMatch:
1046         new_frame = s_FindMatchingFrame(cds, scope);
1047         break;
1048     case eOne:
1049         new_frame = CCdregion::eFrame_one;
1050         break;
1051     case eTwo:
1052         new_frame = CCdregion::eFrame_two;
1053         break;
1054     case eThree:
1055         new_frame = CCdregion::eFrame_three;
1056         break;
1057     }
1058 
1059     bool modified = false;
1060     if (orig_frame != new_frame) {
1061         cds.SetData().SetCdregion().SetFrame(new_frame);
1062         modified = true;
1063     }
1064     return modified;
1065 }
1066 
s_FindMatchingFrame(const CSeq_feat & cds,CScope & scope)1067 CCdregion::TFrame ApplyCDSFrame::s_FindMatchingFrame(const CSeq_feat& cds, CScope& scope)
1068 {
1069     CCdregion::TFrame new_frame = CCdregion::eFrame_not_set;
1070     //return the frame that matches the protein sequence if it can find one
1071     if (!cds.IsSetData() || !cds.GetData().IsCdregion() || !cds.IsSetLocation() || !cds.IsSetProduct()) {
1072         return new_frame;
1073     }
1074 
1075     // get the protein sequence
1076     CBioseq_Handle product = scope.GetBioseqHandle(cds.GetProduct());
1077     if (!product || !product.IsProtein()) {
1078         return new_frame;
1079     }
1080 
1081     // obtaining the original protein sequence
1082     CSeqVector prot_vec = product.GetSeqVector(CBioseq_Handle::eCoding_Ncbi);
1083     prot_vec.SetCoding(CSeq_data::e_Ncbieaa);
1084     string orig_prot_seq;
1085     prot_vec.GetSeqData(0, prot_vec.size(), orig_prot_seq);
1086     if (NStr::IsBlank(orig_prot_seq)) {
1087         return new_frame;
1088     }
1089 
1090     CRef<CSeq_feat> tmp_cds(new CSeq_feat);
1091     tmp_cds->Assign(cds);
1092     for (int enumI = CCdregion::eFrame_one; enumI < CCdregion::eFrame_three + 1; ++enumI) {
1093         CCdregion::EFrame fr = (CCdregion::EFrame) (enumI);
1094         tmp_cds->SetData().SetCdregion().SetFrame(fr);
1095 
1096         string new_prot_seq;
1097         CSeqTranslator::Translate(*tmp_cds, scope, new_prot_seq);
1098         if (NStr::EndsWith(new_prot_seq, '*'))
1099             new_prot_seq.erase(new_prot_seq.end() - 1);
1100         if (NStr::EqualNocase(new_prot_seq, orig_prot_seq)) {
1101             new_frame = fr;
1102             break;
1103         }
1104     }
1105 
1106     return new_frame;
1107 }
1108 
s_GetFrameFromName(const string & name)1109 ApplyCDSFrame::ECdsFrame ApplyCDSFrame::s_GetFrameFromName(const string& name)
1110 {
1111     ECdsFrame frame = eNotSet;
1112     if (NStr::EqualNocase(name, "best")) {
1113         frame = eBest;
1114     } else if (NStr::EqualNocase(name, "match")) {
1115         frame = eMatch;
1116     } else if (NStr::Equal(name, "1") || NStr::EqualNocase(name, "one")) {
1117         frame = eOne;
1118     } else if (NStr::Equal(name, "2") || NStr::EqualNocase(name, "two")) {
1119         frame = eTwo;
1120     } else if (NStr::Equal(name, "3") || NStr::EqualNocase(name, "three")) {
1121         frame = eThree;
1122     }
1123     return frame;
1124 }
1125 
1126 const unsigned int MAX_ID_LENGTH = 50;
1127 
GetIdHash(const string & str)1128 static inline string GetIdHash(const string &str)
1129 {
1130     return NStr::NumericToString(NHash::CityHash64(str), 0, 16);
1131 }
1132 
GetIdHashOrValue(const string & base,int offset)1133 string GetIdHashOrValue(const string &base, int offset)
1134 {
1135     string new_str = base;
1136     if (offset > 0)
1137         new_str += "_" + NStr::NumericToString(offset);
1138     if (new_str.length() <= MAX_ID_LENGTH)
1139         return new_str;
1140     string new_hash = GetIdHash(base);
1141     if (offset > 0)
1142         new_hash += "_" + NStr::NumericToString(offset);
1143     return new_hash;
1144 }
1145 
GetNewLocalProtId(const string & id_base,CScope & scope,int & offset)1146 CRef<objects::CSeq_id> GetNewLocalProtId(const string &id_base, CScope &scope, int &offset)
1147 {
1148     string id_base_hash = GetIdHash(id_base);
1149     CRef<objects::CSeq_id> new_id(new objects::CSeq_id());
1150     string new_str = id_base;
1151     if (offset > 0)
1152         new_str += "_" + NStr::NumericToString(offset);
1153     new_id->SetLocal().SetStr(new_str);
1154     CRef<objects::CSeq_id> new_id_hash(new objects::CSeq_id());
1155     string new_hash = id_base_hash;
1156     if (offset > 0)
1157         new_hash += "_" + NStr::NumericToString(offset);
1158     new_id_hash->SetLocal().SetStr(new_hash);
1159     objects::CBioseq_Handle b_found = scope.GetBioseqHandle(*new_id);
1160     objects::CBioseq_Handle b_found_hash = scope.GetBioseqHandle(*new_id_hash); // as we consider ID_NUM and HASH_NUM to be synonyms we need to check for the existence of both at the same time
1161     // to avoid a situation where ID_1 and HASH_1 are both created
1162     while (b_found || b_found_hash)
1163     {
1164         offset++;
1165         new_id->SetLocal().SetStr(id_base + "_" + NStr::NumericToString(offset));
1166         b_found = scope.GetBioseqHandle(*new_id);
1167         new_id_hash->SetLocal().SetStr(id_base_hash + "_" + NStr::NumericToString(offset));
1168         b_found_hash = scope.GetBioseqHandle(*new_id_hash);
1169     }
1170     if (new_id->GetLocal().GetStr().size() <= MAX_ID_LENGTH)
1171         return new_id;
1172     return new_id_hash;
1173 }
1174 
GetNewGeneralProtId(const string & id_base,const string & db,CScope & scope,int & offset)1175 static CRef<objects::CSeq_id> GetNewGeneralProtId(const string &id_base, const string &db, CScope &scope, int &offset)
1176 {
1177     string id_base_hash = GetIdHash(id_base);
1178     CRef<objects::CSeq_id> new_id(new objects::CSeq_id());
1179     new_id->SetGeneral().SetDb(db);
1180     string new_str = id_base;
1181     if (offset > 0)
1182         new_str += "_" + NStr::NumericToString(offset);
1183     new_id->SetGeneral().SetTag().SetStr(new_str);
1184     CRef<objects::CSeq_id> new_id_hash(new objects::CSeq_id());
1185     new_id_hash->SetGeneral().SetDb(db);
1186     string new_hash = id_base_hash;
1187     if (offset > 0)
1188         new_hash += "_" + NStr::NumericToString(offset);
1189     new_id_hash->SetGeneral().SetTag().SetStr(new_hash);
1190     objects::CBioseq_Handle b_found = scope.GetBioseqHandle(*new_id);
1191     objects::CBioseq_Handle b_found_hash = scope.GetBioseqHandle(*new_id_hash); // as we consider ID_NUM and HASH_NUM to be synonyms we need to check for the existence of both at the same time
1192     // to avoid a situation where ID_1 and HASH_1 are both created
1193     while (b_found || b_found_hash)
1194     {
1195         offset++;
1196         new_id->SetGeneral().SetTag().SetStr(id_base + "_" + NStr::NumericToString(offset));
1197         b_found = scope.GetBioseqHandle(*new_id);
1198         new_id_hash->SetGeneral().SetTag().SetStr(id_base_hash + "_" + NStr::NumericToString(offset));
1199         b_found_hash = scope.GetBioseqHandle(*new_id_hash);
1200     }
1201     if (new_id->GetGeneral().GetTag().GetStr().size() <= MAX_ID_LENGTH)
1202         return  new_id;
1203     return new_id_hash;
1204 }
1205 
GetGeneralOrLocal(objects::CSeq_id_Handle hid,CScope & scope,int & offset,bool fall_through)1206 static CRef<objects::CSeq_id> GetGeneralOrLocal(objects::CSeq_id_Handle hid, CScope &scope, int &offset, bool fall_through)
1207 {
1208     CRef<objects::CSeq_id> new_id;
1209     if (hid.GetSeqId()->IsLocal())
1210     {
1211         string id_base;
1212         if (hid.GetSeqId()->GetLocal().IsId())
1213         {
1214             id_base =  NStr::NumericToString(hid.GetSeqId()->GetLocal().GetId());
1215         }
1216         else
1217         {
1218             id_base = hid.GetSeqId()->GetLocal().GetStr();
1219         }
1220         new_id = GetNewLocalProtId(id_base, scope, offset);
1221     }
1222     else if (hid.GetSeqId()->IsGeneral() && hid.GetSeqId()->GetGeneral().IsSetTag()
1223              && (!hid.GetSeqId()->GetGeneral().IsSetDb() || hid.GetSeqId()->GetGeneral().GetDb() != "TMSMART"))
1224     {
1225         string id_base;
1226         if (hid.GetSeqId()->GetGeneral().GetTag().IsId())
1227         {
1228             id_base =  NStr::NumericToString(hid.GetSeqId()->GetGeneral().GetTag().GetId());
1229         }
1230         else
1231         {
1232             id_base = hid.GetSeqId()->GetGeneral().GetTag().GetStr();
1233         }
1234         new_id = GetNewGeneralProtId(id_base, hid.GetSeqId()->GetGeneral().GetDb(), scope, offset);
1235     }
1236     else if (fall_through) // if we don't care for the incoming seq-id to be specifically local or general take any input and create a local seq-id output
1237     {
1238         string id_base;
1239         hid.GetSeqId()->GetLabel(&id_base, objects::CSeq_id::eContent);
1240         new_id = GetNewLocalProtId(id_base, scope, offset);
1241     }
1242     return new_id;
1243 }
1244 
GetNewProtIdFromExistingProt(objects::CBioseq_Handle bsh,int & offset,string & id_label)1245 vector<CRef<objects::CSeq_id> > GetNewProtIdFromExistingProt(objects::CBioseq_Handle bsh, int &offset, string& id_label)
1246 {
1247     vector<CRef<objects::CSeq_id> > ids;
1248     for(auto it : bsh.GetId())
1249     {
1250         if (it.GetSeqIdOrNull())
1251         {
1252             objects::CSeq_id_Handle hid = it;
1253             CRef<objects::CSeq_id> new_id = GetGeneralOrLocal(hid, bsh.GetScope(), offset, false);
1254             if (new_id)
1255                 ids.push_back(new_id);
1256         }
1257     }
1258 
1259     if (ids.empty() && !bsh.GetId().empty())
1260     {
1261         CRef<objects::CSeq_id> new_id = GetGeneralOrLocal(bsh.GetId().front(), bsh.GetScope(), offset, true);
1262         ids.push_back(new_id);
1263     }
1264 
1265     if (ids.empty())
1266         NCBI_THROW(CException, eUnknown, "Seq-id not found");
1267 
1268     ids.front()->GetLabel(&id_label, objects::CSeq_id::eBoth);
1269     return ids;
1270 }
1271 
GetNewProtId(objects::CBioseq_Handle bsh,int & offset,string & id_label,bool general_only)1272 CRef<objects::CSeq_id> GetNewProtId(objects::CBioseq_Handle bsh, int &offset, string& id_label, bool general_only)
1273 {
1274     objects::CSeq_id_Handle hid = sequence::GetId(bsh, sequence::eGetId_Best);
1275     objects::CSeq_id_Handle gen_id;
1276 
1277     for (auto it : bsh.GetId())
1278     {
1279         if (it.GetSeqId()->IsGeneral() && it.GetSeqId()->GetGeneral().IsSetDb() &&
1280             !it.GetSeqId()->GetGeneral().IsSkippable())
1281         {
1282             gen_id = it;
1283         }
1284     }
1285     if (gen_id && general_only)
1286     {
1287         hid = gen_id;
1288     }
1289 
1290     if (!hid)
1291         NCBI_THROW(CException, eUnknown, "Seq-id of the requested type not found");
1292 
1293     CRef<objects::CSeq_id> new_id = GetGeneralOrLocal(hid, bsh.GetScope(), offset, true);
1294     new_id->GetLabel(&id_label, objects::CSeq_id::eBoth);
1295     return new_id;
1296 }
1297 
IsGeneralIdProtPresent(objects::CSeq_entry_Handle tse)1298 bool IsGeneralIdProtPresent(objects::CSeq_entry_Handle tse)
1299 {
1300     bool found = false;
1301     for (CBioseq_CI b_iter(tse, CSeq_inst::eMol_aa); b_iter; ++b_iter)
1302     {
1303         for (auto it : b_iter->GetId())
1304         {
1305             if (it.GetSeqId()->IsGeneral() && it.GetSeqId()->GetGeneral().IsSetDb() &&
1306                 !it.GetSeqId()->GetGeneral().IsSkippable())
1307             {
1308                 found = true;
1309                 break;
1310             }
1311         }
1312     }
1313     return found;
1314 }
1315 
1316 END_SCOPE(edit)
1317 END_SCOPE(objects)
1318 END_NCBI_SCOPE
1319 
1320