1 /* $Id: cleanup.cpp 632626 2021-06-03 17:38:42Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Robert Smith
27  *
28  * File Description:
29  *   Basic Cleanup of CSeq_entries.
30  *
31  */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <serial/serialbase.hpp>
35 #include <objects/seq/Bioseq.hpp>
36 #include <objects/seq/Seq_annot.hpp>
37 // included for GetPubdescLabels and GetCitationList
38 #include <objects/pub/Pub.hpp>
39 #include <objects/pub/Pub_equiv.hpp>
40 #include <objects/seq/Pubdesc.hpp>
41 #include <objects/biblio/Author.hpp>
42 #include <objects/biblio/Auth_list.hpp>
43 #include <objects/general/Person_id.hpp>
44 #include <objects/general/Name_std.hpp>
45 #include <objects/misc/sequence_macros.hpp>
46 
47 #include <objects/seqset/Seq_entry.hpp>
48 #include <objects/seqset/Bioseq_set.hpp>
49 #include <objects/seqset/seqset_macros.hpp>
50 #include <objects/seqfeat/Org_ref.hpp>
51 #include <objects/seqfeat/Seq_feat.hpp>
52 #include <objects/seqfeat/SeqFeatXref.hpp>
53 #include <objects/general/Object_id.hpp>
54 #include <objects/general/User_object.hpp>
55 #include <objects/submit/Seq_submit.hpp>
56 #include <objects/taxon3/taxon3.hpp>
57 
58 #include <objmgr/object_manager.hpp>
59 #include <objmgr/util/sequence.hpp>
60 #include <objmgr/util/feature.hpp>
61 #include <objmgr/util/autodef.hpp>
62 #include <objmgr/seq_annot_ci.hpp>
63 #include <objmgr/seqdesc_ci.hpp>
64 #include <objmgr/seq_vector.hpp>
65 #include <objmgr/seq_vector_ci.hpp>
66 #include <objtools/edit/cds_fix.hpp>
67 #include <objtools/cleanup/cleanup.hpp>
68 #include "cleanup_utils.hpp"
69 #include <objtools/cleanup/cleanup_message.hpp>
70 
71 #include <util/strsearch.hpp>
72 
73 #include "newcleanupp.hpp"
74 
75 #include <objtools/logging/listener.hpp>
76 
77 BEGIN_NCBI_SCOPE
78 BEGIN_SCOPE(objects)
79 
80 enum EChangeType {
81     eChange_UNKNOWN
82 };
83 
84 // *********************** CCleanup implementation **********************
85 
86 
CCleanup(CScope * scope,EScopeOptions scope_handling)87 CCleanup::CCleanup(CScope* scope, EScopeOptions scope_handling)
88 {
89     if (scope && scope_handling == eScope_UseInPlace) {
90         m_Scope = scope;
91     }
92     else {
93         m_Scope = new CScope(*(CObjectManager::GetInstance()));
94         if (scope) {
95             m_Scope->AddScope(*scope);
96         }
97     }
98 }
99 
100 
~CCleanup(void)101 CCleanup::~CCleanup(void)
102 {
103 }
104 
105 
SetScope(CScope * scope)106 void CCleanup::SetScope(CScope* scope)
107 {
108     m_Scope.Reset(new CScope(*(CObjectManager::GetInstance())));
109     if (scope) {
110         m_Scope->AddScope(*scope);
111     }
112 }
113 
114 
115 static
makeCleanupChange(Uint4 options)116 CRef<CCleanupChange> makeCleanupChange(Uint4 options)
117 {
118     CRef<CCleanupChange> changes;
119     if (! (options  &  CCleanup::eClean_NoReporting)) {
120         changes.Reset(new CCleanupChange);
121     }
122     return changes;
123 }
124 
125 #define CLEANUP_SETUP \
126     CRef<CCleanupChange> changes(makeCleanupChange(options)); \
127     CNewCleanup_imp clean_i(changes, options); \
128     clean_i.SetScope(*m_Scope);
129 
BasicCleanup(CSeq_entry & se,Uint4 options)130 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_entry& se, Uint4 options)
131 {
132     CLEANUP_SETUP
133     clean_i.BasicCleanupSeqEntry(se);
134     return changes;
135 }
136 
137 
BasicCleanup(CSeq_submit & ss,Uint4 options)138 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_submit& ss, Uint4 options)
139 {
140     CLEANUP_SETUP
141     clean_i.BasicCleanupSeqSubmit(ss);
142     return changes;
143 }
144 
145 
BasicCleanup(CSubmit_block & block,Uint4 options)146 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSubmit_block& block, Uint4 options)
147 {
148     CLEANUP_SETUP
149     clean_i.BasicCleanupSubmitblock(block);
150     return changes;
151 }
152 
153 
154 /// Cleanup a Bioseq.
BasicCleanup(CBioseq & bs,Uint4 options)155 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CBioseq& bs, Uint4 options)
156 {
157     CLEANUP_SETUP
158     clean_i.BasicCleanupBioseq(bs);
159     return changes;
160 }
161 
162 
BasicCleanup(CBioseq_set & bss,Uint4 options)163 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CBioseq_set& bss, Uint4 options)
164 {
165     CLEANUP_SETUP
166     clean_i.BasicCleanupBioseqSet(bss);
167     return changes;
168 }
169 
170 
BasicCleanup(CSeq_annot & sa,Uint4 options)171 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_annot& sa, Uint4 options)
172 {
173     CLEANUP_SETUP
174     clean_i.BasicCleanupSeqAnnot(sa);
175     return changes;
176 }
177 
178 
BasicCleanup(CSeq_feat & sf,Uint4 options)179 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_feat& sf, Uint4 options)
180 {
181     CLEANUP_SETUP
182     clean_i.BasicCleanupSeqFeat(sf);
183     return changes;
184 }
185 
186 
BasicCleanup(CBioSource & src,Uint4 options)187 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CBioSource& src, Uint4 options)
188 {
189     CLEANUP_SETUP
190     clean_i.BasicCleanupBioSource(src);
191     return changes;
192 }
193 
194 
BasicCleanup(CSeq_entry_Handle & seh,Uint4 options)195 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_entry_Handle& seh, Uint4 options)
196 {
197     CRef<CCleanupChange> changes(makeCleanupChange(options));
198     CNewCleanup_imp clean_i(changes, options);
199     clean_i.SetScope(seh.GetScope());
200     clean_i.BasicCleanupSeqEntryHandle(seh);
201     return changes;
202 }
203 
204 
BasicCleanup(CBioseq_Handle & bsh,Uint4 options)205 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CBioseq_Handle& bsh,    Uint4 options)
206 {
207     CRef<CCleanupChange> changes(makeCleanupChange(options));
208     CNewCleanup_imp clean_i(changes, options);
209     clean_i.SetScope(bsh.GetScope());
210     clean_i.BasicCleanupBioseqHandle(bsh);
211     return changes;
212 }
213 
214 
BasicCleanup(CBioseq_set_Handle & bssh,Uint4 options)215 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CBioseq_set_Handle& bssh, Uint4 options)
216 {
217     CRef<CCleanupChange> changes(makeCleanupChange(options));
218     CNewCleanup_imp clean_i(changes, options);
219     clean_i.SetScope(bssh.GetScope());
220     clean_i.BasicCleanupBioseqSetHandle(bssh);
221     return changes;
222 }
223 
224 
BasicCleanup(CSeq_annot_Handle & sah,Uint4 options)225 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_annot_Handle& sah, Uint4 options)
226 {
227     CRef<CCleanupChange> changes(makeCleanupChange(options));
228     CNewCleanup_imp clean_i(changes, options);
229     clean_i.SetScope(sah.GetScope());
230     clean_i.BasicCleanupSeqAnnotHandle(sah);
231     return changes;
232 }
233 
234 
BasicCleanup(CSeq_feat_Handle & sfh,Uint4 options)235 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_feat_Handle& sfh,  Uint4 options)
236 {
237     CRef<CCleanupChange> changes(makeCleanupChange(options));
238     CNewCleanup_imp clean_i(changes, options);
239     clean_i.SetScope(sfh.GetScope());
240     clean_i.BasicCleanupSeqFeatHandle(sfh);
241     return changes;
242 }
243 
244 
BasicCleanup(CSeqdesc & desc,Uint4 options)245 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeqdesc& desc, Uint4 options)
246 {
247     CLEANUP_SETUP
248     clean_i.BasicCleanup(desc);
249     return changes;
250 
251 }
252 
253 
BasicCleanup(CSeq_descr & desc,Uint4 options)254 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_descr & desc, Uint4 options)
255 {
256     CLEANUP_SETUP
257 
258     for (auto& it : desc.Set()) {
259         clean_i.BasicCleanup(*it);
260     }
261     return changes;
262 }
263 
264 
265 // *********************** Extended Cleanup implementation ********************
ExtendedCleanup(CSeq_entry & se,Uint4 options)266 CConstRef<CCleanupChange> CCleanup::ExtendedCleanup(CSeq_entry& se,  Uint4 options)
267 {
268     CLEANUP_SETUP
269     clean_i.ExtendedCleanupSeqEntry(se);
270 
271     return changes;
272 }
273 
274 
ExtendedCleanup(CSeq_submit & ss,Uint4 options)275 CConstRef<CCleanupChange> CCleanup::ExtendedCleanup(CSeq_submit& ss,  Uint4 options)
276 {
277     CLEANUP_SETUP
278     clean_i.ExtendedCleanupSeqSubmit(ss);
279     return changes;
280 }
281 
282 
ExtendedCleanup(CSeq_annot & sa,Uint4 options)283 CConstRef<CCleanupChange> CCleanup::ExtendedCleanup(CSeq_annot& sa,  Uint4 options)
284 {
285     CLEANUP_SETUP
286     clean_i.ExtendedCleanupSeqAnnot(sa); // (m_Scope->GetSeq_annotHandle(sa));
287     return changes;
288 }
289 
ExtendedCleanup(CSeq_entry_Handle & seh,Uint4 options)290 CConstRef<CCleanupChange> CCleanup::ExtendedCleanup(CSeq_entry_Handle& seh,  Uint4 options)
291 {
292     CRef<CCleanupChange> changes(makeCleanupChange(options));
293     CNewCleanup_imp clean_i(changes, options);
294     clean_i.ExtendedCleanupSeqEntryHandle(seh); // (m_Scope->GetSeq_annotHandle(sa));
295     return changes;
296 }
297 
298 
299 // *********************** CCleanupChange implementation **********************
300 
301 
CCleanupChange()302 CCleanupChange::CCleanupChange()
303 {
304 }
305 
306 
ChangeCount() const307 size_t CCleanupChange::ChangeCount() const
308 {
309     return m_Changes.count();
310 }
311 
312 
IsChanged(CCleanupChange::EChanges e) const313 bool CCleanupChange::IsChanged(CCleanupChange::EChanges e) const
314 {
315     return m_Changes.test(e);
316 }
317 
318 
SetChanged(CCleanupChange::EChanges e)319 void CCleanupChange::SetChanged(CCleanupChange::EChanges e)
320 {
321     m_Changes.set(e);
322 }
323 
324 
GetAllChanges() const325 vector<CCleanupChange::EChanges> CCleanupChange::GetAllChanges() const
326 {
327     vector<EChanges>  result;
328     for (size_t i = eNoChange + 1; i < m_Changes.size(); ++i) {
329         if (m_Changes.test(i)) {
330             result.push_back( (EChanges) i);
331         }
332     }
333     return result;
334 }
335 
336 
GetAllDescriptions() const337 vector<string> CCleanupChange::GetAllDescriptions() const
338 {
339     vector<string>  result;
340     for (size_t i = eNoChange + 1; i < m_Changes.size(); ++i) {
341         if (m_Changes.test(i)) {
342             result.push_back( GetDescription((EChanges) i) );
343         }
344     }
345     return result;
346 }
347 
348 
GetDescription(EChanges e)349 string CCleanupChange::GetDescription(EChanges e)
350 {
351     if (e <= eNoChange  ||  e >= eNumberofChangeTypes) {
352         return sm_ChangeDesc[eNoChange];
353     }
354     return sm_ChangeDesc[e];
355 }
356 
357 // corresponds to the values in CCleanupChange::EChanges.
358 // They must be edited together.
359 const char* const CCleanupChange::sm_ChangeDesc[eNumberofChangeTypes + 1] = {
360     "Invalid Change Code",
361     // set when strings are changed.
362     "Trim Spaces",
363     "Clean Double Quotes",
364     "Append To String",
365     // set when lists are sorted or uniqued.
366     "Clean Qualifiers List",
367     "Clean Dbxrefs List",
368     "Clean CitonFeat List",
369     "Clean Keywords List",
370     "Clean Subsource List",
371     "Clean Orgmod List",
372     // Set when fields are moved or have content changes
373     "Repair BioseqMol", //10
374     "Change Feature Key",
375     "Normalize Authors",
376     "Change Publication",
377     "Change Qualifiers",
378     "Change Dbxrefs",
379     "Change Keywords",
380     "Change Subsource",
381     "Change Orgmod",
382     "Change Exception",
383     "Change Comment", //20
384     // Set when fields are rescued
385     "Change tRna",
386     "Change rRna",
387     "Change ITS",
388     "Change Anticodon",
389     "Change Code Break",
390     "Change Genetic Code",
391     "Copy GeneXref",
392     "Copy ProtXref",
393     // set when locations are repaired
394     "Change Seqloc",
395     "Change Strand", //30
396     "Change WholeLocation",
397     // set when MolInfo descriptors are affected
398     "Change MolInfo Descriptor",
399     // set when prot-xref is removed
400     "Remove ProtXref",
401     // set when gene-xref is removed
402     "Remove GeneXref",
403     // set when protein feature is added
404     "Add Protein Feature",
405     // set when feature is removed
406     "Remove Feature",
407     // set when feature is moved
408     "Move Feature",
409     // set when qualifier is removed
410     "Remove Qualifier",
411     // set when Gene Xref is created
412     "Add GeneXref",
413     // set when descriptor is removed
414     "Remove Descriptor", //40
415     "Remove Keyword",
416     "Add Descriptor",
417     "Move Descriptor",
418     "Convert Feature to Descriptor",
419     "Collapse Set",
420     "Change Feature Location",
421     "Remove Annotation",
422     "Convert Feature",
423     "Remove Comment",
424     "Add BioSource OrgMod", //50
425     "Add BioSource SubSource",
426     "Change BioSource Genome",
427     "Change BioSource Origin",
428     "Change BioSource Other",
429     "Change SeqId",
430     "Remove Empty Publication",
431     "Add Qualifier",
432     "Cleanup Date",
433     "Change BioseqInst",
434     "Remove SeqID", // 60
435     "Add ProtXref",
436     "Change Partial",
437     "Change Prot Names",
438     "Change Prot Activities",
439     "Change Site",
440     "Change PCR Primers",
441     "Change RNA-ref",
442     "Move To Prot Xref",
443     "Compress Spaces",
444     "Strip serial", // 70
445     "Remove Orgmod",
446     "Remove SubSource",
447     "Create Gene Nomenclature",
448     "Clean Seq-feat xref",
449     "Clean User-Object Or -Field",
450     "Letter Case Change",
451     "Change Bioseq-set Class",
452     "Unique Without Sort",
453     "Add RNA-ref",
454     "Change Gene-ref", // 80
455     "Clean Dbtag",
456     "Change Biomol",
457     "Change Cdregion",
458     "Clean EC Number",
459     "Remove Exception",
460     "Add NcbiCleanupObject",
461     "Clean Delta-ext",
462     "Trim Flanking Quotes",
463     "Clean Bioseq Title",
464     "Decode XML", // 90
465     "Remove Dup BioSource",
466     "Clean Org-ref",
467     "Trim Internal Semicolons",
468     "Add SeqFeatXref",
469     "Convert Unstructured Org-ref Modifier",
470     "Change taxname",
471     "Move GO term to GeneOntology object",
472 
473     // set when any other change is made.
474     "Change Other",
475     "Invalid Change Code"
476 };
477 
478 
s_ProcessedFromKey(const string & key)479 CProt_ref::EProcessed s_ProcessedFromKey(const string& key)
480 {
481     if (NStr::Equal(key, "sig_peptide")) {
482         return CProt_ref::eProcessed_signal_peptide;
483     } else if (NStr::Equal(key, "mat_peptide")) {
484         return CProt_ref::eProcessed_mature;
485     } else if (NStr::Equal(key, "transit_peptide")) {
486         return CProt_ref::eProcessed_transit_peptide;
487     } else if (NStr::Equal(key, "preprotein") || NStr::Equal(key, "proprotein")) {
488         return CProt_ref::eProcessed_preprotein;
489     } else if (NStr::Equal(key, "propeptide")) {
490         return CProt_ref::eProcessed_propeptide;
491     } else {
492         return CProt_ref::eProcessed_not_set;
493     }
494 }
495 
s_KeyFromProcessed(CProt_ref::EProcessed processed)496 string s_KeyFromProcessed(CProt_ref::EProcessed processed)
497 {
498     switch (processed) {
499     case CProt_ref::eProcessed_mature:
500         return "mat_peptide";
501         break;
502     case CProt_ref::eProcessed_preprotein:
503         return "preprotein";
504         break;
505     case CProt_ref::eProcessed_signal_peptide:
506         return "sig_peptide";
507         break;
508     case CProt_ref::eProcessed_transit_peptide:
509         return "transit_peptide";
510         break;
511     case CProt_ref::eProcessed_propeptide:
512         return "propeptide";
513         break;
514     case CProt_ref::eProcessed_not_set:
515         return kEmptyStr;
516         break;
517     }
518     return kEmptyStr;
519 }
520 
521 
ConvertProteinToImp(CSeq_feat_Handle fh)522 bool ConvertProteinToImp(CSeq_feat_Handle fh)
523 {
524     if (fh.GetData().IsProt() && fh.GetData().GetProt().IsSetProcessed()) {
525         string key = s_KeyFromProcessed(fh.GetData().GetProt().GetProcessed());
526         if (!NStr::IsBlank(key)) {
527             CRef<CSeq_feat> new_feat(new CSeq_feat());
528             new_feat->Assign(*(fh.GetSeq_feat()));
529             if (fh.GetData().GetProt().IsSetName() && !fh.GetData().GetProt().GetName().empty()) {
530                 CRef<CGb_qual> q(new CGb_qual());
531                 q->SetQual("product");
532                 q->SetVal(fh.GetData().GetProt().GetName().front());
533                 new_feat->SetQual().push_back(q);
534             }
535             new_feat->SetData().SetImp().SetKey(key);
536             CSeq_feat_EditHandle efh(fh);
537             efh.Replace(*new_feat);
538             return true;
539         }
540     }
541     return false;
542 }
543 
544 
s_IsPreprotein(CSeq_feat_Handle fh)545 bool s_IsPreprotein(CSeq_feat_Handle fh)
546 {
547     if (!fh.IsSetData()) {
548         return false;
549     } else if (fh.GetData().IsProt() &&
550         fh.GetData().GetProt().IsSetProcessed() &&
551         fh.GetData().GetProt().GetProcessed() == CProt_ref::eProcessed_preprotein) {
552         return true;
553     } else if (fh.GetData().IsImp() &&
554         fh.GetData().GetImp().IsSetKey() &&
555         s_ProcessedFromKey(fh.GetData().GetImp().GetKey()) == CProt_ref::eProcessed_preprotein) {
556         return true;
557     } else {
558         return false;
559     }
560 }
561 
562 
RescueProtProductQual(CSeq_feat & feat)563 void RescueProtProductQual(CSeq_feat& feat)
564 {
565     if (!feat.IsSetQual() ||
566         !feat.IsSetData() ||
567         !feat.GetData().IsProt() ||
568         feat.GetData().GetProt().IsSetName()) {
569         return;
570     }
571     CSeq_feat::TQual::iterator it = feat.SetQual().begin();
572     while (it != feat.SetQual().end()) {
573         if ((*it)->IsSetQual() &&
574             NStr::Equal((*it)->GetQual(), "product")) {
575             if ((*it)->IsSetVal() && !NStr::IsBlank((*it)->GetVal())) {
576                 feat.SetData().SetProt().SetName().push_back((*it)->GetVal());
577             }
578             it = feat.SetQual().erase(it);
579         } else {
580             ++it;
581         }
582     }
583 
584     if (feat.SetQual().empty()) {
585         feat.ResetQual();
586     }
587 }
588 
589 
s_GetCdsByProduct(CScope & scope,const CSeq_loc & product)590 static CConstRef<CSeq_feat> s_GetCdsByProduct(CScope& scope, const CSeq_loc& product)
591 {
592     const bool feat_by_product = true;
593     SAnnotSelector sel(CSeqFeatData::e_Cdregion, feat_by_product);
594     CFeat_CI fi(scope, product, sel);
595     if (fi) {
596         return ConstRef(&(fi->GetOriginalFeature()));
597     }
598     return CConstRef<CSeq_feat>();
599 };
600 
s_GetCdsByLocation(CScope & scope,const CSeq_loc & feat_loc)601 static CConstRef<CSeq_feat> s_GetCdsByLocation(CScope& scope, const CSeq_loc& feat_loc)
602 {
603     sequence::TFeatScores cdsScores;
604     sequence::GetOverlappingFeatures(
605             feat_loc,
606             CSeqFeatData::e_Cdregion,
607             CSeqFeatData::eSubtype_cdregion,
608             sequence::eOverlap_Contained,
609             cdsScores,
610             scope);
611 
612     if (cdsScores.empty()) {
613         return CConstRef<CSeq_feat>();
614     }
615 
616     if (!feat_loc.IsPartialStart(eExtreme_Biological)) {
617         for (auto cdsScore : cdsScores) {
618             if (feature::IsLocationInFrame(scope.GetSeq_featHandle(*cdsScore.second), feat_loc)
619                     == feature::eLocationInFrame_InFrame) {
620                 return cdsScore.second;
621             }
622         }
623     }
624 
625     return cdsScores.front().second;
626 }
627 
628 
629 
MoveFeatToProtein(CSeq_feat_Handle fh)630 bool CCleanup::MoveFeatToProtein(CSeq_feat_Handle fh)
631 {
632     CProt_ref::EProcessed processed = CProt_ref::eProcessed_not_set;
633     if (fh.GetData().IsImp()) {
634         if (!fh.GetData().GetImp().IsSetKey()) {
635             return false;
636         }
637         processed = s_ProcessedFromKey(fh.GetData().GetImp().GetKey());
638         if (processed == CProt_ref::eProcessed_not_set || processed == CProt_ref::eProcessed_preprotein) {
639             return false;
640         }
641     } else if (s_IsPreprotein(fh)) {
642         return ConvertProteinToImp(fh);
643     }
644 
645     CBioseq_Handle parent_bsh = fh.GetScope().GetBioseqHandle(fh.GetLocation());
646 
647     if (!parent_bsh) {
648         // feature is mispackaged
649         return false;
650     }
651     if (parent_bsh.IsAa()) {
652         // feature is already on protein sequence
653         return false;
654     }
655 
656     CConstRef<CSeq_feat> cds;
657     bool matched_by_product = false;
658 
659     if (fh.IsSetProduct() &&
660         fh.GetData().IsProt() &&
661         fh.GetData().GetProt().IsSetProcessed() &&
662         fh.GetData().GetProt().GetProcessed() == CProt_ref::eProcessed_mature) {
663         cds = s_GetCdsByProduct(fh.GetScope(), fh.GetProduct());
664         if (cds) {
665             matched_by_product = true;
666         }
667     }
668     if (!matched_by_product) {
669         cds = s_GetCdsByLocation(fh.GetScope(), fh.GetLocation());
670     }
671     if (!cds || !cds->IsSetProduct()) {
672         // there is no overlapping coding region feature, so there is no appropriate
673         // protein sequence to move to
674         return ConvertProteinToImp(fh);
675     }
676 
677     bool require_frame = false;
678     if (!require_frame) {
679         ITERATE(CBioseq::TId, id_it, parent_bsh.GetBioseqCore()->GetId()) {
680             if ((*id_it)->IsEmbl() || (*id_it)->IsDdbj()) {
681                 require_frame = true;
682                 break;
683             }
684         }
685     }
686 
687     CRef<CSeq_loc> prot_loc = GetProteinLocationFromNucleotideLocation(fh.GetLocation(), *cds, fh.GetScope(), require_frame);
688 
689     if (!prot_loc) {
690         return false;
691     }
692 
693     CConstRef<CSeq_feat> orig_feat = fh.GetSeq_feat();
694     CRef<CSeq_feat> new_feat(new CSeq_feat());
695     new_feat->Assign(*orig_feat);
696     if (new_feat->GetData().Which() == CSeqFeatData::e_Imp) {
697         new_feat->SetData().SetProt().SetProcessed(processed);
698         // if possible, rescue product qual
699         RescueProtProductQual(*new_feat);
700         if (processed == CProt_ref::eProcessed_mature &&
701             !new_feat->GetData().GetProt().IsSetName()) {
702             if (orig_feat->IsSetComment() && !NStr::IsBlank(orig_feat->GetComment())) {
703                 new_feat->SetData().SetProt().SetName().push_back(orig_feat->GetComment());
704                 new_feat->ResetComment();
705             } else {
706                 new_feat->SetData().SetProt().SetName().push_back("unnamed");
707             }
708         }
709     }
710 
711     // change location to protein
712     new_feat->ResetLocation();
713     new_feat->SetLocation(*prot_loc);
714     SetFeaturePartial(*new_feat);
715     if (matched_by_product) {
716         new_feat->ResetProduct();
717     }
718 
719     CSeq_feat_EditHandle edh(fh);
720     edh.Replace(*new_feat);
721     CRef<CCleanupChange> changes(makeCleanupChange(0));
722     CNewCleanup_imp clean_i(changes, 0);
723     clean_i.SetScope(fh.GetScope());
724     clean_i.BasicCleanupSeqFeat(*new_feat);
725 
726     CSeq_annot_Handle ah = fh.GetAnnot();
727 
728     CBioseq_Handle target_bsh = fh.GetScope().GetBioseqHandle(new_feat->GetLocation());
729     if (!target_bsh) {
730         return false;
731     }
732 
733     CBioseq_EditHandle eh = target_bsh.GetEditHandle();
734 
735     // Find a feature table on the protein sequence to add the feature to.
736     CSeq_annot_Handle ftable;
737     if (target_bsh.GetCompleteBioseq()->IsSetAnnot()) {
738         ITERATE(CBioseq::TAnnot, annot_it, target_bsh.GetCompleteBioseq()->GetAnnot()) {
739             if ((*annot_it)->IsFtable()) {
740                 ftable = fh.GetScope().GetSeq_annotHandle(**annot_it);
741             }
742         }
743     }
744 
745     // If there is no feature table present, make one
746     if (!ftable) {
747         CRef<CSeq_annot> new_annot(new CSeq_annot());
748         ftable = eh.AttachAnnot(*new_annot);
749     }
750 
751     // add feature to the protein bioseq
752     CSeq_annot_EditHandle aeh(ftable);
753     aeh.TakeFeat(edh);
754 
755     // remove old annot if now empty
756     if (CNewCleanup_imp::ShouldRemoveAnnot(*(ah.GetCompleteSeq_annot()))) {
757         CSeq_annot_EditHandle orig(ah);
758         orig.Remove();
759     }
760 
761     return true;
762 }
763 
764 
MoveProteinSpecificFeats(CSeq_entry_Handle seh)765 bool CCleanup::MoveProteinSpecificFeats(CSeq_entry_Handle seh)
766 {
767     bool any_change = false;
768     CBioseq_CI bi(seh, CSeq_inst::eMol_na);
769     while (bi) {
770         SAnnotSelector sel(CSeqFeatData::e_Prot);
771         sel.IncludeFeatType(CSeqFeatData::e_Psec_str);
772         sel.IncludeFeatType(CSeqFeatData::e_Bond);
773         for (CFeat_CI prot_it(*bi, sel); prot_it; ++prot_it) {
774             any_change |= MoveFeatToProtein(*prot_it);
775         }
776         for (CFeat_CI imp_it(*bi, CSeqFeatData::e_Imp); imp_it; ++imp_it) {
777             any_change |= MoveFeatToProtein(*imp_it);
778         }
779         ++bi;
780     }
781     return any_change;
782 }
783 
784 
IsGeneXrefUnnecessary(const CSeq_feat & sf,CScope & scope,const CGene_ref & gene_xref)785 bool CCleanup::IsGeneXrefUnnecessary(const CSeq_feat& sf, CScope& scope, const CGene_ref& gene_xref)
786 {
787     if (gene_xref.IsSuppressed()) {
788         return false;
789     }
790 
791     CConstRef<CSeq_feat> gene = sequence::GetOverlappingGene(sf.GetLocation(), scope);
792     if (!gene || !gene->IsSetData() || !gene->GetData().IsGene()) {
793         return false;
794     }
795 
796     if (!gene->GetData().GetGene().RefersToSameGene(gene_xref)) {
797         return false;
798     }
799 
800     // see if other gene might also match
801     sequence::TFeatScores scores;
802     sequence::GetOverlappingFeatures(sf.GetLocation(), CSeqFeatData::e_Gene, CSeqFeatData::eSubtype_gene,
803         sequence::eOverlap_Contained, scores, scope);
804     if (scores.size() == 1) {
805         return true;
806     } else if (scores.size() == 0) {
807         return false;
808     }
809 
810     ITERATE(sequence::TFeatScores, g, scores) {
811         if (g->second.GetPointer() != gene.GetPointer() &&
812             sequence::Compare(g->second->GetLocation(), gene->GetLocation(), &scope, sequence::fCompareOverlapping) == sequence::eSame) {
813             return false;
814         }
815     }
816     return true;
817 }
818 
819 
RemoveUnnecessaryGeneXrefs(CSeq_feat & f,CScope & scope)820 bool CCleanup::RemoveUnnecessaryGeneXrefs(CSeq_feat& f, CScope& scope)
821 {
822     if (!f.IsSetXref()) {
823         return false;
824     }
825     bool any_removed = false;
826     CSeq_feat::TXref::iterator xit = f.SetXref().begin();
827     while (xit != f.SetXref().end()) {
828         if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
829             IsGeneXrefUnnecessary(f, scope, (*xit)->GetData().GetGene())) {
830             xit = f.SetXref().erase(xit);
831             any_removed = true;
832         } else {
833             ++xit;
834         }
835     }
836     if (any_removed) {
837         if (f.IsSetXref() && f.GetXref().empty()) {
838             f.ResetXref();
839         }
840     }
841     return any_removed;
842 }
843 
844 
RemoveUnnecessaryGeneXrefs(CSeq_entry_Handle seh)845 bool CCleanup::RemoveUnnecessaryGeneXrefs(CSeq_entry_Handle seh)
846 {
847     bool any_change = false;
848     CScope& scope = seh.GetScope();
849 
850     for (CFeat_CI fi(seh); fi; ++fi) {
851         if (fi->IsSetXref()) {
852             CRef<CSeq_feat> new_feat(new CSeq_feat());
853             new_feat->Assign(*(fi->GetOriginalSeq_feat()));
854             bool any_removed = RemoveUnnecessaryGeneXrefs(*new_feat, scope);
855             if (any_removed) {
856                 CSeq_feat_EditHandle edh(*fi);
857                 edh.Replace(*new_feat);
858                 any_change = true;
859             }
860         }
861     }
862 
863     return any_change;
864 }
865 
866 
867 //LCOV_EXCL_START
868 //not used by asn_cleanup but used by other applications
RemoveNonsuppressingGeneXrefs(CSeq_feat & f)869 bool CCleanup::RemoveNonsuppressingGeneXrefs(CSeq_feat& f)
870 {
871     if (!f.IsSetXref()) {
872         return false;
873     }
874     bool any_removed = false;
875     CSeq_feat::TXref::iterator xit = f.SetXref().begin();
876     while (xit != f.SetXref().end()) {
877         if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
878             !(*xit)->GetData().GetGene().IsSuppressed()) {
879             xit = f.SetXref().erase(xit);
880             any_removed = true;
881         } else {
882             ++xit;
883         }
884     }
885     if (any_removed) {
886         if (f.IsSetXref() && f.GetXref().empty()) {
887             f.ResetXref();
888         }
889     }
890     return any_removed;
891 }
892 //LCOV_EXCL_STOP
893 
894 
RepairXrefs(const CSeq_feat & src,CSeq_feat_Handle & dst,const CTSE_Handle & tse)895 bool CCleanup::RepairXrefs(const CSeq_feat& src, CSeq_feat_Handle& dst, const CTSE_Handle& tse)
896 {
897     if (!src.IsSetId() || !src.GetId().IsLocal()) {
898         // can't create xref if no ID
899         return false;
900     }
901     if (!CSeqFeatData::AllowXref(src.GetData().GetSubtype(), dst.GetData().GetSubtype())) {
902         // only create reciprocal xrefs if permitted
903         return false;
904     }
905     // don't create xref if already have xref or if dst not gene and already has
906     // xref to feature of same type as src
907     bool has_xref = false;
908     if (dst.IsSetXref()) {
909         ITERATE(CSeq_feat::TXref, xit, dst.GetXref()) {
910             if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal()) {
911                 if ((*xit)->GetId().Equals(src.GetId())) {
912                     // already have xref
913                     has_xref = true;
914                     break;
915                 } else if (!dst.GetData().IsGene()) {
916                     const CTSE_Handle::TFeatureId& feat_id = (*xit)->GetId().GetLocal();
917                     CTSE_Handle::TSeq_feat_Handles far_feats = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, feat_id);
918                     ITERATE(CTSE_Handle::TSeq_feat_Handles, fit, far_feats) {
919                         if (fit->GetData().GetSubtype() == src.GetData().GetSubtype()) {
920                             has_xref = true;
921                             break;
922                         }
923                     }
924                     if (has_xref) {
925                         break;
926                     }
927                 }
928             }
929         }
930     }
931     bool rval = false;
932     if (!has_xref) {
933         // to put into "editing mode"
934         dst.GetAnnot().GetEditHandle();
935         CSeq_feat_EditHandle eh(dst);
936         CRef<CSeq_feat> cpy(new CSeq_feat());
937         cpy->Assign(*(dst.GetSeq_feat()));
938         cpy->AddSeqFeatXref(src.GetId());
939         eh.Replace(*cpy);
940         rval = true;
941     }
942     return rval;
943 }
944 
945 
RepairXrefs(const CSeq_feat & f,const CTSE_Handle & tse)946 bool CCleanup::RepairXrefs(const CSeq_feat& f, const CTSE_Handle& tse)
947 {
948     bool rval = false;
949 
950     if (!f.IsSetId() || !f.IsSetXref()) {
951         return rval;
952     }
953 
954     ITERATE(CSeq_feat::TXref, xit, f.GetXref()) {
955         if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal()) {
956             const CTSE_Handle::TFeatureId& x_id = (*xit)->GetId().GetLocal();
957             CTSE_Handle::TSeq_feat_Handles far_feats = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, x_id);
958             if (far_feats.size() == 1) {
959                 rval |= RepairXrefs(f, far_feats[0], tse);
960             }
961         }
962     }
963     return rval;
964 }
965 
966 
RepairXrefs(CSeq_entry_Handle seh)967 bool CCleanup::RepairXrefs(CSeq_entry_Handle seh)
968 {
969     bool rval = false;
970     const CTSE_Handle& tse = seh.GetTSE_Handle();
971 
972     CFeat_CI fi(seh);
973     while (fi) {
974         rval |= RepairXrefs(*(fi->GetSeq_feat()), tse);
975         ++fi;
976     }
977     return rval;
978 }
979 
980 
981 //LCOV_EXCL_START
982 //not used by asn_cleanup but used by other applications
FindMatchingLocusGene(CSeq_feat & f,const CGene_ref & gene_xref,CBioseq_Handle bsh)983 bool CCleanup::FindMatchingLocusGene(CSeq_feat& f, const CGene_ref& gene_xref, CBioseq_Handle bsh)
984 {
985     bool match = false;
986     string locus1;
987     if (gene_xref.IsSetLocus())
988         locus1 = gene_xref.GetLocus();
989     for (CFeat_CI feat_ci(bsh, SAnnotSelector(CSeqFeatData::eSubtype_gene)); feat_ci; ++feat_ci)
990     {
991         string locus2;
992         if ( !f.Equals(*feat_ci->GetSeq_feat()) && feat_ci->GetSeq_feat()->IsSetData() && feat_ci->GetSeq_feat()->GetData().IsGene()
993              && feat_ci->GetSeq_feat()->GetData().GetGene().IsSetLocus())
994         {
995             locus2 = feat_ci->GetSeq_feat()->GetData().GetGene().GetLocus();
996         }
997         if (!locus1.empty() && !locus2.empty() && locus1 == locus2)
998         {
999             match = true;
1000             break;
1001         }
1002     }
1003     return match;
1004 }
1005 
RemoveOrphanLocusGeneXrefs(CSeq_feat & f,CBioseq_Handle bsh)1006 bool CCleanup::RemoveOrphanLocusGeneXrefs(CSeq_feat& f, CBioseq_Handle bsh)
1007 {
1008     if (!f.IsSetXref()) {
1009         return false;
1010     }
1011     bool any_removed = false;
1012     CSeq_feat::TXref::iterator xit = f.SetXref().begin();
1013     while (xit != f.SetXref().end()) {
1014         if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
1015             !(*xit)->GetData().GetGene().IsSuppressed() && !FindMatchingLocusGene(f, (*xit)->GetData().GetGene(), bsh)) {
1016             xit = f.SetXref().erase(xit);
1017             any_removed = true;
1018         } else {
1019             ++xit;
1020         }
1021     }
1022     if (any_removed) {
1023         if (f.IsSetXref() && f.GetXref().empty()) {
1024             f.ResetXref();
1025         }
1026     }
1027     return any_removed;
1028 }
1029 
1030 
FindMatchingLocus_tagGene(CSeq_feat & f,const CGene_ref & gene_xref,CBioseq_Handle bsh)1031 bool CCleanup::FindMatchingLocus_tagGene(CSeq_feat& f, const CGene_ref& gene_xref, CBioseq_Handle bsh)
1032 {
1033     bool match = false;
1034     string locus_tag1;
1035     if (gene_xref.IsSetLocus_tag())
1036         locus_tag1 = gene_xref.GetLocus_tag();
1037     for (CFeat_CI feat_ci(bsh, SAnnotSelector(CSeqFeatData::eSubtype_gene)); feat_ci; ++feat_ci)
1038     {
1039         string locus_tag2;
1040         if ( !f.Equals(*feat_ci->GetSeq_feat()) && feat_ci->GetSeq_feat()->IsSetData() && feat_ci->GetSeq_feat()->GetData().IsGene()
1041              && feat_ci->GetSeq_feat()->GetData().GetGene().IsSetLocus_tag())
1042         {
1043             locus_tag2 = feat_ci->GetSeq_feat()->GetData().GetGene().GetLocus_tag();
1044         }
1045         if (!locus_tag1.empty() && !locus_tag2.empty() && locus_tag1 == locus_tag2)
1046         {
1047             match = true;
1048             break;
1049         }
1050     }
1051     return match;
1052 }
1053 
RemoveOrphanLocus_tagGeneXrefs(CSeq_feat & f,CBioseq_Handle bsh)1054 bool CCleanup::RemoveOrphanLocus_tagGeneXrefs(CSeq_feat& f, CBioseq_Handle bsh)
1055 {
1056     if (!f.IsSetXref()) {
1057         return false;
1058     }
1059     bool any_removed = false;
1060     CSeq_feat::TXref::iterator xit = f.SetXref().begin();
1061     while (xit != f.SetXref().end()) {
1062         if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
1063             !(*xit)->GetData().GetGene().IsSuppressed() && !FindMatchingLocus_tagGene(f, (*xit)->GetData().GetGene(), bsh)) {
1064             xit = f.SetXref().erase(xit);
1065             any_removed = true;
1066         } else {
1067             ++xit;
1068         }
1069     }
1070     if (any_removed) {
1071         if (f.IsSetXref() && f.GetXref().empty()) {
1072             f.ResetXref();
1073         }
1074     }
1075     return any_removed;
1076 }
1077 
1078 
SeqLocExtend(CSeq_loc & loc,size_t pos_,CScope & scope)1079 bool CCleanup::SeqLocExtend(CSeq_loc& loc, size_t pos_, CScope& scope)
1080 {
1081     TSeqPos pos = static_cast<TSeqPos>(pos_);
1082     TSeqPos loc_start = loc.GetStart(eExtreme_Positional);
1083     TSeqPos loc_stop = loc.GetStop(eExtreme_Positional);
1084     bool partial_start = loc.IsPartialStart(eExtreme_Positional);
1085     bool partial_stop = loc.IsPartialStop(eExtreme_Positional);
1086     ENa_strand strand = loc.GetStrand();
1087     CRef<CSeq_loc> new_loc(NULL);
1088     bool changed = false;
1089 
1090     if (pos < loc_start) {
1091         CRef<CSeq_id> id(new CSeq_id());
1092         id->Assign(*(loc.GetId()));
1093         CRef<CSeq_loc> add(new CSeq_loc(*id, pos, loc_start - 1, strand));
1094         add->SetPartialStart(partial_start, eExtreme_Positional);
1095         new_loc = sequence::Seq_loc_Add(loc, *add, CSeq_loc::fSort | CSeq_loc::fMerge_AbuttingOnly, &scope);
1096         changed = true;
1097     } else if (pos > loc_stop) {
1098         CRef<CSeq_id> id(new CSeq_id());
1099         id->Assign(*(loc.GetId()));
1100         CRef<CSeq_loc> add(new CSeq_loc(*id, loc_stop + 1, pos, strand));
1101         add->SetPartialStop(partial_stop, eExtreme_Positional);
1102         new_loc = sequence::Seq_loc_Add(loc, *add, CSeq_loc::fSort | CSeq_loc::fMerge_AbuttingOnly, &scope);
1103         changed = true;
1104     }
1105     if (changed) {
1106         loc.Assign(*new_loc);
1107     }
1108     return changed;
1109 }
1110 //LCOV_EXCL_STOP
1111 
1112 
ExtendStopPosition(CSeq_feat & f,const CSeq_feat * cdregion,size_t extension_)1113 bool CCleanup::ExtendStopPosition(CSeq_feat& f, const CSeq_feat* cdregion, size_t extension_)
1114 {
1115     TSeqPos extension = static_cast<TSeqPos>(extension_);
1116     CRef<CSeq_loc> new_loc(&f.SetLocation());
1117 
1118     CRef<CSeq_loc> last_interval;
1119     if (new_loc->IsMix()) {
1120         last_interval = new_loc->SetMix().SetLastLoc();
1121     }
1122     else
1123     {
1124         last_interval = new_loc;
1125     }
1126 
1127     CConstRef<CSeq_id> id(last_interval->GetId());
1128 
1129     TSeqPos new_start;
1130     TSeqPos new_stop;
1131 
1132     // the last element of the mix or the single location MUST be converted into interval
1133     // whethe it's whole or point, etc
1134     if (last_interval->IsSetStrand() && last_interval->GetStrand() == eNa_strand_minus) {
1135         new_start = (cdregion ? cdregion->GetLocation().GetStart(eExtreme_Positional) :
1136               last_interval->GetStart(eExtreme_Positional)) - extension;
1137 
1138         new_stop = last_interval->GetStop(eExtreme_Positional);
1139     }
1140     else {
1141         new_start = last_interval->GetStart(eExtreme_Positional);
1142         new_stop = (cdregion ? cdregion->GetLocation().GetStop(eExtreme_Positional) :
1143             last_interval->GetStop(eExtreme_Positional)) + extension;
1144     }
1145     last_interval->SetInt().SetFrom(new_start);
1146     last_interval->SetInt().SetTo(new_stop);
1147     last_interval->SetInt().SetId().Assign(*id);
1148 
1149     new_loc->SetPartialStop(false, eExtreme_Biological);
1150 
1151     return true;
1152 }
1153 
ExtendToStopCodon(CSeq_feat & f,CBioseq_Handle bsh,size_t limit)1154 bool CCleanup::ExtendToStopCodon(CSeq_feat& f, CBioseq_Handle bsh, size_t limit)
1155 {
1156     const CSeq_loc& loc = f.GetLocation();
1157 
1158     CCdregion::TFrame frame = CCdregion::eFrame_not_set;
1159     const CGenetic_code* code = NULL;
1160     // we need to extract frame and cd_region from linked cd_region
1161     if (f.IsSetData() && f.GetData().IsCdregion())
1162     {
1163         if (f.GetData().GetCdregion().IsSetCode())
1164            code = &(f.GetData().GetCdregion().GetCode());
1165         if (f.GetData().GetCdregion().IsSetFrame())
1166            frame = f.GetData().GetCdregion().GetFrame();
1167     }
1168 
1169     TSeqPos stop = loc.GetStop(eExtreme_Biological);
1170     if (stop < 1 || stop > bsh.GetBioseqLength() - 1) {
1171         // no room to extend
1172         return false;
1173     }
1174     // figure out if we have a partial codon at the end
1175     size_t orig_len = sequence::GetLength(loc, &(bsh.GetScope()));
1176     size_t len = orig_len;
1177 
1178     if (frame == CCdregion::eFrame_two) {
1179         len -= 1;
1180     } else if (frame == CCdregion::eFrame_three) {
1181         len -= 2;
1182     }
1183 
1184     TSeqPos mod = len % 3;
1185     CRef<CSeq_loc> vector_loc(new CSeq_loc());
1186     vector_loc->SetInt().SetId().Assign(*(bsh.GetId().front().GetSeqId()));
1187 
1188     if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus) {
1189         vector_loc->SetInt().SetFrom(0);
1190         vector_loc->SetInt().SetTo(stop + mod - 1);
1191         vector_loc->SetStrand(eNa_strand_minus);
1192     } else {
1193         vector_loc->SetInt().SetFrom(stop - mod + 1);
1194         vector_loc->SetInt().SetTo(bsh.GetInst_Length() - 1);
1195     }
1196 
1197     CSeqVector seq(*vector_loc, bsh.GetScope(), CBioseq_Handle::eCoding_Iupac);
1198     // reserve our space
1199     size_t usable_size = seq.size();
1200 
1201     if (limit > 0 && usable_size > limit) {
1202         usable_size = limit;
1203     }
1204 
1205     // get appropriate translation table
1206     const CTrans_table & tbl =
1207         (code ? CGen_code_table::GetTransTable(*code) :
1208         CGen_code_table::GetTransTable(1));
1209 
1210     // main loop through bases
1211     CSeqVector::const_iterator start = seq.begin();
1212 
1213     size_t i;
1214     size_t k;
1215     int state = 0;
1216     size_t length = usable_size / 3;
1217 
1218     for (i = 0; i < length; ++i) {
1219         // loop through one codon at a time
1220         for (k = 0; k < 3; ++k, ++start) {
1221             state = tbl.NextCodonState(state, *start);
1222         }
1223 
1224         if (tbl.GetCodonResidue(state) == '*') {
1225             TSeqPos extension = static_cast<TSeqPos>(((i + 1) * 3) - mod);
1226             ExtendStopPosition(f, 0, extension);
1227             return true;
1228         }
1229     }
1230 
1231     return false;
1232 }
1233 
1234 
SetBestFrame(CSeq_feat & cds,CScope & scope)1235 bool CCleanup::SetBestFrame(CSeq_feat& cds, CScope& scope)
1236 {
1237     bool changed = false;
1238     CCdregion::TFrame frame = CCdregion::eFrame_not_set;
1239     if (cds.GetData().GetCdregion().IsSetFrame()) {
1240         frame = cds.GetData().GetCdregion().GetFrame();
1241     }
1242 
1243     CCdregion::TFrame new_frame = CSeqTranslator::FindBestFrame(cds, scope);
1244     if (frame != new_frame) {
1245         cds.SetData().SetCdregion().SetFrame(new_frame);
1246         changed = true;
1247     }
1248     return changed;
1249 }
1250 
1251 // like C's function GetFrameFromLoc, but better
SetFrameFromLoc(CCdregion::EFrame & frame,const CSeq_loc & loc,CScope & scope)1252 bool CCleanup::SetFrameFromLoc(CCdregion::EFrame &frame, const CSeq_loc& loc, CScope& scope)
1253 {
1254     if (!loc.IsPartialStart(eExtreme_Biological)) {
1255         if (frame != CCdregion::eFrame_one) {
1256             frame = CCdregion::eFrame_one;
1257             return true;
1258         }
1259         return false;
1260     }
1261     if (loc.IsPartialStop(eExtreme_Biological)) {
1262         // cannot make a determination if both ends are partial
1263         return false;
1264     }
1265 
1266     const TSeqPos seq_len = sequence::GetLength(loc, &scope);
1267 
1268     CCdregion::EFrame desired_frame = CCdregion::eFrame_not_set;
1269 
1270     // have complete last codon, get frame from length
1271     switch( (seq_len % 3) + 1 ) {
1272         case 1:
1273             desired_frame = CCdregion::eFrame_one;
1274             break;
1275         case 2:
1276             desired_frame = CCdregion::eFrame_two;
1277             break;
1278         case 3:
1279             desired_frame = CCdregion::eFrame_three;
1280             break;
1281         default:
1282             // mathematically impossible
1283             _ASSERT(false);
1284             return false;
1285     }
1286     if (frame != desired_frame) {
1287         frame = desired_frame;
1288         return true;
1289     }
1290     return false;
1291 }
1292 
1293 
SetFrameFromLoc(CCdregion & cdregion,const CSeq_loc & loc,CScope & scope)1294 bool CCleanup::SetFrameFromLoc(CCdregion &cdregion, const CSeq_loc& loc, CScope& scope)
1295 {
1296     CCdregion::EFrame frame = CCdregion::eFrame_not_set;
1297     if (cdregion.IsSetFrame()) {
1298         frame = cdregion.GetFrame();
1299     }
1300     if (SetFrameFromLoc(frame, loc, scope)) {
1301         cdregion.SetFrame(frame);
1302         return true;
1303     } else {
1304         return false;
1305     }
1306 }
1307 
1308 
s_IsLocationEndAtOtherLocationInternalEndpoint(const CSeq_loc & loc,const CSeq_loc & other_loc)1309 bool s_IsLocationEndAtOtherLocationInternalEndpoint(const CSeq_loc& loc, const CSeq_loc& other_loc)
1310 {
1311     size_t loc_end = loc.GetStop(eExtreme_Biological);
1312     CSeq_loc_CI other_int(other_loc);
1313     while (other_int) {
1314         if (other_int.IsSetStrand() &&
1315             other_int.GetStrand() == eNa_strand_minus) {
1316             if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus &&
1317                 loc_end == other_int.GetRange().GetFrom()) {
1318                 return true;
1319             }
1320         } else {
1321             if ((!loc.IsSetStrand() || loc.GetStrand() != eNa_strand_minus) &&
1322                 loc_end == other_int.GetRange().GetTo()) {
1323                 return true;
1324             }
1325         }
1326         ++other_int;
1327     }
1328     return false;
1329 }
1330 
1331 
ExtendToStopIfShortAndNotPartial(CSeq_feat & f,CBioseq_Handle bsh,bool check_for_stop)1332 bool CCleanup::ExtendToStopIfShortAndNotPartial(CSeq_feat& f, CBioseq_Handle bsh, bool check_for_stop)
1333 {
1334     if (!f.GetData().IsCdregion()) {
1335         // not coding region
1336         return false;
1337     }
1338     if (sequence::IsPseudo(f, bsh.GetScope())) {
1339         return false;
1340     }
1341     if (f.GetLocation().IsPartialStop(eExtreme_Biological)) {
1342         return false;
1343     }
1344     CConstRef<CSeq_feat> mrna = sequence::GetmRNAforCDS(f, bsh.GetScope());
1345     if (mrna) {
1346         if (mrna->GetLocation().GetStop(eExtreme_Biological) == f.GetLocation().GetStop(eExtreme_Biological)) {
1347             //ok
1348         } else if (s_IsLocationEndAtOtherLocationInternalEndpoint(f.GetLocation(), mrna->GetLocation())) {
1349             return false;
1350         }
1351     }
1352 
1353     if (check_for_stop) {
1354         string translation;
1355         try {
1356             CSeqTranslator::Translate(f, bsh.GetScope(), translation, true);
1357         } catch (CSeqMapException&) {
1358             //unable to translate
1359             return false;
1360         } catch (CSeqVectorException&) {
1361             //unable to translate
1362             return false;
1363         }
1364         if (NStr::EndsWith(translation, "*")) {
1365             //already has stop codon
1366             return false;
1367         }
1368     }
1369 
1370     return ExtendToStopCodon(f, bsh, 3);
1371 }
1372 
1373 
LocationMayBeExtendedToMatch(const CSeq_loc & orig,const CSeq_loc & improved)1374 bool CCleanup::LocationMayBeExtendedToMatch(const CSeq_loc& orig, const CSeq_loc& improved)
1375 {
1376     if ((orig.GetStrand() == eNa_strand_minus &&
1377         orig.GetStop(eExtreme_Biological) > improved.GetStop(eExtreme_Biological)) ||
1378         (orig.GetStrand() != eNa_strand_minus &&
1379         orig.GetStop(eExtreme_Biological) < improved.GetStop(eExtreme_Biological))) {
1380         return true;
1381     }
1382 
1383     return false;
1384 }
1385 
SetProteinName(CProt_ref & prot_ref,const string & protein_name,bool append)1386 void CCleanup::SetProteinName(CProt_ref& prot_ref, const string& protein_name, bool append)
1387 {
1388     if (append && prot_ref.IsSetName() && prot_ref.GetName().size() > 0) {
1389         if (!NStr::IsBlank(prot_ref.GetName().front())) {
1390             prot_ref.SetName().front() += "; ";
1391         }
1392         prot_ref.SetName().front() += protein_name;
1393     } else {
1394         prot_ref.SetName().push_back(protein_name);
1395     }
1396 }
1397 
1398 
SetMrnaName(CSeq_feat & mrna,const string & protein_name)1399 void CCleanup::SetMrnaName(CSeq_feat& mrna, const string& protein_name)
1400 {
1401     bool used_qual = false;
1402     if (mrna.IsSetQual()) {
1403         for (auto it = mrna.SetQual().begin(); it != mrna.SetQual().end(); it++) {
1404             if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1405                 (*it)->SetVal(protein_name);
1406                 used_qual = true;
1407                 break;
1408             }
1409         }
1410     }
1411     if (!used_qual || (mrna.IsSetData() && mrna.GetData().IsRna() && mrna.GetData().GetRna().IsSetExt())) {
1412         string remainder;
1413         mrna.SetData().SetRna().SetRnaProductName(protein_name, remainder);
1414     }
1415 }
1416 
1417 
1418 //LCOV_EXCL_START
1419 //seems to be unused
s_IsProductOnFeat(const CSeq_feat & cds)1420 bool CCleanup::s_IsProductOnFeat(const CSeq_feat& cds)
1421 {
1422     if (cds.IsSetXref()) {
1423         for (auto it = cds.GetXref().begin(); it != cds.GetXref().end(); it++) {
1424             if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1425                 return true;
1426             }
1427         }
1428     }
1429     if (cds.IsSetQual()) {
1430         for (auto it = cds.GetQual().begin(); it != cds.GetQual().end(); it++) {
1431             if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1432                 return true;
1433             }
1434         }
1435     }
1436     return false;
1437 }
1438 //LCOV_EXCL_STOP
1439 
1440 
s_SetProductOnFeat(CSeq_feat & feat,const string & protein_name,bool append)1441 void CCleanup::s_SetProductOnFeat(CSeq_feat& feat, const string& protein_name, bool append)
1442 {
1443     if (feat.IsSetXref()) {
1444         // see if this seq-feat already has a prot xref
1445         for (auto it = feat.SetXref().begin(); it != feat.SetXref().end(); it++) {
1446             if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1447                 SetProteinName((*it)->SetData().SetProt(), protein_name, append);
1448                 break;
1449             }
1450         }
1451     }
1452     if (feat.IsSetQual()) {
1453         for (auto it = feat.SetQual().begin(); it != feat.SetQual().end(); it++) {
1454             if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1455                 if ((*it)->IsSetVal() && !NStr::IsBlank((*it)->GetVal()) && append) {
1456                     (*it)->SetVal((*it)->GetVal() + "; " + protein_name);
1457                 } else {
1458                     (*it)->SetVal(protein_name);
1459                 }
1460             }
1461         }
1462     }
1463 }
1464 
1465 
SetProteinName(CSeq_feat & cds,const string & protein_name,bool append,CScope & scope)1466 void CCleanup::SetProteinName(CSeq_feat& cds, const string& protein_name, bool append, CScope& scope)
1467 {
1468     s_SetProductOnFeat(cds, protein_name, append);
1469     bool added = false;
1470     if (cds.IsSetProduct()) {
1471         CBioseq_Handle prot = scope.GetBioseqHandle(cds.GetProduct());
1472         if (prot) {
1473             // find main protein feature
1474             CFeat_CI feat_ci(prot, CSeqFeatData::eSubtype_prot);
1475             if (feat_ci) {
1476                 CRef<CSeq_feat> new_prot(new CSeq_feat());
1477                 new_prot->Assign(feat_ci->GetOriginalFeature());
1478                 SetProteinName(new_prot->SetData().SetProt(), protein_name, append);
1479                 CSeq_feat_EditHandle feh(feat_ci->GetSeq_feat_Handle());
1480                 feh.Replace(*new_prot);
1481             } else {
1482                 // make new protein feature
1483                 feature::AddProteinFeature(*(prot.GetCompleteBioseq()), protein_name, cds, scope);
1484             }
1485             added = true;
1486         }
1487     }
1488     if (!added) {
1489         if (cds.IsSetXref()) {
1490             // see if this seq-feat already has a prot xref
1491             NON_CONST_ITERATE(CSeq_feat::TXref, it, cds.SetXref()) {
1492                 if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1493                     SetProteinName((*it)->SetData().SetProt(), protein_name, append);
1494                     added = true;
1495                     break;
1496                 }
1497             }
1498         }
1499         if (!added) {
1500             CRef<CSeqFeatXref> xref(new CSeqFeatXref());
1501             xref->SetData().SetProt().SetName().push_back(protein_name);
1502             cds.SetXref().push_back(xref);
1503         }
1504     }
1505 }
1506 
1507 
GetProteinName(const CProt_ref & prot)1508 const string& CCleanup::GetProteinName(const CProt_ref& prot)
1509 {
1510     if (prot.IsSetName() && !prot.GetName().empty()) {
1511         return prot.GetName().front();
1512     } else {
1513         return kEmptyStr;
1514     }
1515 }
1516 
1517 
GetProteinName(const CSeq_feat & cds,CScope & scope)1518 const string& CCleanup::GetProteinName(const CSeq_feat& cds, CScope& scope)
1519 {
1520     if (cds.IsSetProduct()) {
1521         CBioseq_Handle prot = scope.GetBioseqHandle(cds.GetProduct());
1522         if (prot) {
1523             CFeat_CI f(prot, CSeqFeatData::eSubtype_prot);
1524             if (f) {
1525                 return GetProteinName(f->GetData().GetProt());
1526             }
1527         }
1528     }
1529     if (cds.IsSetXref()) {
1530         ITERATE(CSeq_feat::TXref, it, cds.GetXref()) {
1531             if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1532                 return GetProteinName((*it)->GetData().GetProt());
1533             }
1534         }
1535     }
1536     if (cds.IsSetQual()) {
1537         for (auto it = cds.GetQual().begin(); it != cds.GetQual().end(); it++) {
1538             if ((*it)->IsSetQual() && (*it)->IsSetVal() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1539                 return (*it)->GetVal();
1540             }
1541         }
1542     }
1543     return kEmptyStr;
1544 }
1545 
1546 
SetCDSPartialsByFrameAndTranslation(CSeq_feat & cds,CScope & scope)1547 bool CCleanup::SetCDSPartialsByFrameAndTranslation(CSeq_feat& cds, CScope& scope)
1548 {
1549     bool any_change = false;
1550 
1551     if (!cds.GetLocation().IsPartialStart(eExtreme_Biological) &&
1552         cds.GetData().GetCdregion().IsSetFrame() &&
1553         cds.GetData().GetCdregion().GetFrame() != CCdregion::eFrame_not_set &&
1554         cds.GetData().GetCdregion().GetFrame() != CCdregion::eFrame_one) {
1555         cds.SetLocation().SetPartialStart(true, eExtreme_Biological);
1556         any_change = true;
1557     }
1558 
1559     if (!cds.GetLocation().IsPartialStart(eExtreme_Biological) || !cds.GetLocation().IsPartialStop(eExtreme_Biological)) {
1560         // look for start and stop codon
1561         string transl_prot;
1562         try {
1563             CSeqTranslator::Translate(cds, scope, transl_prot,
1564                 true,   // include stop codons
1565                 false);  // do not remove trailing X/B/Z
1566 
1567         } catch (const runtime_error&) {
1568         }
1569         if (!NStr::IsBlank(transl_prot)) {
1570             if (!cds.GetLocation().IsPartialStart(eExtreme_Biological) && !NStr::StartsWith(transl_prot, "M")) {
1571                 cds.SetLocation().SetPartialStart(true, eExtreme_Biological);
1572                 any_change = true;
1573             }
1574             if (!cds.GetLocation().IsPartialStop(eExtreme_Biological) && !NStr::EndsWith(transl_prot, "*")) {
1575                 cds.SetLocation().SetPartialStop(true, eExtreme_Biological);
1576                 any_change = true;
1577             }
1578         }
1579     }
1580 
1581     any_change |= feature::AdjustFeaturePartialFlagForLocation(cds);
1582 
1583     return any_change;
1584 }
1585 
1586 
ClearInternalPartials(CSeq_loc & loc,bool is_first,bool is_last)1587 bool CCleanup::ClearInternalPartials(CSeq_loc& loc, bool is_first, bool is_last)
1588 {
1589     bool rval = false;
1590     switch (loc.Which()) {
1591         case CSeq_loc::e_Mix:
1592             rval |= ClearInternalPartials(loc.SetMix(), is_first, is_last);
1593             break;
1594         case CSeq_loc::e_Packed_int:
1595             rval |= ClearInternalPartials(loc.SetPacked_int(), is_first, is_last);
1596             break;
1597         default:
1598             break;
1599     }
1600     return rval;
1601 }
1602 
1603 
ClearInternalPartials(CSeq_loc_mix & mix,bool is_first,bool is_last)1604 bool CCleanup::ClearInternalPartials(CSeq_loc_mix& mix, bool is_first, bool is_last)
1605 {
1606     bool rval = false;
1607     NON_CONST_ITERATE(CSeq_loc::TMix::Tdata, it, mix.Set()) {
1608         bool this_is_last = is_last && (*it == mix.Set().back());
1609         if ((*it)->IsMix() || (*it)->IsPacked_int()) {
1610             rval |= ClearInternalPartials(**it, is_first, this_is_last);
1611         } else {
1612             if (!is_first &&
1613                 (*it)->IsPartialStart(eExtreme_Biological)) {
1614                 (*it)->SetPartialStart(false, eExtreme_Biological);
1615                 rval = true;
1616             }
1617             if (!this_is_last &&
1618                 (*it)->IsPartialStop(eExtreme_Biological)) {
1619                 (*it)->SetPartialStop(false, eExtreme_Biological);
1620                 rval = true;
1621             }
1622         }
1623         is_first = false;
1624     }
1625     return rval;
1626 }
1627 
1628 
ClearInternalPartials(CPacked_seqint & pint,bool is_first,bool is_last)1629 bool CCleanup::ClearInternalPartials(CPacked_seqint& pint, bool is_first, bool is_last)
1630 {
1631     bool rval = false;
1632 
1633     NON_CONST_ITERATE(CSeq_loc::TPacked_int::Tdata, it, pint.Set()) {
1634         bool this_is_last = is_last && (*it == pint.Set().back());
1635         if (!is_first && (*it)->IsPartialStart(eExtreme_Biological)) {
1636             (*it)->SetPartialStart(false, eExtreme_Biological);
1637             rval = true;
1638         }
1639         if (!this_is_last && (*it)->IsPartialStop(eExtreme_Biological)) {
1640             (*it)->SetPartialStop(false, eExtreme_Biological);
1641             rval = true;
1642         }
1643         is_first = false;
1644     }
1645     return rval;
1646 }
1647 
1648 
ClearInternalPartials(CSeq_entry_Handle seh)1649 bool CCleanup::ClearInternalPartials(CSeq_entry_Handle seh)
1650 {
1651     bool rval = false;
1652     CFeat_CI f(seh);
1653     while (f) {
1654         CRef<CSeq_feat> new_feat(new CSeq_feat());
1655         new_feat->Assign(*(f->GetSeq_feat()));
1656         if (ClearInternalPartials(new_feat->SetLocation())) {
1657             CSeq_feat_EditHandle eh(f->GetSeq_feat_Handle());
1658             eh.Replace(*new_feat);
1659         }
1660         ++f;
1661     }
1662 
1663     return rval;
1664 }
1665 
1666 
SetFeaturePartial(CSeq_feat & f)1667 bool CCleanup::SetFeaturePartial(CSeq_feat& f)
1668 {
1669     if (!f.IsSetLocation()) {
1670         return false;
1671     }
1672     bool partial = false;
1673     CSeq_loc_CI li(f.GetLocation());
1674     while (li && !partial) {
1675         if (li.GetFuzzFrom() || li.GetFuzzTo()) {
1676             partial = true;
1677             break;
1678         }
1679         ++li;
1680     }
1681     bool changed = false;
1682     if (f.IsSetPartial() && f.GetPartial()) {
1683         if (!partial) {
1684             f.ResetPartial();
1685             changed = true;
1686         }
1687     } else {
1688         if (partial) {
1689             f.SetPartial(true);
1690             changed = true;
1691         }
1692     }
1693     return changed;
1694 }
1695 
1696 
UpdateECNumbers(CProt_ref::TEc & ec_num_list)1697 bool CCleanup::UpdateECNumbers(CProt_ref::TEc & ec_num_list)
1698 {
1699     bool changed = false;
1700     // CProt_ref::TEc is a list, so the iterator stays valid even if we
1701     // add new entries after the current one
1702     NON_CONST_ITERATE(CProt_ref::TEc, ec_num_iter, ec_num_list) {
1703         string & ec_num = *ec_num_iter;
1704         size_t tlen = ec_num.length();
1705         CleanVisStringJunk(ec_num);
1706         if (tlen != ec_num.length()) {
1707             changed = true;
1708         }
1709         if (CProt_ref::GetECNumberStatus(ec_num) == CProt_ref::eEC_replaced &&
1710             !CProt_ref::IsECNumberSplit(ec_num)) {
1711             string new_val = CProt_ref::GetECNumberReplacement(ec_num);
1712             if (!NStr::IsBlank(new_val)) {
1713                 ec_num = new_val;
1714                 changed = true;
1715             }
1716         }
1717 
1718     }
1719     return changed;
1720 }
1721 
1722 
RemoveBadECNumbers(CProt_ref::TEc & ec_num_list)1723 bool CCleanup::RemoveBadECNumbers(CProt_ref::TEc & ec_num_list)
1724 {
1725     bool changed = false;
1726     CProt_ref::TEc::iterator ec_num_iter = ec_num_list.begin();
1727     while (ec_num_iter != ec_num_list.end()) {
1728         string & ec_num = *ec_num_iter;
1729         size_t tlen = ec_num.length();
1730         CleanVisStringJunk(ec_num);
1731         if (tlen != ec_num.length()) {
1732             changed = true;
1733         }
1734         CProt_ref::EECNumberStatus ec_status = CProt_ref::GetECNumberStatus(ec_num);
1735         if (ec_status == CProt_ref::eEC_deleted || ec_status == CProt_ref::eEC_unknown || CProt_ref::IsECNumberSplit(ec_num)) {
1736             ec_num_iter = ec_num_list.erase(ec_num_iter);
1737             changed = true;
1738         } else {
1739             ++ec_num_iter;
1740         }
1741 
1742     }
1743     return changed;
1744 }
1745 
1746 
FixECNumbers(CSeq_entry_Handle entry)1747 bool CCleanup::FixECNumbers(CSeq_entry_Handle entry)
1748 {
1749     bool any_change = false;
1750     CFeat_CI f(entry, CSeqFeatData::e_Prot);
1751     while (f) {
1752         if (f->GetData().GetProt().IsSetEc()) {
1753             bool this_change = false;
1754             CRef<CSeq_feat> new_feat(new CSeq_feat());
1755             new_feat->Assign(*(f->GetSeq_feat()));
1756             this_change = UpdateECNumbers(new_feat->SetData().SetProt().SetEc());
1757             this_change |= RemoveBadECNumbers(new_feat->SetData().SetProt().SetEc());
1758             if (new_feat->GetData().GetProt().GetEc().empty()) {
1759                 new_feat->SetData().SetProt().ResetEc();
1760                 this_change = true;
1761             }
1762             if (this_change) {
1763                 CSeq_feat_EditHandle efh(*f);
1764                 efh.Replace(*new_feat);
1765             }
1766         }
1767         ++f;
1768     }
1769     return any_change;
1770 }
1771 
1772 
SetGenePartialByLongestContainedFeature(CSeq_feat & gene,CScope & scope)1773 bool CCleanup::SetGenePartialByLongestContainedFeature(CSeq_feat& gene, CScope& scope)
1774 {
1775     CBioseq_Handle bh = scope.GetBioseqHandle(gene.GetLocation());
1776     if (!bh) {
1777         return false;
1778     }
1779     CFeat_CI under(scope, gene.GetLocation());
1780     size_t longest = 0;
1781     CConstRef<CSeq_feat> longest_feat(NULL);
1782 
1783     while (under) {
1784         // ignore genes
1785         if (under->GetData().IsGene()) {
1786 
1787         } else {
1788             // must be contained in gene location
1789             sequence::ECompare loc_cmp = sequence::Compare(gene.GetLocation(), under->GetLocation(), &scope, sequence::fCompareOverlapping);
1790 
1791             if (loc_cmp == sequence::eSame || loc_cmp == sequence::eContains) {
1792                 size_t len = sequence::GetLength(under->GetLocation(), &scope);
1793                 // if longer than longest, record new length and feature
1794                 if (len > longest) {
1795                     longest_feat.Reset(under->GetSeq_feat());
1796                 }
1797             }
1798         }
1799 
1800         ++under;
1801     }
1802     bool changed = false;
1803     if (longest_feat) {
1804         changed = feature::CopyFeaturePartials(gene, *longest_feat);
1805     }
1806     return changed;
1807 }
1808 
1809 
SetMolinfoTech(CBioseq_Handle bsh,CMolInfo::ETech tech)1810 bool CCleanup::SetMolinfoTech(CBioseq_Handle bsh, CMolInfo::ETech tech)
1811 {
1812     CSeqdesc_CI di(bsh, CSeqdesc::e_Molinfo);
1813     if (di) {
1814         if (di->GetMolinfo().IsSetTech() && di->GetMolinfo().GetTech() == tech) {
1815             // no change necessary
1816             return false;
1817         } else {
1818             CSeqdesc* d = const_cast<CSeqdesc*>(&(*di));
1819             d->SetMolinfo().SetTech(tech);
1820             return true;
1821         }
1822     }
1823     CRef<CSeqdesc> m(new CSeqdesc());
1824     m->SetMolinfo().SetTech(tech);
1825     if (bsh.IsSetInst() && bsh.GetInst().IsSetMol() && bsh.IsAa()) {
1826         m->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
1827     }
1828     CBioseq_EditHandle eh = bsh.GetEditHandle();
1829     eh.AddSeqdesc(*m);
1830     return true;
1831 }
1832 
1833 
1834 //LCOV_EXCL_START
1835 //does not appear to be used
SetMolinfoBiomol(CBioseq_Handle bsh,CMolInfo::EBiomol biomol)1836 bool CCleanup::SetMolinfoBiomol(CBioseq_Handle bsh, CMolInfo::EBiomol biomol)
1837 {
1838     CSeqdesc_CI di(bsh, CSeqdesc::e_Molinfo);
1839     if (di) {
1840         if (di->GetMolinfo().IsSetTech() && di->GetMolinfo().GetBiomol() == biomol) {
1841             // no change necessary
1842             return false;
1843         } else {
1844             CSeqdesc* d = const_cast<CSeqdesc*>(&(*di));
1845             d->SetMolinfo().SetBiomol(biomol);
1846             return true;
1847         }
1848     }
1849     CRef<CSeqdesc> m(new CSeqdesc());
1850     m->SetMolinfo().SetBiomol(biomol);
1851     CBioseq_EditHandle eh = bsh.GetEditHandle();
1852     eh.AddSeqdesc(*m);
1853     return true;
1854 }
1855 //LCOV_EXCL_STOP
1856 
1857 
AddMissingMolInfo(CBioseq & seq,bool is_product)1858 bool CCleanup::AddMissingMolInfo(CBioseq& seq, bool is_product)
1859 {
1860     if (!seq.IsSetInst() || !seq.GetInst().IsSetMol()) {
1861         return false;
1862     }
1863     bool needs_molinfo = true;
1864 
1865     if (seq.IsSetDescr()) {
1866         NON_CONST_ITERATE(CBioseq::TDescr::Tdata, it, seq.SetDescr().Set()) {
1867             if ((*it)->IsMolinfo()) {
1868                 needs_molinfo = false;
1869                 if (seq.IsAa() &&
1870                     (!(*it)->GetMolinfo().IsSetBiomol() ||
1871                      (*it)->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_unknown)) {
1872                     (*it)->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
1873                 }
1874             }
1875         }
1876     }
1877     if (needs_molinfo) {
1878         if (seq.IsAa()) {
1879             CRef<CSeqdesc> m(new CSeqdesc());
1880             m->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
1881             if (is_product) {
1882                 m->SetMolinfo().SetTech(CMolInfo::eTech_concept_trans);
1883             }
1884             seq.SetDescr().Set().push_back(m);
1885         } else if (seq.GetInst().GetMol() == CSeq_inst::eMol_rna && is_product) {
1886             CRef<CSeqdesc> m(new CSeqdesc());
1887             m->SetMolinfo().SetBiomol(CMolInfo::eBiomol_mRNA);
1888             m->SetMolinfo().SetTech(CMolInfo::eTech_standard);
1889             seq.SetDescr().Set().push_back(m);
1890         } else {
1891             needs_molinfo = false;
1892         }
1893     }
1894 
1895     return needs_molinfo;
1896 }
1897 
1898 
AddProteinTitle(CBioseq_Handle bsh)1899 bool CCleanup::AddProteinTitle(CBioseq_Handle bsh)
1900 {
1901     if (!bsh.IsSetInst() || !bsh.GetInst().IsSetMol() || !bsh.IsAa()) {
1902         return false;
1903     }
1904     if (bsh.IsSetId()) {
1905         ITERATE(CBioseq_Handle::TId, it, bsh.GetId()) {
1906             // do not add titles for sequences with certain IDs
1907             switch (it->Which()) {
1908                 case CSeq_id::e_Pir:
1909                 case CSeq_id::e_Swissprot:
1910                 case CSeq_id::e_Patent:
1911                 case CSeq_id::e_Prf:
1912                 case CSeq_id::e_Pdb:
1913                     return false;
1914                     break;
1915                 default:
1916                     break;
1917             }
1918         }
1919     }
1920 
1921     string new_defline = sequence::CDeflineGenerator().GenerateDefline(bsh, sequence::CDeflineGenerator::fIgnoreExisting);
1922 
1923     CAutoAddDesc title_desc(bsh.GetEditHandle().SetDescr(), CSeqdesc::e_Title);
1924 
1925     bool modified = title_desc.Set().SetTitle() != new_defline; // get or create a title
1926     if (modified)
1927       title_desc.Set().SetTitle().swap(new_defline);
1928     return modified;
1929 }
1930 
1931 
RemoveNcbiCleanupObject(CSeq_entry & seq_entry)1932 bool CCleanup::RemoveNcbiCleanupObject(CSeq_entry &seq_entry)
1933 {
1934     bool rval = false;
1935     if (seq_entry.IsSetDescr()) {
1936         CBioseq::TDescr::Tdata::iterator it = seq_entry.SetDescr().Set().begin();
1937         while (it != seq_entry.SetDescr().Set().end()) {
1938             if ((*it)->IsUser() && (*it)->GetUser().GetObjectType() == CUser_object::eObjectType_Cleanup){
1939                 it = seq_entry.SetDescr().Set().erase(it);
1940                 rval = true;
1941             }
1942             else {
1943                 ++it;
1944             }
1945         }
1946         if (seq_entry.SetDescr().Set().empty()) {
1947             if (seq_entry.IsSeq()) {
1948                 seq_entry.SetSeq().ResetDescr();
1949             }
1950             else if (seq_entry.IsSet()) {
1951                 seq_entry.SetSet().ResetDescr();
1952             }
1953         }
1954     }
1955     if (seq_entry.IsSet() && seq_entry.GetSet().IsSetSeq_set()) {
1956         NON_CONST_ITERATE(CBioseq_set::TSeq_set, it, seq_entry.SetSet().SetSeq_set()) {
1957             rval |= RemoveNcbiCleanupObject(**it);
1958         }
1959     }
1960     return rval;
1961 }
1962 
1963 
1964 //LCOV_EXCL_START
1965 //not used by asn_cleanup but used by functions used by other applications
GetSourceDescriptors(const CSeq_entry & se,vector<const CSeqdesc * > & src_descs)1966 void GetSourceDescriptors(const CSeq_entry& se, vector<const CSeqdesc* >& src_descs)
1967 {
1968     if (se.IsSetDescr()) {
1969         ITERATE(CBioseq::TDescr::Tdata, it, se.GetDescr().Get()) {
1970             if ((*it)->IsSource() && (*it)->GetSource().IsSetOrg()) {
1971                 src_descs.push_back(*it);
1972             }
1973         }
1974     }
1975 
1976     if (se.IsSet() && se.GetSet().IsSetSeq_set()) {
1977         ITERATE(CBioseq_set::TSeq_set, it, se.GetSet().GetSeq_set()) {
1978             GetSourceDescriptors(**it, src_descs);
1979         }
1980     }
1981 }
1982 //LCOV_EXCL_STOP
1983 
1984 
1985 //LCOV_EXCL_START
1986 //not used by asn_cleanup
TaxonomyLookup(CSeq_entry_Handle seh)1987 bool CCleanup::TaxonomyLookup(CSeq_entry_Handle seh)
1988 {
1989     bool any_changes = false;
1990 
1991     vector<CRef<COrg_ref> > rq_list;
1992     vector<const CSeqdesc* > src_descs;
1993     vector<CConstRef<CSeq_feat> > src_feats;
1994 
1995     GetSourceDescriptors(*(seh.GetCompleteSeq_entry()), src_descs);
1996     vector<const CSeqdesc* >::iterator desc_it = src_descs.begin();
1997     while (desc_it != src_descs.end()) {
1998         // add org ref for descriptor to request list
1999         CRef<COrg_ref> org(new COrg_ref());
2000         org->Assign((*desc_it)->GetSource().GetOrg());
2001         rq_list.push_back(org);
2002 
2003         ++desc_it;
2004     }
2005 
2006     CFeat_CI feat(seh, SAnnotSelector(CSeqFeatData::e_Biosrc));
2007     while (feat) {
2008         if (feat->GetData().GetBiosrc().IsSetOrg()) {
2009             // add org ref for feature to request list
2010             CRef<COrg_ref> org(new COrg_ref());
2011             org->Assign(feat->GetData().GetBiosrc().GetOrg());
2012             rq_list.push_back(org);
2013             // add feature to list
2014             src_feats.push_back(feat->GetOriginalSeq_feat());
2015         }
2016         ++feat;
2017     }
2018 
2019     if (rq_list.size() > 0) {
2020         CTaxon3 taxon3;
2021         taxon3.Init();
2022         CRef<CTaxon3_reply> reply = taxon3.SendOrgRefList(rq_list);
2023         if (reply) {
2024             CTaxon3_reply::TReply::const_iterator reply_it = reply->GetReply().begin();
2025 
2026             // process descriptor responses
2027             desc_it = src_descs.begin();
2028 
2029             while (reply_it != reply->GetReply().end()
2030                 && desc_it != src_descs.end()) {
2031                 if ((*reply_it)->IsData() &&
2032                     !(*desc_it)->GetSource().GetOrg().Equals((*reply_it)->GetData().GetOrg())) {
2033                     any_changes = true;
2034                     CSeqdesc* desc = const_cast<CSeqdesc*>(*desc_it);
2035                     desc->SetSource().SetOrg().Assign((*reply_it)->GetData().GetOrg());
2036                     desc->SetSource().SetOrg().CleanForGenBank();
2037                 }
2038                 ++reply_it;
2039                 ++desc_it;
2040             }
2041 
2042             // process feature responses
2043             vector<CConstRef<CSeq_feat> >::iterator feat_it = src_feats.begin();
2044             while (reply_it != reply->GetReply().end()
2045                 && feat_it != src_feats.end()) {
2046                 if ((*reply_it)->IsData() &&
2047                     !(*feat_it)->GetData().GetBiosrc().GetOrg().Equals((*reply_it)->GetData().GetOrg())) {
2048                     any_changes = true;
2049                     CRef<CSeq_feat> new_feat(new CSeq_feat());
2050                     new_feat->Assign(**feat_it);
2051                     new_feat->SetData().SetBiosrc().SetOrg().Assign((*reply_it)->GetData().GetOrg());
2052                     CSeq_feat_Handle fh = seh.GetScope().GetSeq_featHandle(**feat_it);
2053                     CSeq_feat_EditHandle efh(fh);
2054                     efh.Replace(*new_feat);
2055                 }
2056                 ++reply_it;
2057                 ++feat_it;
2058             }
2059         }
2060     }
2061 
2062     return any_changes;
2063 }
2064 //LCOV_EXCL_STOP
2065 
2066 
AddProtein(const CSeq_feat & cds,CScope & scope)2067 CRef<CSeq_entry> CCleanup::AddProtein(const CSeq_feat& cds, CScope& scope)
2068 {
2069     CBioseq_Handle cds_bsh = scope.GetBioseqHandle(cds.GetLocation());
2070     if (!cds_bsh) {
2071         return CRef<CSeq_entry>(NULL);
2072     }
2073     CSeq_entry_Handle seh = cds_bsh.GetSeq_entry_Handle();
2074     if (!seh) {
2075         return CRef<CSeq_entry>(NULL);
2076     }
2077 
2078     CRef<CBioseq> new_product = CSeqTranslator::TranslateToProtein(cds, scope);
2079     if (new_product.Empty()) {
2080         return CRef<CSeq_entry>(NULL);
2081     }
2082 
2083     CRef<CSeqdesc> molinfo(new CSeqdesc());
2084     molinfo->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
2085     molinfo->SetMolinfo().SetTech(CMolInfo::eTech_concept_trans);
2086     new_product->SetDescr().Set().push_back(molinfo);
2087 
2088     if (cds.IsSetProduct()) {
2089         CRef<CSeq_id> prot_id(new CSeq_id());
2090         prot_id->Assign(*(cds.GetProduct().GetId()));
2091         new_product->SetId().push_back(prot_id);
2092     }
2093     CRef<CSeq_entry> prot_entry(new CSeq_entry());
2094     prot_entry->SetSeq(*new_product);
2095 
2096     CSeq_entry_EditHandle eh = seh.GetEditHandle();
2097     if (!eh.IsSet()) {
2098         CBioseq_set_Handle nuc_parent = eh.GetParentBioseq_set();
2099         if (nuc_parent && nuc_parent.IsSetClass() && nuc_parent.GetClass() == objects::CBioseq_set::eClass_nuc_prot) {
2100             eh = nuc_parent.GetParentEntry().GetEditHandle();
2101         }
2102     }
2103     if (!eh.IsSet()) {
2104         eh.ConvertSeqToSet();
2105         // move all descriptors on nucleotide sequence except molinfo, title, and create-date to set
2106         eh.SetSet().SetClass(CBioseq_set::eClass_nuc_prot);
2107         CConstRef<CBioseq_set> set = eh.GetSet().GetCompleteBioseq_set();
2108         if (set && set->IsSetSeq_set()) {
2109             CConstRef<CSeq_entry> nuc = set->GetSeq_set().front();
2110             CSeq_entry_EditHandle neh = eh.GetScope().GetSeq_entryEditHandle(*nuc);
2111             CBioseq_set::TDescr::Tdata::const_iterator it = nuc->GetDescr().Get().begin();
2112             while (it != nuc->GetDescr().Get().end()) {
2113                 if (!(*it)->IsMolinfo() && !(*it)->IsTitle() && !(*it)->IsCreate_date()) {
2114                     CRef<CSeqdesc> copy(new CSeqdesc());
2115                     copy->Assign(**it);
2116                     eh.AddSeqdesc(*copy);
2117                     neh.RemoveSeqdesc(**it);
2118                     if (nuc->IsSetDescr()) {
2119                         it = nuc->GetDescr().Get().begin();
2120                     }
2121                     else {
2122                         break;
2123                     }
2124                 }
2125                 else {
2126                     ++it;
2127                 }
2128             }
2129         }
2130     }
2131 
2132     CSeq_entry_EditHandle added = eh.AttachEntry(*prot_entry);
2133     return prot_entry;
2134 }
2135 
SetGeneticCodes(CBioseq_Handle bsh)2136 bool CCleanup::SetGeneticCodes(CBioseq_Handle bsh)
2137 {
2138     if (!bsh) {
2139         return false;
2140     }
2141     if (!bsh.IsNa()) {
2142         return false;
2143     }
2144 
2145     CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
2146     if (!src) {
2147         // no source, don't fix
2148         return false;
2149     }
2150     const auto& bsrc = src->GetSource();
2151     if (!bsrc.IsSetOrg() || !bsrc.IsSetOrgname()) {
2152         return false;
2153     }
2154     const auto& orgname = bsrc.GetOrg().GetOrgname();
2155     if (!orgname.IsSetGcode() && !orgname.IsSetMgcode() && !orgname.IsSetPgcode()) {
2156         return false;
2157     }
2158     int bioseqGenCode = src->GetSource().GetGenCode();
2159 
2160     bool any_changed = false;
2161     // set Cdregion's gcode from BioSource (unless except-text)
2162     SAnnotSelector sel(CSeqFeatData::e_Cdregion);
2163     CFeat_CI feat_ci(bsh, sel);
2164     for (; feat_ci; ++feat_ci) {
2165         const CSeq_feat& feat = feat_ci->GetOriginalFeature();
2166         const CCdregion& cds = feat.GetData().GetCdregion();
2167         int cdregionGenCode = (cds.IsSetCode() ?
2168             cds.GetCode().GetId() :
2169             0);
2170         if (cdregionGenCode != bioseqGenCode)
2171         {
2172             // make cdregion's gencode match bioseq's gencode,
2173             // if allowed
2174             if (!feat.HasExceptionText("genetic code exception"))
2175             {
2176                 CRef<CSeq_feat> new_feat(new CSeq_feat);
2177                 new_feat->Assign(feat);
2178                 CCdregion& new_cds = new_feat->SetData().SetCdregion();
2179                 new_cds.ResetCode();
2180                 new_cds.SetCode().SetId(bioseqGenCode);
2181                 CSeq_feat_EditHandle edit_handle(*feat_ci);
2182                 edit_handle.Replace(*new_feat);
2183                 any_changed = true;
2184             }
2185         }
2186     }
2187     return any_changed;
2188 }
2189 
2190 
2191 // return position of " [" + sOrganism + "]", but only if it's
2192 // at the end and there are characters before it.
2193 // Also, returns the position of the organelle prefix in the title.
s_TitleEndsInOrganism(const string & sTitle,const string & sOrganism,SIZE_TYPE & OrganellePos)2194 static SIZE_TYPE s_TitleEndsInOrganism(
2195     const string & sTitle,
2196     const string & sOrganism,
2197     SIZE_TYPE& OrganellePos)
2198 {
2199     OrganellePos = NPOS;
2200 
2201     SIZE_TYPE answer = NPOS;
2202 
2203     const string sPattern = " [" + sOrganism + "]";
2204     if (NStr::EndsWith(sTitle, sPattern, NStr::eNocase)) {
2205         answer = sTitle.length() - sPattern.length();
2206         if (answer < 1) {
2207             // title must have something before the pattern
2208             answer = NPOS;
2209         }
2210     } else {
2211         answer = NStr::Find(sTitle, sPattern, NStr::eNocase, NStr::eReverseSearch);
2212         if (answer < 1 || answer == NPOS) {
2213             // pattern not found
2214             answer = NPOS;
2215         }
2216     }
2217 
2218     if (answer != NPOS) {
2219         // find organelle prefix
2220         for (unsigned int genome = CBioSource::eGenome_chloroplast;
2221             genome <= CBioSource::eGenome_chromatophore;
2222             genome++) {
2223             if (genome != CBioSource::eGenome_extrachrom &&
2224                 genome != CBioSource::eGenome_transposon &&
2225                 genome != CBioSource::eGenome_insertion_seq &&
2226                 genome != CBioSource::eGenome_proviral &&
2227                 genome != CBioSource::eGenome_virion &&
2228                 genome != CBioSource::eGenome_chromosome)
2229             {
2230                 string organelle = " (" + CBioSource::GetOrganelleByGenome(genome) + ")";
2231                 SIZE_TYPE possible_organelle_start_pos = NStr::Find(sTitle, organelle, NStr::eNocase, NStr::eReverseSearch);
2232                 if (possible_organelle_start_pos != NPOS &&
2233                     NStr::EndsWith(CTempString(sTitle, 0, answer), organelle)) {
2234                     OrganellePos = possible_organelle_start_pos;
2235                     break;
2236                 }
2237 
2238             }
2239         }
2240     }
2241     return answer;
2242 }
2243 
2244 
s_TitleEndsInOrganism(const string & sTitle,const COrgName::TName & orgname,SIZE_TYPE & organelle_pos)2245 static SIZE_TYPE s_TitleEndsInOrganism(
2246     const string & sTitle,
2247     const COrgName::TName& orgname,
2248     SIZE_TYPE &organelle_pos)
2249 {
2250     SIZE_TYPE suffixPos = NPOS; // will point to " [${organism name}]" at end
2251     organelle_pos = NPOS;
2252 
2253     if (orgname.IsBinomial() &&
2254         orgname.GetBinomial().IsSetGenus() &&
2255         !NStr::IsBlank(orgname.GetBinomial().GetGenus()) &&
2256         orgname.GetBinomial().IsSetSpecies() &&
2257         !NStr::IsBlank(orgname.GetBinomial().GetSpecies())) {
2258         string binomial = orgname.GetBinomial().GetGenus() + " " + orgname.GetBinomial().GetSpecies();
2259         suffixPos = s_TitleEndsInOrganism(sTitle, binomial, organelle_pos);
2260     }
2261     return suffixPos;
2262 }
2263 
2264 
IsCrossKingdom(const COrg_ref & org,string & first_kingdom,string & second_kingdom)2265 bool IsCrossKingdom(const COrg_ref& org, string& first_kingdom, string& second_kingdom)
2266 {
2267     bool is_cross_kingdom = false;
2268     first_kingdom = kEmptyStr;
2269     second_kingdom = kEmptyStr;
2270     if (org.IsSetOrgname() && org.GetOrgname().IsSetName() &&
2271         org.GetOrgname().GetName().IsPartial() &&
2272         org.GetOrgname().GetName().GetPartial().IsSet()) {
2273         ITERATE(CPartialOrgName::Tdata, it, org.GetOrgname().GetName().GetPartial().Get()) {
2274             const CTaxElement& te = **it;
2275             if (te.IsSetFixed_level() && te.GetFixed_level() == 0 &&
2276                 te.IsSetLevel() &&
2277                 NStr::EqualNocase(te.GetLevel(), "superkingdom") &&
2278                 te.IsSetName() && !NStr::IsBlank(te.GetName())) {
2279                 if (first_kingdom.empty()) {
2280                     first_kingdom = te.GetName();
2281                 } else if (!NStr::EqualNocase(first_kingdom, te.GetName())) {
2282                     is_cross_kingdom = true;
2283                     second_kingdom = te.GetName();
2284                     break;
2285                 }
2286             }
2287         }
2288     }
2289     return is_cross_kingdom;
2290 }
2291 
2292 
IsCrossKingdom(const COrg_ref & org)2293 bool IsCrossKingdom(const COrg_ref& org)
2294 {
2295     string first_kingdom, second_kingdom;
2296     return IsCrossKingdom(org, first_kingdom, second_kingdom);
2297 }
2298 
2299 
s_TitleEndsInOrganism(const string & sTitle,const COrg_ref & org,SIZE_TYPE & organelle_pos)2300 static SIZE_TYPE s_TitleEndsInOrganism(
2301     const string & sTitle,
2302     const COrg_ref& org,
2303     SIZE_TYPE &organelle_pos)
2304 {
2305     SIZE_TYPE suffixPos = NPOS; // will point to " [${organism name}]" at end
2306     organelle_pos = NPOS;
2307 
2308     // first, check to see if protein title matches old-name
2309     if (org.IsSetOrgMod()) {
2310         ITERATE(COrgName::TMod, it, org.GetOrgname().GetMod()) {
2311             if ((*it)->IsSetSubtype() && (*it)->IsSetSubname() &&
2312                 (*it)->GetSubtype() == COrgMod::eSubtype_old_name &&
2313                 !NStr::IsBlank((*it)->GetSubname())) {
2314                 suffixPos = s_TitleEndsInOrganism(sTitle, (*it)->GetSubname(), organelle_pos);
2315                 if (suffixPos != NPOS) {
2316                     return suffixPos;
2317                 }
2318             }
2319         }
2320     }
2321 
2322     // next, check to see if protein title matches taxname
2323     if (org.IsSetTaxname() && !NStr::IsBlank(org.GetTaxname())) {
2324         suffixPos = s_TitleEndsInOrganism(sTitle, org.GetTaxname(), organelle_pos);
2325         if (suffixPos != NPOS) {
2326             return suffixPos;
2327         }
2328     }
2329 
2330     // try binomial if preset
2331     if (org.IsSetOrgname() && org.GetOrgname().IsSetName() &&
2332         org.GetOrgname().GetName().IsBinomial()) {
2333         suffixPos = s_TitleEndsInOrganism(sTitle, org.GetOrgname().GetName(), organelle_pos);
2334         if (suffixPos != NPOS) {
2335             return suffixPos;
2336         }
2337     }
2338 
2339     // cross-kingdom?
2340     if (IsCrossKingdom(org)) {
2341         SIZE_TYPE sep = NStr::Find(sTitle, "][");
2342         if (sep != string::npos) {
2343             suffixPos = s_TitleEndsInOrganism(sTitle.substr(0, sep + 1), org.GetTaxname(), organelle_pos);
2344         }
2345     }
2346     return suffixPos;
2347 }
2348 
2349 
s_RemoveOrgFromEndOfProtein(CBioseq & seq,string taxname)2350 static void s_RemoveOrgFromEndOfProtein(CBioseq& seq, string taxname)
2351 
2352 {
2353     if (taxname.empty()) return;
2354     SIZE_TYPE taxlen = taxname.length();
2355 
2356     EDIT_EACH_SEQANNOT_ON_BIOSEQ(annot_it, seq) {
2357         CSeq_annot& annot = **annot_it;
2358         if (!annot.IsFtable()) continue;
2359         EDIT_EACH_FEATURE_ON_ANNOT(feat_it, annot) {
2360             CSeq_feat& feat = **feat_it;
2361             CSeqFeatData& data = feat.SetData();
2362             if (!data.IsProt()) continue;
2363             CProt_ref& prot_ref = data.SetProt();
2364             EDIT_EACH_NAME_ON_PROTREF(it, prot_ref) {
2365                 string str = *it;
2366                 if (str.empty()) continue;
2367                 auto len = str.length();
2368                 if (len < 5) continue;
2369                 if (str[len - 1] != ']') continue;
2370                 SIZE_TYPE cp = NStr::Find(str, "[", NStr::eCase, NStr::eReverseSearch);
2371                 if (cp == NPOS) continue;
2372                 string suffix = str.substr(cp + 1);
2373                 if (NStr::StartsWith(suffix, "NAD")) continue;
2374                 if (suffix.length() != taxlen + 1) continue;
2375                 if (NStr::StartsWith(suffix, taxname)) {
2376                     str.erase(cp);
2377                     Asn2gnbkCompressSpaces(str);
2378                     *it = str;
2379                 }
2380             }
2381         }
2382     }
2383 }
2384 
AddPartialToProteinTitle(CBioseq & bioseq)2385 bool CCleanup::AddPartialToProteinTitle(CBioseq &bioseq)
2386 {
2387     // Bail if not protein
2388     if (!bioseq.IsSetInst() || !bioseq.GetInst().IsSetMol() || !bioseq.GetInst().IsAa()) {
2389         return false;
2390     }
2391 
2392     // Bail if record is swissprot
2393     FOR_EACH_SEQID_ON_BIOSEQ(seqid_itr, bioseq) {
2394         if ((*seqid_itr)->IsSwissprot()) {
2395             return false;
2396         }
2397     }
2398 
2399     // gather some info from the Seqdesc's on the bioseq, into
2400     // the following variables
2401     bool bPartial = false;
2402     string organelle;
2403 
2404     CConstRef<CSeqdesc> molinfo_desc(NULL);
2405     CConstRef<CSeqdesc> src_desc(NULL);
2406     FOR_EACH_SEQDESC_ON_BIOSEQ(descr_iter, bioseq) {
2407         if (!molinfo_desc && (*descr_iter)->IsMolinfo()) {
2408             molinfo_desc = *descr_iter;
2409         }
2410         if (!src_desc && (*descr_iter)->IsSource()) {
2411             src_desc = *descr_iter;
2412         }
2413         if (molinfo_desc && src_desc) {
2414             break;
2415         }
2416     }
2417     if (!molinfo_desc || !src_desc) {
2418         // climb up to get parent Seqdescs
2419         CConstRef<CBioseq_set> bioseq_set(bioseq.GetParentSet());
2420         for (; bioseq_set; bioseq_set = bioseq_set->GetParentSet()) {
2421             FOR_EACH_SEQDESC_ON_SEQSET(descr_iter, *bioseq_set) {
2422                 if (!molinfo_desc && (*descr_iter)->IsMolinfo()) {
2423                     molinfo_desc = *descr_iter;
2424                 }
2425                 if (!src_desc && (*descr_iter)->IsSource()) {
2426                     src_desc = *descr_iter;
2427                 }
2428                 if (molinfo_desc && src_desc) {
2429                     break;
2430                 }
2431             }
2432             if (molinfo_desc && src_desc) {
2433                 break;
2434             }
2435         }
2436     }
2437 
2438     if (molinfo_desc && molinfo_desc->GetMolinfo().IsSetCompleteness()) {
2439         switch (molinfo_desc->GetMolinfo().GetCompleteness()) {
2440             case NCBI_COMPLETENESS(partial):
2441             case NCBI_COMPLETENESS(no_left):
2442             case NCBI_COMPLETENESS(no_right):
2443             case NCBI_COMPLETENESS(no_ends):
2444                 bPartial = true;
2445                 break;
2446             default:
2447                 break;
2448         }
2449     }
2450 
2451     CConstRef<COrg_ref> org(NULL);
2452     if (src_desc) {
2453         const TBIOSOURCE_GENOME genome = (src_desc->GetSource().IsSetGenome() ?
2454             src_desc->GetSource().GetGenome() : CBioSource::eGenome_unknown);
2455         if (genome >= CBioSource::eGenome_chloroplast &&
2456             genome <= CBioSource::eGenome_chromatophore &&
2457             genome != CBioSource::eGenome_extrachrom &&
2458             genome != CBioSource::eGenome_transposon &&
2459             genome != CBioSource::eGenome_insertion_seq &&
2460             genome != CBioSource::eGenome_proviral &&
2461             genome != CBioSource::eGenome_virion &&
2462             genome != CBioSource::eGenome_chromosome)
2463         {
2464             organelle = CBioSource::GetOrganelleByGenome(genome);
2465         }
2466 
2467         if (src_desc->GetSource().IsSetOrg()) {
2468             org.Reset(&(src_desc->GetSource().GetOrg()));
2469         }
2470     }
2471 
2472     if (!org) {
2473         return false;
2474     }
2475     if (org->IsSetTaxname() && !NStr::IsBlank(org->GetTaxname())) {
2476         s_RemoveOrgFromEndOfProtein(bioseq, org->GetTaxname());
2477     }
2478 
2479     // find the title to edit
2480     if (!bioseq.IsSetDescr()) {
2481         return false;
2482     }
2483     CRef<CSeqdesc> title_desc(NULL);
2484     NON_CONST_ITERATE(CBioseq::TDescr::Tdata, d, bioseq.SetDescr().Set()) {
2485         if ((*d)->IsTitle()) {
2486             title_desc = *d;
2487         }
2488     }
2489     if (!title_desc) {
2490         return false;
2491     }
2492     string & sTitle = title_desc->SetTitle();
2493     // remember original so we can see if we changed it
2494     const string sOriginalTitle = sTitle;
2495 
2496     // search for partial, must be just before bracketed organism
2497     SIZE_TYPE partialPos = NStr::Find(sTitle, ", partial [");
2498     if (partialPos == NPOS) {
2499         partialPos = NStr::Find(sTitle, ", partial (");
2500     }
2501 
2502     // find oldname or taxname in brackets at end of protein title
2503     SIZE_TYPE penult = NPOS;
2504     SIZE_TYPE suffixPos = s_TitleEndsInOrganism(sTitle, *org, penult); // will point to " [${organism name}]" at end
2505     // do not change unless [genus species] was at the end
2506     if (suffixPos == NPOS) {
2507         return false;
2508     }
2509 
2510     // truncate bracketed info from end of title, will replace with current taxname
2511     sTitle.resize(suffixPos);
2512     if (penult != NPOS) {
2513         sTitle.resize(penult);
2514     }
2515 
2516     // if ", partial [" was indeed just before the [genus species], it will now be ", partial"
2517     // Note: 9 is length of ", partial"
2518     if (!bPartial  &&
2519         partialPos != string::npos &&
2520         (partialPos == (sTitle.length() - 9)))
2521     {
2522         sTitle.resize(partialPos);
2523     }
2524     NStr::TruncateSpacesInPlace(sTitle);
2525 
2526     //
2527     if (bPartial && partialPos == NPOS) {
2528         sTitle += ", partial";
2529     }
2530     if (!NStr::IsBlank(organelle)) {
2531         sTitle += " (" + string(organelle) + ")";
2532     }
2533     string first_kingdom, second_kingdom;
2534     if (IsCrossKingdom(*org, first_kingdom, second_kingdom)) {
2535         sTitle += " [" + first_kingdom + "][" + second_kingdom + "]";
2536     } else {
2537         sTitle += " [";
2538         if (org->IsSetTaxname()) {
2539             sTitle += org->GetTaxname();
2540         }
2541         sTitle += "]";
2542     }
2543 
2544     if (sTitle != sOriginalTitle) {
2545         return true;
2546     } else {
2547         return false;
2548     }
2549 }
2550 
RemovePseudoProduct(CSeq_feat & cds,CScope & scope)2551 bool CCleanup::RemovePseudoProduct(CSeq_feat& cds, CScope& scope)
2552 {
2553     if (!sequence::IsPseudo(cds, scope) ||
2554         !cds.IsSetData() || !cds.GetData().IsCdregion() ||
2555         !cds.IsSetProduct()) {
2556         return false;
2557     }
2558     CBioseq_Handle pseq = scope.GetBioseqHandle(cds.GetProduct());
2559     if (pseq) {
2560         CFeat_CI prot(pseq, CSeqFeatData::eSubtype_prot);
2561         if (prot) {
2562             string label;
2563             if (prot->GetData().GetProt().IsSetName() &&
2564                 !prot->GetData().GetProt().GetName().empty()) {
2565                 label = prot->GetData().GetProt().GetName().front();
2566             } else if (prot->GetData().GetProt().IsSetDesc()) {
2567                 label = prot->GetData().GetProt().GetDesc();
2568             }
2569             if (!NStr::IsBlank(label)) {
2570                 if (cds.IsSetComment() && !NStr::IsBlank(cds.GetComment())) {
2571                     cds.SetComment(cds.GetComment() + "; " + label);
2572                 } else {
2573                     cds.SetComment(label);
2574                 }
2575             }
2576         }
2577         CBioseq_EditHandle pseq_e(pseq);
2578         pseq_e.Remove();
2579     }
2580     cds.ResetProduct();
2581     return true;
2582 }
2583 
2584 
ExpandGeneToIncludeChildren(CSeq_feat & gene,CTSE_Handle & tse)2585 bool CCleanup::ExpandGeneToIncludeChildren(CSeq_feat& gene, CTSE_Handle& tse)
2586 {
2587     if (!gene.IsSetXref() || !gene.IsSetLocation() || !gene.GetLocation().IsInt()) {
2588         return false;
2589     }
2590     bool any_change = false;
2591     TSeqPos gene_start = gene.GetLocation().GetStart(eExtreme_Positional);
2592     TSeqPos gene_stop = gene.GetLocation().GetStop(eExtreme_Positional);
2593     ITERATE(CSeq_feat::TXref, xit, gene.GetXref()) {
2594         if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal()) {
2595             const CTSE_Handle::TFeatureId& feat_id = (*xit)->GetId().GetLocal();
2596             CTSE_Handle::TSeq_feat_Handles far_feats = tse.GetFeaturesWithId(CSeqFeatData::eSubtype_any, feat_id);
2597             ITERATE(CTSE_Handle::TSeq_feat_Handles, f, far_feats) {
2598                 TSeqPos f_start = f->GetLocation().GetStart(eExtreme_Positional);
2599                 TSeqPos f_stop = f->GetLocation().GetStop(eExtreme_Positional);
2600                 if (f_start < gene_start) {
2601                     gene.SetLocation().SetInt().SetFrom(f_start);
2602                     gene_start = f_start;
2603                     any_change = true;
2604                 }
2605                 if (f_stop > gene_stop) {
2606                     gene.SetLocation().SetInt().SetTo(f_stop);
2607                     gene_stop = f_stop;
2608                     any_change = true;
2609                 }
2610             }
2611         }
2612     }
2613     return any_change;
2614 }
2615 
2616 
2617 typedef pair<size_t, bool> TRNALength;
2618 typedef map<string, TRNALength > TRNALengthMap;
2619 
2620 static const TRNALengthMap kTrnaLengthMap{
2621     { "16S", { 1000, false } },
2622     { "18S", { 1000, false } },
2623     { "23S", { 2000, false } },
2624     { "25S", { 1000, false } },
2625     { "26S", { 1000, false } },
2626     { "28S", { 3300, false } },
2627     { "small", { 1000, false } },
2628     { "large", { 1000, false } },
2629     { "5.8S", { 130, true } },
2630     { "5S", { 90, true } }
2631     // possible problem: if it matches /25S/ it would also match /5S/
2632     // luckily, if it fails the /5S/ rule it would fail the /25S/ rule
2633 };
2634 
2635 
s_CleanupIsShortrRNA(const CSeq_feat & f,CScope * scope)2636 static bool s_CleanupIsShortrRNA(const CSeq_feat& f, CScope* scope) // used in feature_tests.cpp
2637 {
2638     if (f.GetData().GetSubtype() != CSeqFeatData::eSubtype_rRNA) {
2639         return false;
2640     }
2641     bool is_bad = false;
2642     size_t len = sequence::GetLength(f.GetLocation(), scope);
2643     const CRNA_ref& rrna = f.GetData().GetRna();
2644     string rrna_name = rrna.GetRnaProductName();
2645     if (rrna_name.empty()) {
2646         // RNA name may still be in product GBQual
2647         if (f.IsSetQual()) {
2648             for (auto qit : f.GetQual()) {
2649                 const CGb_qual& gbq = *qit;
2650                 if ( gbq.IsSetQual() && gbq.GetQual() == "product" ) {
2651                     rrna_name = gbq.GetVal();
2652                     break;
2653                 }
2654             }
2655         }
2656     }
2657     ITERATE (TRNALengthMap, it, kTrnaLengthMap) {
2658         SIZE_TYPE pos = NStr::FindNoCase(rrna_name, it->first);
2659         if (pos != string::npos && len < it->second.first && !(it->second.second && f.IsSetPartial() && f.GetPartial()) ) {
2660             is_bad = true;
2661             break;
2662         }
2663     }
2664     return is_bad;
2665 }
2666 
WGSCleanup(CSeq_entry_Handle entry,bool instantiate_missing_proteins,Uint4 options,bool run_extended_cleanup)2667 bool CCleanup::WGSCleanup(CSeq_entry_Handle entry, bool instantiate_missing_proteins, Uint4 options, bool run_extended_cleanup)
2668 {
2669     bool any_changes = false;
2670 
2671     int protein_id_counter = 1;
2672     bool create_general_only = objects::edit::IsGeneralIdProtPresent(entry.GetTopLevelEntry());
2673     SAnnotSelector sel(CSeqFeatData::e_Cdregion);
2674     for (CFeat_CI cds_it(entry, sel); cds_it; ++cds_it) {
2675         bool change_this_cds = false;
2676         CRef<CSeq_feat> new_cds(new CSeq_feat());
2677         new_cds->Assign(*(cds_it->GetSeq_feat()));
2678         if (sequence::IsPseudo(*(cds_it->GetSeq_feat()), entry.GetScope())) {
2679             change_this_cds = RemovePseudoProduct(*new_cds, entry.GetScope());
2680         } else {
2681             string current_name = GetProteinName(*new_cds, entry.GetScope());
2682 
2683             change_this_cds |= SetBestFrame(*new_cds, entry.GetScope());
2684 
2685             change_this_cds |= SetCDSPartialsByFrameAndTranslation(*new_cds, entry.GetScope());
2686 
2687             // retranslate
2688             if (new_cds->IsSetProduct() && entry.GetScope().GetBioseqHandleFromTSE(*(new_cds->GetProduct().GetId()), entry)) {
2689                 any_changes |= feature::RetranslateCDS(*new_cds, entry.GetScope());
2690             } else {
2691                 // need to set product if not set
2692                 if (!new_cds->IsSetProduct() && !sequence::IsPseudo(*new_cds, entry.GetScope())) {
2693                     string id_label;
2694                     CRef<CSeq_id> new_id = objects::edit::GetNewProtId(entry.GetScope().GetBioseqHandle(new_cds->GetLocation()), protein_id_counter, id_label, create_general_only);
2695                     if (new_id) {
2696                         new_cds->SetProduct().SetWhole().Assign(*new_id);
2697                         change_this_cds = true;
2698                     }
2699                 }
2700                 if (new_cds->IsSetProduct() && instantiate_missing_proteins) {
2701                     CRef<CSeq_entry> prot = AddProtein(*new_cds, entry.GetScope());
2702                     if (prot) {
2703                         any_changes = true;
2704                     }
2705                 }
2706                 any_changes |= feature::AdjustForCDSPartials(*new_cds, entry);
2707             }
2708             //prefer ncbieaa
2709             if (new_cds->IsSetProduct()) {
2710                 CBioseq_Handle p = entry.GetScope().GetBioseqHandle(new_cds->GetProduct());
2711                 if (p && p.IsSetInst() && p.GetInst().IsSetSeq_data() && p.GetInst().GetSeq_data().IsIupacaa()) {
2712                     CBioseq_EditHandle peh(p);
2713                     string current = p.GetInst().GetSeq_data().GetIupacaa().Get();
2714                     CRef<CSeq_inst> new_inst(new CSeq_inst());
2715                     new_inst->Assign(p.GetInst());
2716                     new_inst->SetSeq_data().SetNcbieaa().Set(current);
2717                     peh.SetInst(*new_inst);
2718                     any_changes = true;
2719                 }
2720             }
2721 
2722             if (NStr::IsBlank(current_name)) {
2723                 SetProteinName(*new_cds, "hypothetical protein", false, entry.GetScope());
2724                 current_name = "hypothetical protein";
2725                 change_this_cds = true;
2726             } else if (new_cds->IsSetProduct()) {
2727                 CBioseq_Handle p = entry.GetScope().GetBioseqHandle(new_cds->GetProduct());
2728                 if (p) {
2729                     CFeat_CI feat_ci(p, CSeqFeatData::eSubtype_prot);
2730                     if (!feat_ci) {
2731                         // make new protein feature
2732                         feature::AddProteinFeature(*(p.GetCompleteBioseq()), current_name, *new_cds, entry.GetScope());
2733                     }
2734                 }
2735             }
2736 
2737             CConstRef<CSeq_feat> mrna = sequence::GetmRNAforCDS(*(cds_it->GetSeq_feat()), entry.GetScope());
2738             if (mrna) {
2739                 bool change_mrna = false;
2740                 CRef<CSeq_feat> new_mrna(new CSeq_feat());
2741                 new_mrna->Assign(*mrna);
2742                 // Make mRNA name match coding region protein
2743                 string mrna_name = new_mrna->GetData().GetRna().GetRnaProductName();
2744                 if (NStr::IsBlank(mrna_name) && new_mrna->IsSetQual()) {
2745                     for (auto it = new_mrna->GetQual().begin(); it != new_mrna->GetQual().end(); it++) {
2746                         if ((*it)->IsSetQual() && (*it)->IsSetVal() && NStr::EqualNocase((*it)->GetQual(), "product")) {
2747                             mrna_name = (*it)->GetVal();
2748                             break;
2749                         }
2750                     }
2751                 }
2752                 if (NStr::IsBlank(mrna_name)
2753                     || (!NStr::Equal(current_name, "hypothetical protein") &&
2754                     !NStr::Equal(current_name, mrna_name))) {
2755                     SetMrnaName(*new_mrna, current_name);
2756                     change_mrna = true;
2757                 }
2758                 // Adjust mRNA partials to match coding region
2759                 change_mrna |= feature::CopyFeaturePartials(*new_mrna, *new_cds);
2760                 if (change_mrna) {
2761                     CSeq_feat_Handle fh = entry.GetScope().GetSeq_featHandle(*mrna);
2762                     CSeq_feat_EditHandle feh(fh);
2763                     feh.Replace(*new_mrna);
2764                     any_changes = true;
2765                 }
2766             }
2767         }
2768 
2769         //any_changes |= feature::RetranslateCDS(*new_cds, entry.GetScope());
2770         if (change_this_cds) {
2771             CSeq_feat_EditHandle cds_h(*cds_it);
2772 
2773             cds_h.Replace(*new_cds);
2774             any_changes = true;
2775 
2776             //also need to redo protein title
2777         }
2778 
2779     }
2780 
2781     CTSE_Handle tse = entry.GetTSE_Handle();
2782 
2783     for (CFeat_CI rna_it(entry, SAnnotSelector(CSeqFeatData::e_Rna)); rna_it; ++rna_it) {
2784 
2785         const CSeq_feat& rna_feat = *(rna_it->GetSeq_feat());
2786         if (rna_feat.IsSetData() &&
2787             rna_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_rRNA &&
2788             s_CleanupIsShortrRNA(rna_feat, &(entry.GetScope()))) {
2789 
2790             bool change_this_rrna = false;
2791             CRef<CSeq_feat> new_rrna(new CSeq_feat());
2792             new_rrna->Assign(*(rna_it->GetSeq_feat()));
2793 
2794             const CSeq_loc& loc = rna_feat.GetLocation();
2795             if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus) {
2796               if (loc.GetStart(eExtreme_Biological) >= sequence::GetLength(rna_feat.GetLocation(), &entry.GetScope())) {
2797                     new_rrna->SetLocation().SetPartialStart(true, eExtreme_Biological);
2798                     change_this_rrna = true;
2799                 }
2800                 if (loc.GetStop(eExtreme_Biological) < 1) {
2801                     new_rrna->SetLocation().SetPartialStop(true, eExtreme_Biological);
2802                     change_this_rrna = true;
2803                 }
2804             } else {
2805                 if (loc.GetStart(eExtreme_Biological) < 1) {
2806                     new_rrna->SetLocation().SetPartialStart(true, eExtreme_Biological);
2807                     change_this_rrna = true;
2808                 }
2809                 if (loc.GetStop(eExtreme_Biological) >= sequence::GetLength(rna_feat.GetLocation(), &entry.GetScope())) {
2810                     new_rrna->SetLocation().SetPartialStop(true, eExtreme_Biological);
2811                     change_this_rrna = true;
2812                 }
2813             }
2814 
2815             if (change_this_rrna) {
2816                 CSeq_feat_EditHandle rrna_h(*rna_it);
2817                 rrna_h.Replace(*new_rrna);
2818                 any_changes = true;
2819             }
2820        }
2821     }
2822 
2823     for (CFeat_CI gene_it(entry, SAnnotSelector(CSeqFeatData::e_Gene)); gene_it; ++gene_it) {
2824         bool change_this_gene;
2825         CRef<CSeq_feat> new_gene(new CSeq_feat());
2826         new_gene->Assign(*(gene_it->GetSeq_feat()));
2827 
2828         change_this_gene = ExpandGeneToIncludeChildren(*new_gene, tse);
2829 
2830         change_this_gene |= SetGenePartialByLongestContainedFeature(*new_gene, entry.GetScope());
2831 
2832         if (change_this_gene) {
2833             CSeq_feat_EditHandle gene_h(*gene_it);
2834             gene_h.Replace(*new_gene);
2835             any_changes = true;
2836         }
2837     }
2838 
2839     NormalizeDescriptorOrder(entry);
2840 
2841     for (CBioseq_CI bi(entry, CSeq_inst::eMol_na); bi; ++bi) {
2842         any_changes |= SetGeneticCodes(*bi);
2843     }
2844 
2845     if (run_extended_cleanup) {
2846         auto pChanged = CCleanup::ExtendedCleanup(entry, options);
2847         if (pChanged->ChangeCount()>0) {
2848             return true;
2849         }
2850     }
2851     return any_changes;
2852 }
2853 
2854 
x_HasShortIntron(const CSeq_loc & loc,size_t min_len)2855 bool CCleanup::x_HasShortIntron(const CSeq_loc& loc, size_t min_len)
2856 {
2857     CSeq_loc_CI li(loc);
2858     while (li && li.IsEmpty()) {
2859         ++li;
2860     }
2861     if (!li) {
2862         return false;
2863     }
2864     while (li) {
2865         TSeqPos prev_end;
2866         ENa_strand prev_strand;
2867         if (li.IsSetStrand() && li.GetStrand() == eNa_strand_minus) {
2868             prev_end = li.GetRange().GetFrom();
2869             prev_strand = eNa_strand_minus;
2870         } else {
2871             prev_end = li.GetRange().GetTo();
2872             prev_strand = eNa_strand_plus;
2873         }
2874         ++li;
2875         while (li && li.IsEmpty()) {
2876             ++li;
2877         }
2878         if (li) {
2879             TSeqPos this_start;
2880             ENa_strand this_strand;
2881             if (li.IsSetStrand() && li.GetStrand() == eNa_strand_minus) {
2882                 this_start = li.GetRange().GetTo();
2883                 this_strand = eNa_strand_minus;
2884             } else {
2885                 this_start = li.GetRange().GetFrom();
2886                 this_strand = eNa_strand_plus;
2887             }
2888             if (this_strand == prev_strand) {
2889                 if (abs((long int)this_start - (long int)prev_end) < min_len) {
2890                     return true;
2891                 }
2892             }
2893         }
2894     }
2895     return false;
2896 }
2897 
2898 //LCOV_EXCL_START
2899 //not used by asn_cleanup but used by table2asn
2900 const string kLowQualitySequence = "low-quality sequence region";
2901 
x_AddLowQualityException(CSeq_feat & feat)2902 bool CCleanup::x_AddLowQualityException(CSeq_feat& feat)
2903 {
2904     bool any_change = false;
2905     if (!feat.IsSetExcept()) {
2906         any_change = true;
2907         feat.SetExcept(true);
2908     }
2909     if (!feat.IsSetExcept_text() || NStr::IsBlank(feat.GetExcept_text())) {
2910         feat.SetExcept_text(kLowQualitySequence);
2911         any_change = true;
2912     } else if (NStr::Find(feat.GetExcept_text(), kLowQualitySequence) == string::npos) {
2913         feat.SetExcept_text(feat.GetExcept_text() + "; " + kLowQualitySequence);
2914         any_change = true;
2915     }
2916     return any_change;
2917 }
2918 
2919 
x_AddLowQualityException(CSeq_entry_Handle entry,CSeqFeatData::ESubtype subtype)2920 bool CCleanup::x_AddLowQualityException(CSeq_entry_Handle entry, CSeqFeatData::ESubtype subtype)
2921 {
2922     bool any_changes = false;
2923 
2924     SAnnotSelector sel(subtype);
2925     for (CFeat_CI cds_it(entry, sel); cds_it; ++cds_it) {
2926         bool change_this_cds = false;
2927         CRef<CSeq_feat> new_cds(new CSeq_feat());
2928         new_cds->Assign(*(cds_it->GetSeq_feat()));
2929         if (!sequence::IsPseudo(*(cds_it->GetSeq_feat()), entry.GetScope()) &&
2930             x_HasShortIntron(cds_it->GetLocation())) {
2931             change_this_cds = x_AddLowQualityException(*new_cds);
2932         }
2933 
2934         if (change_this_cds) {
2935             CSeq_feat_EditHandle cds_h(*cds_it);
2936 
2937             cds_h.Replace(*new_cds);
2938             any_changes = true;
2939         }
2940     }
2941     return any_changes;
2942 }
2943 
2944 
AddLowQualityException(CSeq_entry_Handle entry)2945 bool CCleanup::AddLowQualityException(CSeq_entry_Handle entry)
2946 {
2947     bool any_changes = x_AddLowQualityException(entry, CSeqFeatData::eSubtype_cdregion);
2948     any_changes |= x_AddLowQualityException(entry, CSeqFeatData::eSubtype_mRNA);
2949     return any_changes;
2950 }
2951 //LCOV_EXCL_STOP
2952 
2953 
2954 // maps the type of seqdesc to the order it should be in
2955 // (lowest to highest)
2956 typedef SStaticPair<CSeqdesc::E_Choice, int>  TSeqdescOrderElem;
2957 static const TSeqdescOrderElem sc_seqdesc_order_map[] = {
2958     // Note that ordering must match ordering
2959     // in CSeqdesc::E_Choice
2960         { CSeqdesc::e_Mol_type, 13 },
2961         { CSeqdesc::e_Modif, 14 },
2962         { CSeqdesc::e_Method, 15 },
2963         { CSeqdesc::e_Name, 7 },
2964         { CSeqdesc::e_Title, 1 },
2965         { CSeqdesc::e_Org, 16 },
2966         { CSeqdesc::e_Comment, 6 },
2967         { CSeqdesc::e_Num, 11 },
2968         { CSeqdesc::e_Maploc, 9 },
2969         { CSeqdesc::e_Pir, 18 },
2970         { CSeqdesc::e_Genbank, 22 },
2971         { CSeqdesc::e_Pub, 5 },
2972         { CSeqdesc::e_Region, 10 },
2973         { CSeqdesc::e_User, 8 },
2974         { CSeqdesc::e_Sp, 17 },
2975         { CSeqdesc::e_Dbxref, 12 },
2976         { CSeqdesc::e_Embl, 21 },
2977         { CSeqdesc::e_Create_date, 24 },
2978         { CSeqdesc::e_Update_date, 25 },
2979         { CSeqdesc::e_Prf, 19 },
2980         { CSeqdesc::e_Pdb, 20 },
2981         { CSeqdesc::e_Het, 4 },
2982 
2983         { CSeqdesc::e_Source, 2 },
2984         { CSeqdesc::e_Molinfo, 3 },
2985         { CSeqdesc::e_Modelev, 23 }
2986 };
2987 typedef CStaticPairArrayMap<CSeqdesc::E_Choice, int> TSeqdescOrderMap;
2988 DEFINE_STATIC_ARRAY_MAP(TSeqdescOrderMap, sc_SeqdescOrderMap, sc_seqdesc_order_map);
2989 
2990 static
s_SeqDescToOrdering(CSeqdesc::E_Choice chs)2991 int s_SeqDescToOrdering(CSeqdesc::E_Choice chs) {
2992     // ordering assigned to unknown
2993     const int unknown_seqdesc = static_cast<int>(1 + sc_SeqdescOrderMap.size());
2994 
2995     TSeqdescOrderMap::const_iterator find_iter = sc_SeqdescOrderMap.find(chs);
2996     if (find_iter == sc_SeqdescOrderMap.end()) {
2997         return unknown_seqdesc;
2998     }
2999 
3000     return find_iter->second;
3001 }
3002 
3003 static
s_SeqDescLessThan(const CRef<CSeqdesc> & desc1,const CRef<CSeqdesc> & desc2)3004 bool s_SeqDescLessThan(const CRef<CSeqdesc> &desc1, const CRef<CSeqdesc> &desc2)
3005 {
3006     CSeqdesc::E_Choice chs1, chs2;
3007 
3008     chs1 = desc1->Which();
3009     chs2 = desc2->Which();
3010 
3011     return (s_SeqDescToOrdering(chs1) < s_SeqDescToOrdering(chs2));
3012 }
3013 
NormalizeDescriptorOrder(CSeq_descr & descr)3014 bool CCleanup::NormalizeDescriptorOrder(CSeq_descr& descr)
3015 {
3016     bool rval = false;
3017     if (!seq_mac_is_sorted(descr.Set().begin(), descr.Set().end(), s_SeqDescLessThan)) {
3018         descr.Set().sort(s_SeqDescLessThan);
3019         rval = true;
3020     }
3021     return rval;
3022 }
3023 
NormalizeDescriptorOrder(CSeq_entry_Handle seh)3024 bool CCleanup::NormalizeDescriptorOrder(CSeq_entry_Handle seh)
3025 {
3026     bool rval = false;
3027 
3028     CSeq_entry_CI ci(seh, CSeq_entry_CI::fRecursive | CSeq_entry_CI::fIncludeGivenEntry);
3029     while (ci) {
3030         CSeq_entry_EditHandle edit(*ci);
3031         if (edit.IsSetDescr()) {
3032             rval |= NormalizeDescriptorOrder(edit.SetDescr());
3033         }
3034         ++ci;
3035     }
3036 
3037     return rval;
3038 }
3039 
3040 
RemoveUnseenTitles(CSeq_entry_EditHandle::TSeq seq)3041 bool CCleanup::RemoveUnseenTitles(CSeq_entry_EditHandle::TSeq seq)
3042 {
3043     bool removed = false;
3044     if (seq.IsSetDescr()) {
3045         CConstRef<CSeqdesc> last_title(NULL);
3046         ITERATE(CBioseq::TDescr::Tdata, d, seq.GetDescr().Get()) {
3047             if ((*d)->IsTitle()) {
3048                 if (last_title) {
3049                     seq.RemoveSeqdesc(*last_title);
3050                     removed = true;
3051                 }
3052                 last_title.Reset(d->GetPointer());
3053             }
3054         }
3055     }
3056     return removed;
3057 }
3058 
3059 
RemoveUnseenTitles(CSeq_entry_EditHandle::TSet set)3060 bool CCleanup::RemoveUnseenTitles(CSeq_entry_EditHandle::TSet set)
3061 {
3062     bool removed = false;
3063     if (set.IsSetDescr()) {
3064         CConstRef<CSeqdesc> last_title(NULL);
3065         ITERATE(CBioseq::TDescr::Tdata, d, set.GetDescr().Get()) {
3066             if ((*d)->IsTitle()) {
3067                 if (last_title) {
3068                     set.RemoveSeqdesc(*last_title);
3069                     removed = true;
3070                 }
3071                 last_title.Reset(d->GetPointer());
3072             }
3073         }
3074     }
3075     return removed;
3076 }
3077 
3078 
AddGenBankWrapper(CSeq_entry_Handle seh)3079 bool CCleanup::AddGenBankWrapper(CSeq_entry_Handle seh)
3080 {
3081     if (seh.IsSet() && seh.GetSet().IsSetClass() &&
3082         seh.GetSet().GetClass() == CBioseq_set::eClass_genbank) {
3083         return false;
3084     }
3085     CSeq_entry_EditHandle eh(seh);
3086     eh.ConvertSeqToSet(CBioseq_set::eClass_genbank);
3087     return true;
3088 }
3089 
3090 
s_GetAuthorsString(string * out_authors,const CAuth_list & auth_list)3091 void s_GetAuthorsString(string *out_authors, const CAuth_list& auth_list)
3092 {
3093     string & auth_str = *out_authors;
3094     auth_str.clear();
3095 
3096     if (!auth_list.IsSetNames()) {
3097         return;
3098     }
3099 
3100     vector<string> name_list;
3101 
3102     if (auth_list.GetNames().IsStd()) {
3103         ITERATE(CAuth_list::TNames::TStd, auth_it, auth_list.GetNames().GetStd()) {
3104             if ((*auth_it)->IsSetName()) {
3105                 string label;
3106                 (*auth_it)->GetName().GetLabel(&label);
3107                 name_list.push_back(label);
3108             }
3109         }
3110     } else if (auth_list.GetNames().IsMl()) {
3111         copy(BEGIN_COMMA_END(auth_list.GetNames().GetMl()),
3112             back_inserter(name_list));
3113     } else if (auth_list.GetNames().IsStr()) {
3114         copy(BEGIN_COMMA_END(auth_list.GetNames().GetStr()),
3115             back_inserter(name_list));
3116     }
3117 
3118     if (name_list.size() == 0) {
3119         return;
3120     } else if (name_list.size() == 1) {
3121         auth_str = name_list.back();
3122         return;
3123     }
3124 
3125     // join most of them by commas, but the last one gets an "and"
3126     string last_author;
3127     last_author.swap(name_list.back());
3128     name_list.pop_back();
3129     // swap is faster than assignment
3130     NStr::Join(name_list, ", ").swap(auth_str);
3131     auth_str += "and ";
3132     auth_str += last_author;
3133 
3134     return;
3135 }
3136 
3137 
s_GetAuthorsString(string * out_authors_string,const CPubdesc & pd)3138 void s_GetAuthorsString(
3139     string *out_authors_string, const CPubdesc& pd)
3140 {
3141     string & authors_string = *out_authors_string;
3142     authors_string.clear();
3143 
3144     FOR_EACH_PUB_ON_PUBDESC(pub, pd) {
3145         if ((*pub)->IsSetAuthors()) {
3146             s_GetAuthorsString(&authors_string, (*pub)->GetAuthors());
3147             break;
3148         }
3149     }
3150 }
3151 
3152 
GetPubdescLabels(const CPubdesc & pd,vector<TEntrezId> & pmids,vector<TEntrezId> & muids,vector<int> & serials,vector<string> & published_labels,vector<string> & unpublished_labels)3153 void CCleanup::GetPubdescLabels
3154 (const CPubdesc& pd,
3155 vector<TEntrezId>& pmids, vector<TEntrezId>& muids, vector<int>& serials,
3156 vector<string>& published_labels,
3157 vector<string>& unpublished_labels)
3158 {
3159     string label;
3160     bool   is_published = false;
3161     bool   need_label = false;
3162 
3163     if (!pd.IsSetPub()) {
3164         return;
3165     }
3166     ITERATE(CPubdesc::TPub::Tdata, it, pd.GetPub().Get()) {
3167         if ((*it)->IsPmid()) {
3168             pmids.push_back((*it)->GetPmid());
3169             is_published = true;
3170         } else if ((*it)->IsMuid()) {
3171             muids.push_back((*it)->GetMuid());
3172             is_published = true;
3173         } else if ((*it)->IsGen()) {
3174             if ((*it)->GetGen().IsSetCit()
3175                 && NStr::StartsWith((*it)->GetGen().GetCit(), "BackBone id_pub", NStr::eNocase)) {
3176                 need_label = true;
3177             }
3178             if ((*it)->GetGen().IsSetSerial_number()) {
3179                 serials.push_back((*it)->GetGen().GetSerial_number());
3180                 if ((*it)->GetGen().IsSetCit()
3181                     || (*it)->GetGen().IsSetJournal()
3182                     || (*it)->GetGen().IsSetDate()) {
3183                     need_label = true;
3184                 }
3185             } else {
3186                 need_label = true;
3187             }
3188         } else if ((*it)->IsArticle() && (*it)->GetArticle().IsSetIds()) {
3189             is_published = true;
3190             ITERATE(CArticleIdSet::Tdata, id, (*it)->GetArticle().GetIds().Get()) {
3191                 if ((*id)->IsPubmed()) {
3192                     pmids.push_back((*id)->GetPubmed());
3193                     is_published = true;
3194                 } else if ((*id)->IsMedline()) {
3195                     muids.push_back((*id)->GetMedline());
3196                 }
3197             }
3198             need_label = true;
3199         } else {
3200             need_label = true;
3201         }
3202         if (need_label && NStr::IsBlank(label)) {
3203             // create unique label
3204             (*it)->GetLabel(&label, CPub::eContent, true);
3205             string auth_str;
3206             s_GetAuthorsString(&auth_str, pd);
3207             label += "; ";
3208             label += auth_str;
3209         }
3210     }
3211     if (!NStr::IsBlank(label)) {
3212         if (is_published) {
3213             published_labels.push_back(label);
3214         } else {
3215             unpublished_labels.push_back(label);
3216         }
3217     }
3218 }
3219 
3220 
GetCitationList(CBioseq_Handle bsh)3221 vector<CConstRef<CPub> > CCleanup::GetCitationList(CBioseq_Handle bsh)
3222 {
3223     vector<CConstRef<CPub> > pub_list;
3224 
3225     // first get descriptor pubs
3226     CSeqdesc_CI di(bsh, CSeqdesc::e_Pub);
3227     while (di) {
3228         vector<TEntrezId> pmids;
3229         vector<TEntrezId> muids;
3230         vector<int> serials;
3231         vector<string> published_labels;
3232         vector<string> unpublished_labels;
3233         GetPubdescLabels(di->GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
3234         if (pmids.size() > 0) {
3235             CRef<CPub> pub(new CPub());
3236             pub->SetPmid().Set(pmids[0]);
3237             pub_list.push_back(pub);
3238         } else if (muids.size() > 0) {
3239             CRef<CPub> pub(new CPub());
3240             pub->SetMuid(muids[0]);
3241             pub_list.push_back(pub);
3242         } else if (serials.size() > 0) {
3243             CRef<CPub> pub(new CPub());
3244             pub->SetGen().SetSerial_number(serials[0]);
3245             pub_list.push_back(pub);
3246         } else if (published_labels.size() > 0) {
3247             CRef<CPub> pub(new CPub());
3248             pub->SetGen().SetCit(published_labels[0]);
3249             pub_list.push_back(pub);
3250         } else if (unpublished_labels.size() > 0) {
3251             CRef<CPub> pub(new CPub());
3252             pub->SetGen().SetCit(unpublished_labels[0]);
3253             pub_list.push_back(pub);
3254         }
3255 
3256         ++di;
3257     }
3258     // now get pub features
3259     CFeat_CI fi(bsh, SAnnotSelector(CSeqFeatData::e_Pub));
3260     while (fi) {
3261         vector<TEntrezId> pmids;
3262         vector<TEntrezId> muids;
3263         vector<int> serials;
3264         vector<string> published_labels;
3265         vector<string> unpublished_labels;
3266         GetPubdescLabels(fi->GetData().GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
3267         if (pmids.size() > 0) {
3268             CRef<CPub> pub(new CPub());
3269             pub->SetPmid().Set(pmids[0]);
3270             pub_list.push_back(pub);
3271         } else if (muids.size() > 0) {
3272             CRef<CPub> pub(new CPub());
3273             pub->SetMuid(muids[0]);
3274             pub_list.push_back(pub);
3275         } else if (serials.size() > 0) {
3276             CRef<CPub> pub(new CPub());
3277             pub->SetGen().SetSerial_number(serials[0]);
3278             pub_list.push_back(pub);
3279         } else if (published_labels.size() > 0) {
3280             CRef<CPub> pub(new CPub());
3281             pub->SetGen().SetCit(published_labels[0]);
3282             pub_list.push_back(pub);
3283         } else if (unpublished_labels.size() > 0) {
3284             CRef<CPub> pub(new CPub());
3285             pub->SetGen().SetCit(unpublished_labels[0]);
3286             pub_list.push_back(pub);
3287         }
3288 
3289         ++fi;
3290     }
3291     return pub_list;
3292 }
3293 
3294 
RemoveDuplicatePubs(CSeq_descr & descr)3295 bool CCleanup::RemoveDuplicatePubs(CSeq_descr& descr)
3296 {
3297     bool any_change = false;
3298     CSeq_descr::Tdata::iterator it1 = descr.Set().begin();
3299     while (it1 != descr.Set().end()) {
3300         if ((*it1)->IsPub()) {
3301             CSeq_descr::Tdata::iterator it2 = it1;
3302             ++it2;
3303             while (it2 != descr.Set().end()) {
3304                 if ((*it2)->IsPub() && (*it1)->GetPub().Equals((*it2)->GetPub())) {
3305                     it2 = descr.Set().erase(it2);
3306                     any_change = true;
3307                 } else {
3308                     ++it2;
3309                 }
3310             }
3311         }
3312         ++it1;
3313     }
3314     return any_change;
3315 }
3316 
3317 
s_FirstPubMatchesSecond(const CPubdesc & pd1,const CPubdesc & pd2)3318 bool s_FirstPubMatchesSecond(const CPubdesc& pd1, const CPubdesc& pd2)
3319 {
3320     if (pd1.Equals(pd2)) {
3321         return true;
3322     } else if (pd1.IsSetPub() && pd2.IsSetPub() && pd1.GetPub().Get().size() == 1) {
3323         ITERATE(CPubdesc::TPub::Tdata, it, pd2.GetPub().Get()) {
3324             if (pd1.GetPub().Get().front()->Equals(**it)) {
3325                 return true;
3326             }
3327         }
3328     }
3329     return false;
3330 }
3331 
3332 
PubAlreadyInSet(const CPubdesc & pd,const CSeq_descr & descr)3333 bool CCleanup::PubAlreadyInSet(const CPubdesc& pd, const CSeq_descr& descr)
3334 {
3335     ITERATE(CSeq_descr::Tdata, d, descr.Get()) {
3336         if ((*d)->IsPub() && s_FirstPubMatchesSecond(pd, (*d)->GetPub())) {
3337             return true;
3338         }
3339     }
3340     return false;
3341 }
3342 
3343 
OkToPromoteNpPub(const CBioseq & b)3344 bool CCleanup::OkToPromoteNpPub(const CBioseq& b)
3345 {
3346     bool is_embl_or_ddbj = false;
3347     ITERATE(CBioseq::TId, id, b.GetId()) {
3348         if ((*id)->IsEmbl() || (*id)->IsDdbj()) {
3349             is_embl_or_ddbj = true;
3350             break;
3351         }
3352     }
3353     return !is_embl_or_ddbj;
3354 }
3355 
3356 
OkToPromoteNpPub(const CPubdesc & pd)3357 bool CCleanup::OkToPromoteNpPub(const CPubdesc& pd)
3358 {
3359     if (pd.IsSetNum() || pd.IsSetName() || pd.IsSetFig() || pd.IsSetComment()) {
3360         return false;
3361     } else {
3362         return true;
3363     }
3364 }
3365 
3366 
MoveOneFeatToPubdesc(CSeq_feat_Handle feat,CRef<CSeqdesc> d,CBioseq_Handle b,bool remove_feat)3367 void CCleanup::MoveOneFeatToPubdesc(CSeq_feat_Handle feat, CRef<CSeqdesc> d, CBioseq_Handle b, bool remove_feat)
3368 {
3369     // add descriptor to nuc-prot parent or sequence itself
3370     CBioseq_set_Handle parent = b.GetParentBioseq_set();
3371     if (!CCleanup::OkToPromoteNpPub(*(b.GetCompleteBioseq()))) {
3372         // add to sequence
3373         CBioseq_EditHandle eh(b);
3374         eh.AddSeqdesc(*d);
3375         RemoveDuplicatePubs(eh.SetDescr());
3376         NormalizeDescriptorOrder(eh.SetDescr());
3377     } else if (parent && parent.IsSetClass() &&
3378         parent.GetClass() == CBioseq_set::eClass_nuc_prot &&
3379         parent.IsSetDescr() && PubAlreadyInSet(d->GetPub(), parent.GetDescr())) {
3380         // don't add descriptor, just delete feature
3381     } else if (OkToPromoteNpPub((d)->GetPub()) &&
3382         parent && parent.IsSetClass() &&
3383         parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
3384         CBioseq_set_EditHandle eh(parent);
3385         eh.AddSeqdesc(*d);
3386         RemoveDuplicatePubs(eh.SetDescr());
3387         NormalizeDescriptorOrder(eh.SetDescr());
3388     } else {
3389         CBioseq_EditHandle eh(b);
3390         eh.AddSeqdesc(*d);
3391         RemoveDuplicatePubs(eh.SetDescr());
3392         NormalizeDescriptorOrder(eh.SetDescr());
3393     }
3394     if (remove_feat) {
3395         // remove feature
3396         CSeq_feat_EditHandle feh(feat);
3397         feh.Remove();
3398     }
3399 }
3400 
3401 
ConvertPubFeatsToPubDescs(CSeq_entry_Handle seh)3402 bool CCleanup::ConvertPubFeatsToPubDescs(CSeq_entry_Handle seh)
3403 {
3404     bool any_change = false;
3405     for (CBioseq_CI b(seh); b; ++b) {
3406         for (CFeat_CI p(*b, CSeqFeatData::e_Pub); p; ++p) {
3407             if (p->GetLocation().IsInt() &&
3408                 p->GetLocation().GetStart(eExtreme_Biological) == 0 &&
3409                 p->GetLocation().GetStop(eExtreme_Biological) == b->GetBioseqLength() - 1) {
3410                 CRef<CSeqdesc> d(new CSeqdesc());
3411                 d->SetPub().Assign(p->GetData().GetPub());
3412                 if (p->IsSetComment()) {
3413                     if (d->GetPub().IsSetComment() && !NStr::IsBlank(d->GetPub().GetComment())) {
3414                         d->SetPub().SetComment(d->GetPub().GetComment() + "; " + p->GetComment());
3415                     } else {
3416                         d->SetPub().SetComment();
3417                     }
3418                 }
3419                 MoveOneFeatToPubdesc(*p, d, *b);
3420                 any_change = true;
3421             }
3422         }
3423     }
3424     return any_change;
3425 }
3426 
3427 
IsSiteRef(const CSeq_feat & sf)3428 bool IsSiteRef(const CSeq_feat& sf)
3429 {
3430     if (sf.GetData().IsImp() &&
3431         sf.GetData().GetImp().IsSetKey() &&
3432         NStr::Equal(sf.GetData().GetImp().GetKey(), "Site-ref")) {
3433         return true;
3434     } else {
3435         return false;
3436     }
3437 }
3438 
3439 
IsMinPub(const CPubdesc & pd,bool is_refseq_prot)3440 bool CCleanup::IsMinPub(const CPubdesc& pd, bool is_refseq_prot)
3441 {
3442     if (!pd.IsSetPub()) {
3443         return true;
3444     }
3445     bool found_non_minimal = false;
3446     ITERATE(CPubdesc::TPub::Tdata, it, pd.GetPub().Get()) {
3447         if ((*it)->IsMuid() || (*it)->IsPmid()) {
3448             if (is_refseq_prot) {
3449                 found_non_minimal = true;
3450                 break;
3451             }
3452         } else if ((*it)->IsGen()) {
3453             const CCit_gen& gen = (*it)->GetGen();
3454             if (gen.IsSetCit() && !gen.IsSetJournal() &&
3455                 !gen.IsSetAuthors() && !gen.IsSetVolume() &&
3456                 !gen.IsSetPages()) {
3457                 //minimalish, keep looking
3458             } else {
3459                 found_non_minimal = true;
3460             }
3461         } else {
3462             found_non_minimal = true;
3463             break;
3464         }
3465     }
3466 
3467     return !found_non_minimal;
3468 }
3469 
3470 
RescueSiteRefPubs(CSeq_entry_Handle seh)3471 bool CCleanup::RescueSiteRefPubs(CSeq_entry_Handle seh)
3472 {
3473     bool found_site_ref = false;
3474     CFeat_CI f(seh, CSeqFeatData::e_Imp);
3475     while (f && !found_site_ref) {
3476         if (IsSiteRef(*(f->GetSeq_feat()))) {
3477             found_site_ref = true;
3478         }
3479         ++f;
3480     }
3481     if (!found_site_ref) {
3482         return false;
3483     }
3484 
3485     bool any_change = false;
3486     for (CBioseq_CI b(seh); b; ++b) {
3487         bool is_refseq_prot = false;
3488         if (b->IsAa()) {
3489             ITERATE(CBioseq::TId, id_it, b->GetCompleteBioseq()->GetId()) {
3490                 if ((*id_it)->IsOther()) {
3491                     is_refseq_prot = true;
3492                     break;
3493                 }
3494             }
3495         }
3496 
3497         for (CFeat_CI p(*b); p; ++p) {
3498             if (!p->IsSetCit() || p->GetCit().Which() != CPub_set::e_Pub) {
3499                 continue;
3500             }
3501 
3502             bool is_site_ref = IsSiteRef(*(p->GetSeq_feat()));
3503             ITERATE(CSeq_feat::TCit::TPub, c, p->GetCit().GetPub()) {
3504                 CRef<CSeqdesc> d(new CSeqdesc());
3505                 if ((*c)->IsEquiv()) {
3506                     ITERATE(CPub_equiv::Tdata, t, (*c)->GetEquiv().Get()) {
3507                         CRef<CPub> pub_copy(new CPub());
3508                         pub_copy->Assign(**t);
3509                         d->SetPub().SetPub().Set().push_back(pub_copy);
3510                     }
3511 
3512                 } else {
3513                     CRef<CPub> pub_copy(new CPub());
3514                     pub_copy->Assign(**c);
3515                     d->SetPub().SetPub().Set().push_back(pub_copy);
3516                 }
3517                 if (is_site_ref) {
3518                     d->SetPub().SetReftype(CPubdesc::eReftype_sites);
3519                 } else {
3520                     d->SetPub().SetReftype(CPubdesc::eReftype_feats);
3521                 }
3522                 CRef<CCleanupChange> changes(makeCleanupChange(0));
3523                 CNewCleanup_imp pubclean(changes, 0);
3524                 pubclean.BasicCleanup(d->SetPub(), ShouldStripPubSerial(*(b->GetCompleteBioseq())));
3525                 if (!IsMinPub(d->SetPub(), is_refseq_prot)) {
3526                     MoveOneFeatToPubdesc(*p, d, *b, false);
3527                 }
3528             }
3529             if (is_site_ref) {
3530 
3531                 CSeq_feat_EditHandle feh(*p);
3532                 CSeq_annot_Handle annot = feh.GetAnnot();
3533 
3534                 feh.Remove();
3535 
3536                 // remove old annot if now empty
3537                 if (CNewCleanup_imp::ShouldRemoveAnnot(*(annot.GetCompleteSeq_annot()))) {
3538                     CSeq_annot_EditHandle annot_edit(annot);
3539                     annot_edit.Remove();
3540                 }
3541 
3542             }
3543             any_change = true;
3544         }
3545     }
3546     return any_change;
3547 }
3548 
3549 
AreBioSourcesMergeable(const CBioSource & src1,const CBioSource & src2)3550 bool CCleanup::AreBioSourcesMergeable(const CBioSource& src1, const CBioSource& src2)
3551 {
3552     if (src1.IsSetOrg() && src1.GetOrg().IsSetTaxname() &&
3553         src2.IsSetOrg() && src2.GetOrg().IsSetTaxname() &&
3554         NStr::Equal(src1.GetOrg().GetTaxname(), src2.GetOrg().GetTaxname())) {
3555         return true;
3556     } else {
3557         return false;
3558     }
3559 }
3560 
3561 
s_SubsourceCompareC(const CRef<CSubSource> & st1,const CRef<CSubSource> & st2)3562 static bool s_SubsourceCompareC (
3563     const CRef<CSubSource>& st1,
3564     const CRef<CSubSource>& st2
3565 )
3566 
3567 {
3568     const CSubSource& sbs1 = *(st1);
3569     const CSubSource& sbs2 = *(st2);
3570 
3571     TSUBSOURCE_SUBTYPE chs1 = GET_FIELD (sbs1, Subtype);
3572     TSUBSOURCE_SUBTYPE chs2 = GET_FIELD (sbs2, Subtype);
3573 
3574     if (chs1 < chs2) return true;
3575     if (chs1 > chs2) return false;
3576 
3577     if (FIELD_IS_SET (sbs2, Name)) {
3578         if (! FIELD_IS_SET (sbs1, Name)) return true;
3579         if (NStr::CompareNocase(GET_FIELD (sbs1, Name), GET_FIELD (sbs2, Name)) < 0) return true;
3580     }
3581 
3582     return false;
3583 }
3584 
s_SameSubtypeC(const CSubSource & s1,const CSubSource & s2)3585 static bool s_SameSubtypeC(const CSubSource& s1, const CSubSource& s2)
3586 {
3587     if (!s1.IsSetSubtype() && !s2.IsSetSubtype()) {
3588         return true;
3589     } else if (!s1.IsSetSubtype() || !s2.IsSetSubtype()) {
3590         return false;
3591     } else {
3592         return s1.GetSubtype() == s2.GetSubtype();
3593     }
3594 }
3595 
3596 // close enough if second name contains the first
s_NameCloseEnoughC(const CSubSource & s1,const CSubSource & s2)3597 static bool s_NameCloseEnoughC(const CSubSource& s1, const CSubSource& s2)
3598 {
3599     if (!s1.IsSetName() && !s2.IsSetName()) {
3600         return true;
3601     } else if (!s1.IsSetName() || !s2.IsSetName()) {
3602         return false;
3603     }
3604     const string& n1 = s1.GetName();
3605     const string& n2 = s2.GetName();
3606 
3607     if (NStr::Equal(n1, n2)) {
3608         return true;
3609     } else {
3610         return false;
3611     }
3612 }
3613 
3614 
s_SubSourceListUniqued(CBioSource & biosrc)3615 bool s_SubSourceListUniqued(CBioSource& biosrc)
3616 {
3617     bool res = false;
3618 
3619     // sort and remove duplicates.
3620     if (biosrc.IsSetSubtype() && biosrc.GetSubtype().size() > 1) {
3621         if (!SUBSOURCE_ON_BIOSOURCE_IS_SORTED(biosrc, s_SubsourceCompareC)) {
3622             SORT_SUBSOURCE_ON_BIOSOURCE(biosrc, s_SubsourceCompareC);
3623         }
3624 
3625         // remove duplicates and subsources that contain previous values
3626         CBioSource::TSubtype::iterator s = biosrc.SetSubtype().begin();
3627         CBioSource::TSubtype::iterator s_next = s;
3628         ++s_next;
3629         while (s_next != biosrc.SetSubtype().end()) {
3630             if (s_SameSubtypeC(**s, **s_next) && s_NameCloseEnoughC(**s, **s_next)) {
3631                 s = biosrc.SetSubtype().erase(s);
3632                 res = true;
3633             } else {
3634                 ++s;
3635             }
3636             ++s_next;
3637         }
3638     }
3639 
3640     return res;
3641 }
3642 
MergeDupBioSources(CBioSource & src1,const CBioSource & add)3643 bool CCleanup::MergeDupBioSources(CBioSource& src1, const CBioSource& add)
3644 {
3645     bool any_change = false;
3646     // genome
3647     if ((!src1.IsSetGenome() || src1.GetGenome() == CBioSource::eGenome_unknown) &&
3648         add.IsSetGenome() && add.GetGenome() != CBioSource::eGenome_unknown) {
3649         src1.SetGenome(add.GetGenome());
3650         any_change = true;
3651     }
3652     // origin
3653     if ((!src1.IsSetOrigin() || src1.GetOrigin() == CBioSource::eOrigin_unknown) &&
3654         add.IsSetOrigin() && add.GetOrigin() != CBioSource::eOrigin_unknown) {
3655         src1.SetOrigin(add.GetOrigin());
3656         any_change = true;
3657     }
3658     // focus
3659     if (!src1.IsSetIs_focus() && add.IsSetIs_focus()) {
3660         src1.SetIs_focus();
3661         any_change = true;
3662     }
3663 
3664     // merge subtypes
3665     if (add.IsSetSubtype()) {
3666         ITERATE(CBioSource::TSubtype, it, add.GetSubtype()) {
3667             CRef<CSubSource> a(new CSubSource());
3668             a->Assign(**it);
3669             src1.SetSubtype().push_back(a);
3670         }
3671         any_change = true;
3672     }
3673 
3674     x_MergeDupOrgRefs(src1.SetOrg(), add.GetOrg());
3675 
3676     if (s_SubSourceListUniqued(src1)) {
3677         any_change = true;
3678     }
3679 
3680     return any_change;
3681 }
3682 
3683 
x_MergeDupOrgNames(COrgName & on1,const COrgName & add)3684 bool CCleanup::x_MergeDupOrgNames(COrgName& on1, const COrgName& add)
3685 {
3686     bool any_change = false;
3687 
3688     // OrgMods
3689     if (add.IsSetMod()) {
3690         ITERATE(COrgName::TMod, it, add.GetMod()) {
3691             CRef<COrgMod> a(new COrgMod());
3692             a->Assign(**it);
3693             on1.SetMod().push_back(a);
3694         }
3695         any_change = true;
3696     }
3697 
3698     // gcode
3699     if ((!on1.IsSetGcode() || on1.GetGcode() == 0) && add.IsSetGcode() && add.GetGcode() != 0) {
3700         on1.SetGcode(add.GetGcode());
3701         any_change = true;
3702     }
3703 
3704     // mgcode
3705     if ((!on1.IsSetMgcode() || on1.GetMgcode() == 0) && add.IsSetMgcode() && add.GetMgcode() != 0) {
3706         on1.SetMgcode(add.GetMgcode());
3707         any_change = true;
3708     }
3709 
3710     // lineage
3711     if (!on1.IsSetLineage() && add.IsSetLineage()) {
3712         on1.SetLineage(add.GetLineage());
3713         any_change = true;
3714     }
3715 
3716     // div
3717     if (!on1.IsSetDiv() && add.IsSetDiv()) {
3718         on1.SetDiv(add.GetDiv());
3719         any_change = true;
3720     }
3721 
3722     return any_change;
3723 }
3724 
3725 
HasMod(const COrg_ref & org,const string & mod)3726 bool HasMod(const COrg_ref& org, const string& mod)
3727 {
3728     if (!org.IsSetMod()) {
3729         return false;
3730     }
3731     ITERATE(COrg_ref::TMod, it, org.GetMod()) {
3732         if (NStr::Equal(*it, mod)) {
3733             return true;
3734         }
3735     }
3736     return false;
3737 }
3738 
3739 
x_MergeDupOrgRefs(COrg_ref & org1,const COrg_ref & add)3740 bool CCleanup::x_MergeDupOrgRefs(COrg_ref& org1, const COrg_ref& add)
3741 {
3742     bool any_change = false;
3743     // mods
3744     if (add.IsSetMod()) {
3745         ITERATE(COrg_ref::TMod, it, add.GetMod()) {
3746             if (!HasMod(org1, *it)) {
3747                 org1.SetMod().push_back(*it);
3748                 any_change = true;
3749             }
3750         }
3751     }
3752 
3753     // dbxrefs
3754     if (add.IsSetDb()) {
3755         ITERATE(COrg_ref::TDb, it, add.GetDb()) {
3756             CRef<CDbtag> a(new CDbtag());
3757             a->Assign(**it);
3758             org1.SetDb().push_back(a);
3759         }
3760         any_change = true;
3761     }
3762 
3763     // synonyms
3764     if (add.IsSetSyn()) {
3765         ITERATE(COrg_ref::TSyn, it, add.GetSyn()) {
3766             org1.SetSyn().push_back(*it);
3767         }
3768         any_change = true;
3769     }
3770 
3771     if (add.IsSetOrgname()) {
3772         any_change |= x_MergeDupOrgNames(org1.SetOrgname(), add.GetOrgname());
3773     }
3774 
3775     return any_change;
3776 }
3777 
3778 
MergeDupBioSources(CSeq_descr & seq_descr)3779 bool CCleanup::MergeDupBioSources(CSeq_descr & seq_descr)
3780 {
3781     bool any_change = false;
3782     CSeq_descr::Tdata::iterator src1 = seq_descr.Set().begin();
3783     while (src1 != seq_descr.Set().end()) {
3784         if ((*src1)->IsSource() && (*src1)->GetSource().IsSetOrg() && (*src1)->GetSource().GetOrg().IsSetTaxname()) {
3785             CSeq_descr::Tdata::iterator src2 = src1;
3786             ++src2;
3787             while (src2 != seq_descr.Set().end()) {
3788                 if ((*src2)->IsSource() &&
3789                     AreBioSourcesMergeable((*src1)->GetSource(), (*src2)->GetSource())) {
3790                     MergeDupBioSources((*src1)->SetSource(), (*src2)->GetSource());
3791 
3792                     CRef<CCleanupChange> changes(makeCleanupChange(0));
3793                     CNewCleanup_imp srcclean(changes, 0);
3794                     srcclean.ExtendedCleanup((*src1)->SetSource());
3795                     src2 = seq_descr.Set().erase(src2);
3796                     any_change = true;
3797                 } else {
3798                     ++src2;
3799                 }
3800             }
3801         }
3802         ++src1;
3803     }
3804     return any_change;
3805 }
3806 
3807 /// Remove duplicate biosource descriptors
RemoveDupBioSource(CSeq_descr & descr)3808 bool CCleanup::RemoveDupBioSource(CSeq_descr& descr)
3809 {
3810     bool any_change = false;
3811     vector<CConstRef<CBioSource> > src_list;
3812     CSeq_descr::Tdata::iterator d = descr.Set().begin();
3813     while (d != descr.Set().end()) {
3814         if ((*d)->IsSource()) {
3815             bool found = false;
3816             ITERATE(vector<CConstRef<CBioSource> >, s, src_list) {
3817                 if ((*d)->GetSource().Equals(**s)) {
3818                     found = true;
3819                     break;
3820                 }
3821             }
3822             if (found) {
3823                 d = descr.Set().erase(d);
3824                 any_change = true;
3825             } else {
3826                 CConstRef<CBioSource> src(&((*d)->GetSource()));
3827                 src_list.push_back(src);
3828                 ++d;
3829             }
3830         } else {
3831             ++d;
3832         }
3833     }
3834     return any_change;
3835 }
3836 
3837 
BioSrcFromFeat(const CSeq_feat & f)3838 CRef<CBioSource> CCleanup::BioSrcFromFeat(const CSeq_feat& f)
3839 {
3840     if (!f.IsSetData() || !f.GetData().IsBiosrc()) {
3841         return CRef<CBioSource>(NULL);
3842     }
3843     CRef<CBioSource> src(new CBioSource());
3844     src->Assign(f.GetData().GetBiosrc());
3845 
3846     // move comment to subsource note
3847     if (f.IsSetComment()) {
3848         CRef<CSubSource> s(new CSubSource());
3849         s->SetSubtype(CSubSource::eSubtype_other);
3850         s->SetName(f.GetComment());
3851         src->SetSubtype().push_back(s);
3852 
3853     }
3854 
3855     // move dbxrefs on feature to source
3856     if (f.IsSetDbxref()) {
3857         ITERATE(CSeq_feat::TDbxref, it, f.GetDbxref()) {
3858             CRef<CDbtag> a(new CDbtag());
3859             a->Assign(**it);
3860             src->SetOrg().SetDb().push_back(a);
3861         }
3862     }
3863     CRef<CCleanupChange> changes(makeCleanupChange(0));
3864     CNewCleanup_imp srcclean(changes, 0);
3865     srcclean.ExtendedCleanup(*src);
3866 
3867     return src;
3868 }
3869 
3870 
ConvertSrcFeatsToSrcDescs(CSeq_entry_Handle seh)3871 bool CCleanup::ConvertSrcFeatsToSrcDescs(CSeq_entry_Handle seh)
3872 {
3873     bool any_change = false;
3874     for (CBioseq_CI b(seh); b; ++b) {
3875         bool transgenic_or_focus = false;
3876         CSeqdesc_CI existing_src(*b, CSeqdesc::e_Source);
3877         while (existing_src && !transgenic_or_focus) {
3878             if (existing_src->GetSource().IsSetIs_focus() ||
3879                 existing_src->GetSource().HasSubtype(CSubSource::eSubtype_transgenic)) {
3880                 transgenic_or_focus = true;
3881             }
3882             ++existing_src;
3883         }
3884         if (transgenic_or_focus) {
3885             continue;
3886         }
3887         for (CFeat_CI p(*b, CSeqFeatData::e_Biosrc); p; ++p) {
3888             if (p->GetLocation().IsInt() &&
3889                 p->GetLocation().GetStart(eExtreme_Biological) == 0 &&
3890                 p->GetLocation().GetStop(eExtreme_Biological) == b->GetBioseqLength() - 1) {
3891                 CRef<CSeqdesc> d(new CSeqdesc());
3892                 d->SetSource().Assign(*(BioSrcFromFeat(*(p->GetSeq_feat()))));
3893 
3894                 // add descriptor to nuc-prot parent or sequence itself
3895                 CBioseq_set_Handle parent = b->GetParentBioseq_set();
3896                 if (parent && parent.IsSetClass() &&
3897                     parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
3898                     CBioseq_set_EditHandle eh(parent);
3899                     eh.AddSeqdesc(*d);
3900                     MergeDupBioSources(eh.SetDescr());
3901                     RemoveDupBioSource(eh.SetDescr());
3902                     NormalizeDescriptorOrder(eh.SetDescr());
3903                 } else {
3904                     CBioseq_EditHandle eh(*b);
3905                     eh.AddSeqdesc(*d);
3906                     MergeDupBioSources(eh.SetDescr());
3907                     RemoveDupBioSource(eh.SetDescr());
3908                     NormalizeDescriptorOrder(eh.SetDescr());
3909                 }
3910 
3911                 // remove feature
3912                 CSeq_feat_EditHandle feh(*p);
3913                 feh.Remove();
3914 
3915                 any_change = true;
3916             }
3917         }
3918     }
3919     return any_change;
3920 }
3921 
3922 
3923 
FixGeneXrefSkew(CSeq_entry_Handle seh)3924 bool CCleanup::FixGeneXrefSkew(CSeq_entry_Handle seh)
3925 {
3926     CFeat_CI fi(seh);
3927     size_t num_gene_locus = 0;
3928     size_t num_gene_locus_tag = 0;
3929     size_t num_gene_xref_locus = 0;
3930     size_t num_gene_xref_locus_tag = 0;
3931 
3932     while (fi) {
3933         if (fi->GetData().IsGene()) {
3934             if (fi->GetData().GetGene().IsSetLocus()) {
3935                 num_gene_locus++;
3936             }
3937             if (fi->GetData().GetGene().IsSetLocus_tag()) {
3938                 num_gene_locus_tag++;
3939             }
3940         } else if (fi->IsSetXref()) {
3941             const CGene_ref* g = fi->GetGeneXref();
3942             if (g) {
3943                 if (g->IsSetLocus()) {
3944                     num_gene_xref_locus++;
3945                 }
3946                 if (g->IsSetLocus_tag()) {
3947                     num_gene_xref_locus_tag++;
3948                 }
3949             }
3950         }
3951         if (num_gene_locus > 0) {
3952             if (num_gene_locus_tag > 0) {
3953                 return false;
3954             }
3955             if (num_gene_xref_locus > 0) {
3956                 return false;
3957             }
3958         }
3959         if (num_gene_locus_tag > 0) {
3960             if (num_gene_locus > 0) {
3961                 return false;
3962             }
3963             if (num_gene_xref_locus_tag > 0) {
3964                 return false;
3965             }
3966         }
3967         ++fi;
3968     }
3969 
3970     bool any_change = false;
3971     if (num_gene_locus == 0 && num_gene_locus_tag > 0) {
3972         if (num_gene_xref_locus > 0 && num_gene_xref_locus_tag == 0) {
3973             fi.Rewind();
3974             while (fi) {
3975                 if (!fi->GetData().IsGene() && fi->GetGeneXref() != NULL) {
3976                     bool this_change = false;
3977                     CRef<CSeq_feat> new_f(new CSeq_feat());
3978                     new_f->Assign(*(fi->GetSeq_feat()));
3979                     NON_CONST_ITERATE(CSeq_feat::TXref, it, new_f->SetXref()) {
3980                         if ((*it)->IsSetData() && (*it)->GetData().IsGene()
3981                             && (*it)->GetData().GetGene().IsSetLocus()) {
3982                             (*it)->SetData().SetGene().SetLocus_tag((*it)->GetData().GetGene().GetLocus());
3983                             (*it)->SetData().SetGene().ResetLocus();
3984                             this_change = true;
3985                         }
3986                     }
3987                     if (this_change) {
3988                         CSeq_feat_EditHandle eh(*fi);
3989                         eh.Replace(*new_f);
3990                     }
3991                 }
3992                 ++fi;
3993             }
3994         }
3995     } else if (num_gene_locus > 0 && num_gene_locus_tag == 0) {
3996         if (num_gene_xref_locus == 0 && num_gene_xref_locus_tag > 0) {
3997             fi.Rewind();
3998             while (fi) {
3999                 if (!fi->GetData().IsGene() && fi->GetGeneXref() != NULL) {
4000                     bool this_change = false;
4001                     CRef<CSeq_feat> new_f(new CSeq_feat());
4002                     new_f->Assign(*(fi->GetSeq_feat()));
4003                     NON_CONST_ITERATE(CSeq_feat::TXref, it, new_f->SetXref()) {
4004                         if ((*it)->IsSetData() && (*it)->GetData().IsGene()
4005                             && (*it)->GetData().GetGene().IsSetLocus_tag()) {
4006                             (*it)->SetData().SetGene().SetLocus((*it)->GetData().GetGene().GetLocus_tag());
4007                             (*it)->SetData().SetGene().ResetLocus_tag();
4008                             this_change = true;
4009                         }
4010                     }
4011                     if (this_change) {
4012                         CSeq_feat_EditHandle eh(*fi);
4013                         eh.Replace(*new_f);
4014                         any_change = true;
4015                     }
4016                 }
4017                 ++fi;
4018             }
4019         }
4020     }
4021     return any_change;
4022 }
4023 
4024 
ShouldStripPubSerial(const CBioseq & bs)4025 bool CCleanup::ShouldStripPubSerial(const CBioseq& bs)
4026 {
4027     bool strip_serial = true;
4028     ITERATE(CBioseq::TId, id, bs.GetId()) {
4029         const CSeq_id& sid = **id;
4030         switch (sid.Which()) {
4031         case NCBI_SEQID(Genbank):
4032         case NCBI_SEQID(Tpg):
4033         {
4034             const CTextseq_id& tsid = *GET_FIELD(sid, Textseq_Id);
4035             if (FIELD_IS_SET(tsid, Accession)) {
4036                 const string& acc = GET_FIELD(tsid, Accession);
4037                 if (acc.length() == 6) {
4038                     strip_serial = false;
4039                 }
4040             }
4041         }
4042         break;
4043         case NCBI_SEQID(Embl):
4044         case NCBI_SEQID(Ddbj):
4045             strip_serial = false;
4046             break;
4047         case NCBI_SEQID(not_set):
4048         case NCBI_SEQID(Local):
4049         case NCBI_SEQID(Other):
4050         case NCBI_SEQID(General):
4051             break;
4052         case NCBI_SEQID(Gibbsq):
4053         case NCBI_SEQID(Gibbmt):
4054         case NCBI_SEQID(Pir):
4055         case NCBI_SEQID(Swissprot):
4056         case NCBI_SEQID(Patent):
4057         case NCBI_SEQID(Prf):
4058         case NCBI_SEQID(Pdb):
4059         case NCBI_SEQID(Gpipe):
4060         case NCBI_SEQID(Tpe):
4061         case NCBI_SEQID(Tpd):
4062             strip_serial = false;
4063             break;
4064         default:
4065             break;
4066         }
4067     }
4068     return strip_serial;
4069 }
4070 
4071 
RenormalizeNucProtSets(CSeq_entry_Handle seh)4072 bool CCleanup::RenormalizeNucProtSets(CSeq_entry_Handle seh)
4073 {
4074     bool change_made = false;
4075     CConstRef<CSeq_entry> entry = seh.GetCompleteSeq_entry();
4076     if (seh.IsSet() && seh.GetSet().IsSetClass() &&
4077         entry->GetSet().IsSetSeq_set()) {
4078         CBioseq_set::TClass set_class = seh.GetSet().GetClass();
4079         if (set_class == CBioseq_set::eClass_nuc_prot) {
4080             if (entry->GetSet().GetSeq_set().size() == 1 &&
4081                 entry->GetSet().GetSeq_set().front()->IsSeq()) {
4082                 CSeq_entry_EditHandle eh = seh.GetEditHandle();
4083                 eh.ConvertSetToSeq();
4084                 if (eh.GetSeq().IsSetDescr()) {
4085                     RemoveUnseenTitles(eh.SetSeq());
4086                     NormalizeDescriptorOrder(eh.SetSeq().SetDescr());
4087                 }
4088                 change_made = true;
4089             }
4090         } else if (set_class == CBioseq_set::eClass_genbank ||
4091             set_class == CBioseq_set::eClass_mut_set ||
4092             set_class == CBioseq_set::eClass_pop_set ||
4093             set_class == CBioseq_set::eClass_phy_set ||
4094             set_class == CBioseq_set::eClass_eco_set ||
4095             set_class == CBioseq_set::eClass_wgs_set ||
4096             set_class == CBioseq_set::eClass_gen_prod_set ||
4097             set_class == CBioseq_set::eClass_small_genome_set) {
4098             ITERATE(CBioseq_set::TSeq_set, s, entry->GetSet().GetSeq_set()) {
4099                 CSeq_entry_Handle ch = seh.GetScope().GetSeq_entryHandle(**s);
4100                 change_made |= RenormalizeNucProtSets(ch);
4101             }
4102         }
4103     }
4104     return change_made;
4105 }
4106 
4107 
DecodeXMLMarkChanged(std::string & str)4108 bool CCleanup::DecodeXMLMarkChanged(std::string & str)
4109 {
4110 // return false;
4111     bool change_made = false;
4112 
4113     // This is more complex than you might initially think is necessary
4114     // because this needs to be as efficient as possible since it's
4115     // called on every single string in an object.
4116 
4117     SIZE_TYPE amp = str.find('&');
4118     if( NPOS == amp ) {
4119         // Check for the common case of no replacements required
4120         return change_made;
4121     }
4122 
4123     // transformations done by this function:
4124     const static struct {
4125         string src_word;
4126         string result_word;
4127     } transformations[] = {
4128         // all start with an implicit ampersand
4129         // and end with an implicit semi-colon
4130         { "amp",      "&"      },
4131         { "apos",     "\'"     },
4132         { "gt",       ">"      },
4133         { "lt",       "<"      },
4134         { "quot",     "\""     },
4135         { "#13&#10",  ""       },
4136         { "#13;&#10", ""       },
4137         { "#916",     "Delta"  },
4138         { "#945",     "alpha"  },
4139         { "#946",     "beta"   },
4140         { "#947",     "gamma"  },
4141         { "#952",     "theta"  },
4142         { "#955",     "lambda" },
4143         { "#956",     "mu"     },
4144         { "#957",     "nu"     },
4145         { "#8201",    ""       },
4146         { "#8206",    ""       },
4147         { "#8242",    "'"      },
4148         { "#8594",    "->"     },
4149         { "#8722",    "-"      },
4150         { "#8710",    "delta"  },
4151         { "#64257",   "fi"     },
4152         { "#64258",   "fl"     },
4153         { "#65292",   ","      }
4154     };
4155 
4156     // Collisions should be rare enough that the CFastMutex is
4157     // faster than recreating the searcher each time this function is called
4158     static CTextFsm<int> searcher;
4159     // set searcher's state, if not already done
4160     {
4161         // just in case of the tiny chance that two threads try to prime
4162         // the searcher at the same time.
4163         static CFastMutex searcher_mtx;
4164         CFastMutexGuard searcher_mtx_guard( searcher_mtx );
4165         if( ! searcher.IsPrimed() ) {
4166             for( int idx = 0;
4167                 idx < sizeof(transformations)/sizeof(transformations[0]);
4168                 ++idx )
4169             {
4170                 // match type is index into transformations array
4171                 searcher.AddWord( transformations[idx].src_word, idx );
4172             }
4173             searcher.Prime();
4174         }
4175     }
4176 
4177     // a smart compiler probably won't need this manual optimization,
4178     // but just in case.
4179     const SIZE_TYPE str_len = str.length();
4180 
4181     // fill result up to the first '&'
4182     string result;
4183     result.reserve( str_len );
4184     copy( str.begin(), str.begin() + amp,
4185         back_inserter(result) );
4186 
4187     // at the start of each loop, the result is filled in
4188     // up to the ampersand (amp)
4189     while( amp != NPOS && amp < str_len ) {
4190 
4191         // find out what the ampersand code represents
4192         // (if it represents anything)
4193         int state = searcher.GetInitialState();
4194         SIZE_TYPE search_pos = (amp + 1);
4195         if (str[search_pos] == ' ') {
4196             break;
4197         }
4198         for( ; search_pos < str_len ; ++search_pos ) {
4199             const char ch = str[search_pos];
4200             if( ch == ';' ) {
4201                 break;
4202             }
4203             if( ch == '&' && state == 0 ) {
4204                 --search_pos; // so we don't skip over the '&'
4205                 state = searcher.GetInitialState(); // force "no-match"
4206                 break;
4207             }
4208             state = searcher.GetNextState(state, ch);
4209         }
4210 
4211         if( search_pos == str_len && searcher.IsMatchFound(state) ) {
4212             // copy the translation of the XML code:
4213             _ASSERT( searcher.GetMatches(state).size() == 1 );
4214             const int match_idx = searcher.GetMatches(state)[0];
4215             const string & result_word = transformations[match_idx].result_word;
4216             copy( result_word.begin(), result_word.end(),
4217                 back_inserter(result) );
4218             change_made = true;
4219             break;
4220         }
4221 
4222         if( search_pos >= str_len ) {
4223             // we reached the end without finding anything, so
4224             // copy the rest and break
4225             copy( str.begin() + amp, str.end(),
4226                 back_inserter(result) );
4227             break;
4228         }
4229 
4230         if( searcher.IsMatchFound(state) ) {
4231             // copy the translation of the XML code:
4232             _ASSERT( searcher.GetMatches(state).size() == 1 );
4233             const int match_idx = searcher.GetMatches(state)[0];
4234             const string & result_word = transformations[match_idx].result_word;
4235             copy( result_word.begin(), result_word.end(),
4236                 back_inserter(result) );
4237             change_made = true;
4238         } else {
4239             // no match found, so copy the text we looked at
4240             // as-is
4241             copy( str.begin() + amp, str.begin() + search_pos + 1,
4242                 back_inserter(result) );
4243         }
4244 
4245         // find next_amp
4246         if( str[search_pos] == '&' ) {
4247             // special case that occurs when there are multiple '&' together
4248             ++search_pos;
4249             result += '&';
4250         }
4251         SIZE_TYPE next_amp = str.find('&', search_pos );
4252         if( NPOS == next_amp ) {
4253             // no more amps; copy the rest and break
4254             copy( str.begin() + search_pos + 1, str.end(),
4255                 back_inserter(result) );
4256             break;
4257         }
4258 
4259         // copy up to the next amp
4260         if( (search_pos + 1) < next_amp ) {
4261             copy( str.begin() + search_pos + 1, str.begin() + next_amp,
4262                 back_inserter(result) );
4263         }
4264         amp = next_amp;
4265     }
4266 
4267     if (change_made) {
4268       str = result;
4269     }
4270 
4271     return change_made;
4272 }
4273 
4274 
GetProteinLocationFromNucleotideLocation(const CSeq_loc & nuc_loc,const CSeq_feat & cds,CScope & scope,bool require_inframe)4275 CRef<CSeq_loc> CCleanup::GetProteinLocationFromNucleotideLocation(const CSeq_loc& nuc_loc, const CSeq_feat& cds, CScope& scope, bool require_inframe)
4276 {
4277     if (require_inframe) {
4278         feature::ELocationInFrame is_in_frame = feature::IsLocationInFrame(scope.GetSeq_featHandle(cds), nuc_loc);
4279         bool is_ok = false;
4280         switch (is_in_frame) {
4281             case feature::eLocationInFrame_InFrame:
4282                 is_ok = true;
4283                 break;
4284             case feature::eLocationInFrame_BadStart:
4285                 if (cds.GetLocation().GetStart(eExtreme_Biological) == nuc_loc.GetStart(eExtreme_Biological)) {
4286                     is_ok = true;
4287                 }
4288                 break;
4289             case feature::eLocationInFrame_BadStop:
4290                 if (cds.GetLocation().GetStop(eExtreme_Biological) == nuc_loc.GetStop(eExtreme_Biological)) {
4291                     is_ok = true;
4292                 }
4293                 break;
4294             case feature::eLocationInFrame_BadStartAndStop:
4295                 if (cds.GetLocation().GetStart(eExtreme_Biological) == nuc_loc.GetStart(eExtreme_Biological) &&
4296                     cds.GetLocation().GetStop(eExtreme_Biological) == nuc_loc.GetStop(eExtreme_Biological)) {
4297                     is_ok = true;
4298                 }
4299                 break;
4300             case feature::eLocationInFrame_NotIn:
4301                 break;
4302         }
4303         if (!is_ok) {
4304             return CRef<CSeq_loc>(NULL);
4305         }
4306     }
4307     CRef<CSeq_loc> new_loc;
4308     CRef<CSeq_loc_Mapper> nuc2prot_mapper(
4309         new CSeq_loc_Mapper(cds, CSeq_loc_Mapper::eLocationToProduct, &scope));
4310     new_loc = nuc2prot_mapper->Map(nuc_loc);
4311     if (!new_loc) {
4312         return CRef<CSeq_loc>(NULL);
4313     }
4314 
4315     const CSeq_id* sid = new_loc->GetId();
4316     const CSeq_id* orig_id = nuc_loc.GetId();
4317     if (!sid || (orig_id && sid->Equals(*orig_id))) {
4318         // unable to map to protein location
4319         return CRef<CSeq_loc>(NULL);
4320     }
4321 
4322     new_loc->ResetStrand();
4323 
4324     // if location includes stop codon, remove it
4325     CBioseq_Handle prot = scope.GetBioseqHandle(*sid);
4326     if (prot && new_loc->GetStop(objects::eExtreme_Positional) >= prot.GetBioseqLength())
4327     {
4328         CRef<CSeq_id> sub_id(new CSeq_id());
4329         sub_id->Assign(*sid);
4330         CSeq_loc sub(*sub_id, prot.GetBioseqLength(), new_loc->GetStop(objects::eExtreme_Positional), new_loc->GetStrand());
4331         new_loc = sequence::Seq_loc_Subtract(*new_loc, sub, CSeq_loc::fMerge_All | CSeq_loc::fSort, &scope);
4332         if (nuc_loc.IsPartialStop(eExtreme_Biological)) {
4333             new_loc->SetPartialStop(true, eExtreme_Biological);
4334         }
4335     }
4336 
4337     if (!new_loc->IsInt() && !new_loc->IsPnt()) {
4338         CRef<CSeq_loc> tmp = sequence::Seq_loc_Merge(*new_loc, CSeq_loc::fMerge_All, &scope);
4339         new_loc = tmp;
4340     }
4341 
4342     // fix partials if protein feature starts or ends at beginning or end of protein sequence
4343     if (!cds.GetLocation().IsPartialStart(eExtreme_Biological) &&
4344         new_loc->GetStart(eExtreme_Biological) == 0) {
4345         if (new_loc->IsPartialStart(eExtreme_Biological)) {
4346             new_loc->SetPartialStart(false, eExtreme_Biological);
4347         }
4348     }
4349     if (!cds.GetLocation().IsPartialStop(eExtreme_Biological) &&
4350         new_loc->GetStop(eExtreme_Biological) == prot.GetBioseqLength() - 1) {
4351         if (new_loc->IsPartialStop(eExtreme_Biological)) {
4352             new_loc->SetPartialStop(false, eExtreme_Biological);
4353         }
4354     }
4355 
4356     return new_loc;
4357 }
4358 
4359 
GetProteinLocationFromNucleotideLocation(const CSeq_loc & nuc_loc,CScope & scope)4360 CRef<CSeq_loc> CCleanup::GetProteinLocationFromNucleotideLocation(const CSeq_loc& nuc_loc, CScope& scope)
4361 {
4362     CConstRef<CSeq_feat> cds = sequence::GetOverlappingCDS(nuc_loc, scope);
4363     if (!cds || !cds->IsSetProduct()) {
4364         // there is no overlapping coding region feature, so there is no appropriate
4365         // protein sequence to move to
4366         return CRef<CSeq_loc>(NULL);
4367     }
4368 
4369     return GetProteinLocationFromNucleotideLocation(nuc_loc, *cds, scope);
4370 }
4371 
4372 
4373 
RepackageProteins(const CSeq_feat & cds,CBioseq_set_Handle np)4374 bool CCleanup::RepackageProteins(const CSeq_feat& cds, CBioseq_set_Handle np)
4375 {
4376     if (!cds.IsSetProduct() || !cds.GetProduct().IsWhole()) {
4377         // no product, or product is specified weirdly
4378         return false;
4379     }
4380     CBioseq_Handle protein = np.GetTSE_Handle().GetBioseqHandle(cds.GetProduct().GetWhole());
4381     if (!protein) {
4382         // protein is not in the same TSE
4383         return false;
4384     }
4385     if (protein.GetParentBioseq_set() == np) {
4386         // already in the right set
4387         return false;
4388     }
4389     CBioseq_set_EditHandle eh(np);
4390     CSeq_entry_Handle ph = protein.GetSeq_entry_Handle();
4391     CSeq_entry_EditHandle peh(ph);
4392     eh.TakeEntry(peh);
4393     return true;
4394 }
4395 
4396 
RepackageProteins(CSeq_entry_Handle seh)4397 bool CCleanup::RepackageProteins(CSeq_entry_Handle seh)
4398 {
4399     bool changed = false;
4400     CSeq_entry_CI si(seh, CSeq_entry_CI::fRecursive | CSeq_entry_CI::fIncludeGivenEntry, CSeq_entry::e_Set);
4401     while (si) {
4402         CBioseq_set_Handle set = si->GetSet();
4403         if (set.IsSetClass() && set.GetClass() == CBioseq_set::eClass_nuc_prot && set.HasAnnots()) {
4404             ITERATE(CBioseq_set::TAnnot, annot_it, set.GetCompleteBioseq_set()->GetAnnot()) {
4405                 if ((*annot_it)->IsSetData() && (*annot_it)->IsFtable()) {
4406                     ITERATE(CSeq_annot::TData::TFtable, feat_it, (*annot_it)->GetData().GetFtable()) {
4407                         if ((*feat_it)->IsSetData() && (*feat_it)->GetData().IsCdregion()) {
4408                             changed |= RepackageProteins(**feat_it, set);
4409                         }
4410                     }
4411                 }
4412             }
4413         }
4414         ++si;
4415     }
4416     return changed;
4417 }
4418 
4419 
ConvertDeltaSeqToRaw(CSeq_entry_Handle seh,CSeq_inst::EMol filter)4420 bool CCleanup::ConvertDeltaSeqToRaw(CSeq_entry_Handle seh, CSeq_inst::EMol filter)
4421 {
4422     bool any_change = false;
4423     for (CBioseq_CI bi(seh, filter); bi; ++bi) {
4424         CBioseq_Handle bsh = *bi;
4425         CRef<CSeq_inst> inst(new CSeq_inst());
4426         inst->Assign(bsh.GetInst());
4427         if (inst->ConvertDeltaToRaw()) {
4428             CBioseq_EditHandle beh(bsh);
4429             beh.SetInst(*inst);
4430             any_change = true;
4431         }
4432     }
4433     return any_change;
4434 }
4435 
4436 
ParseCodeBreak(const CSeq_feat & feat,CCdregion & cds,const CTempString & str,CScope & scope,IObjtoolsListener * pMessageListener)4437 bool CCleanup::ParseCodeBreak(const CSeq_feat& feat,
4438         CCdregion& cds,
4439         const CTempString& str,
4440         CScope& scope,
4441         IObjtoolsListener* pMessageListener)
4442 {
4443     if (str.empty() || !feat.IsSetLocation()) {
4444         return false;
4445     }
4446 
4447     const CSeq_id* feat_loc_seq_id = feat.GetLocation().GetId();
4448     if (!feat_loc_seq_id) {
4449         return false;
4450     }
4451 
4452     string::size_type aa_pos = NStr::Find(str, "aa:");
4453     string::size_type len = 0;
4454     string::size_type loc_pos, end_pos;
4455     char protein_letter = 'X';
4456     CRef<CSeq_loc> break_loc;
4457 
4458     if (aa_pos == string::npos) {
4459         aa_pos = NStr::Find(str, ",");
4460         if (aa_pos != string::npos) {
4461             aa_pos = NStr::Find(str, ":", aa_pos);
4462         }
4463         if (aa_pos != string::npos) {
4464             aa_pos++;
4465         }
4466     } else {
4467         aa_pos += 3;
4468     }
4469 
4470     if (aa_pos != string::npos) {
4471         while (aa_pos < str.length() && isspace(str[aa_pos])) {
4472             aa_pos++;
4473         }
4474         while (aa_pos + len < str.length() && isalpha(str[aa_pos + len])) {
4475             len++;
4476         }
4477         if (len != 0) {
4478             protein_letter = ValidAminoAcid(str.substr(aa_pos, len));
4479         }
4480     }
4481 
4482     loc_pos = NStr::Find(str, "(pos:");
4483 
4484     using TSubcode = CCleanupMessage::ESubcode;
4485     auto postMessage =
4486         [pMessageListener](string msg, TSubcode subcode) {
4487             pMessageListener->PutMessage(
4488                     CCleanupMessage(msg, eDiag_Error, CCleanupMessage::ECode::eCodeBreak, subcode));
4489         };
4490 
4491     if (loc_pos == string::npos) {
4492         if (pMessageListener) {
4493             string msg = "Unable to identify code-break location in '" + str + "'";
4494             postMessage(msg, TSubcode::eParseError);
4495         }
4496         return false;
4497     }
4498     loc_pos += 5;
4499     while (loc_pos < str.length() && isspace(str[loc_pos])) {
4500         loc_pos++;
4501     }
4502 
4503     end_pos = NStr::Find(str, ",aa:", loc_pos);
4504     if (end_pos == NPOS) {
4505         end_pos = NStr::Find(str, ",", loc_pos);
4506         if (end_pos == NPOS) {
4507             end_pos = str.length();
4508         }
4509     }
4510 
4511     string pos = NStr::TruncateSpaces_Unsafe(str.substr(loc_pos, end_pos - loc_pos));
4512 
4513     // handle multi-interval positions by adding a join() around them
4514     if (pos.find_first_of(",") != string::npos) {
4515         pos = "join(" + pos + ")";
4516     }
4517 
4518     break_loc = ReadLocFromText(pos, feat_loc_seq_id, &scope);
4519 
4520     if (break_loc == NULL) {
4521         if (pMessageListener) {
4522             string msg = "Unable to extract code-break location from '" + str + "'";
4523             postMessage(msg, TSubcode::eParseError);
4524         }
4525         return false;
4526     }
4527 
4528     if (break_loc->IsInt() && sequence::GetLength(*break_loc, &scope) > 3) {
4529         if (pMessageListener) {
4530             string msg = "code-break location exceeds 3 bases";
4531             postMessage(msg, TSubcode::eBadLocation);
4532         }
4533         return false;
4534     }
4535     if ((break_loc->IsInt() || break_loc->IsPnt()) &&
4536          sequence::Compare(*break_loc, feat.GetLocation(), &scope, sequence::fCompareOverlapping) != sequence::eContained) {
4537         if (pMessageListener) {
4538             string msg = "code-break location lies outside of coding region";
4539             postMessage(msg, TSubcode::eBadLocation);
4540         }
4541         return false;
4542     }
4543 
4544     if (FIELD_IS_SET(feat.GetLocation(), Strand) && GET_FIELD(feat.GetLocation(), Strand) == eNa_strand_minus) {
4545         break_loc->SetStrand(GET_FIELD(feat.GetLocation(), Strand));
4546     } else {
4547         RESET_FIELD(*break_loc, Strand);
4548     }
4549 
4550     // need to build code break object and add it to coding region
4551     CRef<CCode_break> newCodeBreak(new CCode_break());
4552     CCode_break::TAa& aa = newCodeBreak->SetAa();
4553     aa.SetNcbieaa(protein_letter);
4554     newCodeBreak->SetLoc(*break_loc);
4555 
4556     CCdregion::TCode_break& orig_list = cds.SetCode_break();
4557     orig_list.push_back(newCodeBreak);
4558 
4559     return true;
4560 }
4561 
4562 
ParseCodeBreaks(CSeq_feat & feat,CScope & scope)4563 bool CCleanup::ParseCodeBreaks(CSeq_feat& feat, CScope& scope)
4564 {
4565     if (!feat.IsSetData() || !feat.GetData().IsCdregion() ||
4566         !feat.IsSetQual() || !feat.IsSetLocation()) {
4567         return false;
4568     }
4569 
4570     bool any_removed = false;
4571     CSeq_feat::TQual::iterator it = feat.SetQual().begin();
4572     while (it != feat.SetQual().end()) {
4573         if ((*it)->IsSetQual() &&
4574             NStr::EqualNocase((*it)->GetQual(), "transl_except") &&
4575             (*it)->IsSetVal() &&
4576             ParseCodeBreak(feat, feat.SetData().SetCdregion(), (*it)->GetVal(), scope)) {
4577             it = feat.SetQual().erase(it);
4578             any_removed = true;
4579         } else {
4580             ++it;
4581         }
4582     }
4583     if (feat.GetQual().size() == 0) {
4584         feat.ResetQual();
4585     }
4586     return any_removed;
4587 }
4588 
4589 
4590 // From SQD-4297
4591 // Influenza is a multi-segmented virus. We would like to create
4592 // small-genome sets when all segments of a particular viral strain
4593 // are submitted together. This is made more difficult due to fact
4594 // that submitters often have large submissions with multiple strains
4595 // at one time.
4596 // This function will segregate sequences with the same taxname
4597 // plus additional qualifiers into small-genome sets, if there are enough
4598 // sequences for that type of Influenza *AND* all CDS and gene features
4599 // on the sequences are complete.
4600 // * Influenza A virus: 8 or more nucleotide sequences with same strain and serotype
4601 // * Influenza B virus: 8 or more nucleotide sequences with same strain
4602 // * Influenza C virus: 7 or more nucleotide sequences with same strain
4603 // * Influenza D virus: 7 or more records with same strain
4604 // Note that as long as we are making strain-specific organism names,
4605 // the taxname must only start with the Influenza designation, not match it.
4606 // Can only make a set if at least one instance of each segment value is represented.
4607 class CInfluenzaSet : public CObject {
4608 public:
4609     CInfluenzaSet(const string& key);
~CInfluenzaSet()4610     ~CInfluenzaSet() {}
4611 
4612     static string GetKey(const COrg_ref& org);
4613     bool OkToMakeSet() const;
4614     void MakeSet();
4615 
4616     typedef enum {
4617         eNotInfluenza = 0,
4618         eInfluenzaA,
4619         eInfluenzaB,
4620         eInfluenzaC,
4621         eInfluenzaD
4622     } EInfluenzaType;
4623 
4624     static EInfluenzaType GetInfluenzaType(const string& taxname);
4625 
4626     void AddBioseq(CBioseq_Handle bsh);
4627 
4628 protected:
4629     typedef vector<CBioseq_Handle> TMembers;
4630     TMembers m_Members;
4631     const string m_Key;
4632     EInfluenzaType m_FluType;
4633     size_t m_Required;
4634 };
4635 
4636 
CInfluenzaSet(const string & key)4637 CInfluenzaSet::CInfluenzaSet(const string& key) : m_Key(key)
4638 {
4639     m_FluType = GetInfluenzaType(key);
4640     m_Required = 7;
4641     if (m_FluType == eInfluenzaA || m_FluType == eInfluenzaB) {
4642         m_Required = 8;
4643     }
4644 }
4645 
4646 
GetInfluenzaType(const string & taxname)4647 CInfluenzaSet::EInfluenzaType CInfluenzaSet::GetInfluenzaType(const string& taxname)
4648 {
4649     if (NStr::StartsWith(taxname, "Influenza A virus", NStr::eNocase)) {
4650         return eInfluenzaA;
4651     } else if (NStr::StartsWith(taxname, "Influenza B virus", NStr::eNocase)) {
4652         return eInfluenzaB;
4653     } else if (NStr::StartsWith(taxname, "Influenza C virus", NStr::eNocase)) {
4654         return eInfluenzaC;
4655     } else if (NStr::StartsWith(taxname, "Influenza D virus", NStr::eNocase)) {
4656         return eInfluenzaD;
4657     } else {
4658         return eNotInfluenza;
4659     }
4660 }
4661 
4662 
GetKey(const COrg_ref & org)4663 string CInfluenzaSet::GetKey(const COrg_ref& org)
4664 {
4665     if (!org.IsSetTaxname() || !org.IsSetOrgname() || !org.GetOrgname().IsSetMod()) {
4666         return kEmptyStr;
4667     }
4668     EInfluenzaType flu_type = GetInfluenzaType(org.GetTaxname());
4669     if (flu_type == eNotInfluenza) {
4670         return kEmptyStr;
4671     }
4672 
4673     CTempString strain = kEmptyStr;
4674     CTempString serotype = kEmptyStr;
4675 
4676     ITERATE(COrgName::TMod, it, org.GetOrgname().GetMod()) {
4677         if ((*it)->IsSetSubtype() && (*it)->IsSetSubname()) {
4678             if ((*it)->GetSubtype() == COrgMod::eSubtype_strain) {
4679                 strain = (*it)->GetSubname();
4680             } else if ((*it)->GetSubtype() == COrgMod::eSubtype_serotype &&
4681                 flu_type == eInfluenzaA) {
4682                 serotype = (*it)->GetSubname();
4683             }
4684         }
4685     }
4686     if(NStr::IsBlank(strain)) {
4687         return kEmptyStr;
4688     }
4689     if (flu_type == eInfluenzaA) {
4690         if (NStr::IsBlank(serotype)) {
4691             return kEmptyStr;
4692         } else {
4693             return org.GetTaxname() + ":" + strain + ":" + serotype;
4694         }
4695     } else {
4696         return org.GetTaxname() + ":" + strain;
4697     }
4698 }
4699 
4700 
AddBioseq(CBioseq_Handle bsh)4701 void CInfluenzaSet::AddBioseq(CBioseq_Handle bsh)
4702 {
4703     m_Members.push_back(bsh);
4704 }
4705 
4706 
OkToMakeSet() const4707 bool CInfluenzaSet::OkToMakeSet() const
4708 {
4709     if (m_Members.size() < m_Required) {
4710         return false;
4711     }
4712 
4713     bool ok = true;
4714     bool* seg_found = new bool[m_Required];
4715     for (size_t i = 0; i < m_Required; i++) {
4716         seg_found[i] = false;
4717     }
4718 
4719     ITERATE(TMembers, it, m_Members) {
4720         // check to make sure one of each segment is represented
4721         CSeqdesc_CI src(*it, CSeqdesc::e_Source);
4722         if (src->GetSource().IsSetSubtype()) {
4723             bool found_seg = false;
4724             ITERATE(CBioSource::TSubtype, s, src->GetSource().GetSubtype()) {
4725                 if ((*s)->IsSetSubtype() && (*s)->IsSetName() &&
4726                     (*s)->GetSubtype() == CSubSource::eSubtype_segment) {
4727                     try {
4728                         size_t seg = NStr::StringToSizet((*s)->GetName());
4729                         if (seg < 1 || seg > m_Required) {
4730                             ok = false;
4731                             break;
4732                         }
4733                         seg_found[seg - 1] = true;
4734                         found_seg = true;
4735                     } catch (CException&) {
4736                         ok = false;
4737                         break;
4738                     }
4739                 }
4740             }
4741             if (!found_seg) {
4742                 ok = false;
4743             }
4744         } else {
4745             ok = false;
4746         }
4747         if (!ok) {
4748             break;
4749         }
4750 
4751         // make sure all coding regions and genes are complete
4752         SAnnotSelector sel;
4753         sel.IncludeFeatType(CSeqFeatData::e_Cdregion);
4754         sel.IncludeFeatType(CSeqFeatData::e_Gene);
4755         CFeat_CI f(*it, sel);
4756         while (f) {
4757             if (f->GetLocation().IsPartialStart(eExtreme_Biological) ||
4758                 f->GetLocation().IsPartialStop(eExtreme_Biological)) {
4759                 ok = false;
4760                 break;
4761             }
4762             ++f;
4763         }
4764         if (!ok) break;
4765     }
4766 
4767     if (ok) {
4768         for (size_t i = 0; i < m_Required; i++) {
4769             if (!seg_found[i]) {
4770                 ok = false;
4771                 break;
4772             }
4773         }
4774     }
4775     delete[] seg_found;
4776 
4777     return ok;
4778 }
4779 
4780 
MakeSet()4781 void CInfluenzaSet::MakeSet()
4782 {
4783     if (m_Members.size() == 0) {
4784         return;
4785     }
4786     CBioseq_set_Handle parent = m_Members[0].GetParentBioseq_set();
4787     if (!parent) {
4788         return;
4789     }
4790     if (parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
4791         parent = parent.GetParentBioseq_set();
4792     }
4793     if (!parent) {
4794         return;
4795     }
4796     CSeq_entry_Handle peh = parent.GetParentEntry();
4797     CSeq_entry_EditHandle peeh(peh);
4798     CBioseq_set_EditHandle parent_edit(parent);
4799     CRef<CSeq_entry> ns(new CSeq_entry());
4800     ns->SetSet().SetClass(CBioseq_set::eClass_small_genome_set);
4801     CSeq_entry_EditHandle new_set = parent_edit.AttachEntry(*ns, -1);
4802     ITERATE(TMembers, it, m_Members) {
4803         CBioseq_set_Handle np = it->GetParentBioseq_set();
4804         if (np && np.IsSetClass() && np.GetClass() == CBioseq_set::eClass_nuc_prot) {
4805             CSeq_entry_Handle nps = np.GetParentEntry();
4806             CSeq_entry_EditHandle npse(nps);
4807             npse.Remove();
4808             new_set.AttachEntry(npse);
4809         } else {
4810             CSeq_entry_Handle s = it->GetParentEntry();
4811             CSeq_entry_EditHandle se(s);
4812             se.Remove();
4813             new_set.AttachEntry(se);
4814         }
4815     }
4816 }
4817 
4818 
4819 typedef map<string, CRef<CInfluenzaSet> > TInfluenzaSetMap;
4820 
MakeSmallGenomeSet(CSeq_entry_Handle entry)4821 size_t CCleanup::MakeSmallGenomeSet(CSeq_entry_Handle entry)
4822 {
4823     TInfluenzaSetMap flu_map;
4824 
4825     CBioseq_CI bi(entry, CSeq_inst::eMol_na);
4826     while (bi) {
4827         CSeqdesc_CI src(*bi, CSeqdesc::e_Source);
4828         if (src && src->GetSource().IsSetOrg()) {
4829             string key = CInfluenzaSet::GetKey(src->GetSource().GetOrg());
4830             if (!NStr::IsBlank(key)) {
4831                 // add to set
4832                 TInfluenzaSetMap::iterator it = flu_map.find(key);
4833                 if (it == flu_map.end()) {
4834                     CRef<CInfluenzaSet> new_set(new CInfluenzaSet(key));
4835                     new_set->AddBioseq(*bi);
4836                     flu_map[key] = new_set;
4837                 } else {
4838                     it->second->AddBioseq(*bi);
4839                 }
4840             }
4841         }
4842         ++bi;
4843     }
4844     // now create sets
4845     size_t added = 0;
4846     NON_CONST_ITERATE(TInfluenzaSetMap, it, flu_map) {
4847         if (it->second->OkToMakeSet()) {
4848             it->second->MakeSet();
4849             added++;
4850         }
4851     }
4852 
4853     return added;
4854 }
4855 
4856 
AddIRDMiscFeature(CBioseq_Handle bh,const CDbtag & tag)4857 void AddIRDMiscFeature(CBioseq_Handle bh, const CDbtag& tag)
4858 {
4859     CSeq_annot_Handle ftable;
4860 
4861     CSeq_annot_CI annot_ci(bh);
4862     for (; annot_ci; ++annot_ci) {
4863         if ((*annot_ci).IsFtable()) {
4864             ftable = *annot_ci;
4865             break;
4866         }
4867     }
4868 
4869     if (!ftable) {
4870         CBioseq_EditHandle beh = bh.GetEditHandle();
4871         CRef<CSeq_annot> new_annot(new CSeq_annot());
4872         ftable = beh.AttachAnnot(*new_annot);
4873     }
4874 
4875     CSeq_annot_EditHandle aeh(ftable);
4876 
4877     CRef<CSeq_feat> f(new CSeq_feat());
4878     f->SetData().SetImp().SetKey("misc_feature");
4879     f->SetLocation().SetInt().SetFrom(0);
4880     f->SetLocation().SetInt().SetTo(bh.GetBioseqLength() - 1);
4881     f->SetLocation().SetInt().SetId().Assign(*(bh.GetSeqId()));
4882     CRef<CDbtag> xref(new CDbtag());
4883     xref->Assign(tag);
4884     f->SetDbxref().push_back(xref);
4885     CRef<CSeqFeatXref> suppress(new CSeqFeatXref());
4886     suppress->SetData().SetGene();
4887     f->SetXref().push_back(suppress);
4888     aeh.AddFeat(*f);
4889 }
4890 
4891 
MakeIRDFeatsFromSourceXrefs(CSeq_entry_Handle entry)4892 bool CCleanup::MakeIRDFeatsFromSourceXrefs(CSeq_entry_Handle entry)
4893 {
4894     bool any = false;
4895     CBioseq_CI bi(entry, CSeq_inst::eMol_na);
4896     while (bi) {
4897         CSeqdesc_CI src(*bi, CSeqdesc::e_Source);
4898         while (src) {
4899             if (src->GetSource().IsSetOrg() && src->GetSource().GetOrg().IsSetDb()) {
4900                 CRef<COrg_ref> org(const_cast<COrg_ref *>(&(src->GetSource().GetOrg())));
4901                 COrg_ref::TDb::iterator db = org->SetDb().begin();
4902                 while (db != org->SetDb().end()) {
4903                     if ((*db)->IsSetDb() && NStr::Equal((*db)->GetDb(), "IRD")) {
4904                         AddIRDMiscFeature(*bi, **db);
4905                         db = org->SetDb().erase(db);
4906                         any = true;
4907                     } else {
4908                         ++db;
4909                     }
4910                 }
4911                 if (org->GetDb().size() == 0) {
4912                     org->ResetDb();
4913                 }
4914             }
4915             ++src;
4916         }
4917         ++bi;
4918     }
4919     return any;
4920 }
4921 
4922 //LCOV_EXCL_START
4923 //not used by asn_cleanup but used by other applications
4924 const unsigned int methionine_encoded = 'M' - 'A';
4925 
IsMethionine(const CCode_break & cb)4926 bool CCleanup::IsMethionine(const CCode_break& cb)
4927 {
4928     if (!cb.IsSetAa()) {
4929         return false;
4930     }
4931     bool rval = false;
4932     switch (cb.GetAa().Which()) {
4933         case CCode_break::TAa::e_Ncbi8aa:
4934             if (cb.GetAa().GetNcbi8aa() == methionine_encoded) {
4935                 rval = true;
4936             }
4937             break;
4938         case CCode_break::TAa::e_Ncbieaa:
4939             if (cb.GetAa().GetNcbieaa() == 'M') {
4940                 rval = true;
4941             }
4942             break;
4943         case CCode_break::TAa::e_Ncbistdaa:
4944             if (cb.GetAa().GetNcbistdaa() == methionine_encoded) {
4945                 rval = true;
4946             }
4947             break;
4948         default:
4949             break;
4950     }
4951     return rval;
4952 }
4953 //LCOV_EXCL_STOP
4954 
4955 
4956 //LCOV_EXCL_START
4957 //not used by asn_cleanup but used by other applications
GetCodeBreakForLocation(size_t pos,const CSeq_feat & cds)4958 CConstRef<CCode_break> CCleanup::GetCodeBreakForLocation(size_t pos, const CSeq_feat& cds)
4959 {
4960     if (!cds.IsSetData() || !cds.GetData().IsCdregion() ||
4961         !cds.IsSetLocation() ||
4962         !cds.GetData().GetCdregion().IsSetCode_break()) {
4963         return CConstRef<CCode_break>(NULL);
4964     }
4965 
4966     TSeqPos frame = 0;
4967     if (cds.IsSetData() && cds.GetData().IsCdregion() && cds.GetData().GetCdregion().IsSetFrame())
4968     {
4969         switch(cds.GetData().GetCdregion().GetFrame())
4970         {
4971         case CCdregion::eFrame_not_set :
4972         case CCdregion::eFrame_one : frame = 0; break;
4973         case CCdregion::eFrame_two : frame = 1; break;
4974         case  CCdregion::eFrame_three : frame = 2; break;
4975         default : frame = 0; break;
4976         }
4977     }
4978 
4979     for (auto cb : cds.GetData().GetCdregion().GetCode_break()) {
4980         if (cb->IsSetLoc()) {
4981             TSeqPos offset = sequence::LocationOffset(cds.GetLocation(),
4982                             cb->GetLoc());
4983             if (offset >= frame &&
4984                 ((offset - frame) / 3 ) + 1 == pos) {
4985                 return cb;
4986             }
4987         }
4988     }
4989     return CConstRef<CCode_break>(NULL);
4990 }
4991 //LCOV_EXCL_STOP
4992 
4993 //LCOV_EXCL_START
4994 //appears not to be used
SetCodeBreakLocation(CCode_break & cb,size_t pos,const CSeq_feat & cds)4995 void CCleanup::SetCodeBreakLocation(CCode_break& cb, size_t pos, const CSeq_feat& cds)
4996 {
4997     int start = static_cast<int>((pos-1)*3);
4998     //start -= 1;
4999     //start *= 3;
5000     int frame = 0;
5001     if (cds.IsSetData() && cds.GetData().IsCdregion() && cds.GetData().GetCdregion().IsSetFrame())
5002     {
5003         switch(cds.GetData().GetCdregion().GetFrame())
5004         {
5005         case CCdregion::eFrame_not_set :
5006         case CCdregion::eFrame_one : frame = 0; break;
5007         case CCdregion::eFrame_two : frame = 1; break;
5008         case  CCdregion::eFrame_three : frame = 2; break;
5009         default : frame = 0; break;
5010         }
5011     }
5012     int frame_shift = (start - frame) % 3;
5013     if (frame_shift < 0) {
5014         frame_shift += 3;
5015     }
5016     if (frame_shift == 1)
5017     start += 2;
5018     else if (frame_shift == 2)
5019     start += 1;
5020 
5021     int offset = 0;
5022     CRef<CSeq_loc> packed (new CSeq_loc());
5023     for (CSeq_loc_CI loc_iter(cds.GetLocation());  loc_iter;  ++loc_iter) {
5024         int len = loc_iter.GetRange().GetLength();
5025         if (offset <= start && offset + len > start) {
5026             CRef<CSeq_interval> tmp(new CSeq_interval());
5027             tmp->SetId().Assign(loc_iter.GetSeq_id());
5028             if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
5029                 tmp->SetStrand(eNa_strand_minus);
5030                 tmp->SetTo(loc_iter.GetRange().GetTo() - (start - offset) );
5031             } else {
5032                 tmp->SetFrom(loc_iter.GetRange().GetFrom() + start - offset);
5033             }
5034             if (offset <= start + 2 && offset + len > start + 2) {
5035                 if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
5036                     tmp->SetFrom(loc_iter.GetRange().GetTo() - (start - offset + 2) );
5037                 } else {
5038                     tmp->SetTo(loc_iter.GetRange().GetFrom() + start - offset + 2);
5039                 }
5040             } else {
5041                 if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
5042                     tmp->SetFrom(loc_iter.GetRange().GetFrom());
5043                 } else {
5044                     tmp->SetTo(loc_iter.GetRange().GetTo());
5045                 }
5046             }
5047             packed->SetPacked_int().Set().push_back(tmp);
5048         } else if (offset > start && offset <= start + 2) {
5049             // add new interval
5050             CRef<CSeq_interval> tmp (new CSeq_interval());
5051             tmp->SetId().Assign(loc_iter.GetSeq_id());
5052             if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
5053                 tmp->SetStrand(eNa_strand_minus);
5054                 tmp->SetTo(loc_iter.GetRange().GetTo());
5055                 if (offset + len >= start + 2) {
5056                     tmp->SetFrom(loc_iter.GetRange().GetTo() - (start - offset + 2) );
5057                 } else {
5058                     tmp->SetFrom(loc_iter.GetRange().GetFrom());
5059                 }
5060             } else {
5061                 tmp->SetFrom(loc_iter.GetRange().GetFrom());
5062                 if (offset + len >= start + 2) {
5063                     tmp->SetTo(loc_iter.GetRange().GetFrom() + start - offset + 2);
5064                 } else {
5065                     tmp->SetTo(loc_iter.GetRange().GetTo());
5066                 }
5067             }
5068 
5069             packed->SetPacked_int().Set().push_back(tmp);
5070         }
5071         offset += len;
5072     }
5073     if (packed->Which() != CSeq_loc::e_Packed_int || packed->GetPacked_int().Get().size() == 0) {
5074         cb.ResetLoc();
5075     }
5076     if (packed->GetPacked_int().Get().size() == 1) {
5077         cb.SetLoc().SetInt().Assign(*(packed->GetPacked_int().Get().front()));
5078     } else {
5079         cb.SetLoc(*packed);
5080     }
5081 }
5082 //LCOV_EXCL_STOP
5083 
5084 
5085 //LCOV_EXCL_START
5086 //not used by asn_cleanup but used by other applications
FixRNAEditingCodingRegion(CSeq_feat & cds)5087 bool CCleanup::FixRNAEditingCodingRegion(CSeq_feat& cds)
5088 {
5089     if (!cds.IsSetData() || !cds.GetData().IsCdregion()) {
5090         return false;
5091     }
5092     if (!cds.IsSetLocation() ||
5093         cds.GetLocation().IsPartialStart(eExtreme_Biological)) {
5094         return false;
5095     }
5096     CConstRef<CCode_break> cbstart = GetCodeBreakForLocation(1, cds);
5097     if (cbstart && !CCleanup::IsMethionine(*cbstart)) {
5098         // already have a start translation exception AND it is not methionine
5099         return false;
5100     }
5101 
5102     bool any_change = false;
5103     if (!cds.IsSetExcept_text() || NStr::IsBlank(cds.GetExcept_text())) {
5104         cds.SetExcept_text("RNA editing");
5105         any_change = true;
5106     } else if (NStr::Find(cds.GetExcept_text(), "RNA editing") == string::npos) {
5107         cds.SetExcept_text(cds.GetExcept_text() + "; RNA editing");
5108         any_change = true;
5109     }
5110     if (!cds.IsSetExcept() || !cds.GetExcept()) {
5111         cds.SetExcept(true);
5112         any_change = true;
5113     }
5114     return any_change;
5115 }
5116 //LCOV_EXCL_STOP
5117 
5118 
5119 //LCOV_EXCL_START
5120 //not used by asn_cleanup but used by other applications
CleanupCollectionDates(CSeq_entry_Handle seh,bool month_first)5121 bool CCleanup::CleanupCollectionDates(CSeq_entry_Handle seh, bool month_first)
5122 {
5123     bool any_changes = false;
5124 
5125     vector<CRef<COrg_ref> > rq_list;
5126     vector<const CSeqdesc* > src_descs;
5127     vector<CConstRef<CSeq_feat> > src_feats;
5128 
5129     GetSourceDescriptors(*(seh.GetCompleteSeq_entry()), src_descs);
5130     vector<const CSeqdesc* >::iterator desc_it = src_descs.begin();
5131     while (desc_it != src_descs.end()) {
5132         if ((*desc_it)->GetSource().IsSetSubtype()) {
5133             CSeqdesc* desc = const_cast<CSeqdesc*>(*desc_it);
5134             for (auto s : desc->SetSource().SetSubtype()) {
5135                 if (s->IsSetSubtype() && s->GetSubtype() == CSubSource::eSubtype_collection_date
5136                     && s->IsSetName()) {
5137                     bool month_ambiguous = false;
5138                     string new_date = CSubSource::FixDateFormat(s->GetName(), month_first, month_ambiguous);
5139                     if (!NStr::Equal(new_date, s->GetName())) {
5140                         s->SetName(new_date);
5141                         any_changes = true;
5142                     }
5143                 }
5144             }
5145         }
5146         ++desc_it;
5147     }
5148 
5149     CFeat_CI feat(seh, SAnnotSelector(CSeqFeatData::e_Biosrc));
5150     while (feat) {
5151         if (feat->GetData().GetBiosrc().IsSetSubtype()) {
5152             CRef<CSeq_feat> new_feat(new CSeq_feat());
5153             new_feat->Assign(*(feat->GetOriginalSeq_feat()));
5154             bool local_change = false;
5155             for (auto s : new_feat->SetData().SetBiosrc().SetSubtype()) {
5156                 if (s->IsSetSubtype() && s->GetSubtype() == CSubSource::eSubtype_collection_date
5157                     && s->IsSetName()) {
5158                     bool month_ambiguous = false;
5159                     string new_date = CSubSource::FixDateFormat(s->GetName(), month_first, month_ambiguous);
5160                     if (!NStr::Equal(new_date, s->GetName())) {
5161                         s->SetName(new_date);
5162                         local_change = true;
5163                     }
5164                 }
5165             }
5166             if (local_change) {
5167                 any_changes = true;
5168                 CSeq_feat_EditHandle efh(*feat);
5169                 efh.Replace(*new_feat);
5170             }
5171             ++feat;
5172         }
5173     }
5174 
5175     return any_changes;
5176 }
5177 //LCOV_EXCL_STOP
5178 
5179 
AutodefId(CSeq_entry_Handle seh)5180 void CCleanup::AutodefId(CSeq_entry_Handle seh)
5181 {
5182     // remove existing options (TODO)
5183     for (CBioseq_CI b(seh); b; ++b) {
5184         bool removed = true;
5185         while (removed) {
5186             removed = false;
5187             CSeqdesc_CI ud(*b, CSeqdesc::e_User);
5188             while (ud) {
5189                 if (ud->GetUser().IsAutodefOptions()) {
5190                     CSeq_entry_Handle s = ud.GetSeq_entry_Handle();
5191                     CSeq_entry_EditHandle se = s.GetEditHandle();
5192                     se.RemoveSeqdesc(*ud);
5193                     removed = true;
5194                     break;
5195                 }
5196                 ++ud;
5197             }
5198         }
5199     }
5200 
5201     // create new options
5202     CRef<CUser_object> auto_user = CAutoDef::CreateIDOptions(seh);
5203     CRef<CSeqdesc> d(new CSeqdesc());
5204     d->SetUser().Assign(*auto_user);
5205     CSeq_entry_EditHandle eh = seh.GetEditHandle();
5206     eh.AddSeqdesc(*d);
5207 
5208     CAutoDef::RegenerateSequenceDefLines(seh);
5209 }
5210 
5211 END_SCOPE(objects)
5212 END_NCBI_SCOPE
5213