1 #ifndef CLEANUP___CLEANUP__HPP
2 #define CLEANUP___CLEANUP__HPP
3 
4 /*  $Id: cleanup.hpp 632626 2021-06-03 17:38:42Z ivanov $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Robert Smith, Michael Kornbluh
30  *
31  * File Description:
32  *   Basic Cleanup of CSeq_entries.
33  *   .......
34  *
35  */
36 #include <objmgr/scope.hpp>
37 #include <objtools/cleanup/cleanup_change.hpp>
38 #include <objects/seq/MolInfo.hpp>
39 #include <objects/pub/Pub.hpp>
40 #include <objects/biblio/Auth_list.hpp>
41 #include <objects/seqfeat/Cdregion.hpp>
42 
43 
44 BEGIN_NCBI_SCOPE
45 BEGIN_SCOPE(objects)
46 
47 class CSeq_entry;
48 class CBioseq;
49 class CBioseq_set;
50 class CSeq_annot;
51 class CSeq_feat;
52 class CSeq_submit;
53 class COrgName;
54 class CSubmit_block;
55 class CAuthor;
56 class CAuth_list;
57 class CName_std;
58 
59 class CSeq_entry_Handle;
60 class CBioseq_Handle;
61 class CBioseq_set_Handle;
62 class CSeq_annot_Handle;
63 class CSeq_feat_Handle;
64 
65 class CCleanupChange;
66 class IObjtoolsListener;
67 
68 class NCBI_CLEANUP_EXPORT CCleanup : public CObject
69 {
70 public:
71 
72     enum EValidOptions {
73         eClean_NoReporting       = 0x1,
74         eClean_GpipeMode         = 0x2,
75         eClean_NoNcbiUserObjects = 0x4,
76         eClean_SyncGenCodes      = 0x8,
77         eClean_NoProteinTitles   = 0x10,
78         eClean_KeepTopSet        = 0x20
79     };
80 
81     enum EScopeOptions {
82         eScope_Copy,
83         eScope_UseInPlace
84     };
85 
86     // Construtor / Destructor
87     CCleanup(CScope* scope = NULL, EScopeOptions scope_handling = eScope_Copy);
88     ~CCleanup();
89 
90     void SetScope(CScope* scope);
91 
92     // BASIC CLEANUP
93 
94     CConstRef<CCleanupChange> BasicCleanup(CSeq_entry& se,  Uint4 options = 0);
95     /// Cleanup a Seq-submit.
96     CConstRef<CCleanupChange> BasicCleanup(CSeq_submit& ss,  Uint4 options = 0);
97     /// Cleanup a Bioseq.
98     CConstRef<CCleanupChange> BasicCleanup(CBioseq& bs,     Uint4 ooptions = 0);
99     /// Cleanup a Bioseq_set.
100     CConstRef<CCleanupChange> BasicCleanup(CBioseq_set& bss, Uint4 options = 0);
101     /// Cleanup a Seq-Annot.
102     CConstRef<CCleanupChange> BasicCleanup(CSeq_annot& sa,  Uint4 options = 0);
103     /// Cleanup a Seq-feat.
104     CConstRef<CCleanupChange> BasicCleanup(CSeq_feat& sf,   Uint4 options = 0);
105     /// Cleanup a BioSource.
106     CConstRef<CCleanupChange> BasicCleanup(CBioSource& src,   Uint4 options = 0);
107     // Cleanup a Submit-block
108     CConstRef<CCleanupChange> BasicCleanup(CSubmit_block& block, Uint4 options = 0);
109     // Cleanup descriptors
110     CConstRef<CCleanupChange> BasicCleanup(CSeqdesc& desc, Uint4 options = 0);
111     CConstRef<CCleanupChange> BasicCleanup(CSeq_descr & desc, Uint4 options = 0);
112 
113     // Handle versions.
114     CConstRef<CCleanupChange> BasicCleanup(CSeq_entry_Handle& seh, Uint4 options = 0);
115     CConstRef<CCleanupChange> BasicCleanup(CBioseq_Handle& bsh,    Uint4 options = 0);
116     CConstRef<CCleanupChange> BasicCleanup(CBioseq_set_Handle& bssh, Uint4 options = 0);
117     CConstRef<CCleanupChange> BasicCleanup(CSeq_annot_Handle& sak, Uint4 options = 0);
118     CConstRef<CCleanupChange> BasicCleanup(CSeq_feat_Handle& sfh,  Uint4 options = 0);
119 
120     // Extended Cleanup
121         /// Cleanup a Seq-entry.
122     CConstRef<CCleanupChange> ExtendedCleanup(CSeq_entry& se,  Uint4 options = 0);
123     /// Cleanup a Seq-submit.
124     CConstRef<CCleanupChange> ExtendedCleanup(CSeq_submit& ss, Uint4 options = 0);
125     /// Cleanup a Seq-Annot.
126     CConstRef<CCleanupChange> ExtendedCleanup(CSeq_annot& sa,  Uint4 options = 0);
127 
128     // Handle versions
129     static CConstRef<CCleanupChange> ExtendedCleanup(CSeq_entry_Handle& seh, Uint4 options = 0);
130 
131     // Useful cleanup functions
132 
133     static bool ShouldStripPubSerial(const CBioseq& bs);
134 
135 
136 /// Moves protein-specific features from nucleotide sequences in the Seq-entry to
137 /// the appropriate protein sequence.
138 /// @param seh Seq-entry Handle to edit [in]
139 /// @return Boolean return value indicates whether any changes were made
140     static bool MoveProteinSpecificFeats(CSeq_entry_Handle seh);
141 
142     /// Moves one feature from nucleotide bioseq to
143     /// the appropriate protein sequence.
144     /// @param fh Feature to edit
145     /// @return Boolean return value indicates whether any changes were made
146     static bool MoveFeatToProtein(CSeq_feat_Handle fh);
147 
148 /// Calculates whether a Gene-xref is unnecessary (because it refers to the
149 /// same gene as would be calculated using overlap)
150 /// @param sf Seq-feat with the xref [in]
151 /// @param scope Scope in which to search for location [in]
152 /// @param gene_xref Gene-ref of gene-xref [in]
153 /// @return Boolean return value indicates whether gene-xref is unnecessary
154     static bool IsGeneXrefUnnecessary(const CSeq_feat& sf, CScope& scope, const CGene_ref& gene_xref);
155 
156 /// Removes unnecessary Gene-xrefs
157 /// @param f Seq-feat to edit [in]
158 /// @param scope Scope in which to search for locations [in]
159 /// @return Boolean return value indicates whether gene-xrefs were removed
160     static bool RemoveUnnecessaryGeneXrefs(CSeq_feat& f, CScope& scope);
161 
162 /// Removes unnecessary Gene-xrefs on features in Seq-entry
163 /// @param seh Seq-entry-Handle to edit [in]
164 /// @return Boolean return value indicates whether gene-xrefs were removed
165     static bool RemoveUnnecessaryGeneXrefs(CSeq_entry_Handle seh);
166 
167 /// Removes non-suppressing Gene-xrefs
168 /// @param f Seq-feat to edit [in]
169 /// @return Boolean return value indicates whether gene-xrefs were removed
170     static bool RemoveNonsuppressingGeneXrefs(CSeq_feat& f);
171 
172 
173 /// Repairs non-reciprocal xref pairs for specified feature if xrefs between
174 /// subtypes are permitted and feature with missing xref does not have an
175 /// xref to a different feature of the same subtype
176 /// @param f Seq-feat to edit [in]
177 /// @param tse top-level Seq-entry in which to search for the other half of the xref pair
178 /// @return Boolean return value indicates whether xrefs were created
179     static bool RepairXrefs(const CSeq_feat& f, const CTSE_Handle& tse);
180 
181 /// Repairs non-reciprocal xref pairs for specified feature pair if xrefs between
182 /// subtypes are permitted and feature with missing xref does not have an
183 /// xref to a different feature of the same subtype
184 /// @param f Seq-feat to edit [in]
185 /// @param tse top-level Seq-entry in which to search for the other half of the xref pair
186 /// @return Boolean return value indicates whether xrefs were created
187     static bool RepairXrefs(const CSeq_feat& src, CSeq_feat_Handle& dst, const CTSE_Handle& tse);
188 
189 /// Repairs non-reciprocal xref pairs in specified seq-entry
190 /// @param seh Seq-entry to edit [in]
191 /// @return Boolean return value indicates whether xrefs were created
192     static bool RepairXrefs(CSeq_entry_Handle seh);
193 
194 /// Detects gene features with matching locus
195 /// @param f Seq-feat parent feature of gene_xref [in]
196 /// @param gene_xref Gene-ref of gene-xref [in]
197 /// @param bsh CBioseq_Handle parent bioseq in which to search for genes [in]
198 /// @return Boolean return value indicates whether a gene feature with matching locus has been found
199     static bool FindMatchingLocusGene(CSeq_feat& f, const CGene_ref& gene_xref, CBioseq_Handle bsh);
200 
201 /// Removes orphaned locus Gene-xrefs
202 /// @param f Seq-feat to edit [in]
203 /// @param bsh CBioseq_Handle in which to search for gene features [in]
204 /// @return Boolean return value indicates whether gene-xrefs were removed
205     static bool RemoveOrphanLocusGeneXrefs(CSeq_feat& f, CBioseq_Handle bsh);
206 
207 /// Detects gene features with matching locus_tag
208 /// @param f Seq-feat parent feature of gene_xref [in]
209 /// @param gene_xref Gene-ref of gene-xref [in]
210 /// @param bsh CBioseq_Handle parent bioseq in which to search for genes [in]
211 /// @return Boolean return value indicates whether a gene feature with matching locus_tag has been found
212     static bool FindMatchingLocus_tagGene(CSeq_feat& f, const CGene_ref& gene_xref, CBioseq_Handle bsh);
213 
214 /// Removes orphaned locus_tag Gene-xrefs
215 /// @param f Seq-feat to edit [in]
216 /// @param bsh CBioseq_Handle in which to search for gene features [in]
217 /// @return Boolean return value indicates whether gene-xrefs were removed
218     static bool RemoveOrphanLocus_tagGeneXrefs(CSeq_feat& f, CBioseq_Handle bsh);
219 
220 /// Extends a location to the specificed position.
221 /// @param loc Seq-loc to extend
222 /// @param pos position of new end of location
223 /// @param scope Scope in which to look for sequences
224 /// @return Boolean return value indicates whether the location was extended
225     static bool SeqLocExtend(CSeq_loc& loc, size_t pos, CScope& scope);
226 
227 
228 /// Extends a coding region up to 50 nt. if the coding region:
229 /// 1. does not end with a stop codon
230 /// 2. is adjacent to a stop codon
231 /// 3. is not pseudo
232 /// @param f Seq-feat to edit
233 /// @param bsh CBioseq_Handle on which the feature is located
234 /// @return Boolean return value indicates whether the feature was extended
235     static bool ExtendToStopIfShortAndNotPartial(CSeq_feat& f, CBioseq_Handle bsh, bool check_for_stop = true);
236 
237 /// Checks whether it is possible to extend the original location up to improved one. It is possible only if
238 /// the original location is less than improved
239 /// @param orig Seq-loc to check
240 /// @param improved Seq-loc original location may be extended to
241 /// @return Boolean return value indicates whether the extention is possible
242     static bool LocationMayBeExtendedToMatch(const CSeq_loc& orig, const CSeq_loc& improved);
243 
244 /// Extends a feature up to limit nt to a stop codon, or to the end of the sequence
245 /// if limit == 0 (partial will be set if location extends to end of sequence but
246 /// no stop codon is found)
247 /// @param f Seq-feat to edit
248 /// @param bsh CBioseq_Handle on which the feature is located
249 /// @param limit maximum number of nt to extend, or 0 if unlimited
250 /// @return Boolean return value indicates whether the feature was extended
251     static bool ExtendToStopCodon(CSeq_feat& f, CBioseq_Handle bsh, size_t limit);
252     static bool ExtendStopPosition(CSeq_feat& f, const CSeq_feat* cdregion, size_t extension = 0);
253 
254 /// Translates coding region and selects best frame (without stops, or longest)
255 /// @param cds Coding region Seq-feat to edit
256 /// @param scope Scope in which to find coding region
257 /// @return Boolean return value indicates whether the coding region was changed
258     static bool SetBestFrame(CSeq_feat& cds, CScope& scope);
259 
260 /// Chooses best frame based on location
261 /// 1.  If the location is 5' complete, then the frame must be one.
262 /// 2.  If the location is 5' partial and 3' complete, select a frame using the
263 ///      value of the location length modulo 3.
264 /// @param cdregion Coding Region in which to set frame
265 /// @param loc      Location to use for setting frame
266 /// @param scope    Scope in which to find location sequence(s)
267 /// @return Boolean return value indicates whether the frame was changed
268     static bool SetFrameFromLoc(CCdregion &cdregion, const CSeq_loc& loc, CScope& scope);
269     static bool SetFrameFromLoc(CCdregion::EFrame &frame, const CSeq_loc& loc, CScope& scope);
270 
271 /// 1. Set the partial flags when the CDS is partial and codon_start is 2 or 3
272 /// 2. Make the CDS partial at the 5' end if there is no start codon
273 /// 3. Make the CDS partial at the 3' end if there is no stop codon
274 /// @param cds Coding region Seq-feat to edit
275 /// @param scope Scope in which to find coding region and coding region's protein
276 ///        product sequence
277 /// @return Boolean return value indicates whether the coding region changed
278     static bool SetCDSPartialsByFrameAndTranslation(CSeq_feat& cds, CScope& scope);
279 
280 
281 /// Clear internal partials
282     static bool ClearInternalPartials(CSeq_loc& loc, bool is_first = true, bool is_last = true);
283     static bool ClearInternalPartials(CSeq_loc_mix& mix, bool is_first = true, bool is_last = true);
284     static bool ClearInternalPartials(CPacked_seqint& pint, bool is_first = true, bool is_last = true);
285     static bool ClearInternalPartials(CSeq_entry_Handle seh);
286 
287 /// Set feature partial based on feature location
288     static bool SetFeaturePartial(CSeq_feat& f);
289 
290 /// Update EC numbers
291 /// @param ec_num_list Prot-ref ec number list to clean
292 /// @return Boolean value indicates whether any changes were made
293     static bool UpdateECNumbers(CProt_ref::TEc & ec_num_list);
294 
295 /// Delete EC numbers
296 /// @param ec_num_list Prot-ref ec number list to clean
297 /// @return Boolean value indicates whether any changes were made
298     static bool RemoveBadECNumbers(CProt_ref::TEc & ec_num_list);
299 
300 /// Fix EC numbers
301 /// @param entry Seq-entry-handle to clean
302 /// @return Boolean value indicates whether any changes were made
303     static bool FixECNumbers(CSeq_entry_Handle entry);
304 
305 /// Set partialness of gene to match longest feature contained in gene
306 /// @param gene  Seq-feat to edit
307 /// @param scope Scope in which to find gene
308 /// @return Boolean return value indicates whether the gene changed
309     static bool SetGenePartialByLongestContainedFeature(CSeq_feat& gene, CScope& scope);
310 
311     static void SetProteinName(CProt_ref& prot, const string& protein_name, bool append);
312     static void SetProteinName(CSeq_feat& cds, const string& protein_name, bool append, CScope& scope);
313     static void SetMrnaName(CSeq_feat& mrna, const string& protein_name);
314     static const string& GetProteinName(const CProt_ref& prot);
315     static const string& GetProteinName(const CSeq_feat& cds, CScope& scope);
316 
317 /// Sets MolInfo::tech for a sequence
318 /// @param seq Bioseq to edit
319 /// @param tech tech value to set
320 /// @return Boolean tech was changed
321     static bool SetMolinfoTech(CBioseq_Handle seq, CMolInfo::ETech tech);
322 
323 /// Sets MolInfo::biomol for a sequence
324 /// @param seq Bioseq to edit
325 /// @param biomol biomol value to set
326 /// @return Boolean biomol was changed
327     static bool SetMolinfoBiomol(CBioseq_Handle seq, CMolInfo::EBiomol biomol);
328 
329 
330 /// Adds missing MolInfo descriptor to sequence
331 /// @param seq Bioseq to edit
332 /// @return Boolean return value indicates whether descriptor was added
333     static bool AddMissingMolInfo(CBioseq& seq, bool is_product);
334 
335 /// Creates missing protein title descriptor
336 /// @param seq Bioseq to edit
337 /// @return Boolean return value indicates whether title was added
338     static bool AddProteinTitle(CBioseq_Handle bsh);
339 
340 /// Removes NcbiCleanup User Objects in the Seq-entry
341 /// @param seq_entry Seq-entry to edit
342 /// @return Boolean return value indicates whether object was removed
343     static bool RemoveNcbiCleanupObject(CSeq_entry &seq_entry);
344 
345 /// Looks up Org-refs in the Seq-entry
346 /// @param seh Seq-entry to edit
347 /// @return Boolean return value indicates whether object was updated
348     static bool TaxonomyLookup(CSeq_entry_Handle seh);
349 
350 
351 /// Sets genetic codes for coding regions on Bioseq-Handle
352 /// @param Bioseq-Handle to examine
353 /// @return Boolean indicates whether any coding regions were updated
354     static bool SetGeneticCodes(CBioseq_Handle bsh);
355 
356 /// Adjusts protein title to reflect partialness
357 /// @param Bioseq to adjust
358 /// @return Boolean indicates whether title was updated
359     static bool AddPartialToProteinTitle(CBioseq &bioseq);
360 
361 /// Removes protein product from pseudo coding region
362 /// @param cds Seq-feat to adjust
363 /// @param scope Scope in which to find protein sequence and remove it
364 /// @return Boolean indicates whether anything changed
365     static bool RemovePseudoProduct(CSeq_feat& cds, CScope& scope);
366 
367     static CRef<CSeq_entry> AddProtein(const CSeq_feat& cds, CScope& scope);
368 
369 /// Expands gene to include features it cross-references
370 /// @param gene Seq-feat to adjust
371 /// @param tse Top-level Seq-entry in which to find other features
372 /// @return Boolean indicates whether anything changed
373     static bool ExpandGeneToIncludeChildren(CSeq_feat& gene, CTSE_Handle& tse);
374 
375 /// Performs WGS specific cleanup
376 /// @param entry Seq-entry to edit
377 /// @return Boolean return value indicates whether object was updated
378     static bool WGSCleanup(CSeq_entry_Handle entry, bool instantiate_missing_proteins = true, Uint4 options = 0,
379         bool run_extended_cleanup = true);
380 
381 /// For table2asn -c s
382 /// Adds an exception of "low-quality sequence region" to coding regions
383 /// and mRNAs that are not pseudo and have an intron <11bp in length
384 /// @param entry Seq-entry to edit
385 /// @return Boolean return value indicates whether object was updated
386     static bool AddLowQualityException(CSeq_entry_Handle entry);
387 
388 /// Normalize Descriptor Order on a specific Seq-entry
389 /// @param entry Seq-entry to edit
390 /// @return Boolean return value indicates whether object was updated
391     static bool NormalizeDescriptorOrder(CSeq_descr& descr);
392 
393 /// Normalize Descriptor Order on a specific Seq-entry
394 /// @param seh Seq-entry-Handle to edit
395 /// @return Boolean return value indicates whether object was updated
396     static bool NormalizeDescriptorOrder(CSeq_entry_Handle seh);
397 
398 /// Remove all titles in Seqdescr except the last, because it is the
399 /// only one that would be displayed in the flatfile
400 /// @param seq Bioseq-Handle to edit
401 /// @return Boolean return value indicates whether any titles were removed
402     static bool RemoveUnseenTitles(CSeq_entry_EditHandle::TSeq seq);
403 
404 /// Remove all titles in Seqdescr except the last, because it is the
405 /// only one that would be displayed in the flatfile
406 /// @param set Bioseq-set-Handle to edit
407 /// @return Boolean return value indicates whether any titles were removed
408     static bool RemoveUnseenTitles(CSeq_entry_EditHandle::TSet set);
409 
410 /// Add GenBank Wrapper Set
411 /// @param entry Seq-entry to edit
412 /// @return Boolean return value indicates whether object changed
413     static bool AddGenBankWrapper(CSeq_entry_Handle seh);
414 
415 
416 /// For Publication Citations
417 /// Get labels for a pubdesc. To be used in citations.
418     static void GetPubdescLabels
419         (const CPubdesc& pd,
420         vector<TEntrezId>& pmids, vector<TEntrezId>& muids, vector<int>& serials,
421         vector<string>& published_labels, vector<string>& unpublished_labels);
422 
423 /// Get list of pubs that can be used for citations for Seq-feat on a Bioseq-handle
424 /// @param bsh Bioseq-handle to search
425 /// @return vector<CConstRef<CPub> > ordered list of pubs
426 /// Note that Seq-feat.cit appear in the flatfile using the position
427 /// in the list
428     static vector<CConstRef<CPub> > GetCitationList(CBioseq_Handle bsh);
429 
430 /// Remove duplicate publications
431     static bool RemoveDuplicatePubs(CSeq_descr& descr);
432 
433     /// Some pubs should not be promoted to nuc-prot set from sequence
434     static bool OkToPromoteNpPub(const CPubdesc& pd);
435 
436     /// For some sequences, pubs should not be promoted to nuc-prot set from sequence
437     static bool OkToPromoteNpPub(const CBioseq& b);
438 
439     static bool PubAlreadyInSet(const CPubdesc& pd, const CSeq_descr& descr);
440 
441 /// Convert full-length publication features to publication descriptors.
442 /// @param seh Seq-entry to edit
443 /// @return bool indicates whether any changes were made
444     static bool ConvertPubFeatsToPubDescs(CSeq_entry_Handle seh);
445 
446 /// Rescue pubs from Site-ref features
447 /// @param seh Seq-entry to edit
448 /// @return bool indicates whether any changes were made
449     static bool RescueSiteRefPubs(CSeq_entry_Handle seh);
450 
451 /// Is this a "minimal" pub? (If yes, do not rescue from a Seq-feat.cit)
452     static bool IsMinPub(const CPubdesc& pd, bool is_refseq_prot);
453 
454     //helper function for moving feature to pubdesc descriptor
455     static void MoveOneFeatToPubdesc(CSeq_feat_Handle feat, CRef<CSeqdesc> d, CBioseq_Handle b, bool remove_feat = true);
456 
457 /// Remove duplicate biosource descriptors
458     static bool RemoveDupBioSource(CSeq_descr& descr);
459 
460 /// Get BioSource from feature to use for source descriptor
461     static CRef<CBioSource> BioSrcFromFeat(const CSeq_feat& f);
462 
463     static bool AreBioSourcesMergeable(const CBioSource& src1, const CBioSource& src2);
464     static bool MergeDupBioSources(CSeq_descr& descr);
465     static bool MergeDupBioSources(CBioSource& src1, const CBioSource& add);
466 
467 
468 /// Convert full-length source features to source descriptors
469 /// @param seh Seq-entry to edit
470 /// @return bool indicates whether any changes were made
471     static bool ConvertSrcFeatsToSrcDescs(CSeq_entry_Handle seh);
472 
473 /// Examine all genes and gene xrefs in the Seq-entry.
474 /// If no genes have locus and some have locus tag AND no gene xrefs have locus-tag
475 /// and some gene xrefs have locus, change all gene xrefs to use locus tag.
476 /// If no genes have locus tag and some have locus AND no gene xrefs have locus
477 /// and some gene xrefs have locus tag, change all gene xrefs to use locus.
478 /// @param seh Seq-entry to edit
479 /// @return bool indicates whether any changes were made
480     static bool FixGeneXrefSkew(CSeq_entry_Handle seh);
481 
482 /// Convert nuc-prot sets with just one sequence to just the sequence
483 /// can't be done during the explore phase because it changes a seq to a set
484 /// @param seh Seq-entry to edit
485 /// @return bool indicates whether any changes were made
486     static bool RenormalizeNucProtSets(CSeq_entry_Handle seh);
487 
488 /// decodes various tags, including carriage-return-line-feed constructs
489     static bool DecodeXMLMarkChanged(std::string & str);
490 
491     static CRef<CSeq_loc> GetProteinLocationFromNucleotideLocation(const CSeq_loc& nuc_loc, CScope& scope);
492     static CRef<CSeq_loc> GetProteinLocationFromNucleotideLocation(const CSeq_loc& nuc_loc, const CSeq_feat& cds, CScope& scope, bool require_inframe = false);
493 
494 /// Find proteins that are not packaged in the same nuc-prot set as the
495 /// coding region for which they are a product, and move them to that
496 /// nuc-prot set. Ignore coding regions that are in gen-prod-sets.
497 /// @param seh Seq-entry to edit
498 /// @return bool indicates whether any changes were made
499     static bool RepackageProteins(CSeq_entry_Handle seh);
500     static bool RepackageProteins(const CSeq_feat& cds, CBioseq_set_Handle np);
501 
502     static bool ConvertDeltaSeqToRaw(CSeq_entry_Handle seh, CSeq_inst::EMol filter = CSeq_inst::eMol_not_set);
503 
504 /// Parse string into code break and add to coding region.
505 /// @param feat feature that contains coding region - necessary to determine codon boundaries
506 /// @param cds  coding region to which code breaks will be added
507 /// @param str  string from which to parse code break
508 /// @param scope scope in which to find sequences referenced (used for location comparisons)
509 /// @return bool indicates string was successfully parsed and code break was added
510     static bool ParseCodeBreak(const CSeq_feat& feat,
511             CCdregion& cds,
512             const CTempString& str,
513             CScope& scope,
514             IObjtoolsListener* pMessageListener=nullptr);
515 
516 /// Parses all valid transl_except Gb-quals into code-breaks for cdregion,
517 /// then removes the transl_except Gb-quals that were successfully parsed
518 /// @param feat feature that contains coding region
519 /// @param scope scope in which to find sequences referenced (used for location comparisons)
520 /// @return bool indicates changes were made
521     static bool ParseCodeBreaks(CSeq_feat& feat, CScope& scope);
522 
523     static size_t MakeSmallGenomeSet(CSeq_entry_Handle entry);
524 
525 /// From SQD-4329
526 /// For each sequence with a source that has an IRD db_xref, create a misc_feature
527 /// across the entire span and move the IRD db_xref from the source to the misc_feature.
528 /// Create a suppressing gene xref for the misc_feature.
529 /// @param entry Seq-entry on which to search for sources and create features
530 /// @return bool indicates changes were made
531     static bool MakeIRDFeatsFromSourceXrefs(CSeq_entry_Handle entry);
532 
533 /// From GB-7563
534 /// An action has been requested that will do the following:
535 ///    1. This action should be limited to protein sequences where the product
536 ///       is an exact match to a specified text (the usual string constraint
537 ///       is not needed).
538 ///    2. Protein sequences for which the coding region is 5' partial should
539 ///       not be affected.
540 ///    3. When the protein name matches, the following actions should be taken
541 ///       if and only if the first amino acid of the protein sequence is not
542 ///       M (methionine):
543 ///     a. The first amino acid of the protein sequence should be changed to
544 ///       methionine.
545 ///     b. The coding region should have the text "RNA editing" added to
546 ///       Seq-feat.except_text (separated from any existing text by a semicolon).
547 ///       If Seq-feat.except is not already true, it should be set to true.
548 ///     c. A code-break should be added to Cdregion.code-break where the
549 ///       Code-break.loc is the location of the first codon of the coding region
550 ///       and Code-break.aa is ncbieaa 'M' (Indexers will refer to "code-breaks"
551 ///       as "translation exceptions" because these appear in the flatfile as a
552 ///       /transl_except qualifier.
553 ///
554 /// It will be the responsibility of the caller to only invoke this function
555 /// for coding regions where the product name is a match, and the protein sequence
556 /// does not already start with an M.
557 
558     static bool FixRNAEditingCodingRegion(CSeq_feat& cds);
559 
560     /// utility function for setting code break location given offset
561     /// pos is the position of the amino acid where the translation exception
562     /// occurs (starts with 1)
563     static void SetCodeBreakLocation(CCode_break& cb, size_t pos, const CSeq_feat& cds);
564 
565     static bool IsMethionine(const CCode_break& cb);
566 
567     /// utility function for finding the code break for a given amino acid position
568     /// pos is the position of the amino acid where the translation exception
569     /// occurs (starts with 1)
570     static CConstRef<CCode_break> GetCodeBreakForLocation(size_t pos, const CSeq_feat& cds);
571 
572     // From the request in GB-7166, we want to be able to move /gene
573     // qualifiers that have been added to the coding region but not the
574     // parent gene to the parent gene.
575     // If the coding region also has /locus_tag qualifier which is different
576     // from the one on the parent gene features, do not move the qualifier.
577     // If there are two coding regions that are mapped to the same gene,
578     // do not move the qualifier.
579     static bool NormalizeGeneQuals(CSeq_feat& cds, CSeq_feat& gene);
580     static bool NormalizeGeneQuals(CBioseq_Handle bsh);
581     static bool NormalizeGeneQuals(CSeq_entry_Handle seh);
582     typedef pair<CSeq_feat_Handle, CSeq_feat_Handle> TFeatGenePair; // by convention, cds first, gene second
583     static vector<TFeatGenePair> GetNormalizableGeneQualPairs(CBioseq_Handle bsh);
584 
585     // This function is used to do generic string cleanup on User-object string fields
586     // and apply specific cleanups to known types of User-object
587     static bool CleanupUserObject(CUser_object& object);
588 
589     // for cleaning up authors, lists of authors, and affiliation
590     static bool CleanupAuthor(CAuthor& author, bool fix_initials = true);
591     static bool CleanupAuthList(CAuth_list& al, bool fix_initials = true);
592     static void ResetAuthorNames(CAuth_list::TNames& names);
593     static bool CleanupAffil(CAffil& af);
594     static bool IsEmpty(const CAuth_list::TAffil& affil);
595 
596     // for cleaning up collection-date subsource qualifiers
597     static bool CleanupCollectionDates(CSeq_entry_Handle seh, bool month_first);
598 
599     static void AutodefId(CSeq_entry_Handle seh);
600 
601 private:
602     // Prohibit copy constructor & assignment operator
603     CCleanup(const CCleanup&);
604     CCleanup& operator= (const CCleanup&);
605 
606     CRef<CScope>            m_Scope;
607 
608     static bool x_CleanupUserField(CUser_field& field);
609 
610     static bool x_MergeDupOrgNames(COrgName& on1, const COrgName& add);
611     static bool x_MergeDupOrgRefs(COrg_ref& org1, const COrg_ref& add);
612 
613     static bool x_HasShortIntron(const CSeq_loc& loc, size_t min_len = 11);
614     static bool x_AddLowQualityException(CSeq_feat& feat);
615     static bool x_AddLowQualityException(CSeq_entry_Handle entry, CSeqFeatData::ESubtype subtype);
616 
617     static bool s_IsProductOnFeat(const CSeq_feat& cds);
618     static void s_SetProductOnFeat(CSeq_feat& feat, const string& protein_name, bool append);
619 
620     static bool s_CleanupGeneOntology(CUser_object& obj);
621     static bool s_CleanupStructuredComment(CUser_object& obj);
622     static bool s_RemoveEmptyFields(CUser_object& obj);
623     static bool s_CleanupGenomeAssembly(CUser_object& obj);
624     static bool s_CleanupDBLink(CUser_object& obj);
625     static bool s_AddNumToUserField(CUser_field &field);
626 
627     static bool s_CleanupNameStdBC(CName_std& name, bool fix_initials);
628     static void s_ExtractSuffixFromInitials(CName_std& name);
629     static void s_FixEtAl(CName_std& name);
630 
631     // for cleaning pubdesc
632     static bool s_Flatten(CPub_equiv& pub_equiv);
633 };
634 
635 
636 
637 END_SCOPE(objects)
638 END_NCBI_SCOPE
639 
640 #endif  /* CLEANUP___CLEANUP__HPP */
641