1 #ifndef CLEANUP___CLEANUP__HPP 2 #define CLEANUP___CLEANUP__HPP 3 4 /* $Id: cleanup.hpp 632626 2021-06-03 17:38:42Z ivanov $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Robert Smith, Michael Kornbluh 30 * 31 * File Description: 32 * Basic Cleanup of CSeq_entries. 33 * ....... 34 * 35 */ 36 #include <objmgr/scope.hpp> 37 #include <objtools/cleanup/cleanup_change.hpp> 38 #include <objects/seq/MolInfo.hpp> 39 #include <objects/pub/Pub.hpp> 40 #include <objects/biblio/Auth_list.hpp> 41 #include <objects/seqfeat/Cdregion.hpp> 42 43 44 BEGIN_NCBI_SCOPE 45 BEGIN_SCOPE(objects) 46 47 class CSeq_entry; 48 class CBioseq; 49 class CBioseq_set; 50 class CSeq_annot; 51 class CSeq_feat; 52 class CSeq_submit; 53 class COrgName; 54 class CSubmit_block; 55 class CAuthor; 56 class CAuth_list; 57 class CName_std; 58 59 class CSeq_entry_Handle; 60 class CBioseq_Handle; 61 class CBioseq_set_Handle; 62 class CSeq_annot_Handle; 63 class CSeq_feat_Handle; 64 65 class CCleanupChange; 66 class IObjtoolsListener; 67 68 class NCBI_CLEANUP_EXPORT CCleanup : public CObject 69 { 70 public: 71 72 enum EValidOptions { 73 eClean_NoReporting = 0x1, 74 eClean_GpipeMode = 0x2, 75 eClean_NoNcbiUserObjects = 0x4, 76 eClean_SyncGenCodes = 0x8, 77 eClean_NoProteinTitles = 0x10, 78 eClean_KeepTopSet = 0x20 79 }; 80 81 enum EScopeOptions { 82 eScope_Copy, 83 eScope_UseInPlace 84 }; 85 86 // Construtor / Destructor 87 CCleanup(CScope* scope = NULL, EScopeOptions scope_handling = eScope_Copy); 88 ~CCleanup(); 89 90 void SetScope(CScope* scope); 91 92 // BASIC CLEANUP 93 94 CConstRef<CCleanupChange> BasicCleanup(CSeq_entry& se, Uint4 options = 0); 95 /// Cleanup a Seq-submit. 96 CConstRef<CCleanupChange> BasicCleanup(CSeq_submit& ss, Uint4 options = 0); 97 /// Cleanup a Bioseq. 98 CConstRef<CCleanupChange> BasicCleanup(CBioseq& bs, Uint4 ooptions = 0); 99 /// Cleanup a Bioseq_set. 100 CConstRef<CCleanupChange> BasicCleanup(CBioseq_set& bss, Uint4 options = 0); 101 /// Cleanup a Seq-Annot. 102 CConstRef<CCleanupChange> BasicCleanup(CSeq_annot& sa, Uint4 options = 0); 103 /// Cleanup a Seq-feat. 104 CConstRef<CCleanupChange> BasicCleanup(CSeq_feat& sf, Uint4 options = 0); 105 /// Cleanup a BioSource. 106 CConstRef<CCleanupChange> BasicCleanup(CBioSource& src, Uint4 options = 0); 107 // Cleanup a Submit-block 108 CConstRef<CCleanupChange> BasicCleanup(CSubmit_block& block, Uint4 options = 0); 109 // Cleanup descriptors 110 CConstRef<CCleanupChange> BasicCleanup(CSeqdesc& desc, Uint4 options = 0); 111 CConstRef<CCleanupChange> BasicCleanup(CSeq_descr & desc, Uint4 options = 0); 112 113 // Handle versions. 114 CConstRef<CCleanupChange> BasicCleanup(CSeq_entry_Handle& seh, Uint4 options = 0); 115 CConstRef<CCleanupChange> BasicCleanup(CBioseq_Handle& bsh, Uint4 options = 0); 116 CConstRef<CCleanupChange> BasicCleanup(CBioseq_set_Handle& bssh, Uint4 options = 0); 117 CConstRef<CCleanupChange> BasicCleanup(CSeq_annot_Handle& sak, Uint4 options = 0); 118 CConstRef<CCleanupChange> BasicCleanup(CSeq_feat_Handle& sfh, Uint4 options = 0); 119 120 // Extended Cleanup 121 /// Cleanup a Seq-entry. 122 CConstRef<CCleanupChange> ExtendedCleanup(CSeq_entry& se, Uint4 options = 0); 123 /// Cleanup a Seq-submit. 124 CConstRef<CCleanupChange> ExtendedCleanup(CSeq_submit& ss, Uint4 options = 0); 125 /// Cleanup a Seq-Annot. 126 CConstRef<CCleanupChange> ExtendedCleanup(CSeq_annot& sa, Uint4 options = 0); 127 128 // Handle versions 129 static CConstRef<CCleanupChange> ExtendedCleanup(CSeq_entry_Handle& seh, Uint4 options = 0); 130 131 // Useful cleanup functions 132 133 static bool ShouldStripPubSerial(const CBioseq& bs); 134 135 136 /// Moves protein-specific features from nucleotide sequences in the Seq-entry to 137 /// the appropriate protein sequence. 138 /// @param seh Seq-entry Handle to edit [in] 139 /// @return Boolean return value indicates whether any changes were made 140 static bool MoveProteinSpecificFeats(CSeq_entry_Handle seh); 141 142 /// Moves one feature from nucleotide bioseq to 143 /// the appropriate protein sequence. 144 /// @param fh Feature to edit 145 /// @return Boolean return value indicates whether any changes were made 146 static bool MoveFeatToProtein(CSeq_feat_Handle fh); 147 148 /// Calculates whether a Gene-xref is unnecessary (because it refers to the 149 /// same gene as would be calculated using overlap) 150 /// @param sf Seq-feat with the xref [in] 151 /// @param scope Scope in which to search for location [in] 152 /// @param gene_xref Gene-ref of gene-xref [in] 153 /// @return Boolean return value indicates whether gene-xref is unnecessary 154 static bool IsGeneXrefUnnecessary(const CSeq_feat& sf, CScope& scope, const CGene_ref& gene_xref); 155 156 /// Removes unnecessary Gene-xrefs 157 /// @param f Seq-feat to edit [in] 158 /// @param scope Scope in which to search for locations [in] 159 /// @return Boolean return value indicates whether gene-xrefs were removed 160 static bool RemoveUnnecessaryGeneXrefs(CSeq_feat& f, CScope& scope); 161 162 /// Removes unnecessary Gene-xrefs on features in Seq-entry 163 /// @param seh Seq-entry-Handle to edit [in] 164 /// @return Boolean return value indicates whether gene-xrefs were removed 165 static bool RemoveUnnecessaryGeneXrefs(CSeq_entry_Handle seh); 166 167 /// Removes non-suppressing Gene-xrefs 168 /// @param f Seq-feat to edit [in] 169 /// @return Boolean return value indicates whether gene-xrefs were removed 170 static bool RemoveNonsuppressingGeneXrefs(CSeq_feat& f); 171 172 173 /// Repairs non-reciprocal xref pairs for specified feature if xrefs between 174 /// subtypes are permitted and feature with missing xref does not have an 175 /// xref to a different feature of the same subtype 176 /// @param f Seq-feat to edit [in] 177 /// @param tse top-level Seq-entry in which to search for the other half of the xref pair 178 /// @return Boolean return value indicates whether xrefs were created 179 static bool RepairXrefs(const CSeq_feat& f, const CTSE_Handle& tse); 180 181 /// Repairs non-reciprocal xref pairs for specified feature pair if xrefs between 182 /// subtypes are permitted and feature with missing xref does not have an 183 /// xref to a different feature of the same subtype 184 /// @param f Seq-feat to edit [in] 185 /// @param tse top-level Seq-entry in which to search for the other half of the xref pair 186 /// @return Boolean return value indicates whether xrefs were created 187 static bool RepairXrefs(const CSeq_feat& src, CSeq_feat_Handle& dst, const CTSE_Handle& tse); 188 189 /// Repairs non-reciprocal xref pairs in specified seq-entry 190 /// @param seh Seq-entry to edit [in] 191 /// @return Boolean return value indicates whether xrefs were created 192 static bool RepairXrefs(CSeq_entry_Handle seh); 193 194 /// Detects gene features with matching locus 195 /// @param f Seq-feat parent feature of gene_xref [in] 196 /// @param gene_xref Gene-ref of gene-xref [in] 197 /// @param bsh CBioseq_Handle parent bioseq in which to search for genes [in] 198 /// @return Boolean return value indicates whether a gene feature with matching locus has been found 199 static bool FindMatchingLocusGene(CSeq_feat& f, const CGene_ref& gene_xref, CBioseq_Handle bsh); 200 201 /// Removes orphaned locus Gene-xrefs 202 /// @param f Seq-feat to edit [in] 203 /// @param bsh CBioseq_Handle in which to search for gene features [in] 204 /// @return Boolean return value indicates whether gene-xrefs were removed 205 static bool RemoveOrphanLocusGeneXrefs(CSeq_feat& f, CBioseq_Handle bsh); 206 207 /// Detects gene features with matching locus_tag 208 /// @param f Seq-feat parent feature of gene_xref [in] 209 /// @param gene_xref Gene-ref of gene-xref [in] 210 /// @param bsh CBioseq_Handle parent bioseq in which to search for genes [in] 211 /// @return Boolean return value indicates whether a gene feature with matching locus_tag has been found 212 static bool FindMatchingLocus_tagGene(CSeq_feat& f, const CGene_ref& gene_xref, CBioseq_Handle bsh); 213 214 /// Removes orphaned locus_tag Gene-xrefs 215 /// @param f Seq-feat to edit [in] 216 /// @param bsh CBioseq_Handle in which to search for gene features [in] 217 /// @return Boolean return value indicates whether gene-xrefs were removed 218 static bool RemoveOrphanLocus_tagGeneXrefs(CSeq_feat& f, CBioseq_Handle bsh); 219 220 /// Extends a location to the specificed position. 221 /// @param loc Seq-loc to extend 222 /// @param pos position of new end of location 223 /// @param scope Scope in which to look for sequences 224 /// @return Boolean return value indicates whether the location was extended 225 static bool SeqLocExtend(CSeq_loc& loc, size_t pos, CScope& scope); 226 227 228 /// Extends a coding region up to 50 nt. if the coding region: 229 /// 1. does not end with a stop codon 230 /// 2. is adjacent to a stop codon 231 /// 3. is not pseudo 232 /// @param f Seq-feat to edit 233 /// @param bsh CBioseq_Handle on which the feature is located 234 /// @return Boolean return value indicates whether the feature was extended 235 static bool ExtendToStopIfShortAndNotPartial(CSeq_feat& f, CBioseq_Handle bsh, bool check_for_stop = true); 236 237 /// Checks whether it is possible to extend the original location up to improved one. It is possible only if 238 /// the original location is less than improved 239 /// @param orig Seq-loc to check 240 /// @param improved Seq-loc original location may be extended to 241 /// @return Boolean return value indicates whether the extention is possible 242 static bool LocationMayBeExtendedToMatch(const CSeq_loc& orig, const CSeq_loc& improved); 243 244 /// Extends a feature up to limit nt to a stop codon, or to the end of the sequence 245 /// if limit == 0 (partial will be set if location extends to end of sequence but 246 /// no stop codon is found) 247 /// @param f Seq-feat to edit 248 /// @param bsh CBioseq_Handle on which the feature is located 249 /// @param limit maximum number of nt to extend, or 0 if unlimited 250 /// @return Boolean return value indicates whether the feature was extended 251 static bool ExtendToStopCodon(CSeq_feat& f, CBioseq_Handle bsh, size_t limit); 252 static bool ExtendStopPosition(CSeq_feat& f, const CSeq_feat* cdregion, size_t extension = 0); 253 254 /// Translates coding region and selects best frame (without stops, or longest) 255 /// @param cds Coding region Seq-feat to edit 256 /// @param scope Scope in which to find coding region 257 /// @return Boolean return value indicates whether the coding region was changed 258 static bool SetBestFrame(CSeq_feat& cds, CScope& scope); 259 260 /// Chooses best frame based on location 261 /// 1. If the location is 5' complete, then the frame must be one. 262 /// 2. If the location is 5' partial and 3' complete, select a frame using the 263 /// value of the location length modulo 3. 264 /// @param cdregion Coding Region in which to set frame 265 /// @param loc Location to use for setting frame 266 /// @param scope Scope in which to find location sequence(s) 267 /// @return Boolean return value indicates whether the frame was changed 268 static bool SetFrameFromLoc(CCdregion &cdregion, const CSeq_loc& loc, CScope& scope); 269 static bool SetFrameFromLoc(CCdregion::EFrame &frame, const CSeq_loc& loc, CScope& scope); 270 271 /// 1. Set the partial flags when the CDS is partial and codon_start is 2 or 3 272 /// 2. Make the CDS partial at the 5' end if there is no start codon 273 /// 3. Make the CDS partial at the 3' end if there is no stop codon 274 /// @param cds Coding region Seq-feat to edit 275 /// @param scope Scope in which to find coding region and coding region's protein 276 /// product sequence 277 /// @return Boolean return value indicates whether the coding region changed 278 static bool SetCDSPartialsByFrameAndTranslation(CSeq_feat& cds, CScope& scope); 279 280 281 /// Clear internal partials 282 static bool ClearInternalPartials(CSeq_loc& loc, bool is_first = true, bool is_last = true); 283 static bool ClearInternalPartials(CSeq_loc_mix& mix, bool is_first = true, bool is_last = true); 284 static bool ClearInternalPartials(CPacked_seqint& pint, bool is_first = true, bool is_last = true); 285 static bool ClearInternalPartials(CSeq_entry_Handle seh); 286 287 /// Set feature partial based on feature location 288 static bool SetFeaturePartial(CSeq_feat& f); 289 290 /// Update EC numbers 291 /// @param ec_num_list Prot-ref ec number list to clean 292 /// @return Boolean value indicates whether any changes were made 293 static bool UpdateECNumbers(CProt_ref::TEc & ec_num_list); 294 295 /// Delete EC numbers 296 /// @param ec_num_list Prot-ref ec number list to clean 297 /// @return Boolean value indicates whether any changes were made 298 static bool RemoveBadECNumbers(CProt_ref::TEc & ec_num_list); 299 300 /// Fix EC numbers 301 /// @param entry Seq-entry-handle to clean 302 /// @return Boolean value indicates whether any changes were made 303 static bool FixECNumbers(CSeq_entry_Handle entry); 304 305 /// Set partialness of gene to match longest feature contained in gene 306 /// @param gene Seq-feat to edit 307 /// @param scope Scope in which to find gene 308 /// @return Boolean return value indicates whether the gene changed 309 static bool SetGenePartialByLongestContainedFeature(CSeq_feat& gene, CScope& scope); 310 311 static void SetProteinName(CProt_ref& prot, const string& protein_name, bool append); 312 static void SetProteinName(CSeq_feat& cds, const string& protein_name, bool append, CScope& scope); 313 static void SetMrnaName(CSeq_feat& mrna, const string& protein_name); 314 static const string& GetProteinName(const CProt_ref& prot); 315 static const string& GetProteinName(const CSeq_feat& cds, CScope& scope); 316 317 /// Sets MolInfo::tech for a sequence 318 /// @param seq Bioseq to edit 319 /// @param tech tech value to set 320 /// @return Boolean tech was changed 321 static bool SetMolinfoTech(CBioseq_Handle seq, CMolInfo::ETech tech); 322 323 /// Sets MolInfo::biomol for a sequence 324 /// @param seq Bioseq to edit 325 /// @param biomol biomol value to set 326 /// @return Boolean biomol was changed 327 static bool SetMolinfoBiomol(CBioseq_Handle seq, CMolInfo::EBiomol biomol); 328 329 330 /// Adds missing MolInfo descriptor to sequence 331 /// @param seq Bioseq to edit 332 /// @return Boolean return value indicates whether descriptor was added 333 static bool AddMissingMolInfo(CBioseq& seq, bool is_product); 334 335 /// Creates missing protein title descriptor 336 /// @param seq Bioseq to edit 337 /// @return Boolean return value indicates whether title was added 338 static bool AddProteinTitle(CBioseq_Handle bsh); 339 340 /// Removes NcbiCleanup User Objects in the Seq-entry 341 /// @param seq_entry Seq-entry to edit 342 /// @return Boolean return value indicates whether object was removed 343 static bool RemoveNcbiCleanupObject(CSeq_entry &seq_entry); 344 345 /// Looks up Org-refs in the Seq-entry 346 /// @param seh Seq-entry to edit 347 /// @return Boolean return value indicates whether object was updated 348 static bool TaxonomyLookup(CSeq_entry_Handle seh); 349 350 351 /// Sets genetic codes for coding regions on Bioseq-Handle 352 /// @param Bioseq-Handle to examine 353 /// @return Boolean indicates whether any coding regions were updated 354 static bool SetGeneticCodes(CBioseq_Handle bsh); 355 356 /// Adjusts protein title to reflect partialness 357 /// @param Bioseq to adjust 358 /// @return Boolean indicates whether title was updated 359 static bool AddPartialToProteinTitle(CBioseq &bioseq); 360 361 /// Removes protein product from pseudo coding region 362 /// @param cds Seq-feat to adjust 363 /// @param scope Scope in which to find protein sequence and remove it 364 /// @return Boolean indicates whether anything changed 365 static bool RemovePseudoProduct(CSeq_feat& cds, CScope& scope); 366 367 static CRef<CSeq_entry> AddProtein(const CSeq_feat& cds, CScope& scope); 368 369 /// Expands gene to include features it cross-references 370 /// @param gene Seq-feat to adjust 371 /// @param tse Top-level Seq-entry in which to find other features 372 /// @return Boolean indicates whether anything changed 373 static bool ExpandGeneToIncludeChildren(CSeq_feat& gene, CTSE_Handle& tse); 374 375 /// Performs WGS specific cleanup 376 /// @param entry Seq-entry to edit 377 /// @return Boolean return value indicates whether object was updated 378 static bool WGSCleanup(CSeq_entry_Handle entry, bool instantiate_missing_proteins = true, Uint4 options = 0, 379 bool run_extended_cleanup = true); 380 381 /// For table2asn -c s 382 /// Adds an exception of "low-quality sequence region" to coding regions 383 /// and mRNAs that are not pseudo and have an intron <11bp in length 384 /// @param entry Seq-entry to edit 385 /// @return Boolean return value indicates whether object was updated 386 static bool AddLowQualityException(CSeq_entry_Handle entry); 387 388 /// Normalize Descriptor Order on a specific Seq-entry 389 /// @param entry Seq-entry to edit 390 /// @return Boolean return value indicates whether object was updated 391 static bool NormalizeDescriptorOrder(CSeq_descr& descr); 392 393 /// Normalize Descriptor Order on a specific Seq-entry 394 /// @param seh Seq-entry-Handle to edit 395 /// @return Boolean return value indicates whether object was updated 396 static bool NormalizeDescriptorOrder(CSeq_entry_Handle seh); 397 398 /// Remove all titles in Seqdescr except the last, because it is the 399 /// only one that would be displayed in the flatfile 400 /// @param seq Bioseq-Handle to edit 401 /// @return Boolean return value indicates whether any titles were removed 402 static bool RemoveUnseenTitles(CSeq_entry_EditHandle::TSeq seq); 403 404 /// Remove all titles in Seqdescr except the last, because it is the 405 /// only one that would be displayed in the flatfile 406 /// @param set Bioseq-set-Handle to edit 407 /// @return Boolean return value indicates whether any titles were removed 408 static bool RemoveUnseenTitles(CSeq_entry_EditHandle::TSet set); 409 410 /// Add GenBank Wrapper Set 411 /// @param entry Seq-entry to edit 412 /// @return Boolean return value indicates whether object changed 413 static bool AddGenBankWrapper(CSeq_entry_Handle seh); 414 415 416 /// For Publication Citations 417 /// Get labels for a pubdesc. To be used in citations. 418 static void GetPubdescLabels 419 (const CPubdesc& pd, 420 vector<TEntrezId>& pmids, vector<TEntrezId>& muids, vector<int>& serials, 421 vector<string>& published_labels, vector<string>& unpublished_labels); 422 423 /// Get list of pubs that can be used for citations for Seq-feat on a Bioseq-handle 424 /// @param bsh Bioseq-handle to search 425 /// @return vector<CConstRef<CPub> > ordered list of pubs 426 /// Note that Seq-feat.cit appear in the flatfile using the position 427 /// in the list 428 static vector<CConstRef<CPub> > GetCitationList(CBioseq_Handle bsh); 429 430 /// Remove duplicate publications 431 static bool RemoveDuplicatePubs(CSeq_descr& descr); 432 433 /// Some pubs should not be promoted to nuc-prot set from sequence 434 static bool OkToPromoteNpPub(const CPubdesc& pd); 435 436 /// For some sequences, pubs should not be promoted to nuc-prot set from sequence 437 static bool OkToPromoteNpPub(const CBioseq& b); 438 439 static bool PubAlreadyInSet(const CPubdesc& pd, const CSeq_descr& descr); 440 441 /// Convert full-length publication features to publication descriptors. 442 /// @param seh Seq-entry to edit 443 /// @return bool indicates whether any changes were made 444 static bool ConvertPubFeatsToPubDescs(CSeq_entry_Handle seh); 445 446 /// Rescue pubs from Site-ref features 447 /// @param seh Seq-entry to edit 448 /// @return bool indicates whether any changes were made 449 static bool RescueSiteRefPubs(CSeq_entry_Handle seh); 450 451 /// Is this a "minimal" pub? (If yes, do not rescue from a Seq-feat.cit) 452 static bool IsMinPub(const CPubdesc& pd, bool is_refseq_prot); 453 454 //helper function for moving feature to pubdesc descriptor 455 static void MoveOneFeatToPubdesc(CSeq_feat_Handle feat, CRef<CSeqdesc> d, CBioseq_Handle b, bool remove_feat = true); 456 457 /// Remove duplicate biosource descriptors 458 static bool RemoveDupBioSource(CSeq_descr& descr); 459 460 /// Get BioSource from feature to use for source descriptor 461 static CRef<CBioSource> BioSrcFromFeat(const CSeq_feat& f); 462 463 static bool AreBioSourcesMergeable(const CBioSource& src1, const CBioSource& src2); 464 static bool MergeDupBioSources(CSeq_descr& descr); 465 static bool MergeDupBioSources(CBioSource& src1, const CBioSource& add); 466 467 468 /// Convert full-length source features to source descriptors 469 /// @param seh Seq-entry to edit 470 /// @return bool indicates whether any changes were made 471 static bool ConvertSrcFeatsToSrcDescs(CSeq_entry_Handle seh); 472 473 /// Examine all genes and gene xrefs in the Seq-entry. 474 /// If no genes have locus and some have locus tag AND no gene xrefs have locus-tag 475 /// and some gene xrefs have locus, change all gene xrefs to use locus tag. 476 /// If no genes have locus tag and some have locus AND no gene xrefs have locus 477 /// and some gene xrefs have locus tag, change all gene xrefs to use locus. 478 /// @param seh Seq-entry to edit 479 /// @return bool indicates whether any changes were made 480 static bool FixGeneXrefSkew(CSeq_entry_Handle seh); 481 482 /// Convert nuc-prot sets with just one sequence to just the sequence 483 /// can't be done during the explore phase because it changes a seq to a set 484 /// @param seh Seq-entry to edit 485 /// @return bool indicates whether any changes were made 486 static bool RenormalizeNucProtSets(CSeq_entry_Handle seh); 487 488 /// decodes various tags, including carriage-return-line-feed constructs 489 static bool DecodeXMLMarkChanged(std::string & str); 490 491 static CRef<CSeq_loc> GetProteinLocationFromNucleotideLocation(const CSeq_loc& nuc_loc, CScope& scope); 492 static CRef<CSeq_loc> GetProteinLocationFromNucleotideLocation(const CSeq_loc& nuc_loc, const CSeq_feat& cds, CScope& scope, bool require_inframe = false); 493 494 /// Find proteins that are not packaged in the same nuc-prot set as the 495 /// coding region for which they are a product, and move them to that 496 /// nuc-prot set. Ignore coding regions that are in gen-prod-sets. 497 /// @param seh Seq-entry to edit 498 /// @return bool indicates whether any changes were made 499 static bool RepackageProteins(CSeq_entry_Handle seh); 500 static bool RepackageProteins(const CSeq_feat& cds, CBioseq_set_Handle np); 501 502 static bool ConvertDeltaSeqToRaw(CSeq_entry_Handle seh, CSeq_inst::EMol filter = CSeq_inst::eMol_not_set); 503 504 /// Parse string into code break and add to coding region. 505 /// @param feat feature that contains coding region - necessary to determine codon boundaries 506 /// @param cds coding region to which code breaks will be added 507 /// @param str string from which to parse code break 508 /// @param scope scope in which to find sequences referenced (used for location comparisons) 509 /// @return bool indicates string was successfully parsed and code break was added 510 static bool ParseCodeBreak(const CSeq_feat& feat, 511 CCdregion& cds, 512 const CTempString& str, 513 CScope& scope, 514 IObjtoolsListener* pMessageListener=nullptr); 515 516 /// Parses all valid transl_except Gb-quals into code-breaks for cdregion, 517 /// then removes the transl_except Gb-quals that were successfully parsed 518 /// @param feat feature that contains coding region 519 /// @param scope scope in which to find sequences referenced (used for location comparisons) 520 /// @return bool indicates changes were made 521 static bool ParseCodeBreaks(CSeq_feat& feat, CScope& scope); 522 523 static size_t MakeSmallGenomeSet(CSeq_entry_Handle entry); 524 525 /// From SQD-4329 526 /// For each sequence with a source that has an IRD db_xref, create a misc_feature 527 /// across the entire span and move the IRD db_xref from the source to the misc_feature. 528 /// Create a suppressing gene xref for the misc_feature. 529 /// @param entry Seq-entry on which to search for sources and create features 530 /// @return bool indicates changes were made 531 static bool MakeIRDFeatsFromSourceXrefs(CSeq_entry_Handle entry); 532 533 /// From GB-7563 534 /// An action has been requested that will do the following: 535 /// 1. This action should be limited to protein sequences where the product 536 /// is an exact match to a specified text (the usual string constraint 537 /// is not needed). 538 /// 2. Protein sequences for which the coding region is 5' partial should 539 /// not be affected. 540 /// 3. When the protein name matches, the following actions should be taken 541 /// if and only if the first amino acid of the protein sequence is not 542 /// M (methionine): 543 /// a. The first amino acid of the protein sequence should be changed to 544 /// methionine. 545 /// b. The coding region should have the text "RNA editing" added to 546 /// Seq-feat.except_text (separated from any existing text by a semicolon). 547 /// If Seq-feat.except is not already true, it should be set to true. 548 /// c. A code-break should be added to Cdregion.code-break where the 549 /// Code-break.loc is the location of the first codon of the coding region 550 /// and Code-break.aa is ncbieaa 'M' (Indexers will refer to "code-breaks" 551 /// as "translation exceptions" because these appear in the flatfile as a 552 /// /transl_except qualifier. 553 /// 554 /// It will be the responsibility of the caller to only invoke this function 555 /// for coding regions where the product name is a match, and the protein sequence 556 /// does not already start with an M. 557 558 static bool FixRNAEditingCodingRegion(CSeq_feat& cds); 559 560 /// utility function for setting code break location given offset 561 /// pos is the position of the amino acid where the translation exception 562 /// occurs (starts with 1) 563 static void SetCodeBreakLocation(CCode_break& cb, size_t pos, const CSeq_feat& cds); 564 565 static bool IsMethionine(const CCode_break& cb); 566 567 /// utility function for finding the code break for a given amino acid position 568 /// pos is the position of the amino acid where the translation exception 569 /// occurs (starts with 1) 570 static CConstRef<CCode_break> GetCodeBreakForLocation(size_t pos, const CSeq_feat& cds); 571 572 // From the request in GB-7166, we want to be able to move /gene 573 // qualifiers that have been added to the coding region but not the 574 // parent gene to the parent gene. 575 // If the coding region also has /locus_tag qualifier which is different 576 // from the one on the parent gene features, do not move the qualifier. 577 // If there are two coding regions that are mapped to the same gene, 578 // do not move the qualifier. 579 static bool NormalizeGeneQuals(CSeq_feat& cds, CSeq_feat& gene); 580 static bool NormalizeGeneQuals(CBioseq_Handle bsh); 581 static bool NormalizeGeneQuals(CSeq_entry_Handle seh); 582 typedef pair<CSeq_feat_Handle, CSeq_feat_Handle> TFeatGenePair; // by convention, cds first, gene second 583 static vector<TFeatGenePair> GetNormalizableGeneQualPairs(CBioseq_Handle bsh); 584 585 // This function is used to do generic string cleanup on User-object string fields 586 // and apply specific cleanups to known types of User-object 587 static bool CleanupUserObject(CUser_object& object); 588 589 // for cleaning up authors, lists of authors, and affiliation 590 static bool CleanupAuthor(CAuthor& author, bool fix_initials = true); 591 static bool CleanupAuthList(CAuth_list& al, bool fix_initials = true); 592 static void ResetAuthorNames(CAuth_list::TNames& names); 593 static bool CleanupAffil(CAffil& af); 594 static bool IsEmpty(const CAuth_list::TAffil& affil); 595 596 // for cleaning up collection-date subsource qualifiers 597 static bool CleanupCollectionDates(CSeq_entry_Handle seh, bool month_first); 598 599 static void AutodefId(CSeq_entry_Handle seh); 600 601 private: 602 // Prohibit copy constructor & assignment operator 603 CCleanup(const CCleanup&); 604 CCleanup& operator= (const CCleanup&); 605 606 CRef<CScope> m_Scope; 607 608 static bool x_CleanupUserField(CUser_field& field); 609 610 static bool x_MergeDupOrgNames(COrgName& on1, const COrgName& add); 611 static bool x_MergeDupOrgRefs(COrg_ref& org1, const COrg_ref& add); 612 613 static bool x_HasShortIntron(const CSeq_loc& loc, size_t min_len = 11); 614 static bool x_AddLowQualityException(CSeq_feat& feat); 615 static bool x_AddLowQualityException(CSeq_entry_Handle entry, CSeqFeatData::ESubtype subtype); 616 617 static bool s_IsProductOnFeat(const CSeq_feat& cds); 618 static void s_SetProductOnFeat(CSeq_feat& feat, const string& protein_name, bool append); 619 620 static bool s_CleanupGeneOntology(CUser_object& obj); 621 static bool s_CleanupStructuredComment(CUser_object& obj); 622 static bool s_RemoveEmptyFields(CUser_object& obj); 623 static bool s_CleanupGenomeAssembly(CUser_object& obj); 624 static bool s_CleanupDBLink(CUser_object& obj); 625 static bool s_AddNumToUserField(CUser_field &field); 626 627 static bool s_CleanupNameStdBC(CName_std& name, bool fix_initials); 628 static void s_ExtractSuffixFromInitials(CName_std& name); 629 static void s_FixEtAl(CName_std& name); 630 631 // for cleaning pubdesc 632 static bool s_Flatten(CPub_equiv& pub_equiv); 633 }; 634 635 636 637 END_SCOPE(objects) 638 END_NCBI_SCOPE 639 640 #endif /* CLEANUP___CLEANUP__HPP */ 641