1 /* $Id: tax_validation_and_cleanup.hpp 632625 2021-06-03 17:38:33Z ivanov $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 *` 26 * Author: Colleen Bollin 27 * 28 * File Description: 29 * Tools for batch processing taxonomy-related validation and cleanup 30 * ....... 31 * 32 */ 33 34 #ifndef VALIDATOR___TAX_VALIDATION_AND_CLEANUP__HPP 35 #define VALIDATOR___TAX_VALIDATION_AND_CLEANUP__HPP 36 37 #include <corelib/ncbistd.hpp> 38 #include <objects/seqfeat/OrgMod.hpp> 39 #include <objects/seqfeat/Org_ref.hpp> 40 #include <objects/taxon3/T3Reply.hpp> 41 #include <objects/taxon3/Taxon3_reply.hpp> 42 #include <objects/valerr/ValidErrItem.hpp> 43 44 BEGIN_NCBI_SCOPE 45 BEGIN_SCOPE(objects) 46 BEGIN_SCOPE(validator) 47 48 class CValidError_imp; 49 50 51 // For Taxonomy Lookups and Fixups 52 // 53 // For validation, we need to be able to look up an Org-ref and determine 54 // whether the tax ID in the record is the same as what is returned by 55 // the taxonomy service. 56 // For cleanup, we want to look up an Org-ref and replace the existing Org-ref 57 // in the record with what is returned by the taxonomy service. 58 // 59 // Several qualifiers other than Org-ref.taxname may also contain scientific names. 60 // It is possible that the scientific name is merely a portion of the string. 61 // 62 // In the case of specific host, we want to be able to identify names that are 63 // mis-spelled or unrecognized. Unfortunately, common names are also 64 // acceptable for specific host, and it can be difficult to detect whether a 65 // value is a scientific name or a common name. The current method looks for 66 // the string to contain at least two words, the first of which must be capitalized. 67 // Unfortunately, this fails for "Rhesus monkey", "Atlantic white-sided dolphin", 68 // and others, and fails to catch the obvious miscapitalization "homo sapiens". 69 // See SQD-4325 for ongoing discussion. 70 // For validation, these values are reported. For cleanup, we replace the 71 // original value with a corrected value where possible. 72 // 73 // In the case of strain, scientific names should *not* be present in certain 74 // situations. For validation, these values will be reported, once TM-725 is 75 // resolved. 76 // 77 // Often the same value will occur many, many times in the same record, and we 78 // would like to avoid redundant lookups. 79 // Taxonomy requests should be separated into manageable chunks. 80 // In order for the undo commands to work correctly in Genome Workbench, we need 81 // a method that allows Genome Workbench to control when the updates are made. 82 // 83 // Note that Org-refs can be found in both features and source descriptors. 84 // It is necessary to record the parents of the Org-refs for which lookups are 85 // made and for which lookups of qualifiers are made, in order to report 86 // and/or clean them. 87 // 88 89 typedef struct { 90 EDiagSev severity; 91 EErrType err_type; 92 string err_msg; 93 } TTaxError; 94 95 96 // This base class represents a request for a qualifier value. 97 // The same qualifier value will be found in multiple Org-refs, which will 98 // be represented in the parents (m_Descs and m_Feats). 99 // A single qualifier could have multiple strings to be sent to taxonomy 100 // (try the whole value, try just the first two tokens, etc.). These will be 101 // represented in m_ValuesToTry. 102 class NCBI_VALIDATOR_EXPORT CQualifierRequest : public CObject 103 { 104 public: 105 CQualifierRequest(); ~CQualifierRequest()106 virtual ~CQualifierRequest() {}; 107 108 void AddParent(CConstRef<CSeqdesc> desc, CConstRef<CSeq_entry> ctx); 109 void AddParent(CConstRef<CSeq_feat> feat); 110 111 void AddRequests(vector<CRef<COrg_ref> >& request_list) const; 112 bool MatchTryValue(const string& val) const; NumRemainingReplies() const113 size_t NumRemainingReplies() const { return m_ValuesToTry.size() - m_RepliesProcessed; } 114 115 virtual void AddReply(const CT3Reply& reply) = 0; 116 void PostErrors(CValidError_imp& imp); 117 virtual void ListErrors(vector<TTaxError>& errs) const = 0; 118 119 protected: 120 void x_Init(); 121 122 vector<string> m_ValuesToTry; 123 size_t m_RepliesProcessed; 124 125 typedef pair<CConstRef<CSeqdesc>, CConstRef<CSeq_entry> > TDescPair; 126 vector<TDescPair> m_Descs; 127 vector<CConstRef<CSeq_feat> > m_Feats; 128 }; 129 130 // Specific host values can be classified as normal, ambiguous, or unrecognized. 131 // We can also suggest a better value to use instead. 132 class NCBI_VALIDATOR_EXPORT CSpecificHostRequest : public CQualifierRequest 133 { 134 public: 135 CSpecificHostRequest(const string& orig_val, const COrg_ref& org, bool for_fix = false); ~CSpecificHostRequest()136 ~CSpecificHostRequest() {}; 137 138 enum EHostResponseFlags{ 139 eNormal = 0, 140 eAmbiguous, 141 eUnrecognized, 142 eAlternateName 143 }; 144 typedef int TResponseFlags; 145 146 virtual void AddReply(const CT3Reply& reply); 147 virtual void ListErrors(vector<TTaxError>& errs) const; 148 149 const string& SuggestFix() const; 150 151 private: 152 string m_Host; 153 TResponseFlags m_Response; 154 string m_SuggestedFix; 155 string m_Error; 156 string m_HostLineage; 157 string m_OrgLineage; 158 }; 159 160 161 class NCBI_VALIDATOR_EXPORT CStrainRequest : public CQualifierRequest 162 { 163 public: 164 CStrainRequest(const string& strain, const COrg_ref& org); ~CStrainRequest()165 ~CStrainRequest() {}; 166 167 virtual void AddReply(const CT3Reply& reply); 168 virtual void ListErrors(vector<TTaxError>& errs) const; 169 170 static string MakeKey(const string& strain, const string& taxname); 171 static bool RequireTaxname(const string& taxname); 172 static bool Check(const COrg_ref& org); 173 174 private: 175 string m_Strain; 176 string m_Taxname; 177 bool m_IsInvalid; 178 static bool x_IsUnwanted(const string& str); 179 static bool x_IgnoreStrain(const string& str); 180 }; 181 182 183 // The map is used to eliminate duplicate taxonomy requests. 184 // The keys used may depend on just the qualifier value or may 185 // be a combination of the qualifier value and other values from 186 // the Org-ref (in the case of strain, this is sometimes taxname). 187 class NCBI_VALIDATOR_EXPORT CQualLookupMap 188 { 189 public: CQualLookupMap(COrgMod::ESubtype subtype)190 CQualLookupMap(COrgMod::ESubtype subtype) : m_Subtype(subtype), m_Populated(false) {}; ~CQualLookupMap()191 virtual ~CQualLookupMap() {}; 192 IsPopulated() const193 bool IsPopulated() const { return m_Populated; }; 194 195 void Clear(); 196 197 // GetKey gets a string key that is used to determine whether the lookup for two Org-refs 198 // will be the same. 199 // * For validating specific hosts, this would be the original value. 200 // * For fixing specific hosts, this would be the original value after default 201 // fixes have been applied 202 // * For validating strain, this might be the original value or it might be the original 203 // value plus the organism name. 204 virtual string GetKey(const string& orig_val, const COrg_ref& org) const = 0; 205 206 // Check indicates whether this Org-ref should be examined or ignored. 207 // strain values are ignored for some values of lineage or taxname Check(const COrg_ref &) const208 virtual bool Check(const COrg_ref& /*org*/) const { return true; } 209 210 // used to add items to be looked up, when appropriate for this 211 // descriptor or feature 212 void AddDesc(CConstRef<CSeqdesc> desc, CConstRef<CSeq_entry> ctx); 213 void AddFeat(CConstRef<CSeq_feat> feat); 214 void AddOrg(const COrg_ref& org); 215 216 // add an item to be looked up independently of a feature or descriptor 217 void AddString(const string& val); 218 219 // GetRequestList returns a list of Org-refs to be sent to taxonomy. 220 // Note that the number of requests may be greater than the number of 221 // values being checked. 222 vector<CRef<COrg_ref> > GetRequestList(); 223 224 // It is the responsibility of the calling program to chunk the request 225 // list and pass the input and reply to the map until all requests 226 // have responses 227 string IncrementalUpdate(const vector<CRef<COrg_ref> >& input, const CTaxon3_reply& reply); 228 229 // Indicates whether the map is waiting for more responses 230 bool IsUpdateComplete() const; 231 232 // Posts errors to the validator based on responses 233 void PostErrors(CValidError_imp& imp); 234 235 virtual void ListErrors(vector<TTaxError>& errs) const; 236 237 // Applies the change to an Org-ref. Note that there might be multiple 238 // qualifiers of the same subtype on the Org-ref, and we need to be sure 239 // to apply the change to the correct qualifier 240 virtual bool ApplyToOrg(COrg_ref& org) const = 0; 241 242 protected: 243 typedef map<string, CRef<CQualifierRequest> > TQualifierRequests; 244 245 TQualifierRequests m_Map; 246 COrgMod::ESubtype m_Subtype; 247 bool m_Populated; 248 249 TQualifierRequests::iterator x_FindRequest(const string& val); 250 251 // x_MakeNewRequest creates a new CQualifierRequest object for the given pair of orig_val and org 252 virtual CRef<CQualifierRequest> x_MakeNewRequest(const string& orig_val, const COrg_ref& org) = 0; 253 }; 254 255 256 class NCBI_VALIDATOR_EXPORT CSpecificHostMap : public CQualLookupMap 257 { 258 public: CSpecificHostMap()259 CSpecificHostMap() : CQualLookupMap(COrgMod::eSubtype_nat_host) {}; ~CSpecificHostMap()260 ~CSpecificHostMap() {}; 261 GetKey(const string & orig_val,const COrg_ref &) const262 virtual string GetKey(const string& orig_val, const COrg_ref& /*org*/) const { return orig_val; }; ApplyToOrg(COrg_ref &) const263 virtual bool ApplyToOrg(COrg_ref& /*org*/) const { return false; }; 264 265 protected: 266 virtual CRef<CQualifierRequest> x_MakeNewRequest(const string& orig_val, const COrg_ref& org); 267 }; 268 269 class NCBI_VALIDATOR_EXPORT CSpecificHostMapForFix : public CQualLookupMap 270 { 271 public: CSpecificHostMapForFix()272 CSpecificHostMapForFix() : CQualLookupMap(COrgMod::eSubtype_nat_host) {}; ~CSpecificHostMapForFix()273 ~CSpecificHostMapForFix() {}; 274 GetKey(const string & orig_val,const COrg_ref &) const275 virtual string GetKey(const string& orig_val, const COrg_ref& /*org*/) const { return x_DefaultSpecificHostAdjustments(orig_val); }; 276 virtual bool ApplyToOrg(COrg_ref& org) const; 277 278 protected: 279 static string x_DefaultSpecificHostAdjustments(const string& host_val); 280 virtual CRef<CQualifierRequest> x_MakeNewRequest(const string& orig_val, const COrg_ref& org); 281 }; 282 283 284 class NCBI_VALIDATOR_EXPORT CStrainMap : public CQualLookupMap 285 { 286 public: CStrainMap()287 CStrainMap() : CQualLookupMap(COrgMod::eSubtype_strain) {}; ~CStrainMap()288 ~CStrainMap() {}; 289 GetKey(const string & orig_val,const COrg_ref & org) const290 virtual string GetKey(const string& orig_val, const COrg_ref& org) const { return CStrainRequest::MakeKey(orig_val, org.IsSetTaxname() ? org.GetTaxname() : kEmptyStr); }; Check(const COrg_ref & org) const291 virtual bool Check(const COrg_ref& org) const { return CStrainRequest::Check(org); }; ApplyToOrg(COrg_ref &) const292 virtual bool ApplyToOrg(COrg_ref& /*org*/) const { return false; }; 293 294 protected: 295 virtual CRef<CQualifierRequest> x_MakeNewRequest(const string& orig_val, const COrg_ref& org); 296 297 }; 298 299 typedef map<string, CSpecificHostRequest> TSpecificHostRequests; 300 301 // This class handles complete org-ref lookups, specific-host lookups, 302 // and strain lookups. 303 // These activities are bundled together in order to avoid doing a scan 304 // of the record looking for source features and source descriptors 305 // multiple times. 306 class NCBI_VALIDATOR_EXPORT CTaxValidationAndCleanup 307 { 308 public: 309 CTaxValidationAndCleanup(); ~CTaxValidationAndCleanup()310 ~CTaxValidationAndCleanup() {}; 311 312 void Init(const CSeq_entry& se); 313 314 // for complete Org-ref validation/replacement 315 vector< CRef<COrg_ref> > GetTaxonomyLookupRequest() const; 316 void ListTaxLookupErrors(const CT3Reply& reply, const COrg_ref& org, CBioSource::TGenome genome, bool is_insd_patent, bool is_wp, vector<TTaxError>& errs) const; 317 void ReportTaxLookupErrors(const CTaxon3_reply& reply, CValidError_imp& imp, bool is_insd_patent) const; 318 void ReportIncrementalTaxLookupErrors(const CTaxon3_reply& reply, CValidError_imp& imp, bool is_insd_patent, size_t offset) const; 319 bool AdjustOrgRefsWithTaxLookupReply(const CTaxon3_reply& reply, 320 vector<CRef<COrg_ref> > org_refs, 321 string& error_message, 322 bool use_error_orgrefs = false) const; 323 324 // for specific host validation/replacement 325 vector<CRef<COrg_ref> > GetSpecificHostLookupRequest(bool for_fix); 326 327 string IncrementalSpecificHostMapUpdate(const vector<CRef<COrg_ref> >& input, const CTaxon3_reply& reply); 328 bool IsSpecificHostMapUpdateComplete() const; 329 void ReportSpecificHostErrors(const CTaxon3_reply& reply, CValidError_imp& imp); 330 void ReportSpecificHostErrors(CValidError_imp& imp); 331 bool AdjustOrgRefsWithSpecificHostReply(vector<CRef<COrg_ref> > requests, 332 const CTaxon3_reply& reply, 333 vector<CRef<COrg_ref> > org_refs, 334 string& error_message); 335 bool AdjustOrgRefsForSpecificHosts(vector<CRef<COrg_ref> > org_refs); 336 337 // for strain validation 338 vector<CRef<COrg_ref> > GetStrainLookupRequest(); 339 string IncrementalStrainMapUpdate(const vector<CRef<COrg_ref> >& input, const CTaxon3_reply& reply); 340 bool IsStrainMapUpdateComplete() const; 341 void ReportStrainErrors(CValidError_imp& imp); 342 343 // Used when reporting a problem contacting the taxonomy service 344 CConstRef<CSeq_entry> GetTopReportObject() const; 345 346 // Genome Workbench uses these methods to update individual descriptors and features NumDescs() const347 size_t NumDescs() const { return m_SrcDescs.size(); } NumFeats() const348 size_t NumFeats() const { return m_SrcFeats.size(); } 349 GetDesc(size_t num) const350 CConstRef<CSeqdesc> GetDesc(size_t num) const { return m_SrcDescs[num]; }; GetFeat(size_t num) const351 CConstRef<CSeq_feat> GetFeat(size_t num) const { return m_SrcFeats[num]; }; 352 CConstRef<CSeq_entry> GetSeqContext(size_t num) const; 353 354 bool DoTaxonomyUpdate(CSeq_entry_Handle seh, bool with_host); 355 356 void FixOneSpecificHost(string& val); 357 bool IsOneSpecificHostValid(const string& val, string& err_msg); 358 359 void CheckOneOrg(const COrg_ref& org, int genome, CValidError_imp& imp); 360 361 protected: 362 void x_InterpretTaxonomyError(const CT3Error& error, const COrg_ref& org, const EErrType type, vector<TTaxError>& errs) const; 363 void x_GatherSources(const CSeq_entry& se); 364 void x_CreateSpecificHostMap(bool for_fix); 365 void x_UpdateSpecificHostMapWithReply(const CTaxon3_reply& reply, string& error_message); 366 bool x_ApplySpecificHostMap(COrg_ref& org_ref) const; 367 static string x_DefaultSpecificHostAdjustments(const string& host_val); 368 TSpecificHostRequests::iterator x_FindHostFixRequest(const string& val); 369 370 void x_CreateStrainMap(); 371 void x_CreateQualifierMap(CQualLookupMap& lookup); 372 x_ClearMaps()373 void x_ClearMaps() { m_HostMap.Clear(); m_HostMapForFix.Clear(); m_StrainMap.Clear(); } 374 375 vector<CConstRef<CSeqdesc> > m_SrcDescs; 376 vector<CConstRef<CSeq_entry> > m_DescCtxs; 377 vector<CConstRef<CSeq_feat> > m_SrcFeats; 378 379 TSpecificHostRequests m_SpecificHostRequests; 380 bool m_SpecificHostRequestsBuilt; 381 bool m_SpecificHostRequestsUpdated; 382 383 bool m_StrainRequestsBuilt; 384 385 CSpecificHostMap m_HostMap; 386 CSpecificHostMapForFix m_HostMapForFix; 387 CStrainMap m_StrainMap; 388 }; 389 390 391 392 END_SCOPE(validator) 393 END_SCOPE(objects) 394 END_NCBI_SCOPE 395 396 #endif /* TAX_VALIDATION_AND_CLEANUP__HPP */ 397