1 /*  $Id: tax_validation_and_cleanup.hpp 632625 2021-06-03 17:38:33Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *`
26  * Author:  Colleen Bollin
27  *
28  * File Description:
29  *   Tools for batch processing taxonomy-related validation and cleanup
30  *   .......
31  *
32  */
33 
34 #ifndef VALIDATOR___TAX_VALIDATION_AND_CLEANUP__HPP
35 #define VALIDATOR___TAX_VALIDATION_AND_CLEANUP__HPP
36 
37 #include <corelib/ncbistd.hpp>
38 #include <objects/seqfeat/OrgMod.hpp>
39 #include <objects/seqfeat/Org_ref.hpp>
40 #include <objects/taxon3/T3Reply.hpp>
41 #include <objects/taxon3/Taxon3_reply.hpp>
42 #include <objects/valerr/ValidErrItem.hpp>
43 
44 BEGIN_NCBI_SCOPE
45 BEGIN_SCOPE(objects)
46 BEGIN_SCOPE(validator)
47 
48 class CValidError_imp;
49 
50 
51 // For Taxonomy Lookups and Fixups
52 //
53 // For validation, we need to be able to look up an Org-ref and determine
54 // whether the tax ID in the record is the same as what is returned by
55 // the taxonomy service.
56 // For cleanup, we want to look up an Org-ref and replace the existing Org-ref
57 // in the record with what is returned by the taxonomy service.
58 //
59 // Several qualifiers other than Org-ref.taxname may also contain scientific names.
60 // It is possible that the scientific name is merely a portion of the string.
61 //
62 // In the case of specific host, we want to be able to identify names that are
63 // mis-spelled or unrecognized. Unfortunately, common names are also
64 // acceptable for specific host, and it can be difficult to detect whether a
65 // value is a scientific name or a common name. The current method looks for
66 // the string to contain at least two words, the first of which must be capitalized.
67 // Unfortunately, this fails for "Rhesus monkey", "Atlantic white-sided dolphin",
68 // and others, and fails to catch the obvious miscapitalization "homo sapiens".
69 // See SQD-4325 for ongoing discussion.
70 // For validation, these values are reported. For cleanup, we replace the
71 // original value with a corrected value where possible.
72 //
73 // In the case of strain, scientific names should *not* be present in certain
74 // situations. For validation, these values will be reported, once TM-725 is
75 // resolved.
76 //
77 // Often the same value will occur many, many times in the same record, and we
78 // would like to avoid redundant lookups.
79 // Taxonomy requests should be separated into manageable chunks.
80 // In order for the undo commands to work correctly in Genome Workbench, we need
81 // a method that allows Genome Workbench to control when the updates are made.
82 //
83 // Note that Org-refs can be found in both features and source descriptors.
84 // It is necessary to record the parents of the Org-refs for which lookups are
85 // made and for which lookups of qualifiers are made, in order to report
86 // and/or clean them.
87 //
88 
89 typedef struct {
90     EDiagSev severity;
91     EErrType err_type;
92     string err_msg;
93 } TTaxError;
94 
95 
96 // This base class represents a request for a qualifier value.
97 // The same qualifier value will be found in multiple Org-refs, which will
98 // be represented in the parents (m_Descs and m_Feats).
99 // A single qualifier could have multiple strings to be sent to taxonomy
100 // (try the whole value, try just the first two tokens, etc.). These will be
101 // represented in m_ValuesToTry.
102 class NCBI_VALIDATOR_EXPORT CQualifierRequest : public CObject
103 {
104 public:
105     CQualifierRequest();
~CQualifierRequest()106     virtual ~CQualifierRequest() {};
107 
108     void AddParent(CConstRef<CSeqdesc> desc, CConstRef<CSeq_entry> ctx);
109     void AddParent(CConstRef<CSeq_feat> feat);
110 
111     void AddRequests(vector<CRef<COrg_ref> >& request_list) const;
112     bool MatchTryValue(const string& val) const;
NumRemainingReplies() const113     size_t NumRemainingReplies() const { return m_ValuesToTry.size() - m_RepliesProcessed; }
114 
115     virtual void AddReply(const CT3Reply& reply) = 0;
116     void PostErrors(CValidError_imp& imp);
117     virtual void ListErrors(vector<TTaxError>& errs) const = 0;
118 
119 protected:
120     void x_Init();
121 
122     vector<string> m_ValuesToTry;
123     size_t m_RepliesProcessed;
124 
125     typedef pair<CConstRef<CSeqdesc>, CConstRef<CSeq_entry> > TDescPair;
126     vector<TDescPair> m_Descs;
127     vector<CConstRef<CSeq_feat> > m_Feats;
128 };
129 
130 // Specific host values can be classified as normal, ambiguous, or unrecognized.
131 // We can also suggest a better value to use instead.
132 class NCBI_VALIDATOR_EXPORT CSpecificHostRequest : public CQualifierRequest
133 {
134 public:
135     CSpecificHostRequest(const string& orig_val, const COrg_ref& org, bool for_fix = false);
~CSpecificHostRequest()136     ~CSpecificHostRequest() {};
137 
138     enum EHostResponseFlags{
139         eNormal = 0,
140         eAmbiguous,
141         eUnrecognized,
142         eAlternateName
143     };
144     typedef int TResponseFlags;
145 
146     virtual void AddReply(const CT3Reply& reply);
147     virtual void ListErrors(vector<TTaxError>& errs) const;
148 
149     const string& SuggestFix() const;
150 
151 private:
152     string m_Host;
153     TResponseFlags m_Response;
154     string m_SuggestedFix;
155     string m_Error;
156     string m_HostLineage;
157     string m_OrgLineage;
158 };
159 
160 
161 class NCBI_VALIDATOR_EXPORT CStrainRequest : public CQualifierRequest
162 {
163 public:
164     CStrainRequest(const string& strain, const COrg_ref& org);
~CStrainRequest()165     ~CStrainRequest() {};
166 
167     virtual void AddReply(const CT3Reply& reply);
168     virtual void ListErrors(vector<TTaxError>& errs) const;
169 
170     static string MakeKey(const string& strain, const string& taxname);
171     static bool RequireTaxname(const string& taxname);
172     static bool Check(const COrg_ref& org);
173 
174 private:
175     string m_Strain;
176     string m_Taxname;
177     bool m_IsInvalid;
178     static bool x_IsUnwanted(const string& str);
179     static bool x_IgnoreStrain(const string& str);
180 };
181 
182 
183 // The map is used to eliminate duplicate taxonomy requests.
184 // The keys used may depend on just the qualifier value or may
185 // be a combination of the qualifier value and other values from
186 // the Org-ref (in the case of strain, this is sometimes taxname).
187 class NCBI_VALIDATOR_EXPORT CQualLookupMap
188 {
189 public:
CQualLookupMap(COrgMod::ESubtype subtype)190     CQualLookupMap(COrgMod::ESubtype subtype) : m_Subtype(subtype), m_Populated(false) {};
~CQualLookupMap()191     virtual ~CQualLookupMap() {};
192 
IsPopulated() const193     bool IsPopulated() const { return m_Populated; };
194 
195     void Clear();
196 
197     // GetKey gets a string key that is used to determine whether the lookup for two Org-refs
198     // will be the same.
199     // * For validating specific hosts, this would be the original value.
200     // * For fixing specific hosts, this would be the original value after default
201     //   fixes have been applied
202     // * For validating strain, this might be the original value or it might be the original
203     //   value plus the organism name.
204     virtual string GetKey(const string& orig_val, const COrg_ref& org) const = 0;
205 
206     // Check indicates whether this Org-ref should be examined or ignored.
207     // strain values are ignored for some values of lineage or taxname
Check(const COrg_ref &) const208     virtual bool Check(const COrg_ref& /*org*/) const { return true; }
209 
210     // used to add items to be looked up, when appropriate for this
211     // descriptor or feature
212     void AddDesc(CConstRef<CSeqdesc> desc, CConstRef<CSeq_entry> ctx);
213     void AddFeat(CConstRef<CSeq_feat> feat);
214     void AddOrg(const COrg_ref& org);
215 
216     // add an item to be looked up independently of a feature or descriptor
217     void AddString(const string& val);
218 
219     // GetRequestList returns a list of Org-refs to be sent to taxonomy.
220     // Note that the number of requests may be greater than the number of
221     // values being checked.
222     vector<CRef<COrg_ref> > GetRequestList();
223 
224     // It is the responsibility of the calling program to chunk the request
225     // list and pass the input and reply to the map until all requests
226     // have responses
227     string IncrementalUpdate(const vector<CRef<COrg_ref> >& input, const CTaxon3_reply& reply);
228 
229     // Indicates whether the map is waiting for more responses
230     bool IsUpdateComplete() const;
231 
232     // Posts errors to the validator based on responses
233     void PostErrors(CValidError_imp& imp);
234 
235     virtual void ListErrors(vector<TTaxError>& errs) const;
236 
237     // Applies the change to an Org-ref. Note that there might be multiple
238     // qualifiers of the same subtype on the Org-ref, and we need to be sure
239     // to apply the change to the correct qualifier
240     virtual bool ApplyToOrg(COrg_ref& org) const = 0;
241 
242 protected:
243     typedef map<string, CRef<CQualifierRequest> > TQualifierRequests;
244 
245     TQualifierRequests m_Map;
246     COrgMod::ESubtype m_Subtype;
247     bool m_Populated;
248 
249     TQualifierRequests::iterator x_FindRequest(const string& val);
250 
251     // x_MakeNewRequest creates a new CQualifierRequest object for the given pair of orig_val and org
252     virtual CRef<CQualifierRequest> x_MakeNewRequest(const string& orig_val, const COrg_ref& org) = 0;
253 };
254 
255 
256 class NCBI_VALIDATOR_EXPORT CSpecificHostMap : public CQualLookupMap
257 {
258 public:
CSpecificHostMap()259     CSpecificHostMap() : CQualLookupMap(COrgMod::eSubtype_nat_host) {};
~CSpecificHostMap()260     ~CSpecificHostMap() {};
261 
GetKey(const string & orig_val,const COrg_ref &) const262     virtual string GetKey(const string& orig_val, const COrg_ref& /*org*/) const { return orig_val; };
ApplyToOrg(COrg_ref &) const263     virtual bool ApplyToOrg(COrg_ref& /*org*/) const { return false; };
264 
265 protected:
266     virtual CRef<CQualifierRequest> x_MakeNewRequest(const string& orig_val, const COrg_ref& org);
267 };
268 
269 class NCBI_VALIDATOR_EXPORT CSpecificHostMapForFix : public CQualLookupMap
270 {
271 public:
CSpecificHostMapForFix()272     CSpecificHostMapForFix() : CQualLookupMap(COrgMod::eSubtype_nat_host) {};
~CSpecificHostMapForFix()273     ~CSpecificHostMapForFix() {};
274 
GetKey(const string & orig_val,const COrg_ref &) const275     virtual string GetKey(const string& orig_val, const COrg_ref& /*org*/) const { return x_DefaultSpecificHostAdjustments(orig_val); };
276     virtual bool ApplyToOrg(COrg_ref& org) const;
277 
278 protected:
279     static string x_DefaultSpecificHostAdjustments(const string& host_val);
280     virtual CRef<CQualifierRequest> x_MakeNewRequest(const string& orig_val, const COrg_ref& org);
281 };
282 
283 
284 class NCBI_VALIDATOR_EXPORT CStrainMap : public CQualLookupMap
285 {
286 public:
CStrainMap()287     CStrainMap() : CQualLookupMap(COrgMod::eSubtype_strain) {};
~CStrainMap()288     ~CStrainMap() {};
289 
GetKey(const string & orig_val,const COrg_ref & org) const290     virtual string GetKey(const string& orig_val, const COrg_ref& org) const { return CStrainRequest::MakeKey(orig_val, org.IsSetTaxname() ? org.GetTaxname() : kEmptyStr); };
Check(const COrg_ref & org) const291     virtual bool Check(const COrg_ref& org) const { return CStrainRequest::Check(org); };
ApplyToOrg(COrg_ref &) const292     virtual bool ApplyToOrg(COrg_ref& /*org*/) const { return false; };
293 
294 protected:
295     virtual CRef<CQualifierRequest> x_MakeNewRequest(const string& orig_val, const COrg_ref& org);
296 
297 };
298 
299 typedef map<string, CSpecificHostRequest> TSpecificHostRequests;
300 
301 // This class handles complete org-ref lookups, specific-host lookups,
302 // and strain lookups.
303 // These activities are bundled together in order to avoid doing a scan
304 // of the record looking for source features and source descriptors
305 // multiple times.
306 class NCBI_VALIDATOR_EXPORT CTaxValidationAndCleanup
307 {
308 public:
309     CTaxValidationAndCleanup();
~CTaxValidationAndCleanup()310     ~CTaxValidationAndCleanup() {};
311 
312     void Init(const CSeq_entry& se);
313 
314     // for complete Org-ref validation/replacement
315     vector< CRef<COrg_ref> > GetTaxonomyLookupRequest() const;
316     void ListTaxLookupErrors(const CT3Reply& reply, const COrg_ref& org, CBioSource::TGenome genome, bool is_insd_patent, bool is_wp, vector<TTaxError>& errs) const;
317     void ReportTaxLookupErrors(const CTaxon3_reply& reply, CValidError_imp& imp, bool is_insd_patent) const;
318     void ReportIncrementalTaxLookupErrors(const CTaxon3_reply& reply, CValidError_imp& imp, bool is_insd_patent, size_t offset) const;
319     bool AdjustOrgRefsWithTaxLookupReply(const CTaxon3_reply& reply,
320                                          vector<CRef<COrg_ref> > org_refs,
321                                          string& error_message,
322                                          bool use_error_orgrefs = false) const;
323 
324     // for specific host validation/replacement
325     vector<CRef<COrg_ref> > GetSpecificHostLookupRequest(bool for_fix);
326 
327     string IncrementalSpecificHostMapUpdate(const vector<CRef<COrg_ref> >& input, const CTaxon3_reply& reply);
328     bool IsSpecificHostMapUpdateComplete() const;
329     void ReportSpecificHostErrors(const CTaxon3_reply& reply, CValidError_imp& imp);
330     void ReportSpecificHostErrors(CValidError_imp& imp);
331     bool AdjustOrgRefsWithSpecificHostReply(vector<CRef<COrg_ref> > requests,
332                                             const CTaxon3_reply& reply,
333                                             vector<CRef<COrg_ref> > org_refs,
334                                             string& error_message);
335     bool AdjustOrgRefsForSpecificHosts(vector<CRef<COrg_ref> > org_refs);
336 
337     // for strain validation
338     vector<CRef<COrg_ref> > GetStrainLookupRequest();
339     string IncrementalStrainMapUpdate(const vector<CRef<COrg_ref> >& input, const CTaxon3_reply& reply);
340     bool IsStrainMapUpdateComplete() const;
341     void ReportStrainErrors(CValidError_imp& imp);
342 
343     // Used when reporting a problem contacting the taxonomy service
344     CConstRef<CSeq_entry> GetTopReportObject() const;
345 
346     // Genome Workbench uses these methods to update individual descriptors and features
NumDescs() const347     size_t NumDescs() const { return m_SrcDescs.size(); }
NumFeats() const348     size_t NumFeats() const { return m_SrcFeats.size(); }
349 
GetDesc(size_t num) const350     CConstRef<CSeqdesc> GetDesc(size_t num) const { return m_SrcDescs[num]; };
GetFeat(size_t num) const351     CConstRef<CSeq_feat> GetFeat(size_t num) const { return m_SrcFeats[num]; };
352     CConstRef<CSeq_entry> GetSeqContext(size_t num) const;
353 
354     bool DoTaxonomyUpdate(CSeq_entry_Handle seh, bool with_host);
355 
356     void FixOneSpecificHost(string& val);
357     bool IsOneSpecificHostValid(const string& val, string& err_msg);
358 
359     void CheckOneOrg(const COrg_ref& org, int genome, CValidError_imp& imp);
360 
361 protected:
362     void x_InterpretTaxonomyError(const CT3Error& error, const COrg_ref& org, const EErrType type, vector<TTaxError>& errs) const;
363     void x_GatherSources(const CSeq_entry& se);
364     void x_CreateSpecificHostMap(bool for_fix);
365     void x_UpdateSpecificHostMapWithReply(const CTaxon3_reply& reply, string& error_message);
366     bool x_ApplySpecificHostMap(COrg_ref& org_ref) const;
367     static string x_DefaultSpecificHostAdjustments(const string& host_val);
368     TSpecificHostRequests::iterator x_FindHostFixRequest(const string& val);
369 
370     void x_CreateStrainMap();
371     void x_CreateQualifierMap(CQualLookupMap& lookup);
372 
x_ClearMaps()373     void x_ClearMaps() { m_HostMap.Clear(); m_HostMapForFix.Clear(); m_StrainMap.Clear(); }
374 
375     vector<CConstRef<CSeqdesc> > m_SrcDescs;
376     vector<CConstRef<CSeq_entry> > m_DescCtxs;
377     vector<CConstRef<CSeq_feat> > m_SrcFeats;
378 
379     TSpecificHostRequests m_SpecificHostRequests;
380     bool m_SpecificHostRequestsBuilt;
381     bool m_SpecificHostRequestsUpdated;
382 
383     bool m_StrainRequestsBuilt;
384 
385     CSpecificHostMap m_HostMap;
386     CSpecificHostMapForFix m_HostMapForFix;
387     CStrainMap m_StrainMap;
388 };
389 
390 
391 
392 END_SCOPE(validator)
393 END_SCOPE(objects)
394 END_NCBI_SCOPE
395 
396 #endif  /* TAX_VALIDATION_AND_CLEANUP__HPP */
397