1 /*  $Id: biosample_util.hpp 603438 2020-03-11 15:14:32Z ludwigf $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Colleen Bollin
27  *
28  * File Description:
29  *   check biosource and structured comment descriptors against biosample database
30  *
31  */
32 
33 #ifndef BIOSAMPLE_CHK__UTIL__HPP
34 #define BIOSAMPLE_CHK__UTIL__HPP
35 
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbistr.hpp>
38 
39 #include <objects/seq/Seq_descr.hpp>
40 #include <objects/seq/Seqdesc.hpp>
41 #include <objects/seqfeat/BioSource.hpp>
42 #include <objects/seqtable/Seq_table.hpp>
43 #include <objects/seqtable/SeqTable_column.hpp>
44 
45 #include <objmgr/bioseq_handle.hpp>
46 
47 
48 BEGIN_NCBI_SCOPE
49 BEGIN_SCOPE(objects)
50 BEGIN_SCOPE(biosample_util)
51 
52 typedef map< string, CRef< CSeq_descr > > TBioSamples;
53 typedef map< string, CRef< CSeq_descr > >::iterator TBioSamplesIterator;
54 
55 CRef< CSeq_descr > GetBiosampleData(const string& accession, bool use_dev_server = false, TBioSamples *cache = NULL);
56 
57 enum EStatus {
58     eStatus_Unknown = 0,
59     eStatus_Live,
60     eStatus_Hup,
61     eStatus_Withdrawn,
62     eStatus_Suppressed,
63     eStatus_ToBeCurated,
64     eStatus_Replaced
65 };
66 
67 typedef map<string, EStatus> TStatuses;
68 typedef map<string, EStatus>::iterator TStatusesIterator;
69 typedef pair<string, biosample_util::EStatus> TStatus;
70 EStatus GetBiosampleStatus(const string& accession, bool use_dev_server = false, TStatuses *cache = NULL);
71 void GetBiosampleStatus(TStatuses& status, bool use_dev_server = false);
72 string GetBiosampleStatusName(EStatus status);
73 
74 
75 vector<string> GetBiosampleIDs(CBioseq_Handle bh);
76 vector<string> GetBioProjectIDs(CBioseq_Handle bh);
77 
78 
79 class CBiosampleFieldDiff : public CObject
80 {
81 public:
CBiosampleFieldDiff()82     CBiosampleFieldDiff() {};
CBiosampleFieldDiff(const string & sequence_id,const string & biosample_id,const string & field_name,const string & src_val,const string & sample_val)83     CBiosampleFieldDiff(const string& sequence_id, const string& biosample_id, const string& field_name, const string& src_val, const string& sample_val) :
84         m_SequenceID(sequence_id), m_BiosampleID(biosample_id), m_FieldName(field_name), m_SrcVal(src_val), m_SampleVal(sample_val)
85         {};
CBiosampleFieldDiff(const string & sequence_id,const string & biosample_id,const CFieldDiff & diff)86     CBiosampleFieldDiff(const string& sequence_id, const string& biosample_id, const CFieldDiff& diff) :
87         m_SequenceID(sequence_id), m_BiosampleID(biosample_id),
88         m_FieldName(diff.GetFieldName()),
89         m_SrcVal(diff.GetSrcVal()),
90         m_SampleVal(diff.GetSampleVal())
91         {};
92 
~CBiosampleFieldDiff(void)93     ~CBiosampleFieldDiff(void) {};
94 
95     static void PrintHeader(ncbi::CNcbiOstream & stream, bool show_seq_id = true);
96     void Print(ncbi::CNcbiOstream & stream, bool show_seq_id = true) const;
97     void Print(ncbi::CNcbiOstream & stream, const CBiosampleFieldDiff& prev);
98     void PrettyPrint(ncbi::CNcbiOstream & stream, size_t keyWidth=20, size_t valueWidth=40) const;
GetSequenceId() const99     const string& GetSequenceId() const { return m_SequenceID; };
SetSequenceId(const string & id)100     void SetSequenceId(const string& id) { m_SequenceID = id; };
GetFieldName() const101     const string& GetFieldName() const { return m_FieldName; };
GetSrcVal() const102     string GetSrcVal() const { return CBioSource::IsStopWord(m_SrcVal) ? string("") : m_SrcVal; };
GetSampleVal() const103     string GetSampleVal() const { return CBioSource::IsStopWord(m_SampleVal) ? string("") : m_SampleVal; };
GetPureSrcVal() const104     string GetPureSrcVal() const { return m_SrcVal; };
GetPureSampleVal() const105     string GetPureSampleVal() const { return m_SampleVal; };
GetBioSample() const106     const string& GetBioSample() const { return m_BiosampleID; };
107 
108     int CompareAllButSequenceID(const CBiosampleFieldDiff& other);
109     int Compare(const CBiosampleFieldDiff& other);
110 
111 private:
112     string m_SequenceID;
113     string m_BiosampleID;
114     string m_FieldName;
115     string m_SrcVal;
116     string m_SampleVal;
117 };
118 
119 typedef vector< CRef<CBiosampleFieldDiff> > TBiosampleFieldDiffList;
120 
121 TBiosampleFieldDiffList
122 GetBioseqDiffs(CBioseq_Handle bh,
123                const string& biosample_accession,
124                size_t& num_processed,
125                vector<string>& unprocessed_ids,
126                bool use_dev_server = false,
127                bool compare_structured_comments = false,
128                const string& expected_prefix = "",
129                TBioSamples *cache = NULL);
130 
131 TBiosampleFieldDiffList GetFieldDiffs(const string& sequence_id, const string& biosample_id, const CBioSource& src, const CBioSource& sample);
132 TBiosampleFieldDiffList GetFieldDiffs(const string& sequence_id, const string& biosample_id, CConstRef<CUser_object> src, CConstRef<CUser_object> sample);
133 
134 bool ResolveSuppliedBioSampleAccession(const string& biosample_accession, vector<string>& biosample_ids);
135 
136 bool DoDiffsContainConflicts(const TBiosampleFieldDiffList& diffs, CNcbiOstream* log);
137 
138 // This function is for generating a table of biosample values for a bioseq
139 // // that does not currently have a biosample ID
140 void AddBioseqToTable(CBioseq_Handle bh, CSeq_table& table, bool with_id,
141                       bool include_comments = false, const string& expected_prefix = "");
142 
143 
144 string GetBestBioseqLabel(CBioseq_Handle bsh);
145 bool AttributeNamesAreEquivalent(string name1, string name2);
146 
147 void PrintBioseqXML(CBioseq_Handle bh,
148                     const string& id_prefix,
149                     CNcbiOstream* report_stream,
150                     const string& bioproject_accession,
151                     const string& default_owner,
152                     const string& hup_date,
153                     const string& comment,
154                     bool first_seq_only,
155                     bool report_structured_comments,
156                     const string& expected_prefix);
157 
158 string OwnerFromAffil(const CAffil& affil);
159 
160 //  rw-905 >>
161 void
162 GenerateDiffListFromBioSource(
163     const CSeq_descr& bioSample,        // as retrieved from /biosample/fetch
164     const CBioSource& bioSource,        // as plugged from a bioseq or seq-entry
165     TBiosampleFieldDiffList& diffs);    // where to put list of "relevent" differences
166 
167 bool
168 GenerateDiffListFromBioSource(
169     const string& bioSampleAcc,         // as retrieved from /biosample/fetch
170     const CBioSource& bioSource,        // as plugged from a bioseq or seq-entry
171     CBioSource& bioSampleSource,        // assigned from biosample, if there are relevant diffs
172     TBiosampleFieldDiffList& diffs);    // where to put list of "relevent" diffs
173 //  << rw-905
174 
175 bool
176 UpdateBiosourceFromBiosample(
177     const CBioSource& existingBioSource,
178     CBioSource& newBioource);
179 
180 bool
181 UpdateBiosourceFromBiosample(
182     const TBiosampleFieldDiffList& diffs,
183     const CBioSource& existingBioSource,
184     CBioSource& newBioource);
185 
186 void PrettyPrint(
187     const TBiosampleFieldDiffList& diffList,
188     CNcbiOstream& ostr,
189     size_t keyWidth = 20,
190     size_t valueWidth = 40);
191 
192 END_SCOPE(biosample_util)
193 END_SCOPE(objects)
194 END_NCBI_SCOPE
195 
196 #endif //BIOSAMPLE_CHK__UTIL__HPP
197