1 /* $Id: biosample_util.hpp 603438 2020-03-11 15:14:32Z ludwigf $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * Author: Colleen Bollin 27 * 28 * File Description: 29 * check biosource and structured comment descriptors against biosample database 30 * 31 */ 32 33 #ifndef BIOSAMPLE_CHK__UTIL__HPP 34 #define BIOSAMPLE_CHK__UTIL__HPP 35 36 #include <corelib/ncbistd.hpp> 37 #include <corelib/ncbistr.hpp> 38 39 #include <objects/seq/Seq_descr.hpp> 40 #include <objects/seq/Seqdesc.hpp> 41 #include <objects/seqfeat/BioSource.hpp> 42 #include <objects/seqtable/Seq_table.hpp> 43 #include <objects/seqtable/SeqTable_column.hpp> 44 45 #include <objmgr/bioseq_handle.hpp> 46 47 48 BEGIN_NCBI_SCOPE 49 BEGIN_SCOPE(objects) 50 BEGIN_SCOPE(biosample_util) 51 52 typedef map< string, CRef< CSeq_descr > > TBioSamples; 53 typedef map< string, CRef< CSeq_descr > >::iterator TBioSamplesIterator; 54 55 CRef< CSeq_descr > GetBiosampleData(const string& accession, bool use_dev_server = false, TBioSamples *cache = NULL); 56 57 enum EStatus { 58 eStatus_Unknown = 0, 59 eStatus_Live, 60 eStatus_Hup, 61 eStatus_Withdrawn, 62 eStatus_Suppressed, 63 eStatus_ToBeCurated, 64 eStatus_Replaced 65 }; 66 67 typedef map<string, EStatus> TStatuses; 68 typedef map<string, EStatus>::iterator TStatusesIterator; 69 typedef pair<string, biosample_util::EStatus> TStatus; 70 EStatus GetBiosampleStatus(const string& accession, bool use_dev_server = false, TStatuses *cache = NULL); 71 void GetBiosampleStatus(TStatuses& status, bool use_dev_server = false); 72 string GetBiosampleStatusName(EStatus status); 73 74 75 vector<string> GetBiosampleIDs(CBioseq_Handle bh); 76 vector<string> GetBioProjectIDs(CBioseq_Handle bh); 77 78 79 class CBiosampleFieldDiff : public CObject 80 { 81 public: CBiosampleFieldDiff()82 CBiosampleFieldDiff() {}; CBiosampleFieldDiff(const string & sequence_id,const string & biosample_id,const string & field_name,const string & src_val,const string & sample_val)83 CBiosampleFieldDiff(const string& sequence_id, const string& biosample_id, const string& field_name, const string& src_val, const string& sample_val) : 84 m_SequenceID(sequence_id), m_BiosampleID(biosample_id), m_FieldName(field_name), m_SrcVal(src_val), m_SampleVal(sample_val) 85 {}; CBiosampleFieldDiff(const string & sequence_id,const string & biosample_id,const CFieldDiff & diff)86 CBiosampleFieldDiff(const string& sequence_id, const string& biosample_id, const CFieldDiff& diff) : 87 m_SequenceID(sequence_id), m_BiosampleID(biosample_id), 88 m_FieldName(diff.GetFieldName()), 89 m_SrcVal(diff.GetSrcVal()), 90 m_SampleVal(diff.GetSampleVal()) 91 {}; 92 ~CBiosampleFieldDiff(void)93 ~CBiosampleFieldDiff(void) {}; 94 95 static void PrintHeader(ncbi::CNcbiOstream & stream, bool show_seq_id = true); 96 void Print(ncbi::CNcbiOstream & stream, bool show_seq_id = true) const; 97 void Print(ncbi::CNcbiOstream & stream, const CBiosampleFieldDiff& prev); 98 void PrettyPrint(ncbi::CNcbiOstream & stream, size_t keyWidth=20, size_t valueWidth=40) const; GetSequenceId() const99 const string& GetSequenceId() const { return m_SequenceID; }; SetSequenceId(const string & id)100 void SetSequenceId(const string& id) { m_SequenceID = id; }; GetFieldName() const101 const string& GetFieldName() const { return m_FieldName; }; GetSrcVal() const102 string GetSrcVal() const { return CBioSource::IsStopWord(m_SrcVal) ? string("") : m_SrcVal; }; GetSampleVal() const103 string GetSampleVal() const { return CBioSource::IsStopWord(m_SampleVal) ? string("") : m_SampleVal; }; GetPureSrcVal() const104 string GetPureSrcVal() const { return m_SrcVal; }; GetPureSampleVal() const105 string GetPureSampleVal() const { return m_SampleVal; }; GetBioSample() const106 const string& GetBioSample() const { return m_BiosampleID; }; 107 108 int CompareAllButSequenceID(const CBiosampleFieldDiff& other); 109 int Compare(const CBiosampleFieldDiff& other); 110 111 private: 112 string m_SequenceID; 113 string m_BiosampleID; 114 string m_FieldName; 115 string m_SrcVal; 116 string m_SampleVal; 117 }; 118 119 typedef vector< CRef<CBiosampleFieldDiff> > TBiosampleFieldDiffList; 120 121 TBiosampleFieldDiffList 122 GetBioseqDiffs(CBioseq_Handle bh, 123 const string& biosample_accession, 124 size_t& num_processed, 125 vector<string>& unprocessed_ids, 126 bool use_dev_server = false, 127 bool compare_structured_comments = false, 128 const string& expected_prefix = "", 129 TBioSamples *cache = NULL); 130 131 TBiosampleFieldDiffList GetFieldDiffs(const string& sequence_id, const string& biosample_id, const CBioSource& src, const CBioSource& sample); 132 TBiosampleFieldDiffList GetFieldDiffs(const string& sequence_id, const string& biosample_id, CConstRef<CUser_object> src, CConstRef<CUser_object> sample); 133 134 bool ResolveSuppliedBioSampleAccession(const string& biosample_accession, vector<string>& biosample_ids); 135 136 bool DoDiffsContainConflicts(const TBiosampleFieldDiffList& diffs, CNcbiOstream* log); 137 138 // This function is for generating a table of biosample values for a bioseq 139 // // that does not currently have a biosample ID 140 void AddBioseqToTable(CBioseq_Handle bh, CSeq_table& table, bool with_id, 141 bool include_comments = false, const string& expected_prefix = ""); 142 143 144 string GetBestBioseqLabel(CBioseq_Handle bsh); 145 bool AttributeNamesAreEquivalent(string name1, string name2); 146 147 void PrintBioseqXML(CBioseq_Handle bh, 148 const string& id_prefix, 149 CNcbiOstream* report_stream, 150 const string& bioproject_accession, 151 const string& default_owner, 152 const string& hup_date, 153 const string& comment, 154 bool first_seq_only, 155 bool report_structured_comments, 156 const string& expected_prefix); 157 158 string OwnerFromAffil(const CAffil& affil); 159 160 // rw-905 >> 161 void 162 GenerateDiffListFromBioSource( 163 const CSeq_descr& bioSample, // as retrieved from /biosample/fetch 164 const CBioSource& bioSource, // as plugged from a bioseq or seq-entry 165 TBiosampleFieldDiffList& diffs); // where to put list of "relevent" differences 166 167 bool 168 GenerateDiffListFromBioSource( 169 const string& bioSampleAcc, // as retrieved from /biosample/fetch 170 const CBioSource& bioSource, // as plugged from a bioseq or seq-entry 171 CBioSource& bioSampleSource, // assigned from biosample, if there are relevant diffs 172 TBiosampleFieldDiffList& diffs); // where to put list of "relevent" diffs 173 // << rw-905 174 175 bool 176 UpdateBiosourceFromBiosample( 177 const CBioSource& existingBioSource, 178 CBioSource& newBioource); 179 180 bool 181 UpdateBiosourceFromBiosample( 182 const TBiosampleFieldDiffList& diffs, 183 const CBioSource& existingBioSource, 184 CBioSource& newBioource); 185 186 void PrettyPrint( 187 const TBiosampleFieldDiffList& diffList, 188 CNcbiOstream& ostr, 189 size_t keyWidth = 20, 190 size_t valueWidth = 40); 191 192 END_SCOPE(biosample_util) 193 END_SCOPE(objects) 194 END_NCBI_SCOPE 195 196 #endif //BIOSAMPLE_CHK__UTIL__HPP 197