1 /* $Id: cuTaxClient.cpp 609836 2020-06-08 15:56:03Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Adapted from CDTree-1 files by Chris Lanczycki
27 *
28 * File Description:
29 *
30 * Various utilities and classes for obtaining taxonomy information
31 * from ASN objects and NCBI taxonomy services.
32 * Also maintain lists of preferred and model tax nodes.
33 *
34 * ===========================================================================
35 */
36
37
38 #include <ncbi_pch.hpp>
39 #include <objects/taxon1/taxon1.hpp>
40 #include <objects/taxon1/Taxon2_data.hpp>
41 #include <objects/seq/Bioseq.hpp>
42 #include <objects/seq/Seq_descr.hpp>
43 #include <objects/seq/Seqdesc.hpp>
44 #include <objects/seqfeat/Org_ref.hpp>
45 #include <objects/seqloc/Seq_id.hpp>
46 #include <objects/seqloc/Seq_loc.hpp>
47 #include <objects/general/Dbtag.hpp>
48 #include <objects/general/Object_id.hpp>
49 #include <objects/seqfeat/BioSource.hpp>
50 #include <math.h>
51
52 #include <algo/structure/cd_utils/cuTaxClient.hpp>
53 #include <objects/id1/id1_client.hpp>
54
55 BEGIN_NCBI_SCOPE
56 BEGIN_SCOPE(cd_utils)
57
58 const bool TaxClient::REFRESH_DEFAULT = false;
59
TaxClient(bool refresh)60 TaxClient::TaxClient(bool refresh) : m_taxonomyClient(0), m_id1(0)
61 {
62 }
63
64
~TaxClient()65 TaxClient::~TaxClient() {
66 if(m_taxonomyClient){
67 m_taxonomyClient->Fini();
68 delete m_taxonomyClient;
69 m_taxonomyClient = 0;
70 }
71 if (m_id1)
72 delete m_id1;
73 }
74
init()75 bool TaxClient::init()
76 {
77 return ConnectToTaxServer();
78 }
79
ConnectToTaxServer()80 bool TaxClient::ConnectToTaxServer()
81 {
82 if(!m_taxonomyClient)
83 m_taxonomyClient= new CTaxon1();
84 m_taxonomyClient->Init();
85
86 if (m_taxonomyClient->IsAlive())
87 return true;
88 else
89 return false;
90 }
91
92
IsAlive()93 bool TaxClient::IsAlive() {
94 if(!m_taxonomyClient)return false;
95 return m_taxonomyClient->IsAlive();
96 }
97
98 // try to get "official" tax info from seq_id's gi
GetTaxIDForSeqId(CConstRef<CSeq_id> sid)99 TTaxId TaxClient::GetTaxIDForSeqId(CConstRef< CSeq_id > sid)
100 {
101 TGi gi = ZERO_GI;
102 if (sid->IsGi())
103 {
104 gi = sid->GetGi();
105 }
106 else
107 {
108 if (!m_id1)
109 m_id1= new CID1Client;
110 gi = m_id1->AskGetgi(*sid);
111 }
112 return GetTaxIDForGI(gi);
113 }
114
115
GetTaxIDForGI(TGi gi)116 TTaxId TaxClient::GetTaxIDForGI(TGi gi) {
117 TTaxId taxid = ZERO_TAX_ID;
118 if (IsAlive()) {
119 return (m_taxonomyClient->GetTaxId4GI(gi, taxid)) ? taxid : ZERO_TAX_ID;
120 }
121 return taxid;
122 }
123
GetOrgRef(TTaxId taxId,CRef<COrg_ref> & orgRef)124 bool TaxClient::GetOrgRef(TTaxId taxId, CRef< COrg_ref >& orgRef) {
125 bool result = false;
126
127 if (IsAlive() && orgRef.NotEmpty() && taxId > ZERO_TAX_ID) {
128 bool is_species, is_uncultured;
129 string blast_name;
130 CConstRef< COrg_ref > constOrgRef = m_taxonomyClient->GetOrgRef(taxId, is_species, is_uncultured, blast_name);
131 orgRef->Assign(*constOrgRef);
132 result = true;
133 } else {
134 orgRef.Reset();
135 }
136 return result;
137 }
138
139
140 // Look through the bioseq for a COrg object, and use it to get taxid.
141 // Use tax server by default, unless server fails and lookInBioseq is true.
GetTaxIDFromBioseq(const CBioseq & bioseq,bool lookInBioseq)142 TTaxId TaxClient::GetTaxIDFromBioseq(const CBioseq& bioseq, bool lookInBioseq) {
143
144 TTaxId taxid = ZERO_TAX_ID;
145 list< CRef< CSeqdesc > >::const_iterator j, jend;
146
147 if (bioseq.IsSetDescr())
148 {
149 jend = bioseq.GetDescr().Get().end();
150
151 // look through the sequence descriptions
152 for (j=bioseq.GetDescr().Get().begin(); j!=jend; j++)
153 {
154 const COrg_ref *org = NULL;
155 if ((*j)->IsOrg())
156 org = &((*j)->GetOrg());
157 else if ((*j)->IsSource())
158 org = &((*j)->GetSource().GetOrg());
159 if (org)
160 {
161 // Use tax server
162 if (IsAlive())
163 {
164 if ((taxid = m_taxonomyClient->GetTaxIdByOrgRef(*org)) != ZERO_TAX_ID)
165 {
166 if (taxid < ZERO_TAX_ID)
167 { // multiple tax nodes; -taxid is one of them
168 taxid = -taxid;
169 }
170 break;
171 }
172 }
173 // Use bioseq, which may be obsolete, only when requested and tax server failed.
174 if (taxid == ZERO_TAX_ID && lookInBioseq)
175 { // is there an ID in the bioseq itself if fails
176 vector < CRef< CDbtag > >::const_iterator k, kend = org->GetDb().end();
177 for (k=org->GetDb().begin(); k != kend; ++k) {
178 if ((*k)->GetDb() == "taxon") {
179 if ((*k)->GetTag().IsId()) {
180 taxid = TAX_ID_FROM(CObject_id::TId, (*k)->GetTag().GetId());
181 break;
182 }
183 }
184 }
185 }
186 } //end if (org)
187 }//end for
188 }
189 return taxid;
190 }
191
GetRankID(TTaxId taxId,string & rankName)192 short TaxClient::GetRankID(TTaxId taxId, string& rankName)
193 {
194 short rankId = -1;
195 if(IsAlive())
196 {
197 CRef< ITreeIterator > nodeIt = m_taxonomyClient->GetTreeIterator( taxId );
198 rankId = nodeIt->GetNode()->GetRank();
199 m_taxonomyClient->GetRankName(rankId, rankName);
200 }
201
202 return rankId;
203 }
204 // get info for taxid
GetTaxNameForTaxID(TTaxId taxid)205 std::string TaxClient::GetTaxNameForTaxID(TTaxId taxid)
206 {
207 std::string taxName = kEmptyStr;
208 if (taxid <= ZERO_TAX_ID)
209 return taxName;
210 if (taxid == TAX_ID_CONST(1))
211 {
212 taxName = "Root";
213 return taxName;
214 }
215 if (IsAlive())
216 {
217 CRef < CTaxon2_data > data = m_taxonomyClient->GetById(taxid);
218 if (data->IsSetOrg() && data->GetOrg().IsSetTaxname()) {
219 taxName = data->GetOrg().GetTaxname();
220 }
221 }
222
223 return taxName;
224 }
225
226 // get parent for taxid
GetParentTaxID(TTaxId taxid)227 TTaxId TaxClient::GetParentTaxID(TTaxId taxid)
228 {
229 TTaxId parent = ZERO_TAX_ID;
230 if (IsAlive())
231 parent = m_taxonomyClient->GetParent(taxid);
232 return parent;
233 }
234
GetSuperKingdom(TTaxId taxid)235 string TaxClient::GetSuperKingdom(TTaxId taxid) {
236
237 TTaxId skId = m_taxonomyClient->GetSuperkingdom(taxid);
238 return (skId == INVALID_TAX_ID) ? kEmptyStr : GetTaxNameForTaxID(skId);
239 }
240
GetDisplayCommonName(TTaxId taxid,string & displayCommonName)241 bool TaxClient::GetDisplayCommonName(TTaxId taxid, string& displayCommonName)
242 {
243 if (IsAlive()) {
244 return m_taxonomyClient->GetDisplayCommonName(taxid, displayCommonName);
245 } else {
246 return false;
247 }
248 }
249
250 //is taxid2 the descendant of taxid1?
IsTaxDescendant(TTaxId taxid1,TTaxId taxid2)251 bool TaxClient::IsTaxDescendant(TTaxId taxid1, TTaxId taxid2)
252 {
253 if (IsAlive()) {
254 TTaxId ancestor = m_taxonomyClient->Join(taxid1, taxid2);
255 return (ancestor == taxid1);
256 } else
257 return false;
258 }
259
GetOrgRef(TTaxId tax_id,bool & is_species,bool & is_uncultured,string & blast_name)260 CConstRef< COrg_ref > TaxClient::GetOrgRef(TTaxId tax_id, bool& is_species, bool& is_uncultured, string& blast_name)
261 {
262 if (IsAlive()) {
263 return m_taxonomyClient->GetOrgRef(tax_id, is_species, is_uncultured, blast_name);
264 } else {
265 // following the behavior of CTaxon1::GetOrgRef on failure.
266 return ncbi::null;
267 }
268 }
269
Join(TTaxId taxid1,TTaxId taxid2)270 TTaxId TaxClient::Join(TTaxId taxid1, TTaxId taxid2)
271 {
272 if (IsAlive()) {
273 return m_taxonomyClient->Join(taxid1, taxid2);
274 } else {
275 return ZERO_TAX_ID;
276 }
277 }
278
GetFullLineage(TTaxId taxid,vector<TTaxId> & lineageFromRoot)279 bool TaxClient::GetFullLineage(TTaxId taxid, vector<TTaxId>& lineageFromRoot)
280 {
281 TTaxId rootTaxid = TAX_ID_CONST(1);
282 vector<TTaxId> v;
283 v.push_back(rootTaxid);
284 v.push_back(taxid);
285 lineageFromRoot.clear();
286
287 if (IsAlive() && m_taxonomyClient->GetPopsetJoin(v, lineageFromRoot) && lineageFromRoot.size() > 0) {
288 lineageFromRoot.push_back(taxid); // GetPopsetJoin does not put taxid at the end of the lineage
289 return true;
290 }
291 return false;
292 }
293
GetFullLineage(TTaxId taxid,vector<pair<TTaxId,string>> & lineageFromRoot,bool useCommonName)294 bool TaxClient::GetFullLineage(TTaxId taxid, vector< pair<TTaxId, string> >& lineageFromRoot, bool useCommonName)
295 {
296 string commonName;
297 string root("root");
298 vector<TTaxId> lineageAsTaxids;
299 pair<TTaxId, string> p;
300
301 lineageFromRoot.clear();
302 if (GetFullLineage(taxid, lineageAsTaxids)) {
303 ITERATE(vector<TTaxId>, vit, lineageAsTaxids) {
304 p.first = *vit;
305 if (!useCommonName) { // formal name via the COrg_ref
306 commonName = GetTaxNameForTaxID(*vit);
307 if (commonName.length() > 0) {
308 p.second = (p.first > TAX_ID_CONST(1)) ? commonName : root;
309 } else {
310 p.second = kEmptyStr;
311 }
312 } else if (useCommonName && GetDisplayCommonName(*vit, commonName) && commonName.length() > 0) {
313 p.second = (p.first > TAX_ID_CONST(1)) ? commonName : root;
314 } else {
315 p.second = kEmptyStr;
316 }
317 lineageFromRoot.push_back(p);
318 }
319 return true;
320 }
321 return false;
322 }
323
324
325 END_SCOPE(cd_utils)
326 END_NCBI_SCOPE
327