1 /* $Id: cuTaxTree.cpp 609836 2020-06-08 15:56:03Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Charlie Liu
27 *
28 * File Description:
29 * part of CDTree app
30 */
31 #include <ncbi_pch.hpp>
32 #include <algo/structure/cd_utils/cuTaxTree.hpp>
33 #include <algo/structure/cd_utils/cuTaxClient.hpp>
34 #include <fstream>
35 #include <stdio.h>
36
37 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(cd_utils)38 BEGIN_SCOPE(cd_utils)
39
40 //TaxNode
41 TaxNode::TaxNode()
42 {
43 init();
44 }
45
TaxNode(const TaxNode & rhs)46 TaxNode::TaxNode(const TaxNode& rhs): taxId(rhs.taxId),
47 orgName(rhs.orgName), rankId(rhs.rankId), rowId(rhs.rowId), cd(rhs.cd), seqName(rhs.seqName),
48 numLeaves(rhs.numLeaves),selectedLeaves(rhs.selectedLeaves)
49 {
50 }
51
init()52 void TaxNode::init()
53 {
54 taxId= INVALID_TAX_ID;
55 rankId =-1;
56
57 rowId = -1;
58 cd = 0;
59
60 numLeaves = 0;
61 selectedLeaves = 0;
62 }
63
makeTaxNode(TTaxId taxID,std::string taxName,short rankId)64 TaxNode* TaxNode::makeTaxNode(TTaxId taxID, std::string taxName, short rankId)
65 {
66 TaxNode* node = new TaxNode();
67 node->taxId = taxID;
68 node->rankId = rankId;
69 node->orgName = taxName;
70 return node;
71 }
72
makeSeqLeaf(int rowID,std::string sequenceName)73 TaxNode* TaxNode::makeSeqLeaf(int rowID, std::string sequenceName)
74 {
75 TaxNode* node = new TaxNode();
76 node->rowId = rowID;
77 node->seqName = sequenceName;
78 node->numLeaves = 1;
79 return node;
80 }
81
makeSubSeqLeaf(int rowID,CCdCore * cd,int rowInCd)82 TaxNode* TaxNode::makeSubSeqLeaf(int rowID, CCdCore* cd, int rowInCd)
83 {
84 TaxNode* node = new TaxNode();
85 node->rowId = rowID;
86 node->cd = cd;
87 char name[500];
88 sprintf(name,"row_%d_of_%s", rowInCd, cd->GetAccession().c_str());
89 node->seqName = name;
90 node->numLeaves = 1;
91 return node;
92 }
93
isSeqLeaf(const TaxNode & node)94 bool TaxNode::isSeqLeaf(const TaxNode& node)
95 {
96 return node.rowId >= 0;
97 }
98
isSubSeqLeaf(const TaxNode & node)99 bool TaxNode::isSubSeqLeaf(const TaxNode& node)
100 {
101 return node.cd != NULL;
102 }
103
TaxTreeData(const AlignmentCollection & ac)104 TaxTreeData :: TaxTreeData(const AlignmentCollection& ac)
105 : TaxonomyTree(), m_ac(ac),
106 m_rowToTaxNode(), m_rankNameToId(), m_failedRows()
107 {
108 m_taxDataSource = new TaxClient();
109 if (m_taxDataSource->init())
110 makeTaxonomyTree();
111 }
112
113
~TaxTreeData()114 TaxTreeData :: ~TaxTreeData()
115 {
116 delete m_taxDataSource;
117 }
118
119 //select all seqs under this tax node
selectTaxNode(TaxTreeIterator & taxNode,bool select)120 void TaxTreeData::selectTaxNode(TaxTreeIterator& taxNode, bool select)
121 {
122 //deselectAllTaxNodes();
123 vector<TaxTreeIterator> nodes;
124 getAllLeafNodes(taxNode, nodes);
125 for (unsigned int i = 0; i < nodes.size(); i++)
126 {
127 selectTaxTreeLeaf(nodes[i], select);
128 }
129 }
130
getAllLeafNodes(const TaxTreeIterator & taxNode,vector<TaxTreeIterator> & nodes) const131 int TaxTreeData::getAllLeafNodes(const TaxTreeIterator& taxNode, vector<TaxTreeIterator>& nodes) const
132 {
133 if (TaxNode::isSeqLeaf(*taxNode))
134 {
135 nodes.push_back(taxNode);
136 return nodes.size();
137 }
138 TaxonomyTree::sibling_iterator sib = taxNode.begin();
139 while (sib != taxNode.end())
140 {
141 getAllLeafNodes(sib, nodes); //recursive
142 ++sib;
143 }
144 return nodes.size();
145 }
146
setSelections(const vector<int> & rowIDs,CCdCore * cd)147 void TaxTreeData::setSelections(const vector<int>& rowIDs, CCdCore* cd)
148 {
149 for (unsigned int i = 0; i < rowIDs.size(); i ++)
150 {
151 RowToTaxNode::iterator mapIt = m_rowToTaxNode.find(rowIDs[i]);
152 if (mapIt != m_rowToTaxNode.end())
153 {
154 TaxonomyTree::iterator taxIt = mapIt->second;
155 selectTaxTreeLeaf(taxIt, true, cd);
156 }
157 }
158 }
159
160
getSelections(vector<int> & rows)161 int TaxTreeData::getSelections(vector<int>& rows)
162 {
163 for (RowToTaxNode::iterator i = m_rowToTaxNode.begin();
164 i != m_rowToTaxNode.end(); i++)
165 {
166 TaxonomyTree::iterator taxIt = i->second;
167 if (taxIt->selectedLeaves > 0)
168 rows.push_back(taxIt->rowId);
169 }
170 return rows.size();
171 }
172
173
selectTaxTreeLeaf(const TaxonomyTree::iterator & cursor,bool select,CCdCore * cd)174 void TaxTreeData::selectTaxTreeLeaf(const TaxonomyTree::iterator& cursor, bool select, CCdCore* cd)
175 {
176 if (TaxNode::isSeqLeaf(*cursor)) //make sure cursor points to a leaf
177 {
178 if (cursor.number_of_children() > 0) //multiple instances for the seqLoc
179 {
180 TaxonomyTree::sibling_iterator sib = cursor.begin();
181 while (sib != cursor.end())
182 {
183 selectTaxTreeLeaf(sib, select, cd);
184 ++sib;
185 }
186 }
187 else if (TaxNode::isSubSeqLeaf(*cursor))
188 {
189 if (cd == 0)
190 {
191 if (select)
192 cursor->selectedLeaves = 1;
193 else
194 cursor->selectedLeaves = 0;
195 }
196 else if (cursor->cd == cd)
197 {
198 if (select)
199 cursor->selectedLeaves = 1;
200 else
201 cursor->selectedLeaves = 0;
202 }
203 }
204 else //seq node with no children
205 {
206 if (select)
207 cursor->selectedLeaves = 1;
208 else
209 cursor->selectedLeaves = 0;
210 }
211 }
212 }
213
clearSelection()214 void TaxTreeData::clearSelection()
215 {
216 deselectAllTaxNodes();
217 fillLeafCount(begin());
218 }
219
deselectAllTaxNodes()220 void TaxTreeData::deselectAllTaxNodes()
221 {
222 for (RowToTaxNode::iterator i = m_rowToTaxNode.begin();
223 i != m_rowToTaxNode.end(); i++)
224 {
225 TaxonomyTree::iterator taxIt = i->second;
226 selectTaxTreeLeaf(taxIt, false);
227 }
228 }
229
230 /*
231 void TaxTreeData::deselectTaxNodes(CCd* cd)
232 {
233 for (RowToTaxNode::iterator i = m_rowToTaxNode.begin();
234 i != m_rowToTaxNode.end(); i++)
235 {
236 TaxonomyTree::iterator taxIt = i->second;
237 if (taxIt->cd == cd)
238 taxIt->selectedLeaves = 0;
239 }
240 }*/
241
makeTaxonomyTree()242 bool TaxTreeData::makeTaxonomyTree()
243 {
244 //connect
245 if (!m_taxDataSource->IsAlive())
246 return false;
247 //add root
248 TaxNode* root = TaxNode::makeTaxNode(TAX_ID_CONST(1),"Root");
249 insert(begin(), *root);
250 delete root;
251 //retrieve tax lineage for each sequence and add it to the tree
252 addRows(m_ac);
253 fillLeafCount(begin());
254 return true;
255 }
256
addRows(const AlignmentCollection & ac)257 void TaxTreeData::addRows(const AlignmentCollection& ac)
258 {
259 int num = ac.GetNumRows();
260 m_failedRows.clear();
261 for (int i = 0; i < num; i++)
262 {
263 std::string seqName;
264 ac.Get_GI_or_PDB_String_FromAlignment(i, seqName);
265 TTaxId taxid = GetTaxIDForSequence(ac, i);
266 if (taxid <= ZERO_TAX_ID)
267 m_failedRows.push_back(i);
268 else
269 addSeqTax(i, seqName, taxid);
270 }
271 }
272
fillLeafCount(const TaxTreeIterator & cursor)273 void TaxTreeData::fillLeafCount(const TaxTreeIterator& cursor)
274 {
275 //reset my count if I am not a leaf
276 if (cursor.number_of_children() > 0)
277 {
278 cursor->numLeaves = 0;
279 cursor->selectedLeaves = 0;
280 }
281 // fill each child's count
282 TaxonomyTree::sibling_iterator sib = cursor.begin();
283 while (sib != cursor.end())
284 {
285 fillLeafCount(sib); //recursive
286 ++sib;
287 }
288 //add my count to parent
289 if (cursor != begin())
290 {
291 TaxonomyTree::iterator parentIt = parent(cursor);
292 parentIt->numLeaves += cursor->numLeaves;
293 parentIt->selectedLeaves += cursor->selectedLeaves;
294 }
295 }
296
addSeqTax(int rowID,string seqName,TTaxId taxid)297 void TaxTreeData::addSeqTax(int rowID, string seqName, TTaxId taxid)
298 {
299 stack<TaxNode*> lineage;
300 if (taxid <= ZERO_TAX_ID) //invalid tax node, ignore this sequence
301 return;
302 TaxNode *seq = TaxNode::makeSeqLeaf(rowID, seqName);
303 string rankName;
304 short rank= m_taxDataSource->GetRankID(taxid, rankName);
305 TaxNode *tax = TaxNode::makeTaxNode(taxid, m_taxDataSource->GetTaxNameForTaxID(taxid), rank);
306 cacheRank(rank, rankName);
307 lineage.push(seq);
308 lineage.push(tax);
309 growAndInsertLineage(lineage);
310 }
311
growAndInsertLineage(stack<TaxNode * > & lineage)312 void TaxTreeData::growAndInsertLineage(stack<TaxNode*>& lineage)
313 {
314 TaxNode* top = lineage.top();
315 TaxonomyTree::iterator pos = begin();
316 while (pos != end() && !(*pos == *top) ) {
317 ++pos;
318 }
319 if (pos != end()) //top is alreaddy in the tree
320 {
321 lineage.pop();
322 delete top;
323 insertLineage(pos, lineage);
324 return; //done
325 }
326 else //top is not in the tree, keep growing lineage
327 {
328 TTaxId parentId = m_taxDataSource->GetParentTaxID(top->taxId);
329 string rankName;
330 short rank= m_taxDataSource->GetRankID(parentId, rankName);
331 cacheRank(rank, rankName);
332 TaxNode* parentTax = TaxNode::makeTaxNode(parentId, m_taxDataSource->GetTaxNameForTaxID(parentId), rank);
333 lineage.push(parentTax);
334 growAndInsertLineage(lineage);
335 }
336 }
337
insertLineage(TaxonomyTree::iterator & pos,stack<TaxNode * > & lineage)338 void TaxTreeData::insertLineage(TaxonomyTree::iterator& pos, stack<TaxNode*>& lineage)
339 {
340 TaxonomyTree::iterator cursor = pos;
341 while(!lineage.empty())
342 {
343 TaxNode* topNode = lineage.top();
344 cursor = append_child(cursor,*topNode);
345 lineage.pop();
346 delete topNode;
347 }
348 //the last node added is a leaf node.
349 //cache it to speed up the operation of selecting a sequence/leaf
350 m_rowToTaxNode.insert(RowToTaxNode::value_type(cursor->rowId, cursor));
351 //add multiple instances if the row is duplicated in several CDs
352 vector<RowSource> rss;
353 m_ac.GetRowSourceTable().findEntries(cursor->rowId, rss, true); //true = scoped only
354 if (rss.size() <= 1)
355 return;
356 for (unsigned int i = 0; i < rss.size(); i++)
357 {
358 if (m_ac.isCDInScope(rss[i].cd))
359 {
360 TaxNode* subSeqNode =
361 TaxNode::makeSubSeqLeaf(cursor->rowId, rss[i].cd, rss[i].rowInSrc);
362 append_child(cursor,*subSeqNode);
363 delete subSeqNode;
364 }
365 }
366 }
367
cacheRank(short rank,string rankName)368 void TaxTreeData::cacheRank(short rank, string rankName)
369 {
370 if (rank >= 0)
371 m_rankNameToId.insert(RankNameToId::value_type(rankName, rank));
372 }
373
getRankId(string rankName)374 short TaxTreeData::getRankId(string rankName)
375 {
376 RankNameToId::iterator mit = m_rankNameToId.find(rankName);
377 if (mit != m_rankNameToId.end())
378 return mit->second;
379 else return -1;
380 }
381
382 // get integer taxid for a sequence
GetTaxIDForSequence(const AlignmentCollection & aligns,int rowID)383 TTaxId TaxTreeData::GetTaxIDForSequence(const AlignmentCollection& aligns, int rowID)
384 {
385 TTaxId taxid = ZERO_TAX_ID;
386 std::string err = "no gi or source info";
387 // try to get "official" tax info from gi
388 TGi gi = ZERO_GI;
389 bool gotGI = aligns.GetGI(rowID, gi, false);
390 if (gotGI)
391 {
392 taxid = m_taxDataSource->GetTaxIDForGI(gi);
393 }
394 CRef< CSeq_entry > seqEntry;
395 if (aligns.GetSeqEntryForRow(rowID, seqEntry))
396 {
397 if (seqEntry->IsSeq())
398 {
399 TTaxId localTaxid = m_taxDataSource->GetTaxIDFromBioseq(seqEntry->GetSeq(), true);
400 if(localTaxid != taxid)
401 {
402 if (taxid != ZERO_TAX_ID) {
403 string taxName = m_taxDataSource->GetTaxNameForTaxID(taxid);
404 addTaxToBioseq(seqEntry->SetSeq(), taxid, taxName);
405 } else
406 taxid = localTaxid;
407 }
408
409 }
410 }
411 return taxid;
412 }
413
addTaxToBioseq(CBioseq & bioseq,TTaxId taxid,string & taxName)414 void TaxTreeData::addTaxToBioseq(CBioseq& bioseq, TTaxId taxid, string& taxName)
415 {
416 //bool hasSource = false;
417 //get BioSource if there is none in bioseq
418 CSeq_descr& seqDescr = bioseq.SetDescr();
419 if (seqDescr.IsSet())
420 {
421 list< CRef< CSeqdesc > >& descrList = seqDescr.Set();
422 list< CRef< CSeqdesc > >::iterator cit = descrList.begin();
423 //remove old orgRef
424 while (cit != descrList.end())
425 {
426 if ((*cit)->IsSource())
427 {
428 cit = descrList.erase(cit);
429 }
430 else
431 cit++;
432 }
433 //add new orgRef
434 //create a source seedsc and add it
435 CRef< CSeqdesc > source(new CSeqdesc);
436 COrg_ref& orgRef = source->SetSource().SetOrg();
437 orgRef.SetTaxId(taxid);
438 orgRef.SetTaxname(taxName);
439 descrList.push_back(source);
440 }
441 }
442
isEmpty() const443 bool TaxTreeData::isEmpty()const
444 {
445 return m_rowToTaxNode.size() == 0;
446 }
447
writeOutRanks()448 void TaxTreeData::writeOutRanks()
449 {
450 std::ofstream fout(".\\SeqTree\\ranks");
451 if (!fout.good())
452 return;
453 for (RankNameToId::iterator mit = m_rankNameToId.begin(); mit != m_rankNameToId.end(); mit++)
454 {
455 fout<<mit->first<<' '<<mit->second<<endl;
456 }
457 fout.close();
458 }
459
writeToFile(string fname) const460 bool TaxTreeData::writeToFile(string fname)const
461 {
462 ofstream os(fname.c_str());
463 if (os.good())
464 {
465 write(os, begin());
466 return true;
467 }
468 else
469 return false;
470
471 }
472
writeToFileAsTable(string fname) const473 bool TaxTreeData::writeToFileAsTable(string fname)const
474 {
475 ofstream os(fname.c_str());
476 if (!os.good())
477 return false;
478 writeAsTable(os, begin(), begin());
479 return true;
480 }
481
writeAsTable(std::ostream & os,const iterator & cursor,const iterator & branchingNode) const482 bool TaxTreeData::writeAsTable(std::ostream&os, const iterator& cursor, const iterator& branchingNode)const
483 {
484 if (!os.good())
485 return false;
486 if (number_of_children(cursor) > 1)//new branch
487 {
488 os <<cursor->taxId<<","<<branchingNode->taxId<<"\n";
489 sibling_iterator sib = cursor.begin();
490 while (sib != cursor.end())
491 {
492 writeAsTable(os,sib, cursor); //recursive
493 ++sib;
494 }
495 }
496 else if (number_of_children(cursor) == 1)// continue on the same branch
497 {
498 sibling_iterator onlychild = child(cursor,0);
499 writeAsTable(os, onlychild, branchingNode);
500 }
501 else //reach the end of a branch
502 {
503 os<<cursor->seqName<<","<<branchingNode->taxId<<"\n";
504 }
505 return true;
506 }
507
write(std::ostream & os,const iterator & cursor) const508 bool TaxTreeData::write(std::ostream&os, const iterator& cursor)const
509 {
510 if (!os.good())
511 return false;
512 // if leaf, print leaf and return
513 if (cursor->rowId >= 0)
514 {
515 os<<cursor->seqName;
516 }
517 else
518 {
519 // print (
520 if (number_of_children(cursor) > 1)
521 os<<'(';
522 // print each child
523 sibling_iterator sib = cursor.begin();
524 while (sib != cursor.end())
525 {
526 write(os,sib); //recursive
527 ++sib;
528 }
529 // print )
530 if (number_of_children(cursor) > 1)
531 {
532 os<<") ";
533 //os<<cursor->orgName;
534 }
535 }
536 if (cursor == begin())
537 os<<";";
538 // if this node has sibling, print ","
539 else if (number_of_siblings(cursor) > 1)
540 os<<',';
541 //return true;
542
543 return true;
544 }
545
getParentAtRank(int row,string rankName)546 TaxTreeIterator TaxTreeData::getParentAtRank(int row, string rankName)
547 {
548 short rank = getRankId(rankName);
549 if (rank < 0)
550 return begin();
551 TaxTreeIterator& leaf = m_rowToTaxNode[row];
552 if (leaf->rowId < 0)
553 return begin();
554 TaxTreeIterator parentNode = parent(leaf);
555 TaxTreeIterator oneBeforeParentNode = leaf;
556 while (parentNode != begin())
557 {
558 if (parentNode->rankId == rank)
559 return parentNode;
560 else
561 {
562 oneBeforeParentNode = parentNode;
563 parentNode = parent(parentNode);
564 }
565 }
566 //can't find the specified rank, return the one node beneath the root
567 return oneBeforeParentNode;
568 }
569
570 END_SCOPE(cd_utils)
571 END_NCBI_SCOPE
572