1 /* $Id: cuTaxTree.cpp 609836 2020-06-08 15:56:03Z grichenk $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Charlie Liu
27  *
28  * File Description:
29  *   part of CDTree app
30  */
31 #include <ncbi_pch.hpp>
32 #include <algo/structure/cd_utils/cuTaxTree.hpp>
33 #include <algo/structure/cd_utils/cuTaxClient.hpp>
34 #include <fstream>
35 #include <stdio.h>
36 
37 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(cd_utils)38 BEGIN_SCOPE(cd_utils)
39 
40 	//TaxNode
41 TaxNode::TaxNode()
42 {
43 	init();
44 }
45 
TaxNode(const TaxNode & rhs)46 TaxNode::TaxNode(const TaxNode& rhs): taxId(rhs.taxId),
47 	orgName(rhs.orgName), rankId(rhs.rankId), rowId(rhs.rowId), cd(rhs.cd), seqName(rhs.seqName),
48 	numLeaves(rhs.numLeaves),selectedLeaves(rhs.selectedLeaves)
49 {
50 }
51 
init()52 void TaxNode::init()
53 {
54 	taxId= INVALID_TAX_ID;
55 	rankId =-1;
56 
57 	rowId = -1;
58 	cd = 0;
59 
60 	numLeaves = 0;
61 	selectedLeaves = 0;
62 }
63 
makeTaxNode(TTaxId taxID,std::string taxName,short rankId)64 TaxNode* TaxNode::makeTaxNode(TTaxId taxID, std::string taxName, short rankId)
65 {
66 	TaxNode*  node = new TaxNode();
67 	node->taxId = taxID;
68 	node->rankId = rankId;
69 	node->orgName = taxName;
70 	return node;
71 }
72 
makeSeqLeaf(int rowID,std::string sequenceName)73 TaxNode* TaxNode::makeSeqLeaf(int rowID, std::string sequenceName)
74 {
75 	TaxNode* node = new TaxNode();
76 	node->rowId = rowID;
77 	node->seqName = sequenceName;
78 	node->numLeaves = 1;
79 	return node;
80 }
81 
makeSubSeqLeaf(int rowID,CCdCore * cd,int rowInCd)82 TaxNode* TaxNode::makeSubSeqLeaf(int rowID, CCdCore* cd, int rowInCd)
83 {
84 	TaxNode* node = new TaxNode();
85 	node->rowId = rowID;
86 	node->cd = cd;
87 	char name[500];
88 	sprintf(name,"row_%d_of_%s", rowInCd, cd->GetAccession().c_str());
89 	node->seqName = name;
90 	node->numLeaves = 1;
91 	return node;
92 }
93 
isSeqLeaf(const TaxNode & node)94 bool TaxNode::isSeqLeaf(const TaxNode& node)
95 {
96 	return node.rowId >= 0;
97 }
98 
isSubSeqLeaf(const TaxNode & node)99 bool TaxNode::isSubSeqLeaf(const TaxNode& node)
100 {
101 	return node.cd != NULL;
102 }
103 
TaxTreeData(const AlignmentCollection & ac)104 TaxTreeData :: TaxTreeData(const AlignmentCollection& ac)
105 				: TaxonomyTree(), m_ac(ac),
106 				m_rowToTaxNode(), m_rankNameToId(), m_failedRows()
107 {
108 	m_taxDataSource = new TaxClient();
109 	if (m_taxDataSource->init())
110 		makeTaxonomyTree();
111 }
112 
113 
~TaxTreeData()114 TaxTreeData :: ~TaxTreeData()
115 {
116 	delete m_taxDataSource;
117 }
118 
119 //select all seqs under this tax node
selectTaxNode(TaxTreeIterator & taxNode,bool select)120 void TaxTreeData::selectTaxNode(TaxTreeIterator& taxNode, bool select)
121 {
122 	//deselectAllTaxNodes();
123 	vector<TaxTreeIterator> nodes;
124 	getAllLeafNodes(taxNode, nodes);
125 	for (unsigned int i = 0; i < nodes.size(); i++)
126 	{
127 		selectTaxTreeLeaf(nodes[i], select);
128 	}
129 }
130 
getAllLeafNodes(const TaxTreeIterator & taxNode,vector<TaxTreeIterator> & nodes) const131 int TaxTreeData::getAllLeafNodes(const TaxTreeIterator& taxNode, vector<TaxTreeIterator>& nodes) const
132 {
133 	if (TaxNode::isSeqLeaf(*taxNode))
134 	{
135 		nodes.push_back(taxNode);
136 		return nodes.size();
137 	}
138 	TaxonomyTree::sibling_iterator sib = taxNode.begin();
139 	while (sib != taxNode.end())
140 	{
141 		getAllLeafNodes(sib, nodes);  //recursive
142 		++sib;
143 	}
144 	return nodes.size();
145 }
146 
setSelections(const vector<int> & rowIDs,CCdCore * cd)147 void TaxTreeData::setSelections(const vector<int>& rowIDs, CCdCore* cd)
148 {
149 	for (unsigned int i = 0; i < rowIDs.size(); i ++)
150 	{
151 		RowToTaxNode::iterator mapIt = m_rowToTaxNode.find(rowIDs[i]);
152 		if (mapIt != m_rowToTaxNode.end())
153 		{
154 			TaxonomyTree::iterator taxIt = mapIt->second;
155 			selectTaxTreeLeaf(taxIt, true, cd);
156 		}
157 	}
158 }
159 
160 
getSelections(vector<int> & rows)161 int TaxTreeData::getSelections(vector<int>& rows)
162 {
163 	for (RowToTaxNode::iterator i = m_rowToTaxNode.begin();
164 		i != m_rowToTaxNode.end(); i++)
165 	{
166 		TaxonomyTree::iterator taxIt = i->second;
167 		if (taxIt->selectedLeaves > 0)
168 			rows.push_back(taxIt->rowId);
169 	}
170 	return rows.size();
171 }
172 
173 
selectTaxTreeLeaf(const TaxonomyTree::iterator & cursor,bool select,CCdCore * cd)174 void TaxTreeData::selectTaxTreeLeaf(const TaxonomyTree::iterator& cursor, bool select, CCdCore* cd)
175 {
176 	if (TaxNode::isSeqLeaf(*cursor)) //make sure cursor points to a leaf
177 	{
178 		if (cursor.number_of_children() > 0) //multiple instances for the seqLoc
179 		{
180 			TaxonomyTree::sibling_iterator sib = cursor.begin();
181 			while (sib != cursor.end())
182 			{
183 				selectTaxTreeLeaf(sib, select, cd);
184 				++sib;
185 			}
186 		}
187 		else if (TaxNode::isSubSeqLeaf(*cursor))
188 		{
189 			if (cd == 0)
190 			{
191 				if (select)
192 					cursor->selectedLeaves = 1;
193 				else
194 					cursor->selectedLeaves = 0;
195 			}
196 			else if (cursor->cd == cd)
197 			{
198 				if (select)
199 					cursor->selectedLeaves = 1;
200 				else
201 					cursor->selectedLeaves = 0;
202 			}
203 		}
204 		else //seq node with no children
205 		{
206 			if (select)
207 				cursor->selectedLeaves = 1;
208 			else
209 				cursor->selectedLeaves = 0;
210 		}
211 	}
212 }
213 
clearSelection()214 void TaxTreeData::clearSelection()
215 {
216 	deselectAllTaxNodes();
217 	fillLeafCount(begin());
218 }
219 
deselectAllTaxNodes()220 void TaxTreeData::deselectAllTaxNodes()
221 {
222 	for (RowToTaxNode::iterator i = m_rowToTaxNode.begin();
223 		i != m_rowToTaxNode.end(); i++)
224 	{
225 		TaxonomyTree::iterator taxIt = i->second;
226 		selectTaxTreeLeaf(taxIt, false);
227 	}
228 }
229 
230 /*
231 void TaxTreeData::deselectTaxNodes(CCd* cd)
232 {
233 	for (RowToTaxNode::iterator i = m_rowToTaxNode.begin();
234 		i != m_rowToTaxNode.end(); i++)
235 	{
236 		TaxonomyTree::iterator taxIt = i->second;
237 		if (taxIt->cd == cd)
238 			taxIt->selectedLeaves = 0;
239 	}
240 }*/
241 
makeTaxonomyTree()242 bool TaxTreeData::makeTaxonomyTree()
243 {
244 	//connect
245 	if (!m_taxDataSource->IsAlive())
246 		return false;
247 	//add root
248 	TaxNode* root = TaxNode::makeTaxNode(TAX_ID_CONST(1),"Root");
249 	insert(begin(), *root);
250 	delete root;
251 	//retrieve tax lineage for each sequence and add it to the tree
252 	addRows(m_ac);
253 	fillLeafCount(begin());
254 	return true;
255 }
256 
addRows(const AlignmentCollection & ac)257 void TaxTreeData::addRows(const AlignmentCollection& ac)
258 {
259 	int num = ac.GetNumRows();
260 	m_failedRows.clear();
261 	for (int i = 0; i < num; i++)
262 	{
263 		std::string seqName;
264 		ac.Get_GI_or_PDB_String_FromAlignment(i, seqName);
265 		TTaxId taxid = GetTaxIDForSequence(ac, i);
266 		if (taxid <= ZERO_TAX_ID)
267 			m_failedRows.push_back(i);
268 		else
269 			addSeqTax(i, seqName, taxid);
270 	}
271 }
272 
fillLeafCount(const TaxTreeIterator & cursor)273 void TaxTreeData::fillLeafCount(const TaxTreeIterator& cursor)
274 {
275 	//reset my count if I am not a leaf
276 	if (cursor.number_of_children() > 0)
277 	{
278 		cursor->numLeaves = 0;
279 		cursor->selectedLeaves = 0;
280 	}
281 	// fill each child's count
282 	TaxonomyTree::sibling_iterator sib = cursor.begin();
283 	while (sib != cursor.end())
284 	{
285 		fillLeafCount(sib);  //recursive
286 		++sib;
287 	}
288 	//add my count to parent
289 	if (cursor != begin())
290 	{
291 		TaxonomyTree::iterator parentIt = parent(cursor);
292 		parentIt->numLeaves += cursor->numLeaves;
293 		parentIt->selectedLeaves += cursor->selectedLeaves;
294 	}
295 }
296 
addSeqTax(int rowID,string seqName,TTaxId taxid)297 void TaxTreeData::addSeqTax(int rowID, string seqName, TTaxId taxid)
298 {
299 	stack<TaxNode*> lineage;
300 	if (taxid <= ZERO_TAX_ID) //invalid tax node, ignore this sequence
301 		return;
302 	TaxNode *seq = TaxNode::makeSeqLeaf(rowID, seqName);
303 	string rankName;
304 	short rank= m_taxDataSource->GetRankID(taxid, rankName);
305 	TaxNode *tax = TaxNode::makeTaxNode(taxid, m_taxDataSource->GetTaxNameForTaxID(taxid), rank);
306 	cacheRank(rank, rankName);
307 	lineage.push(seq);
308 	lineage.push(tax);
309 	growAndInsertLineage(lineage);
310 }
311 
growAndInsertLineage(stack<TaxNode * > & lineage)312 void TaxTreeData::growAndInsertLineage(stack<TaxNode*>& lineage)
313 {
314 	TaxNode* top = lineage.top();
315 	TaxonomyTree::iterator pos = begin();
316 	while (pos != end()  &&  !(*pos == *top) ) {
317 		++pos;
318 	}
319 	if (pos != end())  //top is alreaddy in the tree
320 	{
321 		lineage.pop();
322 		delete top;
323 		insertLineage(pos, lineage);
324 		return; //done
325 	}
326 	else //top is not in the tree, keep growing lineage
327 	{
328         TTaxId parentId = m_taxDataSource->GetParentTaxID(top->taxId);
329 		string rankName;
330 		short rank= m_taxDataSource->GetRankID(parentId, rankName);
331 		cacheRank(rank, rankName);
332 		TaxNode* parentTax = TaxNode::makeTaxNode(parentId, m_taxDataSource->GetTaxNameForTaxID(parentId), rank);
333 		lineage.push(parentTax);
334 		growAndInsertLineage(lineage);
335 	}
336 }
337 
insertLineage(TaxonomyTree::iterator & pos,stack<TaxNode * > & lineage)338 void TaxTreeData::insertLineage(TaxonomyTree::iterator& pos, stack<TaxNode*>& lineage)
339 {
340 	TaxonomyTree::iterator cursor = pos;
341 	while(!lineage.empty())
342 	{
343 		TaxNode* topNode = lineage.top();
344 		cursor = append_child(cursor,*topNode);
345 		lineage.pop();
346 		delete topNode;
347 	}
348 	//the last node added is a leaf node.
349 	//cache it to speed up the operation of selecting a sequence/leaf
350 	m_rowToTaxNode.insert(RowToTaxNode::value_type(cursor->rowId, cursor));
351 	//add multiple instances if the row is duplicated in several CDs
352 	vector<RowSource> rss;
353 	m_ac.GetRowSourceTable().findEntries(cursor->rowId, rss, true); //true = scoped only
354 	if (rss.size() <= 1)
355 		return;
356 	for (unsigned int i = 0; i < rss.size(); i++)
357 	{
358 		if (m_ac.isCDInScope(rss[i].cd))
359 		{
360 			TaxNode* subSeqNode =
361 			TaxNode::makeSubSeqLeaf(cursor->rowId, rss[i].cd, rss[i].rowInSrc);
362 			append_child(cursor,*subSeqNode);
363 			delete subSeqNode;
364 		}
365 	}
366 }
367 
cacheRank(short rank,string rankName)368 void TaxTreeData::cacheRank(short rank, string rankName)
369 {
370 	if (rank >= 0)
371 		m_rankNameToId.insert(RankNameToId::value_type(rankName, rank));
372 }
373 
getRankId(string rankName)374 short TaxTreeData::getRankId(string rankName)
375 {
376 	RankNameToId::iterator mit = m_rankNameToId.find(rankName);
377 	if (mit != m_rankNameToId.end())
378 		return mit->second;
379 	else return -1;
380 }
381 
382  // get integer taxid for a sequence
GetTaxIDForSequence(const AlignmentCollection & aligns,int rowID)383 TTaxId TaxTreeData::GetTaxIDForSequence(const AlignmentCollection& aligns, int rowID)
384 {
385     TTaxId taxid = ZERO_TAX_ID;
386     std::string err = "no gi or source info";
387 	// try to get "official" tax info from gi
388 	TGi gi = ZERO_GI;
389 	bool gotGI = aligns.GetGI(rowID, gi, false);
390 	if (gotGI)
391 	{
392 		taxid = m_taxDataSource->GetTaxIDForGI(gi);
393 	}
394 	CRef< CSeq_entry > seqEntry;
395 	if (aligns.GetSeqEntryForRow(rowID, seqEntry))
396 	{
397 		if (seqEntry->IsSeq())
398 		{
399             TTaxId localTaxid = m_taxDataSource->GetTaxIDFromBioseq(seqEntry->GetSeq(), true);
400 			if(localTaxid != taxid)
401 			{
402 				if (taxid != ZERO_TAX_ID) {
403 				    string taxName = m_taxDataSource->GetTaxNameForTaxID(taxid);
404 				    addTaxToBioseq(seqEntry->SetSeq(), taxid, taxName);
405 				} else
406 				  taxid = localTaxid;
407 			}
408 
409 		}
410 	}
411 	return taxid;
412 }
413 
addTaxToBioseq(CBioseq & bioseq,TTaxId taxid,string & taxName)414 void TaxTreeData::addTaxToBioseq(CBioseq& bioseq, TTaxId taxid, string& taxName)
415 {
416 	//bool hasSource = false;
417 	//get BioSource if there is none in bioseq
418 	CSeq_descr& seqDescr = bioseq.SetDescr();
419 	if (seqDescr.IsSet())
420 	{
421 		list< CRef< CSeqdesc > >& descrList = seqDescr.Set();
422 		list< CRef< CSeqdesc > >::iterator cit = descrList.begin();
423 		//remove old orgRef
424 		while (cit != descrList.end())
425 		{
426 			if ((*cit)->IsSource())
427 			{
428 				cit = descrList.erase(cit);
429 			}
430 			else
431 				cit++;
432 		}
433 		//add new orgRef
434 		//create a source seedsc and add it
435 		CRef< CSeqdesc > source(new CSeqdesc);
436 		COrg_ref& orgRef = source->SetSource().SetOrg();
437 		orgRef.SetTaxId(taxid);
438 		orgRef.SetTaxname(taxName);
439 		descrList.push_back(source);
440 	}
441 }
442 
isEmpty() const443 bool TaxTreeData::isEmpty()const
444 {
445 	return m_rowToTaxNode.size() == 0;
446 }
447 
writeOutRanks()448 void TaxTreeData::writeOutRanks()
449 {
450 	std::ofstream fout(".\\SeqTree\\ranks");
451 	if (!fout.good())
452 		return;
453 	for (RankNameToId::iterator mit = m_rankNameToId.begin(); mit != m_rankNameToId.end(); mit++)
454 	{
455 		fout<<mit->first<<' '<<mit->second<<endl;
456 	}
457 	fout.close();
458 }
459 
writeToFile(string fname) const460 bool TaxTreeData::writeToFile(string fname)const
461 {
462 	ofstream os(fname.c_str());
463 	if (os.good())
464 	{
465 		write(os, begin());
466 		return true;
467 	}
468 	else
469 		return false;
470 
471 }
472 
writeToFileAsTable(string fname) const473 bool TaxTreeData::writeToFileAsTable(string fname)const
474 {
475 	ofstream os(fname.c_str());
476 	if (!os.good())
477 		return false;
478 	writeAsTable(os, begin(), begin());
479 	return true;
480 }
481 
writeAsTable(std::ostream & os,const iterator & cursor,const iterator & branchingNode) const482 bool TaxTreeData::writeAsTable(std::ostream&os, const iterator& cursor, const iterator& branchingNode)const
483 {
484 	if (!os.good())
485 		return false;
486 	if (number_of_children(cursor) > 1)//new branch
487 	{
488 		os <<cursor->taxId<<","<<branchingNode->taxId<<"\n";
489 		sibling_iterator sib = cursor.begin();
490 		while (sib != cursor.end())
491 		{
492 			writeAsTable(os,sib, cursor);  //recursive
493 			++sib;
494 		}
495 	}
496 	else if (number_of_children(cursor) == 1)// continue on the same branch
497 	{
498 		sibling_iterator onlychild = child(cursor,0);
499 		writeAsTable(os, onlychild, branchingNode);
500 	}
501 	else //reach the end of a branch
502 	{
503 		os<<cursor->seqName<<","<<branchingNode->taxId<<"\n";
504 	}
505 	return true;
506 }
507 
write(std::ostream & os,const iterator & cursor) const508 bool TaxTreeData::write(std::ostream&os, const iterator& cursor)const
509 {
510 	if (!os.good())
511 		return false;
512 	// if leaf, print leaf and return
513 	if (cursor->rowId >= 0)
514 	{
515 		os<<cursor->seqName;
516 	}
517 	else
518 	{
519 		// print (
520 		if (number_of_children(cursor) > 1)
521 			os<<'(';
522 		// print each child
523 		sibling_iterator sib = cursor.begin();
524 		while (sib != cursor.end())
525 		{
526 			write(os,sib);  //recursive
527 			++sib;
528 		}
529 		// print )
530 		if (number_of_children(cursor) > 1)
531 		{
532 			os<<") ";
533 			//os<<cursor->orgName;
534 		}
535 	}
536 	if (cursor == begin())
537 		os<<";";
538 	// if this node has sibling, print ","
539 	else if (number_of_siblings(cursor) > 1)
540 		os<<',';
541 	//return true;
542 
543 	return true;
544 }
545 
getParentAtRank(int row,string rankName)546 TaxTreeIterator TaxTreeData::getParentAtRank(int row, string rankName)
547 {
548 	short rank = getRankId(rankName);
549 	if (rank < 0)
550 		return begin();
551 	TaxTreeIterator& leaf = m_rowToTaxNode[row];
552 	if (leaf->rowId < 0)
553 		return begin();
554 	TaxTreeIterator parentNode = parent(leaf);
555 	TaxTreeIterator oneBeforeParentNode = leaf;
556 	while (parentNode != begin())
557 	{
558 		if (parentNode->rankId == rank)
559 			return parentNode;
560 		else
561 		{
562 			oneBeforeParentNode = parentNode;
563 			parentNode = parent(parentNode);
564 		}
565 	}
566 	//can't find the specified rank, return the one node beneath the root
567 	return oneBeforeParentNode;
568 }
569 
570 END_SCOPE(cd_utils)
571 END_NCBI_SCOPE
572