1 /* $Id: cuTaxTree.hpp 607148 2020-04-30 13:05:44Z grichenk $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Charlie Liu
27  *
28  * File Description: Retrieve and create a Taxonomy tree for several CDs.
29  *   part of CDTree app
30  */
31 
32 #ifndef CU_TAXTREE_HPP
33 #define CU_TAXTREE_HPP
34 #include <algo/structure/cd_utils/cuCppNCBI.hpp>
35 #include <algo/structure/cd_utils/tree_msvc7.hpp>
36 #include <algo/structure/cd_utils/cuTaxClient.hpp>
37 #include <algo/structure/cd_utils/cuAlignmentCollection.hpp>
38 #include <list>
39 #include <stack>
40 #include <algorithm>
41 
42 BEGIN_NCBI_SCOPE
43 USING_SCOPE(objects);
44 BEGIN_SCOPE(cd_utils)
45 
46 class NCBI_CDUTILS_EXPORT TaxNode
47 {
48 public:
49 	//fields for internal tax nodes
50     TTaxId taxId;
51 	std::string orgName;
52 	short rankId;
53 	//fields for external seq nodes
54 	int rowId;
55 	CCdCore* cd;
56 	std::string seqName;
57 	//total and selected leaf counts
58 	int numLeaves;
59 	int selectedLeaves;
60 
61 	//methods
62 	TaxNode();
63 	TaxNode(const TaxNode& rhs);
operator ==(const TaxNode & rhs)64 	bool operator==(const TaxNode& rhs) {return taxId == rhs.taxId;};
65 	static bool isSeqLeaf(const TaxNode& node);
66 	static bool isSubSeqLeaf(const TaxNode& node);
67 	static TaxNode* makeTaxNode(TTaxId taxID, std::string taxName, short rankId=-1);
68 	static TaxNode* makeSeqLeaf(int rowID, std::string sequenceName);
69 	static TaxNode* makeSubSeqLeaf(int rowID, CCdCore* cd, int rowInCd);
70 
71 private:
72 	void init();
73 };
74 
75 typedef tree<TaxNode> TaxonomyTree;
76 typedef TaxonomyTree::iterator TaxTreeIterator;
77 //typedef list<CCd*> CDList;
78 
79 /*  taxonomy ranking -- total 8 level
80  Superkingdom: Eukaryota
81          Kingdom: Metazoa
82            Phylum: Chordata
83              Class: Mammalia
84                Order: Primata
85                  Family: Hominidae
86                    Genus: Homo
87                      Species: sapiens
88 */
89 
90 // define a family hiearchy of CDs
91 class NCBI_CDUTILS_EXPORT TaxTreeData : public TaxonomyTree
92 {
93 public:
94 	TaxTreeData(const AlignmentCollection& ac);
95 
getFailedRows()96 	const vector<int>& getFailedRows() { return m_failedRows;}
97 	void selectTaxNode(TaxTreeIterator& taxNode, bool select);
98 	void setSelections(const vector<int>& rowIDs, CCdCore* cd=0);
99 	int getSelections(vector<int>& rows);
100 	void clearSelection();
101 	void deselectAllTaxNodes();
102 	void fillLeafCount(const TaxTreeIterator& cursor);
103 	//bool isPreferredTaxNode(const TaxTreeIterator& taxNode);
104 	int getAllLeafNodes(const TaxTreeIterator& taxNode, vector<TaxTreeIterator>& nodes) const;
105 	short getRankId(string rankName);
106 	TaxTreeIterator getParentAtRank(int row, string rankName);
107 	bool isEmpty()const;
108 	//bool missLocalTaxFiles()const {return m_missLocalTaxFiles;}
109 	~TaxTreeData();
110 
111 	void addTaxToBioseq(CBioseq& bioseq, TTaxId taxid, string& taxName);
112 	bool writeToFile(string fname)const;
113 	bool writeToFileAsTable(string fname)const;
114 	bool write(std::ostream&os, const iterator& cursor)const;
115 	bool writeAsTable(std::ostream&os, const iterator& cursor, const iterator& branchingNode)const;
116 private:
117 	const AlignmentCollection& m_ac;
118 	typedef map<int, TaxonomyTree::iterator> RowToTaxNode;
119 	RowToTaxNode m_rowToTaxNode;
120 	typedef map<string, short> RankNameToId;
121 	RankNameToId m_rankNameToId;
122 	 // wrapper of taxonomy server class
123     TaxClient* m_taxDataSource;
124 	vector<int> m_failedRows;
125 
126 	bool makeTaxonomyTree();
127 	void addRows(const AlignmentCollection& ac);
128     // get integer taxid for a sequence
129     TTaxId GetTaxIDForSequence(const AlignmentCollection& aligns, int rowID);
130     // get info for taxid
131 	void selectTaxTreeLeaf(const TaxTreeIterator& cursor, bool select, CCdCore* cd=0);
132 	void addSeqTax(int rowID, string seqName, TTaxId taxid);
133 	void growAndInsertLineage(stack<TaxNode*>& lineage);
134 	void insertLineage(TaxTreeIterator& pos, stack<TaxNode*>& lineage);
135 	void cacheRank(short rank, string rankName);
136 	void writeOutRanks();
137 };
138 
139 END_SCOPE(cd_utils)
140 END_NCBI_SCOPE
141 #endif
142 
143