1 #ifndef PHYTREE_CALC__HPP
2 #define PHYTREE_CALC__HPP
3 /*  $Id: phytree_calc.hpp 498568 2016-04-18 01:34:13Z whlavina $
4  * ===========================================================================
5  *
6  *                            PUBLIC DOMAIN NOTICE
7  *               National Center for Biotechnology Information
8  *
9  *  This software/database is a "United States Government Work" under the
10  *  terms of the United States Copyright Act.  It was written as part of
11  *  the author's official duties as a United States Government employee and
12  *  thus cannot be copyrighted.  This software/database is freely available
13  *  to the public for use. The National Library of Medicine and the U.S.
14  *  Government have not placed any restriction on its use or reproduction.
15  *
16  *  Although all reasonable efforts have been taken to ensure the accuracy
17  *  and reliability of the software and data, the NLM and the U.S.
18  *  Government do not and cannot warrant the performance or results that
19  *  may be obtained by using this software or data. The NLM and the U.S.
20  *  Government disclaim all warranties, express or implied, including
21  *  warranties of performance, merchantability or fitness for any particular
22  *  purpose.
23  *
24  *  Please cite the author in any work or product based on this material.
25  *
26  * ===========================================================================
27  *
28  * Author:  Irena Zaretskaya, Greg Boratyn
29  *
30  */
31 
32 #include <corelib/ncbiobj.hpp>
33 
34 #include <algo/phy_tree/bio_tree.hpp>
35 #include <algo/phy_tree/bio_tree_conv.hpp>
36 #include <algo/phy_tree/dist_methods.hpp>
37 
38 #include <objects/biotree/BioTreeContainer.hpp>
39 #include <objtools/alnmgr/alnmix.hpp>
40 
41 #include <util/range.hpp>
42 
43 /// Class used in unit tests
44 class CTestPhyTreeCalc;
45 
46 BEGIN_NCBI_SCOPE
47 USING_SCOPE(objects);
48 
49 
50 /// Computaion of distance-based phylognetic tree
51 class NCBI_XALGOPHYTREE_EXPORT CPhyTreeCalc : public CObject {
52 
53 public:
54 
55     /// Methods for computing evolutionary distance
56     ///
57     enum EDistMethod {
58         eJukesCantor, ePoisson, eKimura, eGrishin, eGrishinGeneral
59     };
60 
61 
62     /// Algorithms for phylogenetic tree reconstruction
63     ///
64     enum ETreeMethod {
65         eNJ,      ///< Neighbor Joining
66         eFastME,  ///< Fast Minumum Evolution
67     };
68 
69     typedef COpenRange<TSeqPos> TRange;
70     typedef vector< vector<TRange> > TSegInfo;
71 
72     /// Distance matrix (square, symmetric with zeros on diagnol)
73     class NCBI_XALGOPHYTREE_EXPORT CDistMatrix
74     {
75     public:
76 
77         /// Constructor
78         CDistMatrix(int num_elements = 0);
79 
80         /// Get number of rows/columns
81         /// @return Number of rows/columns
GetNumElements(void) const82         int GetNumElements(void) const {return m_NumElements;}
83 
84         /// Is matrix size zero
85         /// @return true if the matrx has zero elements and false otherwise
Empty(void) const86         bool Empty(void) const {return GetNumElements() <= 0;}
87 
88         /// Resize matrix
89         /// @param num_elements New number of rows/columns [in]
90         void Resize(int num_elements);
91 
92         /// Get distance value
93         /// @param i Row index [in]
94         /// @param j Column index [in]
95         /// @return Distance between i-th and j-th element
96         const double& operator()(int i, int j) const;
97 
98         /// Get distance value
99         /// @param i Row index [in]
100         /// @param j Column index [in]
101         /// @return Distance between i-th and j-th element
102         double& operator()(int i, int j);
103 
104     private:
105         /// Number of rows/columns
106         int m_NumElements;
107 
108         /// Storage for distance values
109         vector<double> m_Distances;
110 
111         /// Storage for matrix diagnol value
112         const double m_Diagnol;
113     };
114 
115 public:
116 
117     //--- Constructors ---
118 
119     /// Create CPhyTreeCalc from Seq-align
120     /// @param seq_aln Seq-align [in]
121     /// @param scope Scope [in]
122     ///
123     CPhyTreeCalc(const CSeq_align& seq_aln, CRef<CScope> scope);
124 
125     /// Create CPhyTreeCalc from CSeq_align_set
126     /// @param seq_align_set SeqAlignSet [in]
127     /// @param scope Scope [in]
128     ///
129     CPhyTreeCalc(const CSeq_align_set& seq_align_set, CRef<CScope> scope);
130 
~CPhyTreeCalc()131     ~CPhyTreeCalc() {delete m_Tree;}
132 
133 
134     //--- Setters ---
135 
136     /// Set maximum allowed divergence between sequences included in tree
137     //  reconstruction
138     /// @param div [in]
139     ///
SetMaxDivergence(double div)140     void SetMaxDivergence(double div) {m_MaxDivergence = div;}
141 
142 
143     /// Set evolutionary correction method for computing distance between
144     /// sequences
145     /// @param method Distance method [in]
146     ///
SetDistMethod(EDistMethod method)147     void SetDistMethod(EDistMethod method) {m_DistMethod = method;}
148 
149 
150     /// Set algorithm for phylogenetic tree computation
151     /// @param method Tree compuutation method [in]
152     ///
SetTreeMethod(ETreeMethod method)153     void SetTreeMethod(ETreeMethod method) {m_TreeMethod = method;}
154 
155 
156     /// Set labels for tree leaves
157     /// @return Labels
158     ///
SetLabels(void)159     vector<string>& SetLabels(void) {return m_Labels;}
160 
161     /// Set query sequence by index in alignment. Query sequence is always
162     /// included in the tree.
163     /// @param index Index of the query sequence in the alignment [in]
164     ///
SetQuery(int index)165     void SetQuery(int index) {m_QueryIdx = index;}
166 
167     /// Set query sequence by sequence id. Query sequence is always included
168     /// in the tree. Exception is thrown if sequence is not found in the
169     /// input alignment.
170     /// @param seqid Sequence id for the query sequence
171     ///
172     void SetQuery(const CSeq_id& seqid);
173 
174     /// Calculate summarized segment positions in mutliple alignment
175     /// @param s If true, calculate segment positions [in]
176     ///
SetCalcAlnSegInfo(bool s)177     void SetCalcAlnSegInfo(bool s) {m_CalcSegInfo = s;}
178 
179     //--- Getters ---
180 
181     /// Get computed tree
182     /// @return Tree
183     ///
GetTree(void)184     TPhyTreeNode* GetTree(void) {return m_Tree;}
185 
186     /// Get computed tree
187     /// @return Tree
188     ///
GetTree(void) const189     const TPhyTreeNode* GetTree(void) const {return m_Tree;}
190 
191     /// Get serial tree
192     /// @return Tree
193     ///
194     CRef<CBioTreeContainer> GetSerialTree(void) const;
195 
196     /// Get seq_align that corresponds to current tree
197     /// @return Seq_align
198     ///
199     CRef<CSeq_align> GetSeqAlign(void) const;
200 
201     /// Get seq-ids of sequences used in tree construction
202     /// @return Seq-ids
203     ///
204     const vector< CRef<CSeq_id> >& GetSeqIds(void) const;
205 
206     /// Get scope
207     /// @return Scope
208     ///
GetScope(void)209     CRef<CScope> GetScope(void) {return m_Scope;}
210 
211     /// Get divergence matrix
212     /// @return Divergence matrix
GetDivergenceMatrix(void) const213     const CDistMatrix& GetDivergenceMatrix(void) const
214     {return m_DivergenceMatrix;}
215 
216 
217     /// Get distance matrix
218     /// @return Distance matrix
GetDistanceMatrix(void) const219     const CDistMatrix& GetDistanceMatrix(void) const
220     {return m_DistMatrix;}
221 
222 
223     /// Get maximum allowed diveregence between sequences included in tree
224     /// reconstruction
225     /// @return Max divergence
226     ///
GetMaxDivergence(void) const227     double GetMaxDivergence(void) const {return m_MaxDivergence;}
228 
229 
230     /// Get evolutionary correction method for computing distance between
231     /// sequences
232     /// @return Distance method
233     ///
GetDistMethod(void) const234     EDistMethod GetDistMethod(void) const {return m_DistMethod;}
235 
236 
237     /// Get ids of sequences excluded from tree computation
238     /// @return Ids of excluded sequences
239     ///
GetRemovedSeqIds(void) const240     const vector<string>& GetRemovedSeqIds(void) const {return m_RemovedSeqIds;}
241 
242     /// Get indeces of sequences excluded from tree computation
243     /// @return Indeces of excluded sequences
244     ///
GetRemovedSeqIndeces(void) const245     const vector<int>& GetRemovedSeqIndeces(void) const
246     {return m_RemovedSeqIndeces;}
247 
248     /// Get error/warning messages
249     /// @return List of messages
250     ///
GetMessages(void) const251     const vector<string>& GetMessages(void) const {return m_Messages;}
252 
253     /// Get simplified segment positions in multiple or query anchored
254     /// alignment
255     /// @return Vector of vector of ranges for segments for each sequence
256     ///
GetAlnSegInfo(void) const257     const TSegInfo& GetAlnSegInfo(void) const {return m_SegInfo;}
258 
259     /// Check whether there are any messages
260     /// @return True if there are messages, false otherwise
261     ///
IsMessage(void) const262     bool IsMessage(void) const {return m_Messages.size() > 0;}
263 
264 
265     //--- Tree computation ---
266 
267     /// Compute bio tree for the current alignment in a black box manner
268     /// @return True is computation successful, false otherwise
269     ///
270     bool CalcBioTree(void);
271 
272 
273 protected:
274 
275     /// Initialize class attributes
276     void x_Init(void);
277 
278     /// Initialize alignment data source
279     /// @param seq_aln Seq-align [in]
280     /// @return True if success, false otherwise
281     void x_InitAlignDS(const CSeq_align& seq_aln);
282 
283     /// Initialize alignment data source
284     /// @param seq_align_set CSeq_align_set [in]
285     /// @return True if success, false otherwise
286     bool x_InitAlignDS(const CSeq_align_set& seq_align_set);
287 
288     /// Compute divergence matrix and find sequences to exclude from tree
289     /// reconstruction
290     ///
291     /// Divergence between all pairs of sequences used for phylogenetic tree
292     /// reconstruction must be smaller than a cutoff (m_Divergence). Hence
293     /// some sequences may be excluded from tree computation. The first
294     /// sequence in alignmnet is considered query/master. All sequences similar
295     /// enough to the query and each other are included in tree computation.
296     ///
297     /// @param used_indices Vector of sequence indices included in phylogenetic
298     /// tree computation [out]
299     /// @return True if tree can be computed (ie at least two sequences are
300     /// included for tree computation), false otherwise
301     ///
302     bool x_CalcDivergenceMatrix(vector<int>& used_indices);
303 
304     /// Calculate the alignment segemnts summary
305     /// @param aln Alignment as text [in]
306     /// @param seg_info Segment start and stop positions in the alignment [out]
307     ///
308     void x_CalcAlnSegInfo(const vector<string>& aln, TSegInfo& seg_info);
309 
310     /// Compute distance as evolutionary corrected dissimilarity
311     void x_CalcDistMatrix(void);
312 
313     /// Create alignment only for sequences included in tree computation
314     /// @param included_indices Indices of included sequences [in]
315     ///
316     void x_CreateValidAlign(const vector<int>& used_indices);
317 
318     /// Compute phylogenetic tree
319     /// @param correct Whether negative tree egde lengths should be set to zero
320     /// [in]
321     ///
322     void x_ComputeTree(bool correct = true);
323 
324     /// Change non-finite and negative tree branch lengths to zeros
325     /// @param node Tree root [in]
326     ///
327     void x_CorrectBranchLengths(TPhyTreeNode* node);
328 
329 
330 protected:
331 
332     /// Structure for storing divergences between sequences as list of links
333     struct SLink {
334         int index1;       //< index of sequence 1
335         int index2;       //< index of sequence 2
336         double distance;  //< distance between sequences
337 
338         /// Constructor
SLinkCPhyTreeCalc::SLink339         SLink(int ind1, int ind2, double dist)
340             : index1(ind1), index2(ind2), distance(dist) {}
341     };
342 
343 
344 protected:
345 
346     /// Scope
347     CRef<CScope> m_Scope;
348 
349     /// Alignment data source
350     CRef<CAlnVec> m_AlignDataSource;
351 
352     /// Maximum allowed divergence between sequences
353     double m_MaxDivergence;
354 
355     /// Method of calculating evolutionary distance correction
356     EDistMethod m_DistMethod;
357 
358     /// Method of calculating tree
359     ETreeMethod m_TreeMethod;
360 
361     /// Matrix of percent identities based distances
362     CDistMatrix m_DivergenceMatrix;
363 
364     /// Matrix of evolutionary distance
365     CDistMatrix m_DistMatrix;
366 
367     /// Full distance matrix, this data structure is required by CDistMethods
368     /// functions
369     CDistMethods::TMatrix m_FullDistMatrix;
370 
371     /// Sequences that are not included in the tree
372     vector<string> m_RemovedSeqIds;
373 
374     /// Indeces of sequences that are not included in the tree
375     vector<int> m_RemovedSeqIndeces;
376 
377     /// Computed tree
378     TPhyTreeNode* m_Tree;
379 
380     /// Labels for tree leaves
381     vector<string> m_Labels;
382 
383     /// Index of query sequence in the input alignment. Query sequence is
384     /// always included in the tree.
385     int m_QueryIdx;
386 
387     /// Error/warning messages
388     vector<string> m_Messages;
389 
390     /// Positions of segments in mulitple or query anchored alignment
391     TSegInfo m_SegInfo;
392 
393     /// Calculate segment positions
394     bool m_CalcSegInfo;
395 
396     friend class ::CTestPhyTreeCalc;
397 };
398 
399 
400 /// Guide tree calc exceptions
401 class NCBI_XALGOPHYTREE_EXPORT CPhyTreeCalcException : public CException
402 {
403 public:
404     enum EErrCode {
405         eInvalidOptions,
406         eTreeComputationProblem,
407         eNoTree,
408         eDistMatrixError
409     };
410 
411     NCBI_EXCEPTION_DEFAULT(CPhyTreeCalcException, CException);
412 };
413 
414 END_NCBI_SCOPE
415 
416 #endif // PHYTREE_CALC__HPP
417