1 #ifndef PHYTREE_CALC__HPP 2 #define PHYTREE_CALC__HPP 3 /* $Id: phytree_calc.hpp 498568 2016-04-18 01:34:13Z whlavina $ 4 * =========================================================================== 5 * 6 * PUBLIC DOMAIN NOTICE 7 * National Center for Biotechnology Information 8 * 9 * This software/database is a "United States Government Work" under the 10 * terms of the United States Copyright Act. It was written as part of 11 * the author's official duties as a United States Government employee and 12 * thus cannot be copyrighted. This software/database is freely available 13 * to the public for use. The National Library of Medicine and the U.S. 14 * Government have not placed any restriction on its use or reproduction. 15 * 16 * Although all reasonable efforts have been taken to ensure the accuracy 17 * and reliability of the software and data, the NLM and the U.S. 18 * Government do not and cannot warrant the performance or results that 19 * may be obtained by using this software or data. The NLM and the U.S. 20 * Government disclaim all warranties, express or implied, including 21 * warranties of performance, merchantability or fitness for any particular 22 * purpose. 23 * 24 * Please cite the author in any work or product based on this material. 25 * 26 * =========================================================================== 27 * 28 * Author: Irena Zaretskaya, Greg Boratyn 29 * 30 */ 31 32 #include <corelib/ncbiobj.hpp> 33 34 #include <algo/phy_tree/bio_tree.hpp> 35 #include <algo/phy_tree/bio_tree_conv.hpp> 36 #include <algo/phy_tree/dist_methods.hpp> 37 38 #include <objects/biotree/BioTreeContainer.hpp> 39 #include <objtools/alnmgr/alnmix.hpp> 40 41 #include <util/range.hpp> 42 43 /// Class used in unit tests 44 class CTestPhyTreeCalc; 45 46 BEGIN_NCBI_SCOPE 47 USING_SCOPE(objects); 48 49 50 /// Computaion of distance-based phylognetic tree 51 class NCBI_XALGOPHYTREE_EXPORT CPhyTreeCalc : public CObject { 52 53 public: 54 55 /// Methods for computing evolutionary distance 56 /// 57 enum EDistMethod { 58 eJukesCantor, ePoisson, eKimura, eGrishin, eGrishinGeneral 59 }; 60 61 62 /// Algorithms for phylogenetic tree reconstruction 63 /// 64 enum ETreeMethod { 65 eNJ, ///< Neighbor Joining 66 eFastME, ///< Fast Minumum Evolution 67 }; 68 69 typedef COpenRange<TSeqPos> TRange; 70 typedef vector< vector<TRange> > TSegInfo; 71 72 /// Distance matrix (square, symmetric with zeros on diagnol) 73 class NCBI_XALGOPHYTREE_EXPORT CDistMatrix 74 { 75 public: 76 77 /// Constructor 78 CDistMatrix(int num_elements = 0); 79 80 /// Get number of rows/columns 81 /// @return Number of rows/columns GetNumElements(void) const82 int GetNumElements(void) const {return m_NumElements;} 83 84 /// Is matrix size zero 85 /// @return true if the matrx has zero elements and false otherwise Empty(void) const86 bool Empty(void) const {return GetNumElements() <= 0;} 87 88 /// Resize matrix 89 /// @param num_elements New number of rows/columns [in] 90 void Resize(int num_elements); 91 92 /// Get distance value 93 /// @param i Row index [in] 94 /// @param j Column index [in] 95 /// @return Distance between i-th and j-th element 96 const double& operator()(int i, int j) const; 97 98 /// Get distance value 99 /// @param i Row index [in] 100 /// @param j Column index [in] 101 /// @return Distance between i-th and j-th element 102 double& operator()(int i, int j); 103 104 private: 105 /// Number of rows/columns 106 int m_NumElements; 107 108 /// Storage for distance values 109 vector<double> m_Distances; 110 111 /// Storage for matrix diagnol value 112 const double m_Diagnol; 113 }; 114 115 public: 116 117 //--- Constructors --- 118 119 /// Create CPhyTreeCalc from Seq-align 120 /// @param seq_aln Seq-align [in] 121 /// @param scope Scope [in] 122 /// 123 CPhyTreeCalc(const CSeq_align& seq_aln, CRef<CScope> scope); 124 125 /// Create CPhyTreeCalc from CSeq_align_set 126 /// @param seq_align_set SeqAlignSet [in] 127 /// @param scope Scope [in] 128 /// 129 CPhyTreeCalc(const CSeq_align_set& seq_align_set, CRef<CScope> scope); 130 ~CPhyTreeCalc()131 ~CPhyTreeCalc() {delete m_Tree;} 132 133 134 //--- Setters --- 135 136 /// Set maximum allowed divergence between sequences included in tree 137 // reconstruction 138 /// @param div [in] 139 /// SetMaxDivergence(double div)140 void SetMaxDivergence(double div) {m_MaxDivergence = div;} 141 142 143 /// Set evolutionary correction method for computing distance between 144 /// sequences 145 /// @param method Distance method [in] 146 /// SetDistMethod(EDistMethod method)147 void SetDistMethod(EDistMethod method) {m_DistMethod = method;} 148 149 150 /// Set algorithm for phylogenetic tree computation 151 /// @param method Tree compuutation method [in] 152 /// SetTreeMethod(ETreeMethod method)153 void SetTreeMethod(ETreeMethod method) {m_TreeMethod = method;} 154 155 156 /// Set labels for tree leaves 157 /// @return Labels 158 /// SetLabels(void)159 vector<string>& SetLabels(void) {return m_Labels;} 160 161 /// Set query sequence by index in alignment. Query sequence is always 162 /// included in the tree. 163 /// @param index Index of the query sequence in the alignment [in] 164 /// SetQuery(int index)165 void SetQuery(int index) {m_QueryIdx = index;} 166 167 /// Set query sequence by sequence id. Query sequence is always included 168 /// in the tree. Exception is thrown if sequence is not found in the 169 /// input alignment. 170 /// @param seqid Sequence id for the query sequence 171 /// 172 void SetQuery(const CSeq_id& seqid); 173 174 /// Calculate summarized segment positions in mutliple alignment 175 /// @param s If true, calculate segment positions [in] 176 /// SetCalcAlnSegInfo(bool s)177 void SetCalcAlnSegInfo(bool s) {m_CalcSegInfo = s;} 178 179 //--- Getters --- 180 181 /// Get computed tree 182 /// @return Tree 183 /// GetTree(void)184 TPhyTreeNode* GetTree(void) {return m_Tree;} 185 186 /// Get computed tree 187 /// @return Tree 188 /// GetTree(void) const189 const TPhyTreeNode* GetTree(void) const {return m_Tree;} 190 191 /// Get serial tree 192 /// @return Tree 193 /// 194 CRef<CBioTreeContainer> GetSerialTree(void) const; 195 196 /// Get seq_align that corresponds to current tree 197 /// @return Seq_align 198 /// 199 CRef<CSeq_align> GetSeqAlign(void) const; 200 201 /// Get seq-ids of sequences used in tree construction 202 /// @return Seq-ids 203 /// 204 const vector< CRef<CSeq_id> >& GetSeqIds(void) const; 205 206 /// Get scope 207 /// @return Scope 208 /// GetScope(void)209 CRef<CScope> GetScope(void) {return m_Scope;} 210 211 /// Get divergence matrix 212 /// @return Divergence matrix GetDivergenceMatrix(void) const213 const CDistMatrix& GetDivergenceMatrix(void) const 214 {return m_DivergenceMatrix;} 215 216 217 /// Get distance matrix 218 /// @return Distance matrix GetDistanceMatrix(void) const219 const CDistMatrix& GetDistanceMatrix(void) const 220 {return m_DistMatrix;} 221 222 223 /// Get maximum allowed diveregence between sequences included in tree 224 /// reconstruction 225 /// @return Max divergence 226 /// GetMaxDivergence(void) const227 double GetMaxDivergence(void) const {return m_MaxDivergence;} 228 229 230 /// Get evolutionary correction method for computing distance between 231 /// sequences 232 /// @return Distance method 233 /// GetDistMethod(void) const234 EDistMethod GetDistMethod(void) const {return m_DistMethod;} 235 236 237 /// Get ids of sequences excluded from tree computation 238 /// @return Ids of excluded sequences 239 /// GetRemovedSeqIds(void) const240 const vector<string>& GetRemovedSeqIds(void) const {return m_RemovedSeqIds;} 241 242 /// Get indeces of sequences excluded from tree computation 243 /// @return Indeces of excluded sequences 244 /// GetRemovedSeqIndeces(void) const245 const vector<int>& GetRemovedSeqIndeces(void) const 246 {return m_RemovedSeqIndeces;} 247 248 /// Get error/warning messages 249 /// @return List of messages 250 /// GetMessages(void) const251 const vector<string>& GetMessages(void) const {return m_Messages;} 252 253 /// Get simplified segment positions in multiple or query anchored 254 /// alignment 255 /// @return Vector of vector of ranges for segments for each sequence 256 /// GetAlnSegInfo(void) const257 const TSegInfo& GetAlnSegInfo(void) const {return m_SegInfo;} 258 259 /// Check whether there are any messages 260 /// @return True if there are messages, false otherwise 261 /// IsMessage(void) const262 bool IsMessage(void) const {return m_Messages.size() > 0;} 263 264 265 //--- Tree computation --- 266 267 /// Compute bio tree for the current alignment in a black box manner 268 /// @return True is computation successful, false otherwise 269 /// 270 bool CalcBioTree(void); 271 272 273 protected: 274 275 /// Initialize class attributes 276 void x_Init(void); 277 278 /// Initialize alignment data source 279 /// @param seq_aln Seq-align [in] 280 /// @return True if success, false otherwise 281 void x_InitAlignDS(const CSeq_align& seq_aln); 282 283 /// Initialize alignment data source 284 /// @param seq_align_set CSeq_align_set [in] 285 /// @return True if success, false otherwise 286 bool x_InitAlignDS(const CSeq_align_set& seq_align_set); 287 288 /// Compute divergence matrix and find sequences to exclude from tree 289 /// reconstruction 290 /// 291 /// Divergence between all pairs of sequences used for phylogenetic tree 292 /// reconstruction must be smaller than a cutoff (m_Divergence). Hence 293 /// some sequences may be excluded from tree computation. The first 294 /// sequence in alignmnet is considered query/master. All sequences similar 295 /// enough to the query and each other are included in tree computation. 296 /// 297 /// @param used_indices Vector of sequence indices included in phylogenetic 298 /// tree computation [out] 299 /// @return True if tree can be computed (ie at least two sequences are 300 /// included for tree computation), false otherwise 301 /// 302 bool x_CalcDivergenceMatrix(vector<int>& used_indices); 303 304 /// Calculate the alignment segemnts summary 305 /// @param aln Alignment as text [in] 306 /// @param seg_info Segment start and stop positions in the alignment [out] 307 /// 308 void x_CalcAlnSegInfo(const vector<string>& aln, TSegInfo& seg_info); 309 310 /// Compute distance as evolutionary corrected dissimilarity 311 void x_CalcDistMatrix(void); 312 313 /// Create alignment only for sequences included in tree computation 314 /// @param included_indices Indices of included sequences [in] 315 /// 316 void x_CreateValidAlign(const vector<int>& used_indices); 317 318 /// Compute phylogenetic tree 319 /// @param correct Whether negative tree egde lengths should be set to zero 320 /// [in] 321 /// 322 void x_ComputeTree(bool correct = true); 323 324 /// Change non-finite and negative tree branch lengths to zeros 325 /// @param node Tree root [in] 326 /// 327 void x_CorrectBranchLengths(TPhyTreeNode* node); 328 329 330 protected: 331 332 /// Structure for storing divergences between sequences as list of links 333 struct SLink { 334 int index1; //< index of sequence 1 335 int index2; //< index of sequence 2 336 double distance; //< distance between sequences 337 338 /// Constructor SLinkCPhyTreeCalc::SLink339 SLink(int ind1, int ind2, double dist) 340 : index1(ind1), index2(ind2), distance(dist) {} 341 }; 342 343 344 protected: 345 346 /// Scope 347 CRef<CScope> m_Scope; 348 349 /// Alignment data source 350 CRef<CAlnVec> m_AlignDataSource; 351 352 /// Maximum allowed divergence between sequences 353 double m_MaxDivergence; 354 355 /// Method of calculating evolutionary distance correction 356 EDistMethod m_DistMethod; 357 358 /// Method of calculating tree 359 ETreeMethod m_TreeMethod; 360 361 /// Matrix of percent identities based distances 362 CDistMatrix m_DivergenceMatrix; 363 364 /// Matrix of evolutionary distance 365 CDistMatrix m_DistMatrix; 366 367 /// Full distance matrix, this data structure is required by CDistMethods 368 /// functions 369 CDistMethods::TMatrix m_FullDistMatrix; 370 371 /// Sequences that are not included in the tree 372 vector<string> m_RemovedSeqIds; 373 374 /// Indeces of sequences that are not included in the tree 375 vector<int> m_RemovedSeqIndeces; 376 377 /// Computed tree 378 TPhyTreeNode* m_Tree; 379 380 /// Labels for tree leaves 381 vector<string> m_Labels; 382 383 /// Index of query sequence in the input alignment. Query sequence is 384 /// always included in the tree. 385 int m_QueryIdx; 386 387 /// Error/warning messages 388 vector<string> m_Messages; 389 390 /// Positions of segments in mulitple or query anchored alignment 391 TSegInfo m_SegInfo; 392 393 /// Calculate segment positions 394 bool m_CalcSegInfo; 395 396 friend class ::CTestPhyTreeCalc; 397 }; 398 399 400 /// Guide tree calc exceptions 401 class NCBI_XALGOPHYTREE_EXPORT CPhyTreeCalcException : public CException 402 { 403 public: 404 enum EErrCode { 405 eInvalidOptions, 406 eTreeComputationProblem, 407 eNoTree, 408 eDistMatrixError 409 }; 410 411 NCBI_EXCEPTION_DEFAULT(CPhyTreeCalcException, CException); 412 }; 413 414 END_NCBI_SCOPE 415 416 #endif // PHYTREE_CALC__HPP 417