1 #ifndef ALGO_COBALT___KMERCOUNTS__HPP 2 #define ALGO_COBALT___KMERCOUNTS__HPP 3 4 /* $Id: kmercounts.hpp 389681 2013-02-20 13:16:20Z kornbluh $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's offical duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * ===========================================================================*/ 28 29 /***************************************************************************** 30 31 File name: kmercounts.hpp 32 33 Author: Greg Boratyn 34 35 Contents: Interface for k-mer counting 36 37 ******************************************************************************/ 38 39 40 #include <util/math/matrix.hpp> 41 #include <objects/seqloc/Seq_loc.hpp> 42 #include <objmgr/scope.hpp> 43 #include <algo/cobalt/base.hpp> 44 #include <algo/cobalt/links.hpp> 45 #include <algo/blast/core/blast_encoding.h> 46 #include <vector> 47 #include <stack> 48 49 50 BEGIN_NCBI_SCOPE 51 BEGIN_SCOPE(cobalt) 52 53 54 55 // TO DO: Redesign K-mer counts classes 56 57 /// Kmer counts for alignment free sequence similarity computation 58 /// implemented as a sparse vector 59 /// 60 class NCBI_COBALT_EXPORT CSparseKmerCounts 61 { 62 public: 63 typedef Uint1 TCount; 64 65 66 /// Element of the sparse vector 67 struct SVectorElement { 68 Uint4 position; ///< position of non-zero element 69 TCount value; ///< value of non-zero element 70 71 /// Default constructor SVectorElementCSparseKmerCounts::SVectorElement72 SVectorElement(void) {position = 0; value = 0;} 73 74 /// Create vector element 75 /// @param pos Element position 76 /// @param val Element value SVectorElementCSparseKmerCounts::SVectorElement77 SVectorElement(Uint4 pos, TCount val) {position = pos; value = val;} 78 }; 79 80 typedef vector<SVectorElement>::const_iterator TNonZeroCounts_CI; 81 82 83 public: 84 /// Create empty counts vector 85 /// CSparseKmerCounts(void)86 CSparseKmerCounts(void) : m_SeqLength(0), m_NumCounts(0) {} 87 88 /// Create k-mer counts vector from SSeqLoc with defalut k-mer length and 89 /// alphabet size 90 /// @param seq The sequence to be represented as k-mer counts [in] 91 /// @param scope Scope 92 /// 93 CSparseKmerCounts(const objects::CSeq_loc& seq, 94 objects::CScope& scope); 95 96 /// Reset the counts vector 97 /// @param seq Sequence [in] 98 /// @param scope Scope [in] 99 /// 100 void Reset(const objects::CSeq_loc& seq, objects::CScope& scope); 101 102 /// Get sequence length 103 /// @return Sequence length 104 /// GetSeqLength(void) const105 unsigned int GetSeqLength(void) const {return m_SeqLength;} 106 107 /// Get number of all k-mers found in the sequence 108 /// @return Number of all k-mers 109 /// GetNumCounts(void) const110 unsigned int GetNumCounts(void) const {return m_NumCounts;} 111 112 /// Get default kmer length 113 /// @return Default k-mer length 114 /// GetKmerLength(void)115 static unsigned int GetKmerLength(void) 116 {return CSparseKmerCounts::sm_KmerLength;} 117 118 /// Get default alphabet size 119 /// @return Default alphabet size 120 /// GetAlphabetSize(void)121 static unsigned int GetAlphabetSize(void) {return sm_AlphabetSize;} 122 123 /// Get non-zero counts iterator 124 /// @return Non-zero counts iterator pointing to the begining 125 /// BeginNonZero(void) const126 TNonZeroCounts_CI BeginNonZero(void) const {return m_Counts.begin();} 127 128 /// Get non-zero counts iterator 129 /// @return Non-zero counts iterator pointing to the end 130 /// EndNonZero(void) const131 TNonZeroCounts_CI EndNonZero(void) const {return m_Counts.end();} 132 133 /// Print counts 134 /// @param ostr Output stream [in|out] 135 /// @return Output stream 136 /// 137 CNcbiOstream& Print(CNcbiOstream& ostr) const; 138 139 /// Set default k-mer length 140 /// @param len Default k-mer length [in] 141 /// SetKmerLength(unsigned len)142 static void SetKmerLength(unsigned len) 143 {sm_KmerLength = len; sm_ForceSmallerMem = false;} 144 145 /// Set Default alphabet size 146 /// @param size Default alphabet size [in] 147 /// SetAlphabetSize(unsigned size)148 static void SetAlphabetSize(unsigned size) 149 {sm_AlphabetSize = size; sm_ForceSmallerMem = false;} 150 151 /// Set default compressed alphabet letter translation table 152 /// @return Reference to translation table [in|out] 153 /// SetTransTable(void)154 static vector<Uint1>& SetTransTable(void) {return sm_TransTable;} 155 156 /// Set default option for using compressed alphabet 157 /// @param use_comp Will compressed alphabet be used [in] 158 /// SetUseCompressed(bool use_comp)159 static void SetUseCompressed(bool use_comp) {sm_UseCompressed = use_comp;} 160 161 /// Compute 1 - local fraction of common k-mers between two count vectors 162 /// normalized by length of shorter sequence 163 /// @param vect1 K-mer counts vector [in] 164 /// @param vect2 K-mer counts vector [in] 165 /// @return Local fraction of common k-mer as distance 166 /// 167 /// Computes 1 - F(v1, v2), where 168 /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\min \{L_x, L_y\} 169 /// - k + 1), where 170 /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x 171 /// excluding Xaa, k - k-mer length 172 /// F(x, y) is described in RC Edgar, BMC Bioinformatics 5:113, 2004 173 static double FractionCommonKmersDist(const CSparseKmerCounts& vect1, 174 const CSparseKmerCounts& vect2); 175 176 /// Compute 1 - global fraction of common k-mers between two count vectors 177 /// normalized by length of longer sequence 178 /// @param vect1 K-mer counts vector [in] 179 /// @param vect2 K-mer counts vector [in] 180 /// @return Global fraction of common k-mers as distance 181 /// 182 /// Computes 1 - F(v1, v2), where 183 /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\max \{L_x, L_y\} 184 /// - k + 1), where 185 /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x 186 /// excluding Xaa, k - k-mer length 187 /// F(x, y) is modified version of measure presented 188 /// RC Edgar, BMC Bioinformatics 5:113, 2004 189 static double FractionCommonKmersGlobalDist(const CSparseKmerCounts& v1, 190 const CSparseKmerCounts& v2); 191 192 /// Copmute number of common kmers between two count vectors 193 /// @param v1 K-mer counts vector [in] 194 /// @param v2 K-mer counts vecotr [in] 195 /// @param repetitions Should multiple copies of the same k-mer be counted 196 /// @return Number of k-mers that are present in both counts vectors 197 /// 198 static unsigned int CountCommonKmers(const CSparseKmerCounts& v1, 199 const CSparseKmerCounts& v2, 200 bool repetitions = true); 201 202 /// Perform preparations before k-mer counting common to all sequences. 203 /// Allocate buffer for storing temporary counts 204 /// 205 static void PreCount(void); 206 207 /// Perform post-kmer counting tasks. Free buffer. 208 /// 209 static void PostCount(void); 210 211 212 private: 213 static TCount* ReserveCountsMem(unsigned int num_bits); 214 GetAALetter(Uint1 letter)215 static Uint4 GetAALetter(Uint1 letter) 216 { 217 _ASSERT(!sm_UseCompressed || letter < sm_TransTable.size()); 218 return (Uint4)(sm_UseCompressed ? sm_TransTable[(int)letter] : letter); 219 } 220 221 /// Initializes element index as bit vector for first k letters, 222 /// skipping Xaa 223 /// @param sv Sequence [in] 224 /// @param pos Element index in sparse vector [out] 225 /// @param index Index of letter in the sequence where k-mer counting 226 /// starts. At exit index points to the next letter after first 227 /// k-mer [in|out] 228 /// @param num_bits Number of bits in pos per letter [in] 229 /// @param kmer_len K-mer length [in] 230 /// @return True if pos was initialized, false otherwise (if no k-mer 231 /// without X was found) 232 static bool InitPosBits(const objects::CSeqVector& sv, Uint4& pos, 233 unsigned int& index, Uint4 num_bits, 234 Uint4 kmer_len); 235 236 237 protected: 238 vector<SVectorElement> m_Counts; 239 unsigned int m_SeqLength; 240 unsigned int m_NumCounts; 241 static unsigned int sm_KmerLength; 242 static unsigned int sm_AlphabetSize; 243 static vector<Uint1> sm_TransTable; 244 static bool sm_UseCompressed; 245 static TCount* sm_Buffer; 246 static bool sm_ForceSmallerMem; 247 static const unsigned int kLengthBitsThreshold = 32; 248 }; 249 250 251 /// K-mer counts implemented as bit vectors 252 /// 253 class NCBI_COBALT_EXPORT CBinaryKmerCounts 254 { 255 public: 256 257 /// Constructor 258 /// CBinaryKmerCounts(void)259 CBinaryKmerCounts(void) : m_SeqLength(0), m_NumCounts(0) {} 260 261 262 /// Constructor 263 /// @param seq Sequence [in] 264 /// @param scop Scope [in] 265 /// 266 CBinaryKmerCounts(const objects::CSeq_loc& seq, 267 objects::CScope& scope); 268 269 /// Compute counts 270 /// @param seq Sequence [in] 271 /// @param scope Scope [in] 272 /// 273 void Reset(const objects::CSeq_loc& seq, objects::CScope& scope); 274 275 /// Get sequence length 276 /// @return Sequence length 277 /// GetSeqLength(void) const278 unsigned int GetSeqLength(void) const {return m_SeqLength;} 279 280 /// Get number of k-mers 281 /// @return Number of k-mers 282 /// GetNumCounts(void) const283 unsigned int GetNumCounts(void) const {return m_NumCounts;} 284 285 /// Get k-mer length 286 /// @return K-mer length 287 /// GetKmerLength(void)288 static unsigned int GetKmerLength(void) 289 {return CBinaryKmerCounts::sm_KmerLength;} 290 291 /// Get alphabet size 292 /// @return Alphabet size 293 /// GetAlphabetSize(void)294 static unsigned int GetAlphabetSize(void) {return sm_AlphabetSize;} 295 296 /// Set default k-mer length 297 /// @param len Default k-mer length [in] 298 /// SetKmerLength(unsigned len)299 static void SetKmerLength(unsigned len) 300 {sm_KmerLength = len;} 301 302 /// Set Default alphabet size 303 /// @param size Default alphabet size [in] 304 /// SetAlphabetSize(unsigned size)305 static void SetAlphabetSize(unsigned size) 306 {sm_AlphabetSize = size;} 307 308 /// Set default compressed alphabet letter translation table 309 /// @return Reference to translation table [in|out] 310 /// SetTransTable(void)311 static vector<Uint1>& SetTransTable(void) {return sm_TransTable;} 312 313 /// Set default option for using compressed alphabet 314 /// @param use_comp Will compressed alphabet be used [in] 315 /// SetUseCompressed(bool use_comp)316 static void SetUseCompressed(bool use_comp) {sm_UseCompressed = use_comp;} 317 318 /// Compute 1 - local fraction of common k-mers between two count vectors 319 /// normalized by length of shorter sequence 320 /// @param vect1 K-mer counts vector [in] 321 /// @param vect2 K-mer counts vector [in] 322 /// @return Local fraction of common k-mer as distance 323 /// 324 /// Computes 1 - F(v1, v2), where 325 /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\min \{L_x, L_y\} 326 /// - k + 1), where 327 /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x 328 /// excluding Xaa, k - k-mer length 329 /// F(x, y) is described in RC Edgar, BMC Bioinformatics 5:113, 2004 330 static double FractionCommonKmersDist(const CBinaryKmerCounts& vect1, 331 const CBinaryKmerCounts& vect2); 332 333 /// Compute 1 - global fraction of common k-mers between two count vectors 334 /// normalized by length of longer sequence 335 /// @param vect1 K-mer counts vector [in] 336 /// @param vect2 K-mer counts vector [in] 337 /// @return Global fraction of common k-mers as distance 338 /// 339 /// Computes 1 - F(v1, v2), where 340 /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\max \{L_x, L_y\} 341 /// - k + 1), where 342 /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x 343 /// excluding Xaa, k - k-mer length 344 /// F(x, y) is modified version of measure presented 345 /// RC Edgar, BMC Bioinformatics 5:113, 2004 346 static double FractionCommonKmersGlobalDist(const CBinaryKmerCounts& v1, 347 const CBinaryKmerCounts& v2); 348 349 /// Copmute number of common kmers between two count vectors 350 /// @param v1 K-mer counts vector [in] 351 /// @param v2 K-mer counts vecotr [in] 352 /// @param repetitions Should multiple copies of the same k-mer be counted 353 /// @return Number of k-mers that are present in both counts vectors 354 /// 355 static unsigned int CountCommonKmers(const CBinaryKmerCounts& v1, 356 const CBinaryKmerCounts& v2); 357 358 359 /// Perform preparations before k-mer counting common to all sequences. 360 /// PreCount(void)361 static void PreCount(void) {} 362 363 /// Perform post-kmer counting tasks. 364 /// PostCount(void)365 static void PostCount(void) {} 366 367 368 protected: GetAALetter(Uint1 letter)369 static Uint4 GetAALetter(Uint1 letter) 370 { 371 _ASSERT(!sm_UseCompressed || letter < sm_TransTable.size()); 372 return (Uint4)(sm_UseCompressed ? sm_TransTable[(int)letter] : letter); 373 } 374 375 /// Get number of set bits (adapted 376 /// from http://graphics.stanford.edu/~seander/bithacks.html) 377 /// @param v Bit vector [in] 378 /// @return Number of set bits 379 /// x_Popcount(Uint4 v)380 static Uint4 x_Popcount(Uint4 v) 381 { 382 if (v==0) return 0; // early bailout for sparse vectors 383 v = v - ((v >> 1) & 0x55555555); 384 v = (v & 0x33333333) + ((v >> 2) & 0x33333333); 385 v = ((v + (v >> 4)) & 0xF0F0F0F); 386 v = v*0x1010101; 387 388 return v >> 24; // count 389 } 390 391 392 protected: 393 vector<Uint4> m_Counts; 394 Uint4 m_SeqLength; 395 Uint4 m_NumCounts; 396 static unsigned int sm_KmerLength; 397 static unsigned int sm_AlphabetSize; 398 static vector<Uint1> sm_TransTable; 399 static bool sm_UseCompressed; 400 }; 401 402 403 404 /// Exception class for Kmer counts 405 class CKmerCountsException : public CException 406 { 407 public: 408 enum EErrCode { 409 eInvalid, 410 eUnsupportedSeqLoc, 411 eUnsuportedDistMethod, 412 eInvalidOptions, 413 eBadSequence, 414 eMemoryAllocation 415 }; 416 417 NCBI_EXCEPTION_DEFAULT(CKmerCountsException, CException); 418 }; 419 420 /// Interface for computing and manipulating k-mer counts vectors that allows 421 /// for different implementations of K-mer counts vectors 422 /// 423 template <class TKmerCounts> 424 class TKmerMethods 425 { 426 public: 427 enum ECompressedAlphabet { 428 eRegular = 0, eSE_V10, eSE_B15, 429 eFirstCompressed = eSE_V10, 430 eLastAlphabet = eSE_B15 431 }; 432 433 enum EDistMeasures { 434 eFractionCommonKmersGlobal, 435 eFractionCommonKmersLocal 436 }; 437 438 typedef CNcbiMatrix<double> TDistMatrix; 439 440 public: 441 442 /// Set default counts vector parameters 443 /// @param kmer_len K-mer length [in] 444 /// @param alphabet_size Alphabet size [in] 445 /// SetParams(unsigned kmer_len,unsigned alphabet_size)446 static void SetParams(unsigned kmer_len, unsigned alphabet_size) 447 { 448 TKmerCounts::SetKmerLength(kmer_len); 449 TKmerCounts::SetAlphabetSize(alphabet_size); 450 TKmerCounts::SetTransTable().clear(); 451 TKmerCounts::SetUseCompressed(false); 452 } 453 454 /// Creates translation table for compressed alphabets 455 /// @param trans_string String with groupped letters [in] 456 /// @param trans_table Translation table [out] 457 /// @param alphabet_len Number of letters in compressed alphabet 458 /// BuildCompressedTranslation(ECompressedAlphabet alph_index,vector<Uint1> & trans_table,unsigned alphabet_len)459 static void BuildCompressedTranslation(ECompressedAlphabet alph_index, 460 vector<Uint1>& trans_table, 461 unsigned alphabet_len) 462 463 { 464 // Compressed alphabets taken from 465 // Shiryev et al.(2007), Bioinformatics, 23:2949-2951 466 const char* kCompAlphabets[] = { 467 // 23-to-10 letter compressed alphabet. Based on SE-V(10) 468 "IJLMV AST BDENZ KQR G FY P H C W", 469 // 23-to-15 letter compressed alphabet. Based on SE_B(14) 470 "ST IJV LM KR EQZ A G BD P N F Y H C W" 471 }; 472 473 _ASSERT(alph_index >= eFirstCompressed && alph_index <= eLastAlphabet); 474 const char* trans_string = kCompAlphabets[alph_index 475 - (int)eFirstCompressed]; 476 477 Uint4 compressed_letter = 1; // this allows for gaps 478 trans_table.clear(); 479 trans_table.resize(alphabet_len + 1, 0); 480 for (Uint4 i = 0; i < strlen(trans_string);i++) { 481 if (isspace(trans_string[i])) { 482 compressed_letter++; 483 } 484 else if (isalpha(trans_string[i])) { 485 Uint1 aa_letter = AMINOACID_TO_NCBISTDAA[(int)trans_string[i]]; 486 487 _ASSERT(aa_letter < trans_table.size()); 488 489 trans_table[aa_letter] = compressed_letter; 490 } 491 } 492 } 493 494 /// Set default counts vector parameters for use with compressed alphabet 495 /// @param kmer_len K-mer length [in] 496 /// @param alph Compressed alphabet to use [in] 497 /// SetParams(unsigned kmer_len,ECompressedAlphabet alph)498 static void SetParams(unsigned kmer_len, ECompressedAlphabet alph) { 499 TKmerCounts::SetKmerLength(kmer_len); 500 unsigned int len; 501 unsigned int compressed_len; 502 switch (alph) { 503 case eSE_V10: 504 len = 28; 505 compressed_len = 11; //including gap 506 BuildCompressedTranslation(eSE_V10, 507 TKmerCounts::SetTransTable(), 508 len); 509 TKmerCounts::SetAlphabetSize(compressed_len); 510 TKmerCounts::SetUseCompressed(true); 511 break; 512 513 case eSE_B15: 514 len = 28; 515 compressed_len = 16; //including gap 516 BuildCompressedTranslation(eSE_B15, 517 TKmerCounts::SetTransTable(), 518 len); 519 TKmerCounts::SetAlphabetSize(compressed_len); 520 TKmerCounts::SetUseCompressed(true); 521 break; 522 523 case eRegular: 524 TKmerCounts::SetAlphabetSize(kAlphabetSize); 525 TKmerCounts::SetTransTable().clear(); 526 TKmerCounts::SetUseCompressed(false); 527 } 528 } 529 530 /// Create k-mer counts vectors for given sequences 531 /// @param seqs List of sequences [in] 532 /// @param counts List of k-mer counts vectors [out] 533 /// ComputeCounts(const vector<CRef<objects::CSeq_loc>> & seqs,objects::CScope & scope,vector<TKmerCounts> & counts)534 static void ComputeCounts(const vector< CRef<objects::CSeq_loc> >& seqs, 535 objects::CScope& scope, 536 vector<TKmerCounts>& counts) 537 { 538 if (seqs.empty()) { 539 NCBI_THROW(CKmerCountsException, eInvalidOptions, 540 "Empty list of sequences"); 541 } 542 543 counts.clear(); 544 545 TKmerCounts::PreCount(); 546 547 ITERATE(vector< CRef<objects::CSeq_loc> >, it, seqs) { 548 counts.push_back(TKmerCounts(**it, scope)); 549 } 550 551 TKmerCounts::PostCount(); 552 } 553 554 /// Compute matrix of distances between given counts vectors 555 /// @param counts List of k-mer counts vectors [in] 556 /// @param fsim Function that computes distance betwee two vectors [in] 557 /// @param dmat Distance matrix [out] 558 /// ComputeDistMatrix(const vector<TKmerCounts> & counts,double (* fsim)(const TKmerCounts &,const TKmerCounts &),TDistMatrix & dmat)559 static void ComputeDistMatrix(const vector<TKmerCounts>& counts, 560 double(*fsim)(const TKmerCounts&, const TKmerCounts&), 561 TDistMatrix& dmat) 562 563 { 564 if (counts.empty()) { 565 NCBI_THROW(CKmerCountsException, eBadSequence, 566 "The list of k-mer counts vectors is empty"); 567 } 568 569 dmat.Resize(counts.size(), counts.size(), 0.0); 570 for (int i=0;i < (int)counts.size() - 1;i++) { 571 for (int j=i+1;j < (int)counts.size();j++) { 572 dmat(i, j) = fsim(counts[i], counts[j]); 573 dmat(j, i) = dmat(i, j); 574 } 575 } 576 } 577 578 /// Compute matrix of distances between given list of counts vectors 579 /// using distance function with additional normalizing values 580 /// @param counts List of k-mer counts vectors [in] 581 /// @param dmat Distance matrix [out] 582 /// @param fsim Function that computes distance betwee two vectors [in] 583 /// @param normalizers List of normalizing arguments [in] 584 /// 585 static void ComputeDistMatrix(const vector<TKmerCounts>& counts, 586 TDistMatrix& dmat, 587 double(*fsim)(const TKmerCounts&, const TKmerCounts&, double, double), 588 const vector<double>& normalizers); 589 590 591 /// Compute distance matrix for given counts vectors and distance measure 592 /// @param counts List of k-mer counts vecotrs [in] 593 /// @param dist_method Distance measure [in] 594 /// @param dmat Distance matrix [out] 595 /// ComputeDistMatrix(const vector<TKmerCounts> & counts,EDistMeasures dist_method,TDistMatrix & dmat)596 static void ComputeDistMatrix(const vector<TKmerCounts>& counts, 597 EDistMeasures dist_method, 598 TDistMatrix& dmat) 599 { 600 switch (dist_method) { 601 case eFractionCommonKmersLocal: 602 ComputeDistMatrix(counts, TKmerCounts::FractionCommonKmersDist, 603 dmat); 604 break; 605 606 case eFractionCommonKmersGlobal: 607 ComputeDistMatrix(counts, 608 TKmerCounts::FractionCommonKmersGlobalDist, 609 dmat); 610 break; 611 612 default: 613 NCBI_THROW(CKmerCountsException, eUnsuportedDistMethod, 614 "Unrecognised distance measure"); 615 } 616 } 617 618 619 /// Compute distance matrix for given counts vectors and distance measure 620 /// and avoid copying 621 /// @param counts List of k-mer counts vecotrs [in] 622 /// @param dist_method Distance measure [in] 623 /// @return Distance matrix 624 /// ComputeDistMatrix(const vector<TKmerCounts> & counts,EDistMeasures dist_method)625 static auto_ptr<TDistMatrix> ComputeDistMatrix( 626 const vector<TKmerCounts>& counts, 627 EDistMeasures dist_method) 628 { 629 auto_ptr<TDistMatrix> dmat(new TDistMatrix(counts.size(), 630 counts.size(), 0)); 631 ComputeDistMatrix(counts, dist_method, *dmat.get()); 632 return dmat; 633 } 634 635 636 /// Compute distances between k-mer counts as graph where nodes are 637 /// sequences and edges represent distances. Distances above given 638 /// threshold will not have edges. 639 /// @param counts List of k-mer counts vectors [in] 640 /// @param dist_method Distance measure [in] 641 /// @param max_dist Maxium distance that will be represented with a graph 642 /// edge [in] 643 /// @param mark_links If true, existings links will be marked in binary 644 /// matrix [in] 645 /// @return Disatances between k-mer counts vectors represented as a graph 646 /// ComputeDistLinks(const vector<TKmerCounts> & counts,EDistMeasures dist_method,double max_dist)647 static CRef<CLinks> ComputeDistLinks(const vector<TKmerCounts>& counts, 648 EDistMeasures dist_method, 649 double max_dist) 650 { 651 if (counts.size() < 2) { 652 NCBI_THROW(CKmerCountsException, eInvalid, "Distance links can be" 653 " computed for at least two k-mer counts vectors"); 654 } 655 656 CRef<CLinks> links(new CLinks(counts.size())); 657 double dist; 658 for (int i=0;i < (int)counts.size()-1;i++) { 659 for (int j=i+1;j < (int)counts.size();j++) { 660 if (dist_method == eFractionCommonKmersLocal) { 661 dist = TKmerCounts::FractionCommonKmersDist(counts[i], 662 counts[j]); 663 } 664 else { 665 dist = TKmerCounts::FractionCommonKmersGlobalDist(counts[i], 666 counts[j]); 667 } 668 669 if (dist <= max_dist) { 670 links->AddLink(i, j, dist); 671 } 672 } 673 } 674 675 return links; 676 } 677 }; 678 679 680 681 END_SCOPE(cobalt) 682 END_NCBI_SCOPE 683 684 #endif /* ALGO_COBALT___KMERCOUNTS__HPP */ 685