1 #ifndef ALGO_COBALT___KMERCOUNTS__HPP
2 #define ALGO_COBALT___KMERCOUNTS__HPP
3 
4 /* $Id: kmercounts.hpp 389681 2013-02-20 13:16:20Z kornbluh $
5 * ===========================================================================
6 *
7 *                            PUBLIC DOMAIN NOTICE
8 *               National Center for Biotechnology Information
9 *
10 *  This software/database is a "United States Government Work" under the
11 *  terms of the United States Copyright Act.  It was written as part of
12 *  the author's offical duties as a United States Government employee and
13 *  thus cannot be copyrighted.  This software/database is freely available
14 *  to the public for use. The National Library of Medicine and the U.S.
15 *  Government have not placed any restriction on its use or reproduction.
16 *
17 *  Although all reasonable efforts have been taken to ensure the accuracy
18 *  and reliability of the software and data, the NLM and the U.S.
19 *  Government do not and cannot warrant the performance or results that
20 *  may be obtained by using this software or data. The NLM and the U.S.
21 *  Government disclaim all warranties, express or implied, including
22 *  warranties of performance, merchantability or fitness for any particular
23 *  purpose.
24 *
25 *  Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================*/
28 
29 /*****************************************************************************
30 
31 File name: kmercounts.hpp
32 
33 Author: Greg Boratyn
34 
35 Contents: Interface for k-mer counting
36 
37 ******************************************************************************/
38 
39 
40 #include <util/math/matrix.hpp>
41 #include <objects/seqloc/Seq_loc.hpp>
42 #include <objmgr/scope.hpp>
43 #include <algo/cobalt/base.hpp>
44 #include <algo/cobalt/links.hpp>
45 #include <algo/blast/core/blast_encoding.h>
46 #include <vector>
47 #include <stack>
48 
49 
50 BEGIN_NCBI_SCOPE
51 BEGIN_SCOPE(cobalt)
52 
53 
54 
55 // TO DO: Redesign K-mer counts classes
56 
57 /// Kmer counts for alignment free sequence similarity computation
58 /// implemented as a sparse vector
59 ///
60 class NCBI_COBALT_EXPORT CSparseKmerCounts
61 {
62 public:
63     typedef Uint1 TCount;
64 
65 
66     /// Element of the sparse vector
67     struct SVectorElement {
68     Uint4 position;   ///< position of non-zero element
69     TCount value;     ///< value of non-zero element
70 
71     /// Default constructor
SVectorElementCSparseKmerCounts::SVectorElement72     SVectorElement(void) {position = 0; value = 0;}
73 
74     /// Create vector element
75     /// @param pos Element position
76     /// @param val Element value
SVectorElementCSparseKmerCounts::SVectorElement77     SVectorElement(Uint4 pos, TCount val) {position = pos; value = val;}
78     };
79 
80     typedef vector<SVectorElement>::const_iterator TNonZeroCounts_CI;
81 
82 
83 public:
84     /// Create empty counts vector
85     ///
CSparseKmerCounts(void)86     CSparseKmerCounts(void) : m_SeqLength(0), m_NumCounts(0) {}
87 
88     /// Create k-mer counts vector from SSeqLoc with defalut k-mer length and
89     /// alphabet size
90     /// @param seq The sequence to be represented as k-mer counts [in]
91     /// @param scope Scope
92     ///
93     CSparseKmerCounts(const objects::CSeq_loc& seq,
94                       objects::CScope& scope);
95 
96     /// Reset the counts vector
97     /// @param seq Sequence [in]
98     /// @param scope Scope [in]
99     ///
100     void Reset(const objects::CSeq_loc& seq, objects::CScope& scope);
101 
102     /// Get sequence length
103     /// @return Sequence length
104     ///
GetSeqLength(void) const105     unsigned int GetSeqLength(void) const {return m_SeqLength;}
106 
107     /// Get number of all k-mers found in the sequence
108     /// @return Number of all k-mers
109     ///
GetNumCounts(void) const110     unsigned int GetNumCounts(void) const {return m_NumCounts;}
111 
112     /// Get default kmer length
113     /// @return Default k-mer length
114     ///
GetKmerLength(void)115     static unsigned int GetKmerLength(void)
116     {return CSparseKmerCounts::sm_KmerLength;}
117 
118     /// Get default alphabet size
119     /// @return Default alphabet size
120     ///
GetAlphabetSize(void)121     static unsigned int GetAlphabetSize(void) {return sm_AlphabetSize;}
122 
123     /// Get non-zero counts iterator
124     /// @return Non-zero counts iterator pointing to the begining
125     ///
BeginNonZero(void) const126     TNonZeroCounts_CI BeginNonZero(void) const {return m_Counts.begin();}
127 
128     /// Get non-zero counts iterator
129     /// @return Non-zero counts iterator pointing to the end
130     ///
EndNonZero(void) const131     TNonZeroCounts_CI EndNonZero(void) const {return m_Counts.end();}
132 
133     /// Print counts
134     /// @param ostr Output stream [in|out]
135     /// @return Output stream
136     ///
137     CNcbiOstream& Print(CNcbiOstream& ostr) const;
138 
139     /// Set default k-mer length
140     /// @param len Default k-mer length [in]
141     ///
SetKmerLength(unsigned len)142     static void SetKmerLength(unsigned len)
143     {sm_KmerLength = len; sm_ForceSmallerMem = false;}
144 
145     /// Set Default alphabet size
146     /// @param size Default alphabet size [in]
147     ///
SetAlphabetSize(unsigned size)148     static void SetAlphabetSize(unsigned size)
149     {sm_AlphabetSize = size; sm_ForceSmallerMem = false;}
150 
151     /// Set default compressed alphabet letter translation table
152     /// @return Reference to translation table [in|out]
153     ///
SetTransTable(void)154     static vector<Uint1>& SetTransTable(void) {return sm_TransTable;}
155 
156     /// Set default option for using compressed alphabet
157     /// @param use_comp Will compressed alphabet be used [in]
158     ///
SetUseCompressed(bool use_comp)159     static void SetUseCompressed(bool use_comp) {sm_UseCompressed = use_comp;}
160 
161     /// Compute 1 - local fraction of common k-mers between two count vectors
162     /// normalized by length of shorter sequence
163     /// @param vect1 K-mer counts vector [in]
164     /// @param vect2 K-mer counts vector [in]
165     /// @return Local fraction of common k-mer as distance
166     ///
167     /// Computes 1 - F(v1, v2), where
168     /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\min \{L_x, L_y\}
169     /// - k + 1), where
170     /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x
171     ///  excluding Xaa, k - k-mer length
172     /// F(x, y) is described in RC Edgar, BMC Bioinformatics 5:113, 2004
173     static double FractionCommonKmersDist(const CSparseKmerCounts& vect1,
174                       const CSparseKmerCounts& vect2);
175 
176     /// Compute 1 - global fraction of common k-mers between two count vectors
177     /// normalized by length of longer sequence
178     /// @param vect1 K-mer counts vector [in]
179     /// @param vect2 K-mer counts vector [in]
180     /// @return Global fraction of common k-mers as distance
181     ///
182     /// Computes 1 - F(v1, v2), where
183     /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\max \{L_x, L_y\}
184     /// - k + 1), where
185     /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x
186     /// excluding Xaa, k - k-mer length
187     /// F(x, y) is modified version of measure presented
188     /// RC Edgar, BMC Bioinformatics 5:113, 2004
189     static double FractionCommonKmersGlobalDist(const CSparseKmerCounts& v1,
190                         const CSparseKmerCounts& v2);
191 
192     /// Copmute number of common kmers between two count vectors
193     /// @param v1 K-mer counts vector [in]
194     /// @param v2 K-mer counts vecotr [in]
195     /// @param repetitions Should multiple copies of the same k-mer be counted
196     /// @return Number of k-mers that are present in both counts vectors
197     ///
198     static unsigned int CountCommonKmers(const CSparseKmerCounts& v1,
199                      const CSparseKmerCounts& v2,
200                      bool repetitions = true);
201 
202     /// Perform preparations before k-mer counting common to all sequences.
203     /// Allocate buffer for storing temporary counts
204     ///
205     static void PreCount(void);
206 
207     /// Perform post-kmer counting tasks. Free buffer.
208     ///
209     static void PostCount(void);
210 
211 
212 private:
213     static TCount* ReserveCountsMem(unsigned int num_bits);
214 
GetAALetter(Uint1 letter)215     static Uint4 GetAALetter(Uint1 letter)
216     {
217         _ASSERT(!sm_UseCompressed || letter < sm_TransTable.size());
218         return (Uint4)(sm_UseCompressed ? sm_TransTable[(int)letter] : letter);
219     }
220 
221     /// Initializes element index as bit vector for first k letters,
222     /// skipping Xaa
223     /// @param sv Sequence [in]
224     /// @param pos Element index in sparse vector [out]
225     /// @param index Index of letter in the sequence where k-mer counting
226     /// starts. At exit index points to the next letter after first
227     /// k-mer [in|out]
228     /// @param num_bits Number of bits in pos per letter [in]
229     /// @param kmer_len K-mer length [in]
230     /// @return True if pos was initialized, false otherwise (if no k-mer
231     /// without X was found)
232     static bool InitPosBits(const objects::CSeqVector& sv, Uint4& pos,
233                             unsigned int& index,  Uint4 num_bits,
234                             Uint4 kmer_len);
235 
236 
237 protected:
238     vector<SVectorElement> m_Counts;
239     unsigned int m_SeqLength;
240     unsigned int m_NumCounts;
241     static unsigned int sm_KmerLength;
242     static unsigned int sm_AlphabetSize;
243     static vector<Uint1> sm_TransTable;
244     static bool sm_UseCompressed;
245     static TCount* sm_Buffer;
246     static bool sm_ForceSmallerMem;
247     static const unsigned int kLengthBitsThreshold = 32;
248 };
249 
250 
251 /// K-mer counts implemented as bit vectors
252 ///
253 class NCBI_COBALT_EXPORT CBinaryKmerCounts
254 {
255 public:
256 
257     /// Constructor
258     ///
CBinaryKmerCounts(void)259     CBinaryKmerCounts(void) : m_SeqLength(0), m_NumCounts(0) {}
260 
261 
262     /// Constructor
263     /// @param seq Sequence [in]
264     /// @param scop Scope [in]
265     ///
266     CBinaryKmerCounts(const objects::CSeq_loc& seq,
267                       objects::CScope& scope);
268 
269     /// Compute counts
270     /// @param seq Sequence [in]
271     /// @param scope Scope [in]
272     ///
273     void Reset(const objects::CSeq_loc& seq, objects::CScope& scope);
274 
275     /// Get sequence length
276     /// @return Sequence length
277     ///
GetSeqLength(void) const278     unsigned int GetSeqLength(void) const {return m_SeqLength;}
279 
280     /// Get number of k-mers
281     /// @return Number of k-mers
282     ///
GetNumCounts(void) const283     unsigned int GetNumCounts(void) const {return m_NumCounts;}
284 
285     /// Get k-mer length
286     /// @return K-mer length
287     ///
GetKmerLength(void)288     static unsigned int GetKmerLength(void)
289     {return CBinaryKmerCounts::sm_KmerLength;}
290 
291     /// Get alphabet size
292     /// @return Alphabet size
293     ///
GetAlphabetSize(void)294     static unsigned int GetAlphabetSize(void) {return sm_AlphabetSize;}
295 
296     /// Set default k-mer length
297     /// @param len Default k-mer length [in]
298     ///
SetKmerLength(unsigned len)299     static void SetKmerLength(unsigned len)
300     {sm_KmerLength = len;}
301 
302     /// Set Default alphabet size
303     /// @param size Default alphabet size [in]
304     ///
SetAlphabetSize(unsigned size)305     static void SetAlphabetSize(unsigned size)
306     {sm_AlphabetSize = size;}
307 
308     /// Set default compressed alphabet letter translation table
309     /// @return Reference to translation table [in|out]
310     ///
SetTransTable(void)311     static vector<Uint1>& SetTransTable(void) {return sm_TransTable;}
312 
313     /// Set default option for using compressed alphabet
314     /// @param use_comp Will compressed alphabet be used [in]
315     ///
SetUseCompressed(bool use_comp)316     static void SetUseCompressed(bool use_comp) {sm_UseCompressed = use_comp;}
317 
318     /// Compute 1 - local fraction of common k-mers between two count vectors
319     /// normalized by length of shorter sequence
320     /// @param vect1 K-mer counts vector [in]
321     /// @param vect2 K-mer counts vector [in]
322     /// @return Local fraction of common k-mer as distance
323     ///
324     /// Computes 1 - F(v1, v2), where
325     /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\min \{L_x, L_y\}
326     /// - k + 1), where
327     /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x
328     ///  excluding Xaa, k - k-mer length
329     /// F(x, y) is described in RC Edgar, BMC Bioinformatics 5:113, 2004
330     static double FractionCommonKmersDist(const CBinaryKmerCounts& vect1,
331                       const CBinaryKmerCounts& vect2);
332 
333     /// Compute 1 - global fraction of common k-mers between two count vectors
334     /// normalized by length of longer sequence
335     /// @param vect1 K-mer counts vector [in]
336     /// @param vect2 K-mer counts vector [in]
337     /// @return Global fraction of common k-mers as distance
338     ///
339     /// Computes 1 - F(v1, v2), where
340     /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\max \{L_x, L_y\}
341     /// - k + 1), where
342     /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x
343     /// excluding Xaa, k - k-mer length
344     /// F(x, y) is modified version of measure presented
345     /// RC Edgar, BMC Bioinformatics 5:113, 2004
346     static double FractionCommonKmersGlobalDist(const CBinaryKmerCounts& v1,
347                         const CBinaryKmerCounts& v2);
348 
349     /// Copmute number of common kmers between two count vectors
350     /// @param v1 K-mer counts vector [in]
351     /// @param v2 K-mer counts vecotr [in]
352     /// @param repetitions Should multiple copies of the same k-mer be counted
353     /// @return Number of k-mers that are present in both counts vectors
354     ///
355     static unsigned int CountCommonKmers(const CBinaryKmerCounts& v1,
356                                          const CBinaryKmerCounts& v2);
357 
358 
359     /// Perform preparations before k-mer counting common to all sequences.
360     ///
PreCount(void)361     static void PreCount(void) {}
362 
363     /// Perform post-kmer counting tasks.
364     ///
PostCount(void)365     static void PostCount(void) {}
366 
367 
368 protected:
GetAALetter(Uint1 letter)369     static Uint4 GetAALetter(Uint1 letter)
370     {
371         _ASSERT(!sm_UseCompressed || letter < sm_TransTable.size());
372         return (Uint4)(sm_UseCompressed ? sm_TransTable[(int)letter] : letter);
373     }
374 
375     /// Get number of set bits (adapted
376     /// from http://graphics.stanford.edu/~seander/bithacks.html)
377     /// @param v Bit vector [in]
378     /// @return Number of set bits
379     ///
x_Popcount(Uint4 v)380     static Uint4 x_Popcount(Uint4 v)
381     {
382         if (v==0) return 0; // early bailout for sparse vectors
383         v = v - ((v >> 1) & 0x55555555);
384         v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
385         v = ((v + (v >> 4)) & 0xF0F0F0F);
386         v = v*0x1010101;
387 
388         return v >> 24; // count
389     }
390 
391 
392 protected:
393     vector<Uint4> m_Counts;
394     Uint4 m_SeqLength;
395     Uint4 m_NumCounts;
396     static unsigned int sm_KmerLength;
397     static unsigned int sm_AlphabetSize;
398     static vector<Uint1> sm_TransTable;
399     static bool sm_UseCompressed;
400 };
401 
402 
403 
404 /// Exception class for Kmer counts
405 class CKmerCountsException : public CException
406 {
407 public:
408     enum EErrCode {
409         eInvalid,
410         eUnsupportedSeqLoc,
411         eUnsuportedDistMethod,
412         eInvalidOptions,
413         eBadSequence,
414         eMemoryAllocation
415     };
416 
417     NCBI_EXCEPTION_DEFAULT(CKmerCountsException, CException);
418 };
419 
420 /// Interface for computing and manipulating k-mer counts vectors that allows
421 /// for different implementations of K-mer counts vectors
422 ///
423 template <class TKmerCounts>
424 class TKmerMethods
425 {
426 public:
427     enum ECompressedAlphabet {
428         eRegular = 0, eSE_V10, eSE_B15,
429         eFirstCompressed = eSE_V10,
430         eLastAlphabet = eSE_B15
431     };
432 
433     enum EDistMeasures {
434         eFractionCommonKmersGlobal,
435         eFractionCommonKmersLocal
436     };
437 
438     typedef CNcbiMatrix<double> TDistMatrix;
439 
440 public:
441 
442     /// Set default counts vector parameters
443     /// @param kmer_len K-mer length [in]
444     /// @param alphabet_size Alphabet size [in]
445     ///
SetParams(unsigned kmer_len,unsigned alphabet_size)446     static void SetParams(unsigned kmer_len, unsigned alphabet_size)
447     {
448         TKmerCounts::SetKmerLength(kmer_len);
449         TKmerCounts::SetAlphabetSize(alphabet_size);
450         TKmerCounts::SetTransTable().clear();
451         TKmerCounts::SetUseCompressed(false);
452     }
453 
454     /// Creates translation table for compressed alphabets
455     /// @param trans_string String with groupped letters [in]
456     /// @param trans_table Translation table [out]
457     /// @param alphabet_len Number of letters in compressed alphabet
458     ///
BuildCompressedTranslation(ECompressedAlphabet alph_index,vector<Uint1> & trans_table,unsigned alphabet_len)459     static void BuildCompressedTranslation(ECompressedAlphabet alph_index,
460                                            vector<Uint1>& trans_table,
461                                            unsigned alphabet_len)
462 
463     {
464         // Compressed alphabets taken from
465         // Shiryev et al.(2007),  Bioinformatics, 23:2949-2951
466         const char* kCompAlphabets[] = {
467             // 23-to-10 letter compressed alphabet. Based on SE-V(10)
468             "IJLMV AST BDENZ KQR G FY P H C W",
469             // 23-to-15 letter compressed alphabet. Based on SE_B(14)
470             "ST IJV LM KR EQZ A G BD P N F Y H C W"
471         };
472 
473         _ASSERT(alph_index >= eFirstCompressed && alph_index <= eLastAlphabet);
474         const char* trans_string = kCompAlphabets[alph_index
475                                                   - (int)eFirstCompressed];
476 
477         Uint4 compressed_letter = 1; // this allows for gaps
478         trans_table.clear();
479         trans_table.resize(alphabet_len + 1, 0);
480         for (Uint4 i = 0; i < strlen(trans_string);i++) {
481             if (isspace(trans_string[i])) {
482                 compressed_letter++;
483             }
484             else if (isalpha(trans_string[i])) {
485                 Uint1 aa_letter = AMINOACID_TO_NCBISTDAA[(int)trans_string[i]];
486 
487                 _ASSERT(aa_letter < trans_table.size());
488 
489                 trans_table[aa_letter] = compressed_letter;
490             }
491         }
492     }
493 
494     /// Set default counts vector parameters for use with compressed alphabet
495     /// @param kmer_len K-mer length [in]
496     /// @param alph Compressed alphabet to use [in]
497     ///
SetParams(unsigned kmer_len,ECompressedAlphabet alph)498     static void SetParams(unsigned kmer_len, ECompressedAlphabet alph) {
499         TKmerCounts::SetKmerLength(kmer_len);
500         unsigned int len;
501         unsigned int compressed_len;
502         switch (alph) {
503         case eSE_V10:
504             len = 28;
505             compressed_len = 11; //including gap
506             BuildCompressedTranslation(eSE_V10,
507                                        TKmerCounts::SetTransTable(),
508                                        len);
509             TKmerCounts::SetAlphabetSize(compressed_len);
510             TKmerCounts::SetUseCompressed(true);
511             break;
512 
513         case eSE_B15:
514             len = 28;
515             compressed_len = 16; //including gap
516             BuildCompressedTranslation(eSE_B15,
517                                        TKmerCounts::SetTransTable(),
518                                        len);
519             TKmerCounts::SetAlphabetSize(compressed_len);
520             TKmerCounts::SetUseCompressed(true);
521             break;
522 
523         case eRegular:
524             TKmerCounts::SetAlphabetSize(kAlphabetSize);
525             TKmerCounts::SetTransTable().clear();
526             TKmerCounts::SetUseCompressed(false);
527         }
528     }
529 
530     /// Create k-mer counts vectors for given sequences
531     /// @param seqs List of sequences [in]
532     /// @param counts List of k-mer counts vectors [out]
533     ///
ComputeCounts(const vector<CRef<objects::CSeq_loc>> & seqs,objects::CScope & scope,vector<TKmerCounts> & counts)534     static void ComputeCounts(const vector< CRef<objects::CSeq_loc> >& seqs,
535                               objects::CScope& scope,
536                               vector<TKmerCounts>& counts)
537     {
538         if (seqs.empty()) {
539             NCBI_THROW(CKmerCountsException, eInvalidOptions,
540                        "Empty list of sequences");
541         }
542 
543         counts.clear();
544 
545         TKmerCounts::PreCount();
546 
547         ITERATE(vector< CRef<objects::CSeq_loc> >, it, seqs) {
548             counts.push_back(TKmerCounts(**it, scope));
549         }
550 
551         TKmerCounts::PostCount();
552     }
553 
554     /// Compute matrix of distances between given counts vectors
555     /// @param counts List of k-mer counts vectors [in]
556     /// @param fsim Function that computes distance betwee two vectors [in]
557     /// @param dmat Distance matrix [out]
558     ///
ComputeDistMatrix(const vector<TKmerCounts> & counts,double (* fsim)(const TKmerCounts &,const TKmerCounts &),TDistMatrix & dmat)559     static void ComputeDistMatrix(const vector<TKmerCounts>& counts,
560                   double(*fsim)(const TKmerCounts&, const TKmerCounts&),
561                   TDistMatrix& dmat)
562 
563     {
564         if (counts.empty()) {
565             NCBI_THROW(CKmerCountsException, eBadSequence,
566                        "The list of k-mer counts vectors is empty");
567         }
568 
569         dmat.Resize(counts.size(), counts.size(), 0.0);
570         for (int i=0;i < (int)counts.size() - 1;i++) {
571             for (int j=i+1;j < (int)counts.size();j++) {
572                 dmat(i, j) = fsim(counts[i], counts[j]);
573                 dmat(j, i) = dmat(i, j);
574             }
575         }
576     }
577 
578     /// Compute matrix of distances between given list of counts vectors
579     /// using distance function with additional normalizing values
580     /// @param counts List of k-mer counts vectors [in]
581     /// @param dmat Distance matrix [out]
582     /// @param fsim Function that computes distance betwee two vectors [in]
583     /// @param normalizers List of normalizing arguments [in]
584     ///
585     static void ComputeDistMatrix(const vector<TKmerCounts>& counts,
586         TDistMatrix& dmat,
587         double(*fsim)(const TKmerCounts&, const TKmerCounts&, double, double),
588         const vector<double>& normalizers);
589 
590 
591     /// Compute distance matrix for given counts vectors and distance measure
592     /// @param counts List of k-mer counts vecotrs [in]
593     /// @param dist_method Distance measure [in]
594     /// @param dmat Distance matrix [out]
595     ///
ComputeDistMatrix(const vector<TKmerCounts> & counts,EDistMeasures dist_method,TDistMatrix & dmat)596     static void ComputeDistMatrix(const vector<TKmerCounts>& counts,
597                                   EDistMeasures dist_method,
598                                   TDistMatrix& dmat)
599     {
600         switch (dist_method) {
601         case eFractionCommonKmersLocal:
602             ComputeDistMatrix(counts, TKmerCounts::FractionCommonKmersDist,
603                               dmat);
604             break;
605 
606         case eFractionCommonKmersGlobal:
607             ComputeDistMatrix(counts,
608                               TKmerCounts::FractionCommonKmersGlobalDist,
609                               dmat);
610             break;
611 
612         default:
613             NCBI_THROW(CKmerCountsException, eUnsuportedDistMethod,
614                        "Unrecognised distance measure");
615         }
616     }
617 
618 
619     /// Compute distance matrix for given counts vectors and distance measure
620     /// and avoid copying
621     /// @param counts List of k-mer counts vecotrs [in]
622     /// @param dist_method Distance measure [in]
623     /// @return Distance matrix
624     ///
ComputeDistMatrix(const vector<TKmerCounts> & counts,EDistMeasures dist_method)625     static auto_ptr<TDistMatrix> ComputeDistMatrix(
626                                           const vector<TKmerCounts>& counts,
627                                           EDistMeasures dist_method)
628     {
629         auto_ptr<TDistMatrix> dmat(new TDistMatrix(counts.size(),
630                                                    counts.size(), 0));
631         ComputeDistMatrix(counts, dist_method, *dmat.get());
632         return dmat;
633     }
634 
635 
636     /// Compute distances between k-mer counts as graph where nodes are
637     /// sequences and edges represent distances. Distances above given
638     /// threshold will not have edges.
639     /// @param counts List of k-mer counts vectors [in]
640     /// @param dist_method Distance measure [in]
641     /// @param max_dist Maxium distance that will be represented with a graph
642     /// edge [in]
643     /// @param mark_links If true, existings links will be marked in binary
644     /// matrix [in]
645     /// @return Disatances between k-mer counts vectors represented as a graph
646     ///
ComputeDistLinks(const vector<TKmerCounts> & counts,EDistMeasures dist_method,double max_dist)647     static CRef<CLinks> ComputeDistLinks(const vector<TKmerCounts>& counts,
648                                          EDistMeasures dist_method,
649                                          double max_dist)
650     {
651         if (counts.size() < 2) {
652             NCBI_THROW(CKmerCountsException, eInvalid, "Distance links can be"
653                        " computed for at least two k-mer counts vectors");
654         }
655 
656         CRef<CLinks> links(new CLinks(counts.size()));
657         double dist;
658         for (int i=0;i < (int)counts.size()-1;i++) {
659             for (int j=i+1;j < (int)counts.size();j++) {
660                 if (dist_method == eFractionCommonKmersLocal) {
661                     dist = TKmerCounts::FractionCommonKmersDist(counts[i],
662                                                                 counts[j]);
663                 }
664                 else {
665                     dist = TKmerCounts::FractionCommonKmersGlobalDist(counts[i],
666                                                                       counts[j]);
667                 }
668 
669                 if (dist <= max_dist) {
670                     links->AddLink(i, j, dist);
671                 }
672             }
673         }
674 
675         return links;
676     }
677 };
678 
679 
680 
681 END_SCOPE(cobalt)
682 END_NCBI_SCOPE
683 
684 #endif /* ALGO_COBALT___KMERCOUNTS__HPP */
685