1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 
33 #ifndef SEQAN_HEADER_GRAPH_ALIGN_TCOFFEE_DISTANCE_H
34 #define SEQAN_HEADER_GRAPH_ALIGN_TCOFFEE_DISTANCE_H
35 
36 namespace seqan
37 {
38 
39 //////////////////////////////////////////////////////////////////////////////
40 // Distance matrix calculation
41 //////////////////////////////////////////////////////////////////////////////
42 
43 /*!
44  * @defgroup DistanceCalculationTags
45  * @brief Tags for specifying how to calculate distance matrices.
46  *
47  *
48  * @tag DistanceCalculationTags#LibraryDistance
49  * @headerfile <seqan/graph_msa.h>
50  * @brief Using the library itself and heaviest common subsequence to determine a distance matrix.
51  *
52  * @signature typedef Tag<LibraryDistance_> const LibraryDistance;
53  *
54  *
55  * @tag DistanceCalculationTags#KmerDistance
56  * @headerfile <seqan/graph_msa.h>
57  * @brief Using a simple kmer count to determine a distance matrix.
58  *
59  * @signature typedef Tag<LibraryDistance_> const KmerDistance;
60  */
61 
62 struct LibraryDistance_;
63 typedef Tag<LibraryDistance_> const LibraryDistance;
64 
65 
66 struct KmerDistance_;
67 typedef Tag<KmerDistance_> const KmerDistance;
68 
69 
70 
71 
72 
73 
74 //////////////////////////////////////////////////////////////////////////////
75 // LibraryDistance
76 //////////////////////////////////////////////////////////////////////////////
77 
78 //////////////////////////////////////////////////////////////////////////////
79 
80 template<typename TStringSet, typename TCargo, typename TSpec, typename TMatrix>
81 inline void
getDistanceMatrix(Graph<Alignment<TStringSet,TCargo,TSpec>> & g,TMatrix & distanceMatrix,LibraryDistance)82 getDistanceMatrix(Graph<Alignment<TStringSet, TCargo, TSpec> >& g,
83                   TMatrix& distanceMatrix,
84                   LibraryDistance)
85 {
86     typedef Graph<Alignment<TStringSet, TCargo, TSpec> > TGraph;
87     typedef typename Size<TGraph>::Type TSize;
88     //typedef typename Id<TGraph>::Type TId;
89     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
90     //typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
91     //typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
92     typedef typename Value<TMatrix>::Type TValue;
93 
94     // Initialization
95     clear(distanceMatrix);
96     TStringSet& str = stringSet(g);
97     TSize nseq = length(str);
98     resize(distanceMatrix, nseq * nseq, 0);
99 
100     // All pairwise alignments
101     typedef String<String<TVertexDescriptor> > TSegmentString;
102     TValue maxScore = 0;
103     for(TSize i=0; i<nseq; ++i) {
104         TSegmentString seq1;
105         TSize len1 = length(str[i]);
106         _buildLeafString(g, i, seq1);
107         for(TSize j=i+1; j<nseq; ++j) {
108             // Align the 2 strings
109             TSegmentString seq2;
110             TSize len2 = length(str[j]);
111             _buildLeafString(g, j, seq2);
112             TSegmentString alignSeq;
113             TValue score = heaviestCommonSubsequence(g,seq1,seq2,alignSeq);
114 
115             // Normalize by distance
116             if (len1 > len2) score /= len1;
117             else score /= len2;
118             if (score > maxScore) maxScore = score;
119 
120             // Remember the value
121             distanceMatrix[i*nseq+j] = score;
122         }
123     }
124 
125     // Normalize values
126     for(TSize i=0; i<nseq; ++i)
127         for(TSize j=i+1; j<nseq; ++j)
128             distanceMatrix[i*nseq+j] = SEQAN_DISTANCE_UNITY - ((distanceMatrix[i*nseq+j] * SEQAN_DISTANCE_UNITY) / maxScore );
129 }
130 
131 
132 //////////////////////////////////////////////////////////////////////////////
133 // KmerDistance
134 //////////////////////////////////////////////////////////////////////////////
135 
136 //////////////////////////////////////////////////////////////////////////////
137 
138 
139 template<typename TStringSet, typename TCargo, typename TSpec, typename TMatrix, typename TSize, typename TAlphabet>
140 inline void
getDistanceMatrix(Graph<Alignment<TStringSet,TCargo,TSpec>> & g,TMatrix & distanceMatrix,TSize ktup,TAlphabet,KmerDistance)141 getDistanceMatrix(Graph<Alignment<TStringSet, TCargo, TSpec> >& g,
142                   TMatrix& distanceMatrix,
143                   TSize ktup,
144                   TAlphabet,
145                   KmerDistance)
146 {
147     //typedef typename Value<TMatrix>::Type TValue;
148     typedef typename Iterator<TMatrix, Standard>::Type TMatrixIterator;
149 
150     getKmerSimilarityMatrix(stringSet(g), distanceMatrix, ktup, TAlphabet());
151 
152     // Similarity to distance conversion
153     TMatrixIterator matIt = begin(distanceMatrix, Standard());
154     TMatrixIterator endMatIt = end(distanceMatrix, Standard());
155     for(;matIt != endMatIt;++matIt)
156         *matIt = SEQAN_DISTANCE_UNITY - (*matIt);
157 }
158 
159 //////////////////////////////////////////////////////////////////////////////
160 
161 template<typename TStringSet, typename TCargo, typename TSpec, typename TMatrix, typename TSize>
162 inline void
getDistanceMatrix(Graph<Alignment<TStringSet,TCargo,TSpec>> & g,TMatrix & distanceMatrix,TSize ktup,KmerDistance)163 getDistanceMatrix(Graph<Alignment<TStringSet, TCargo, TSpec> >& g,
164                   TMatrix& distanceMatrix,
165                   TSize ktup,
166                   KmerDistance)
167 {
168     getDistanceMatrix(g, distanceMatrix, ktup, typename Value<typename Value<TStringSet>::Type>::Type(), KmerDistance() );
169 }
170 
171 //////////////////////////////////////////////////////////////////////////////
172 
173 template<typename TStringSet, typename TCargo, typename TSpec, typename TMatrix>
174 inline void
getDistanceMatrix(Graph<Alignment<TStringSet,TCargo,TSpec>> & g,TMatrix & distanceMatrix,KmerDistance)175 getDistanceMatrix(Graph<Alignment<TStringSet, TCargo, TSpec> >& g,
176                   TMatrix& distanceMatrix,
177                   KmerDistance)
178 {
179     getDistanceMatrix(g, distanceMatrix, 3, KmerDistance() );
180 }
181 
182 
183 //////////////////////////////////////////////////////////////////////////////
184 
185 /*!
186  * @fn AlignmentGraph#getDistanceMatrix
187  * @headerfile <seqan/graph_msa.h>
188  * @brief Computes a pairwise distance matrix from an @link AlignmentGraph @endlink.
189  *
190  * @signature void getDistanceMtarix(graph, mat[, tag]);
191  * @signature void getDistanceMtarix(graph, mat[, kTup][, alphabet], KmerDistance);
192  *
193  * @param[in]  graph    An @link AlignmentGraph @endlink containing the sequences and possible alignment edges.
194  * @param[out] mat      Pairwise distance matrix.
195  * @param[in]  kTup     For KMerDistance, the length of the k-mers.
196  * @param[in]  alphabet For KMerDistance, the alphabet to use for k-mer counting (e.g. compressed alphabets).
197  * @param[in]  tag      See @link DistanceCalculationTags @endlink.  Default: <tt>KMerDistance</tt>.
198  */
199 
200 
201 template<typename TStringSet, typename TCargo, typename TSpec, typename TMatrix>
202 inline void
getDistanceMatrix(Graph<Alignment<TStringSet,TCargo,TSpec>> & g,TMatrix & distanceMatrix)203 getDistanceMatrix(Graph<Alignment<TStringSet, TCargo, TSpec> >& g,
204                   TMatrix& distanceMatrix)
205 {
206     getDistanceMatrix(g, distanceMatrix, KmerDistance() );
207 }
208 
209 
210 
211 }// namespace seqan
212 
213 #endif //#ifndef SEQAN_HEADER_...
214