1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32
33 #ifndef SEQAN_HEADER_GRAPH_ALIGN_TCOFFEE_GUIDETREE_H
34 #define SEQAN_HEADER_GRAPH_ALIGN_TCOFFEE_GUIDETREE_H
35
36 namespace SEQAN_NAMESPACE_MAIN
37 {
38
39 //////////////////////////////////////////////////////////////////////////////
40 // Guide Tree
41 //////////////////////////////////////////////////////////////////////////////
42
43
44
45
46 //////////////////////////////////////////////////////////////////////////////
47 // Neighbor Joining
48 //////////////////////////////////////////////////////////////////////////////
49
50
51 // Helper function for rounding to n significant digits. Ported from
52 // Java code found here: http://stackoverflow.com/questions/202302
53 inline double
_roundToSignificantFigures(double num,int n)54 _roundToSignificantFigures(double num, int n)
55 {
56 if (num == 0)
57 return 0;
58
59 const double d = ceil(log10(num < 0 ? -num : num));
60 const int power = n - (int) d;
61
62 const double magnitude = pow(10.0, power);
63 const long shifted = static_cast<long>(round(num*magnitude));
64 return shifted / magnitude;
65 }
66
67 //////////////////////////////////////////////////////////////////////////////
68
69 /*!
70 * @fn njTree
71 * @headerfile <seqan/graph_msa.h>
72 * @brief computes a guid etree from a distance matrix.
73 *
74 * @signature void njTree(mat, tree);
75 *
76 * @param[in] mat A @link String @endlink of pairwise distance values, representing a square matrix.
77 * @param[out] tree The guide tree.
78 */
79
80 template<typename TValue, typename TStringSpec, typename TCargo, typename TSpec>
81 inline void
njTree(String<TValue,TStringSpec> const & matIn,Graph<Tree<TCargo,TSpec>> & g)82 njTree(String<TValue, TStringSpec> const & matIn,
83 Graph<Tree<TCargo, TSpec> >& g)
84 {
85 typedef String<TValue, TStringSpec> TMatrix;
86 typedef typename Size<TMatrix>::Type TSize;
87 typedef Graph<Tree<TCargo, TSpec> > TGraph;
88 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
89
90 TVertexDescriptor nilVertex = getNil<TVertexDescriptor>();
91 TSize nseq = (TSize) std::sqrt((double)length(matIn));
92
93 // Assert that the input matrix has no negative values.
94 // #if SEQAN_ENABLE_DEBUG
95 // for (unsigned i = 0; i < length(matIn); ++i)
96 // SEQAN_ASSERT_GEQ_MSG(matIn[i], 0, "i = %u", i);
97 // #endif // #if SEQAN_ENABLE_DEBUG
98
99 //for(TSize i=0;i<nseq;++i) {
100 // for(TSize j=0;j<nseq;++j) {
101 // std::cout << getValue(matIn, i*nseq+j) << ",";
102 // }
103 // std::cout << std::endl;
104 //}
105
106 // Handle base cases for one and two sequences.
107 clearVertices(g);
108 if (nseq == 1)
109 {
110 g.data_root = addVertex(g);
111 return;
112 }
113 else if (nseq == 2)
114 {
115 TVertexDescriptor v1 = addVertex(g);
116 TVertexDescriptor v2 = addVertex(g);
117 TVertexDescriptor internalVertex = addVertex(g);
118 addEdge(g, internalVertex, v1, (TCargo) _roundToSignificantFigures(matIn[1] / 2.0, 5));
119 addEdge(g, internalVertex, v2, (TCargo) _roundToSignificantFigures(matIn[1] / 2.0, 5));
120 g.data_root = internalVertex;
121 return;
122 }
123
124 // Create a normalized copy of matIn with fixed point numbers, precision of 10 digits.
125 TValue normFactor = 0;
126 for (unsigned i = 0; i < length(matIn); ++i)
127 normFactor = std::max(normFactor, matIn[i]);
128 // In the case that normFactor is 0 (e.g. all sequences equal) then fix it to 1 so the normalization
129 // does not break.
130 if (!normFactor)
131 normFactor = 1;
132 String<__int64> mat;
133 resize(mat, length(matIn));
134 for (unsigned i = 0; i < length(mat); ++i)
135 mat[i] = static_cast<__int64>(10000000.0 * ((double)(matIn[i]) / (double)(normFactor)));
136
137 // First initialization
138 String<__int64> av; // Average branch length to a combined node
139 resize(av,nseq,0);
140
141 String<TVertexDescriptor> connector; // Nodes that need to be connected
142 resize(connector, nseq);
143
144 for(TSize i=0;i<nseq;++i)
145 {
146 addVertex(g); // Add all the nodes that correspond to sequences
147 connector[i] = i;
148 mat[i*nseq+i] = 0;
149 }
150
151 // Main cycle
152 __int64 fnseqs = static_cast<__int64>(nseq);
153 for(TSize nc=0; nc<(nseq-3); ++nc) {
154 __int64 sumOfBranches = 0;
155
156 // Determine the sum of all branches and
157 // copy upper triangle matrix to lower triangle
158 for(TSize col=1; col<nseq; ++col)
159 for(TSize row=0; row<col; ++row)
160 sumOfBranches += mat[col*nseq+row] = mat[row*nseq+col];
161
162 // Compute the sum of branch lengths for all possible pairs
163 bool notFound = true;
164 __int64 tmin = 0;
165 TSize mini = 0; // Next pair of seq i and j to join
166 TSize minj = 0;
167 __int64 diToAllOthers = 0;
168 __int64 djToAllOthers = 0;
169 __int64 total = 0;
170 for(TSize col=1; col<nseq; ++col) {
171 if (connector[col] != nilVertex) {
172 for(TSize row=0; row<col; ++row) {
173 if (connector[row] != nilVertex) {
174 diToAllOthers = 0;
175 djToAllOthers = 0;
176
177 for(TSize i=0; i<nseq; ++i) {
178 diToAllOthers += mat[i*nseq+row];
179 djToAllOthers += mat[i*nseq+col];
180 }
181
182 total = diToAllOthers + djToAllOthers + (fnseqs - 2) * mat[row*nseq+col] + 2 * (sumOfBranches - diToAllOthers - djToAllOthers);
183 total /= (2*(fnseqs - 2));
184
185 if ((notFound) || (total < tmin)) {
186 notFound = false;
187 tmin = total;
188 mini = row;
189 minj = col;
190 }
191 }
192 }
193 }
194 }
195
196 // Print nodes that are about to be joined
197 //std::cout << mini << std::endl;
198 //std::cout << minj << std::endl;
199 //std::cout << tmin << std::endl;
200 //std::cout << std::endl;
201
202 // Compute branch lengths
203 __int64 dMinIToOthers = 0;
204 __int64 dMinJToOthers = 0;
205 for(TSize i=0; i<nseq; ++i) {
206 dMinIToOthers += mat[i*nseq + mini];
207 dMinJToOthers += mat[i*nseq + minj];
208 }
209 __int64 dmin = mat[mini*nseq + minj];
210 dMinIToOthers = dMinIToOthers / (fnseqs - 2);
211 dMinJToOthers = dMinJToOthers / (fnseqs - 2);
212 __int64 iBranch = (dmin + dMinIToOthers - dMinJToOthers) / 2;
213 __int64 jBranch = dmin - iBranch;
214 iBranch -= av[mini];
215 jBranch -= av[minj];
216
217 // Set negative branch length to zero
218 if( iBranch < 0) iBranch = 0;
219 if( jBranch < 0) jBranch = 0;
220
221 // Print branch lengths
222 //std::cout << iBranch << std::endl;
223 //std::cout << jBranch << std::endl;
224 //std::cout << std::endl;
225
226 // Build tree
227 TVertexDescriptor internalVertex = addVertex(g);
228 addEdge(g, internalVertex, connector[mini], (TCargo) _roundToSignificantFigures((iBranch / 10000000.0) * normFactor, 5));
229 addEdge(g, internalVertex, connector[minj], (TCargo) _roundToSignificantFigures((jBranch / 10000000.0) * normFactor, 5));
230
231 // Remember the average branch length for the new combined node
232 // Must be subtracted from all branches that include this node
233 if(dmin < 0) dmin = 0;
234 av[mini] = dmin / 2;
235
236
237 // Re-initialisation
238 // mini becomes the new combined node, minj is killed
239 --fnseqs;
240 connector[minj] = nilVertex;
241 connector[mini] = internalVertex;
242
243 for(TSize j=0; j<nseq; ++j) {
244 if( connector[j] != nilVertex ) {
245 // Use upper triangle
246 if((TSize) mini < j) mat[mini*nseq+j] = (mat[mini*nseq+j] + mat[minj*nseq+j]) / 2;
247 if((TSize) mini > j) mat[j*nseq+mini] = (mat[mini*nseq+j] + mat[minj*nseq+j]) / 2;
248 }
249 }
250 for(TSize j=0; j<nseq; ++j)
251 mat[j*nseq+minj] = mat[minj*nseq+j] = 0;
252 }
253
254 // Only three nodes left
255
256 // Find the remaining nodes
257 String<TSize> l;
258 resize(l,3);
259 TSize count = 0;
260 for(TSize i=0; i<nseq; ++i) {
261 if(connector[i] != nilVertex) {
262 l[count] = i;
263 ++count;
264 }
265 }
266
267 // Remaining nodes
268 //std::cout << l[0] << std::endl;
269 //std::cout << l[1] << std::endl;
270 //std::cout << l[2] << std::endl;
271 //std::cout << std::endl;
272
273 String<__int64> branch;
274 resize(branch, 3);
275 branch[0] = (mat[l[0]*nseq+l[1]] + mat[l[0]*nseq+l[2]] - mat[l[1]*nseq+l[2]]) / 2;
276 branch[1] = (mat[l[1]*nseq+l[2]] + mat[l[0]*nseq+l[1]] - mat[l[0]*nseq+l[2]]) / 2;
277 branch[2] = (mat[l[1]*nseq+l[2]] + mat[l[0]*nseq+l[2]] - mat[l[0]*nseq+l[1]]) / 2;
278
279 branch[0] -= av[l[0]];
280 branch[1] -= av[l[1]];
281 branch[2] -= av[l[2]];
282
283 // Print branch lengths
284 //std::cout << branch[0] << std::endl;
285 //std::cout << branch[1] << std::endl;
286 //std::cout << branch[2] << std::endl;
287 //std::cout << std::endl;
288
289 // Reset negative branch lengths to zero
290 if( branch[0] < 0) branch[0] = 0;
291 if( branch[1] < 0) branch[1] = 0;
292 if( branch[2] < 0) branch[2] = 0;
293
294 // Build tree
295 TVertexDescriptor internalVertex = addVertex(g);
296 addEdge(g, internalVertex, getValue(connector, l[0]), (TCargo) _roundToSignificantFigures((branch[0] / 10000000.0)* normFactor, 5));
297 addEdge(g, internalVertex, getValue(connector, l[1]), (TCargo) _roundToSignificantFigures((branch[1] / 10000000.0) * normFactor, 5));
298 TVertexDescriptor the_root = addVertex(g);
299 addEdge(g, the_root, getValue(connector, l[2]), (TCargo) _roundToSignificantFigures((branch[2] / 20000000.0) * normFactor, 5));
300 addEdge(g, the_root, internalVertex, (TCargo) _roundToSignificantFigures((branch[2] / 20000000.0) * normFactor, 5));
301 g.data_root = the_root;
302 }
303
304
305
306
307
308
309 //////////////////////////////////////////////////////////////////////////////
310 // Unweighted Pair Group Mean Average (UPGMA)
311 //////////////////////////////////////////////////////////////////////////////
312
313 /*!
314 * @defgroup UpgmaConfiguratorTags
315 * @brief Tags for configuring the guide tree construction.
316 *
317 * @tag UpgmaConfiguratorTags#UpgmaMin
318 * @headerfile <seqan/graph_msa.h>
319 * @brief Uses the min operation in the UPGMA algorithm.
320 *
321 * @signature typedef Tag<UpgmaMin_> const UpgmaMin;
322 *
323 * @tag UpgmaConfiguratorTags#UpgmaMax
324 * @headerfile <seqan/graph_msa.h>
325 * @brief Uses the max operation in the UPGMA algorithm.
326 *
327 * @signature typedef Tag<UpgmaMax_> const UpgmaMax;
328 *
329 * @tag UpgmaConfiguratorTags#UpgmaAvg
330 * @headerfile <seqan/graph_msa.h>
331 * @brief Uses the avg operation in the UPGMA algorithm.
332 *
333 * @signature typedef Tag<UpgmaAvg_> const UpgmaAvg;
334 *
335 * @tag UpgmaConfiguratorTags#UpgmaWeightAvg
336 * @headerfile <seqan/graph_msa.h>
337 * @brief Uses the weighted avg operation in the UPGMA algorithm.
338 *
339 * @signature typedef Tag<UpgmaAvg_> const UpgmaWeightAvg;
340 */
341
342 //////////////////////////////////////////////////////////////////////////////
343
344 struct UpgmaMin_;
345 typedef Tag<UpgmaMin_> const UpgmaMin;
346
347 //////////////////////////////////////////////////////////////////////////////
348
349 struct UpgmaMax_;
350 typedef Tag<UpgmaMax_> const UpgmaMax;
351
352 //////////////////////////////////////////////////////////////////////////////
353
354 struct UpgmaAvg_;
355 typedef Tag<UpgmaAvg_> const UpgmaAvg;
356
357 //////////////////////////////////////////////////////////////////////////////
358
359 struct UpgmaWeightAvg_;
360 typedef Tag<UpgmaWeightAvg_> const UpgmaWeightAvg;
361
362
363
364 //////////////////////////////////////////////////////////////////////////////
365
366 template<typename TMatrix, typename TActive, typename TSize>
367 inline void
_upgmaTreeMerge(TMatrix & mat,TActive & active,TSize index_i,TSize index_j,TSize nseq,UpgmaWeightAvg)368 _upgmaTreeMerge(TMatrix& mat,
369 TActive& active,
370 TSize index_i,
371 TSize index_j,
372 TSize nseq,
373 UpgmaWeightAvg)
374 {
375 SEQAN_CHECKPOINT
376 typedef typename Value<TMatrix>::Type TValue;
377 // Average
378 for(TSize i=0;i<nseq;++i) {
379 if ((i != index_i) && (i != index_j) && (active[i] != 0)) {
380 if (index_i < i) {
381 mat[index_i*nseq + i] = ((TValue) active[index_i] / (TValue) (active[index_i] + active[index_j])) * mat[index_i * nseq + i];
382 if (index_j < i) mat[index_i*nseq + i] += ((TValue) active[index_j] / (TValue) (active[index_i] + active[index_j])) * mat[index_j * nseq + i];
383 else mat[index_i*nseq + i] += ((TValue) active[index_j] / (TValue) (active[index_i] + active[index_j])) * mat[i * nseq + index_j];
384 } else {
385 mat[i*nseq + index_i] = ((TValue) active[index_i] / (TValue) (active[index_i] + active[index_j])) * mat[i * nseq + index_i];
386 if (index_j < i) value(mat, i*nseq + index_i) += ((TValue) active[index_j] / (TValue) (active[index_i] + active[index_j])) * mat[index_j * nseq + i];
387 else mat[i*nseq + index_i] += ((TValue) active[index_j] / (TValue) (active[index_i] + active[index_j])) * mat[i * nseq + index_j];
388 }
389 }
390 }
391 }
392
393 //////////////////////////////////////////////////////////////////////////////
394
395
396 template<typename TMatrix, typename TActive, typename TSize>
397 inline void
_upgmaTreeMerge(TMatrix & mat,TActive & active,TSize index_i,TSize index_j,TSize nseq,UpgmaAvg)398 _upgmaTreeMerge(TMatrix& mat,
399 TActive& active,
400 TSize index_i,
401 TSize index_j,
402 TSize nseq,
403 UpgmaAvg)
404 {
405 SEQAN_CHECKPOINT
406 typedef typename Value<TMatrix>::Type TValue;
407
408 // Minimum
409 for(TSize i=0;i<nseq;++i) {
410 if ((i != index_i) && (i != index_j) && (active[i] != 0)) {
411 TValue val1 = (index_i < i) ? mat[index_i * nseq + i] : mat[i * nseq + index_i];
412 TValue val2 = (index_j < i) ? mat[index_j * nseq + i] : mat[i * nseq + index_j];
413 if (index_i < i) mat[index_i * nseq + i] = (val1 + val2) / 2;
414 else mat[i * nseq + index_i] = (val1 + val2) / 2;
415 }
416 }
417 }
418
419 //////////////////////////////////////////////////////////////////////////////
420
421
422 template<typename TMatrix, typename TActive, typename TSize>
423 inline void
_upgmaTreeMerge(TMatrix & mat,TActive & active,TSize index_i,TSize index_j,TSize nseq,UpgmaMin)424 _upgmaTreeMerge(TMatrix& mat,
425 TActive& active,
426 TSize index_i,
427 TSize index_j,
428 TSize nseq,
429 UpgmaMin)
430 {
431 SEQAN_CHECKPOINT
432 typedef typename Value<TMatrix>::Type TValue;
433
434 // Minimum
435 for(TSize i=0;i<nseq;++i) {
436 if ((i != index_i) && (i != index_j) && (active[i] != 0)) {
437 TValue newDist = (index_i < i) ? mat[index_i * nseq + i] : mat[i * nseq + index_i];
438 TValue newDist2 = (index_j < i) ? mat[index_j * nseq + i] : mat[i * nseq + index_j];
439 if (index_i < i) mat[index_i * nseq + i] = _min(newDist, newDist2);
440 else mat[i * nseq + index_i] = _min(newDist, newDist2);
441 }
442 }
443 }
444
445 //////////////////////////////////////////////////////////////////////////////
446
447 template<typename TMatrix, typename TActive, typename TSize>
448 inline void
_upgmaTreeMerge(TMatrix & mat,TActive & active,TSize index_i,TSize index_j,TSize nseq,UpgmaMax)449 _upgmaTreeMerge(TMatrix& mat,
450 TActive& active,
451 TSize index_i,
452 TSize index_j,
453 TSize nseq,
454 UpgmaMax)
455 {
456 SEQAN_CHECKPOINT
457 typedef typename Value<TMatrix>::Type TValue;
458
459 // Maximum
460 for(TSize i=0;i<nseq;++i) {
461 if ((i != index_i) && (i != index_j) && (active[i] != 0)) {
462 TValue newDist = (index_i < i) ? mat[index_i * nseq + i] : mat[i * nseq + index_i];
463 TValue newDist2 = (index_j < i) ? mat[index_j * nseq + i] : mat[i * nseq + index_j];
464 if (index_i < i) mat[index_i * nseq + i] = _max(newDist, newDist2);
465 else mat[i * nseq + index_i] = _max(newDist, newDist2);
466 }
467 }
468 }
469
470 //////////////////////////////////////////////////////////////////////////////
471
472 template<typename TCargo, typename TSpec, typename TActive, typename TEdgeDescriptor>
473 inline void
_upgmaTreeMerge(Graph<Undirected<TCargo,TSpec>> & pairGraph,TActive const &,TEdgeDescriptor best,UpgmaMax)474 _upgmaTreeMerge(Graph<Undirected<TCargo, TSpec> >& pairGraph,
475 TActive const&,
476 TEdgeDescriptor best,
477 UpgmaMax)
478 {
479 typedef Graph<Undirected<TCargo, TSpec> > TGraph;
480 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
481 //typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
482 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
483
484 TVertexDescriptor s = sourceVertex(pairGraph,best);
485 TVertexDescriptor t = targetVertex(pairGraph,best);
486 typedef String<TEdgeDescriptor> TEdgeString;
487 typedef typename Iterator<TEdgeString>::Type TEdgeIter;
488 TEdgeString removeEdges;
489 for(TOutEdgeIterator outIt(pairGraph, s);!atEnd(outIt);goNext(outIt)) {
490 if (targetVertex(outIt) == t) continue;
491 TEdgeDescriptor e = findEdge(pairGraph, targetVertex(outIt), t);
492 if (e == 0) appendValue(removeEdges, value(outIt));
493 else {
494 if (cargo(e) > cargo(value(outIt))) cargo(value(outIt)) = cargo(e);
495 }
496 }
497 TEdgeIter eIt = begin(removeEdges);
498 TEdgeIter eItEnd = end(removeEdges);
499 for(;eIt != eItEnd; goNext(eIt)) removeEdge(pairGraph, value(eIt));
500 removeVertex(pairGraph, t);
501 }
502
503 //////////////////////////////////////////////////////////////////////////////
504
505 template<typename TCargo, typename TSpec, typename TActive, typename TEdgeDescriptor>
506 inline void
_upgmaTreeMerge(Graph<Undirected<TCargo,TSpec>> & pairGraph,TActive const &,TEdgeDescriptor best,UpgmaMin)507 _upgmaTreeMerge(Graph<Undirected<TCargo, TSpec> >& pairGraph,
508 TActive const&,
509 TEdgeDescriptor best,
510 UpgmaMin)
511 {
512 typedef Graph<Undirected<TCargo, TSpec> > TGraph;
513 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
514 //typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
515 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
516
517 TVertexDescriptor s = sourceVertex(pairGraph,best);
518 TVertexDescriptor t = targetVertex(pairGraph,best);
519 for(TOutEdgeIterator outIt(pairGraph, s);!atEnd(outIt);goNext(outIt)) {
520 if (targetVertex(outIt) == t) continue;
521 TEdgeDescriptor e = findEdge(pairGraph, targetVertex(outIt), t);
522 if (e != 0) {
523 if (cargo(e) < cargo(value(outIt))) cargo(value(outIt)) = cargo(e);
524 }
525 }
526 for(TOutEdgeIterator outIt(pairGraph, t);!atEnd(outIt);goNext(outIt)) {
527 if (targetVertex(outIt) == s) continue;
528 TEdgeDescriptor e = findEdge(pairGraph, targetVertex(outIt), s);
529 if (e == 0) addEdge(pairGraph, s, targetVertex(outIt), cargo(value(outIt)));
530 }
531 removeVertex(pairGraph, t);
532 }
533
534
535 //////////////////////////////////////////////////////////////////////////////
536
537 template<typename TCargo, typename TSpec, typename TActive, typename TEdgeDescriptor>
538 inline void
_upgmaTreeMerge(Graph<Undirected<TCargo,TSpec>> & pairGraph,TActive const & active,TEdgeDescriptor best,UpgmaWeightAvg)539 _upgmaTreeMerge(Graph<Undirected<TCargo, TSpec> >& pairGraph,
540 TActive const& active,
541 TEdgeDescriptor best,
542 UpgmaWeightAvg)
543 {
544 typedef Graph<Undirected<TCargo, TSpec> > TGraph;
545 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
546 //typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
547 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
548
549 TCargo infCargo = _getInfinity<TCargo>();
550 TVertexDescriptor s = sourceVertex(pairGraph,best);
551 TVertexDescriptor t = targetVertex(pairGraph,best);
552
553 for(TOutEdgeIterator outIt(pairGraph, s);!atEnd(outIt);goNext(outIt)) {
554 if (targetVertex(outIt) == t) continue;
555 TEdgeDescriptor e1 = value(outIt);
556 TEdgeDescriptor e2 = findEdge(pairGraph, targetVertex(outIt), t);
557 if (e2 != 0) cargo(e1) = ((TCargo) value(active,s) / (TCargo) (value(active,s) + value(active,t))) * cargo(e1) + ((TCargo) value(active,t) / (TCargo) (value(active,s) + value(active,t))) * cargo(e2);
558 else cargo(e1) = ((TCargo) value(active,s) / (TCargo) (value(active,s) + value(active,t))) * cargo(e1) + ((TCargo) value(active,t) / (TCargo) (value(active,s) + value(active,t))) * infCargo;
559 }
560 for(TOutEdgeIterator outIt(pairGraph, t);!atEnd(outIt);goNext(outIt)) {
561 if (targetVertex(outIt) == s) continue;
562 TEdgeDescriptor e = findEdge(pairGraph, targetVertex(outIt), s);
563 TCargo c = ((TCargo) value(active,s) / (TCargo) (value(active,s) + value(active,t))) * infCargo + ((TCargo) value(active,t) / (TCargo) (value(active,s) + value(active,t))) * cargo(value(outIt));
564 if (e == 0) addEdge(pairGraph, s, targetVertex(outIt), c);
565 }
566 removeVertex(pairGraph, t);
567 }
568
569
570 //////////////////////////////////////////////////////////////////////////////
571
572 template<typename TCargo, typename TSpec, typename TActive, typename TEdgeDescriptor>
573 inline void
_upgmaTreeMerge(Graph<Undirected<TCargo,TSpec>> & pairGraph,TActive const &,TEdgeDescriptor best,UpgmaAvg)574 _upgmaTreeMerge(Graph<Undirected<TCargo, TSpec> >& pairGraph,
575 TActive const&,
576 TEdgeDescriptor best,
577 UpgmaAvg)
578 {
579 typedef Graph<Undirected<TCargo, TSpec> > TGraph;
580 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
581 //typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
582 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
583
584 TCargo infCargo = _getInfinity<TCargo>();
585 TVertexDescriptor s = sourceVertex(pairGraph,best);
586 TVertexDescriptor t = targetVertex(pairGraph,best);
587
588 for(TOutEdgeIterator outIt(pairGraph, s);!atEnd(outIt);goNext(outIt)) {
589 if (targetVertex(outIt) == t) continue;
590 TEdgeDescriptor e1 = value(outIt);
591 TEdgeDescriptor e2 = findEdge(pairGraph, targetVertex(outIt), t);
592 cargo(e1) = (e2 != 0) ? (cargo(e1) + cargo(e2)) / 2 : (cargo(e1) + infCargo) / 2;
593 }
594 for(TOutEdgeIterator outIt(pairGraph, t);!atEnd(outIt);goNext(outIt)) {
595 if (targetVertex(outIt) == s) continue;
596 TEdgeDescriptor e = findEdge(pairGraph, targetVertex(outIt), s);
597 if (e == 0) {
598 TCargo c = (infCargo + cargo(value(outIt))) / 2;
599 addEdge(pairGraph, s, targetVertex(outIt), c);
600 }
601 }
602 removeVertex(pairGraph, t);
603 }
604
605 //////////////////////////////////////////////////////////////////////////////
606
607 template<typename TStringValue, typename TStringSpec, typename TCargo, typename TSpec, typename TTag>
608 inline void
upgmaTree(String<TStringValue,TStringSpec> & mat,Graph<Tree<TCargo,TSpec>> & g,TTag)609 upgmaTree(String<TStringValue, TStringSpec>& mat,
610 Graph<Tree<TCargo, TSpec> >& g,
611 TTag)
612 {
613 SEQAN_CHECKPOINT
614 typedef Graph<Tree<TCargo, TSpec> > TGraph;
615 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
616 typedef typename Size<TGraph>::Type TSize;
617 typedef String<TStringValue, TStringSpec> TMatrix;
618 typedef typename Value<TMatrix>::Type TValue;
619 //typedef typename Iterator<TMatrix>::Type TMatrixIter;
620
621 // First initialization
622 TSize nseq = (TSize) std::sqrt((double)length(mat));
623 clearVertices(g);
624
625 // Is it possible to make a guide tree?
626 if (nseq == 1) {
627 g.data_root = addVertex(g);
628 return;
629 } else if (nseq == 2) {
630 TVertexDescriptor v1 = addVertex(g);
631 TVertexDescriptor v2 = addVertex(g);
632 TVertexDescriptor internalVertex = addVertex(g);
633 addEdge(g, internalVertex, v1, (TCargo) 1);
634 addEdge(g, internalVertex, v2, (TCargo) 1);
635 g.data_root = internalVertex;
636 return;
637 }
638
639 // Which entries in the matrix are still active and how many members belong to this group
640 String<TSize> active;
641 resize(active, nseq, 1);
642 // Vertex descriptor that represents that entry
643 String<TVertexDescriptor> nodes;
644 reserve(nodes, nseq);
645
646
647 // Find the minimal value
648 bool notFound = true;
649 TValue minVal = 0;
650 TSize index_i = 0;
651 TSize index_j = 1;
652 for(TSize row=0;row<nseq;++row) {
653 for(TSize col=row+1;col<nseq;++col) {
654 if ((notFound) || (minVal > mat[row*nseq + col])) {
655 notFound = false;
656 minVal = mat[row*nseq + col];
657 index_i = row;
658 index_j = col;
659 }
660 }
661 appendValue(nodes, addVertex(g)); // For each sequence one vertex
662 }
663
664 // Property map for sum of weights for each node
665 String<TCargo> weights;
666 resize(weights, nseq, (TCargo) 0);
667 reserve(weights, 2*nseq - 1);
668
669 // Merge groups
670 TSize m = nseq;
671 while (m>1) {
672 // Merge nodes
673 TVertexDescriptor internalNode = addVertex(g);
674
675 //// Debug code
676 //for(TSize i=0;i<nseq;++i) {
677 // if (value(active,i)==0) continue;
678 // for(TSize j=i+1;j<nseq;++j) {
679 // if (value(active,j)==0) continue;
680 // std::cout << value(mat, i*nseq+j) << ",";
681 // }
682 // std::cout << std::endl;
683 //}
684 //std::cout << minVal << ',' << index_i << ',' << index_j << ',' << std::endl;
685 //std::cout << nodes[index_i] << ',' << nodes[index_j] << std::endl;
686 //std::cout << std::endl;
687
688 TCargo w = (TCargo) (minVal / 2);
689 addEdge(g, internalNode, nodes[index_i], w - property(weights, nodes[index_i]));
690 addEdge(g, internalNode, nodes[index_j], w - property(weights, nodes[index_j]));
691 appendValue(weights, w);
692
693 // Get the new distance values
694 _upgmaTreeMerge(mat, active, index_i, index_j, nseq, TTag());
695
696 // Inactivate one group, adjust the member count for the other one
697 active[index_i] += active[index_j];
698 active[index_j] = 0;
699 nodes[index_i] = internalNode;
700
701 // Find new minimum
702 notFound = true;
703 for(TSize i=0;i<nseq;++i) {
704 if (active[i] == 0) continue;
705 for(TSize j=i+1;j<nseq;++j) {
706 if (active[j] == 0) continue;
707 if ((notFound) || (minVal > mat[i*nseq + j])) {
708 notFound = false;
709 minVal = mat[i*nseq + j];
710 index_i = i;
711 index_j = j;
712 }
713 }
714 }
715 --m;
716 }
717 g.data_root = numVertices(g) - 1;
718 }
719
720 //////////////////////////////////////////////////////////////////////////////
721
722 template<typename TValue, typename TSpec1, typename TCargo, typename TSpec2, typename TTag>
723 inline void
upgmaTree(Graph<Undirected<TValue,TSpec1>> & pairGraph,Graph<Tree<TCargo,TSpec2>> & g,TTag)724 upgmaTree(Graph<Undirected<TValue, TSpec1> >& pairGraph,
725 Graph<Tree<TCargo, TSpec2> >& g,
726 TTag)
727 {
728 SEQAN_CHECKPOINT
729 typedef Graph<Undirected<TValue, TSpec1> > TPairGraph;
730 typedef Graph<Tree<TCargo, TSpec2> > TGuideTree;
731 typedef typename VertexDescriptor<TGuideTree>::Type TVertexDescriptor;
732 typedef typename VertexDescriptor<TPairGraph>::Type TVD;
733 typedef typename EdgeDescriptor<TPairGraph>::Type TED;
734 typedef typename Iterator<TPairGraph, EdgeIterator>::Type TEdgeI;
735 typedef typename Iterator<TPairGraph, OutEdgeIterator>::Type TEdgeOutI;
736 typedef typename Iterator<TPairGraph, VertexIterator>::Type TVertexI;
737 typedef typename Size<TPairGraph>::Type TSize;
738
739 // First initialization
740 TCargo const maxVal = maxValue<TCargo>();
741 TSize nseq = numVertices(pairGraph);
742 TCargo infCargo = _getInfinity<TCargo>();
743 clearVertices(g);
744
745 // Is it possible to make a guide tree?
746 if (nseq == 1) {
747 g.data_root = addVertex(g);
748 return;
749 } else if (nseq == 2) {
750 TVertexDescriptor v1 = addVertex(g);
751 TVertexDescriptor v2 = addVertex(g);
752 TVertexDescriptor internalVertex = addVertex(g);
753 addEdge(g, internalVertex, v1, (TCargo) 1);
754 addEdge(g, internalVertex, v2, (TCargo) 1);
755 g.data_root = internalVertex;
756 return;
757 }
758
759 // Which entries in the matrix are still active and how many members belong to this group
760 String<TSize> active;
761 resize(active, nseq, 1);
762 // Vertex descriptor that represents that entry
763 typedef String<TVertexDescriptor> TNodeString;
764 typedef typename Iterator<TNodeString, Standard>::Type TNodeIter;
765 TNodeString nodes;
766 resize(nodes, nseq);
767 TNodeIter nodeIt = begin(nodes, Standard() );
768 TNodeIter nodeItEnd = end(nodes, Standard() );
769 for(;nodeIt<nodeItEnd;goNext(nodeIt))
770 *nodeIt = addVertex(g); // For each sequence one vertex
771
772 // Find the minimal value for all vertices (with respect to all greater vertices)
773 typedef Pair<TValue, TVD> TWeightEdgePair;
774 typedef String<TWeightEdgePair> TMinValues;
775 TMinValues minValues;
776 resize(minValues, nseq, TWeightEdgePair(maxVal, 0));
777 TEdgeI itE(pairGraph);
778 for(;!atEnd(itE);goNext(itE)) {
779 TVD s = sourceVertex(itE);
780 TVD t = targetVertex(itE);
781 if (cargo(*itE) < (minValues[s].i1))
782 minValues[s] = TWeightEdgePair(cargo(*itE), t);
783 }
784 // Find the overall minimum
785 typedef typename Iterator<TMinValues, Standard>::Type TMinIter;
786 TMinIter itMin = begin(minValues, Standard() );
787 TMinIter itMinEnd = end(minValues, Standard() );
788 TValue minVal = maxVal;
789 TVD sourceBest = 0;
790 TVD targetBest = 0;
791 for(TVD index=0;itMin != itMinEnd; goNext(itMin), ++index) {
792 if ((*itMin).i1 < minVal) {
793 minVal = (*itMin).i1;
794 sourceBest = index;
795 targetBest = (*itMin).i2;
796 }
797 }
798 TED best = 0;
799 if (sourceBest == targetBest) { // If none is found we have to insert a new edge
800 TVertexI itV(pairGraph);
801 sourceBest = value(itV); goNext(itV);
802 targetBest = value(itV);
803 best = addEdge(pairGraph, sourceBest, targetBest, infCargo);
804 } else best = findEdge(pairGraph, sourceBest, targetBest);
805
806 // Property map for sum of weights for each node
807 String<TCargo> weights;
808 resize(weights, nseq, (TCargo) 0);
809 reserve(weights, 2*nseq - 1);
810
811 // Merge groups
812 TSize m = nseq;
813 while (m>1) {
814 // Merge nodes
815 TVertexDescriptor internalNode = addVertex(g);
816
817 // Set the weights
818 TCargo w = (TCargo) (minVal / 2);
819 addEdge(g, internalNode, nodes[sourceBest], w - property(weights, nodes[sourceBest]));
820 addEdge(g, internalNode, nodes[targetBest], w - property(weights, nodes[targetBest]));
821 appendValue(weights, w);
822
823 // Get the new distance values
824 _upgmaTreeMerge(pairGraph, active, best, TTag());
825
826 // Inactivate one group, adjust the member count for the other one
827 active[sourceBest] += active[targetBest];
828 active[targetBest] = 0;
829 nodes[sourceBest] = internalNode;
830
831 // Update the minimum values
832 minValues[sourceBest] = TWeightEdgePair(maxVal, 0);
833 for(TEdgeOutI itOutE(pairGraph, sourceBest);!atEnd(itOutE);goNext(itOutE)) {
834 TVD localTVD = targetVertex(itOutE);
835 if (sourceBest < localTVD) {
836 if (cargo(value(itOutE)) < (minValues[sourceBest].i1)) minValues[sourceBest] = TWeightEdgePair(cargo(value(itOutE)), localTVD);
837 }
838 }
839 // Find the new minimum value
840 itMin = begin(minValues, Standard() );
841 itMinEnd = end(minValues, Standard() );
842 minVal = maxVal;
843 TVD oldSourceBest = sourceBest;
844 sourceBest = 0;
845 targetBest = 0;
846 for(TVD index= 0;itMin != itMinEnd; goNext(itMin), ++index) {
847 if (active[index] == 0) continue;
848 if (((*itMin).i2 == oldSourceBest) || (active[(*itMin).i2] == 0)) {
849 // Update the values
850 (*itMin).i1 = maxVal;
851 TEdgeOutI itOutLocal(pairGraph, index);
852 for(;!atEnd(itOutLocal);goNext(itOutLocal)) {
853 TVD targ = targetVertex(itOutLocal);
854 if (targ < index) continue;
855 if (cargo(value(itOutLocal)) < (*itMin).i1) *itMin = TWeightEdgePair(cargo(value(itOutLocal)), targ);
856 }
857 }
858 if ((*itMin).i1 < minVal) {
859 minVal = (*itMin).i1;
860 sourceBest = index;
861 targetBest = (*itMin).i2;
862 }
863 }
864 // If none is found we have to insert a new edge
865 if ((sourceBest == targetBest) && (m>2)) {
866 TVertexI itV(pairGraph);
867 sourceBest = value(itV); goNext(itV);
868 targetBest = value(itV);
869 best = addEdge(pairGraph, sourceBest, targetBest, infCargo);
870 } else best = findEdge(pairGraph, sourceBest, targetBest);
871 --m;
872 }
873 g.data_root = numVertices(g) - 1;
874 }
875
876 //////////////////////////////////////////////////////////////////////////////
877
878 /*!
879 * @fn upgmaTree
880 * @headerfile <seqan/graph_msa.h>
881 * @brief Computes a guide tree from a distance matrix.
882 *
883 * @signature void upgmaTree(mat, tree[, tag]);
884 *
885 * @param[in] mat A matrix with pairwise distance values. Can be a @link String @endlink representing a (dense)
886 * square matrix or a @link UndirectedGraph @endlink where each edge weight corresponds to the
887 * distance between sequence i and j (if the edge is present).
888 * @param[out] tree An undirected guide tre.
889 * @param[in] tag Tag indicating how to calcualte the cluster distances. See: @link UpgmaConfiguratorTags @endlink.
890 * Default: <tt>UpgmaWeightAvg</tt>.
891 */
892
893 template<typename TDistance, typename TCargo, typename TSpec>
894 inline void
upgmaTree(TDistance & dist,Graph<Tree<TCargo,TSpec>> & g)895 upgmaTree(TDistance& dist,
896 Graph<Tree<TCargo, TSpec> >& g)
897 {
898 SEQAN_CHECKPOINT
899 upgmaTree(dist, g, UpgmaWeightAvg());
900 }
901
902
903 }// namespace SEQAN_NAMESPACE_MAIN
904
905 #endif //#ifndef SEQAN_HEADER_...
906