1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 
33 #ifndef SEQAN_HEADER_GRAPH_ALIGN_TCOFFEE_GUIDETREE_H
34 #define SEQAN_HEADER_GRAPH_ALIGN_TCOFFEE_GUIDETREE_H
35 
36 namespace SEQAN_NAMESPACE_MAIN
37 {
38 
39 //////////////////////////////////////////////////////////////////////////////
40 // Guide Tree
41 //////////////////////////////////////////////////////////////////////////////
42 
43 
44 
45 
46 //////////////////////////////////////////////////////////////////////////////
47 // Neighbor Joining
48 //////////////////////////////////////////////////////////////////////////////
49 
50 
51 // Helper function for rounding to n significant digits.  Ported from
52 // Java code found here: http://stackoverflow.com/questions/202302
53 inline double
_roundToSignificantFigures(double num,int n)54 _roundToSignificantFigures(double num, int n)
55 {
56     if (num == 0)
57         return 0;
58 
59     const double d = ceil(log10(num < 0 ? -num : num));
60     const int power = n - (int) d;
61 
62     const double magnitude = pow(10.0, power);
63     const long shifted = static_cast<long>(round(num*magnitude));
64     return shifted / magnitude;
65 }
66 
67 //////////////////////////////////////////////////////////////////////////////
68 
69 /*!
70  * @fn njTree
71  * @headerfile <seqan/graph_msa.h>
72  * @brief computes a guid etree from a distance matrix.
73  *
74  * @signature void njTree(mat, tree);
75  *
76  * @param[in]  mat  A @link String @endlink of pairwise distance values, representing a square matrix.
77  * @param[out] tree The guide tree.
78  */
79 
80 template<typename TValue, typename TStringSpec, typename TCargo, typename TSpec>
81 inline void
njTree(String<TValue,TStringSpec> const & matIn,Graph<Tree<TCargo,TSpec>> & g)82 njTree(String<TValue, TStringSpec> const & matIn,
83        Graph<Tree<TCargo, TSpec> >& g)
84 {
85     typedef String<TValue, TStringSpec> TMatrix;
86     typedef typename Size<TMatrix>::Type TSize;
87     typedef Graph<Tree<TCargo, TSpec> > TGraph;
88     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
89 
90     TVertexDescriptor nilVertex = getNil<TVertexDescriptor>();
91     TSize nseq = (TSize) std::sqrt((double)length(matIn));
92 
93     // Assert that the input matrix has no negative values.
94 // #if SEQAN_ENABLE_DEBUG
95 //     for (unsigned i = 0; i < length(matIn); ++i)
96 //         SEQAN_ASSERT_GEQ_MSG(matIn[i], 0, "i = %u", i);
97 // #endif  // #if SEQAN_ENABLE_DEBUG
98 
99     //for(TSize i=0;i<nseq;++i) {
100     //    for(TSize j=0;j<nseq;++j) {
101     //        std::cout << getValue(matIn, i*nseq+j) << ",";
102     //    }
103     //    std::cout << std::endl;
104     //}
105 
106     // Handle base cases for one and two sequences.
107     clearVertices(g);
108     if (nseq == 1)
109     {
110         g.data_root = addVertex(g);
111         return;
112     }
113     else if (nseq == 2)
114     {
115         TVertexDescriptor v1 = addVertex(g);
116         TVertexDescriptor v2 = addVertex(g);
117         TVertexDescriptor internalVertex = addVertex(g);
118         addEdge(g, internalVertex, v1, (TCargo) _roundToSignificantFigures(matIn[1] / 2.0, 5));
119         addEdge(g, internalVertex, v2, (TCargo) _roundToSignificantFigures(matIn[1] / 2.0, 5));
120         g.data_root = internalVertex;
121         return;
122     }
123 
124     // Create a normalized copy of matIn with fixed point numbers, precision of 10 digits.
125     TValue normFactor = 0;
126     for (unsigned i = 0; i < length(matIn); ++i)
127         normFactor = std::max(normFactor, matIn[i]);
128     // In the case that normFactor is 0 (e.g. all sequences equal) then fix it to 1 so the normalization
129     // does not break.
130     if (!normFactor)
131         normFactor = 1;
132     String<__int64> mat;
133     resize(mat, length(matIn));
134     for (unsigned i = 0; i < length(mat); ++i)
135         mat[i] = static_cast<__int64>(10000000.0 * ((double)(matIn[i]) / (double)(normFactor)));
136 
137     // First initialization
138     String<__int64> av;    // Average branch length to a combined node
139     resize(av,nseq,0);
140 
141     String<TVertexDescriptor> connector;   // Nodes that need to be connected
142     resize(connector, nseq);
143 
144     for(TSize i=0;i<nseq;++i)
145     {
146         addVertex(g);  // Add all the nodes that correspond to sequences
147         connector[i] = i;
148         mat[i*nseq+i] = 0;
149     }
150 
151     // Main cycle
152     __int64 fnseqs = static_cast<__int64>(nseq);
153     for(TSize nc=0; nc<(nseq-3); ++nc) {
154         __int64 sumOfBranches = 0;
155 
156         // Determine the sum of all branches and
157         // copy upper triangle matrix to lower triangle
158         for(TSize col=1; col<nseq; ++col)
159             for(TSize row=0; row<col; ++row)
160                 sumOfBranches += mat[col*nseq+row] = mat[row*nseq+col];
161 
162         // Compute the sum of branch lengths for all possible pairs
163         bool notFound = true;
164         __int64 tmin = 0;
165         TSize mini = 0;  // Next pair of seq i and j to join
166         TSize minj = 0;
167         __int64 diToAllOthers = 0;
168         __int64 djToAllOthers = 0;
169         __int64 total = 0;
170         for(TSize col=1; col<nseq; ++col)  {
171             if (connector[col] != nilVertex) {
172                 for(TSize row=0; row<col; ++row) {
173                     if (connector[row] != nilVertex) {
174                         diToAllOthers = 0;
175                         djToAllOthers = 0;
176 
177                         for(TSize i=0; i<nseq; ++i) {
178                             diToAllOthers += mat[i*nseq+row];
179                             djToAllOthers += mat[i*nseq+col];
180                         }
181 
182                         total = diToAllOthers + djToAllOthers + (fnseqs - 2) * mat[row*nseq+col] + 2 * (sumOfBranches - diToAllOthers - djToAllOthers);
183                         total /= (2*(fnseqs - 2));
184 
185                         if ((notFound) || (total < tmin)) {
186                             notFound = false;
187                             tmin = total;
188                             mini = row;
189                             minj = col;
190                         }
191                     }
192                 }
193             }
194         }
195 
196         // Print nodes that are about to be joined
197         //std::cout << mini << std::endl;
198         //std::cout << minj << std::endl;
199         //std::cout << tmin << std::endl;
200         //std::cout << std::endl;
201 
202         // Compute branch lengths
203         __int64 dMinIToOthers = 0;
204         __int64 dMinJToOthers = 0;
205         for(TSize i=0; i<nseq; ++i) {
206             dMinIToOthers += mat[i*nseq + mini];
207             dMinJToOthers += mat[i*nseq + minj];
208         }
209         __int64 dmin = mat[mini*nseq + minj];
210         dMinIToOthers = dMinIToOthers / (fnseqs - 2);
211         dMinJToOthers = dMinJToOthers / (fnseqs - 2);
212         __int64 iBranch = (dmin + dMinIToOthers - dMinJToOthers) / 2;
213         __int64 jBranch = dmin - iBranch;
214         iBranch -= av[mini];
215         jBranch -= av[minj];
216 
217         // Set negative branch length to zero
218         if( iBranch < 0) iBranch = 0;
219         if( jBranch < 0) jBranch = 0;
220 
221         // Print branch lengths
222         //std::cout << iBranch << std::endl;
223         //std::cout << jBranch << std::endl;
224         //std::cout << std::endl;
225 
226         // Build tree
227         TVertexDescriptor internalVertex = addVertex(g);
228         addEdge(g, internalVertex, connector[mini], (TCargo) _roundToSignificantFigures((iBranch / 10000000.0) * normFactor, 5));
229         addEdge(g, internalVertex, connector[minj], (TCargo) _roundToSignificantFigures((jBranch / 10000000.0) * normFactor, 5));
230 
231         // Remember the average branch length for the new combined node
232         // Must be subtracted from all branches that include this node
233         if(dmin < 0) dmin = 0;
234         av[mini] = dmin / 2;
235 
236 
237         // Re-initialisation
238         // mini becomes the new combined node, minj is killed
239         --fnseqs;
240         connector[minj] = nilVertex;
241         connector[mini] = internalVertex;
242 
243         for(TSize j=0; j<nseq; ++j) {
244             if( connector[j] != nilVertex ) {
245                 // Use upper triangle
246                 if((TSize) mini < j) mat[mini*nseq+j] = (mat[mini*nseq+j] + mat[minj*nseq+j]) / 2;
247                 if((TSize) mini > j) mat[j*nseq+mini] = (mat[mini*nseq+j] + mat[minj*nseq+j]) / 2;
248             }
249         }
250         for(TSize j=0; j<nseq; ++j)
251             mat[j*nseq+minj] = mat[minj*nseq+j] = 0;
252     }
253 
254     // Only three nodes left
255 
256     // Find the remaining nodes
257     String<TSize> l;
258     resize(l,3);
259     TSize count = 0;
260     for(TSize i=0; i<nseq; ++i) {
261         if(connector[i] != nilVertex) {
262             l[count] = i;
263             ++count;
264         }
265     }
266 
267     // Remaining nodes
268     //std::cout << l[0] << std::endl;
269     //std::cout << l[1] << std::endl;
270     //std::cout << l[2] << std::endl;
271     //std::cout << std::endl;
272 
273     String<__int64> branch;
274     resize(branch, 3);
275     branch[0] = (mat[l[0]*nseq+l[1]] + mat[l[0]*nseq+l[2]] - mat[l[1]*nseq+l[2]]) / 2;
276     branch[1] = (mat[l[1]*nseq+l[2]] + mat[l[0]*nseq+l[1]] - mat[l[0]*nseq+l[2]]) / 2;
277     branch[2] = (mat[l[1]*nseq+l[2]] + mat[l[0]*nseq+l[2]] - mat[l[0]*nseq+l[1]]) / 2;
278 
279     branch[0] -= av[l[0]];
280     branch[1] -= av[l[1]];
281     branch[2] -= av[l[2]];
282 
283     // Print branch lengths
284     //std::cout << branch[0] << std::endl;
285     //std::cout << branch[1] << std::endl;
286     //std::cout << branch[2] << std::endl;
287     //std::cout << std::endl;
288 
289     // Reset negative branch lengths to zero
290     if( branch[0] < 0) branch[0] = 0;
291     if( branch[1] < 0) branch[1] = 0;
292     if( branch[2] < 0) branch[2] = 0;
293 
294     // Build tree
295     TVertexDescriptor internalVertex = addVertex(g);
296     addEdge(g, internalVertex, getValue(connector, l[0]), (TCargo) _roundToSignificantFigures((branch[0] / 10000000.0)* normFactor, 5));
297     addEdge(g, internalVertex, getValue(connector, l[1]), (TCargo) _roundToSignificantFigures((branch[1] / 10000000.0) * normFactor, 5));
298     TVertexDescriptor the_root = addVertex(g);
299     addEdge(g, the_root, getValue(connector, l[2]), (TCargo) _roundToSignificantFigures((branch[2] / 20000000.0) * normFactor, 5));
300     addEdge(g, the_root, internalVertex, (TCargo) _roundToSignificantFigures((branch[2] / 20000000.0) * normFactor, 5));
301     g.data_root = the_root;
302 }
303 
304 
305 
306 
307 
308 
309 //////////////////////////////////////////////////////////////////////////////
310 // Unweighted Pair Group Mean Average (UPGMA)
311 //////////////////////////////////////////////////////////////////////////////
312 
313 /*!
314  * @defgroup UpgmaConfiguratorTags
315  * @brief Tags for configuring the guide tree construction.
316  *
317  * @tag UpgmaConfiguratorTags#UpgmaMin
318  * @headerfile <seqan/graph_msa.h>
319  * @brief Uses the min operation in the UPGMA algorithm.
320  *
321  * @signature typedef Tag<UpgmaMin_> const UpgmaMin;
322  *
323  * @tag UpgmaConfiguratorTags#UpgmaMax
324  * @headerfile <seqan/graph_msa.h>
325  * @brief Uses the max operation in the UPGMA algorithm.
326  *
327  * @signature typedef Tag<UpgmaMax_> const UpgmaMax;
328  *
329  * @tag UpgmaConfiguratorTags#UpgmaAvg
330  * @headerfile <seqan/graph_msa.h>
331  * @brief Uses the avg operation in the UPGMA algorithm.
332  *
333  * @signature typedef Tag<UpgmaAvg_> const UpgmaAvg;
334  *
335  * @tag UpgmaConfiguratorTags#UpgmaWeightAvg
336  * @headerfile <seqan/graph_msa.h>
337  * @brief Uses the weighted avg operation in the UPGMA algorithm.
338  *
339  * @signature typedef Tag<UpgmaAvg_> const UpgmaWeightAvg;
340  */
341 
342 //////////////////////////////////////////////////////////////////////////////
343 
344 struct UpgmaMin_;
345 typedef Tag<UpgmaMin_> const UpgmaMin;
346 
347 //////////////////////////////////////////////////////////////////////////////
348 
349 struct UpgmaMax_;
350 typedef Tag<UpgmaMax_> const UpgmaMax;
351 
352 //////////////////////////////////////////////////////////////////////////////
353 
354 struct UpgmaAvg_;
355 typedef Tag<UpgmaAvg_> const UpgmaAvg;
356 
357 //////////////////////////////////////////////////////////////////////////////
358 
359 struct UpgmaWeightAvg_;
360 typedef Tag<UpgmaWeightAvg_> const UpgmaWeightAvg;
361 
362 
363 
364 //////////////////////////////////////////////////////////////////////////////
365 
366 template<typename TMatrix, typename TActive, typename TSize>
367 inline void
_upgmaTreeMerge(TMatrix & mat,TActive & active,TSize index_i,TSize index_j,TSize nseq,UpgmaWeightAvg)368 _upgmaTreeMerge(TMatrix& mat,
369                 TActive& active,
370                 TSize index_i,
371                 TSize index_j,
372                 TSize nseq,
373                 UpgmaWeightAvg)
374 {
375     SEQAN_CHECKPOINT
376     typedef typename Value<TMatrix>::Type TValue;
377     // Average
378     for(TSize i=0;i<nseq;++i) {
379         if ((i != index_i) && (i != index_j) && (active[i] != 0)) {
380             if (index_i < i) {
381                 mat[index_i*nseq + i] = ((TValue) active[index_i] / (TValue) (active[index_i] + active[index_j])) * mat[index_i * nseq + i];
382                 if (index_j < i) mat[index_i*nseq + i] += ((TValue) active[index_j] / (TValue) (active[index_i] + active[index_j])) * mat[index_j * nseq + i];
383                 else mat[index_i*nseq + i] += ((TValue) active[index_j] / (TValue) (active[index_i] + active[index_j])) * mat[i * nseq + index_j];
384             } else {
385                 mat[i*nseq + index_i] = ((TValue) active[index_i] / (TValue) (active[index_i] + active[index_j])) * mat[i * nseq + index_i];
386                 if (index_j < i) value(mat, i*nseq + index_i) += ((TValue) active[index_j] / (TValue) (active[index_i] + active[index_j])) * mat[index_j * nseq + i];
387                 else mat[i*nseq + index_i] += ((TValue) active[index_j] / (TValue) (active[index_i] + active[index_j])) * mat[i * nseq + index_j];
388             }
389         }
390     }
391 }
392 
393 //////////////////////////////////////////////////////////////////////////////
394 
395 
396 template<typename TMatrix, typename TActive, typename TSize>
397 inline void
_upgmaTreeMerge(TMatrix & mat,TActive & active,TSize index_i,TSize index_j,TSize nseq,UpgmaAvg)398 _upgmaTreeMerge(TMatrix& mat,
399                 TActive& active,
400                 TSize index_i,
401                 TSize index_j,
402                 TSize nseq,
403                 UpgmaAvg)
404 {
405     SEQAN_CHECKPOINT
406     typedef typename Value<TMatrix>::Type TValue;
407 
408     // Minimum
409     for(TSize i=0;i<nseq;++i) {
410         if ((i != index_i) && (i != index_j) && (active[i] != 0)) {
411             TValue val1 = (index_i < i) ? mat[index_i * nseq + i] : mat[i * nseq + index_i];
412             TValue val2 = (index_j < i) ? mat[index_j * nseq + i] : mat[i * nseq + index_j];
413             if (index_i < i) mat[index_i * nseq + i] = (val1 + val2) / 2;
414             else mat[i * nseq + index_i] = (val1 + val2) / 2;
415         }
416     }
417 }
418 
419 //////////////////////////////////////////////////////////////////////////////
420 
421 
422 template<typename TMatrix, typename TActive, typename TSize>
423 inline void
_upgmaTreeMerge(TMatrix & mat,TActive & active,TSize index_i,TSize index_j,TSize nseq,UpgmaMin)424 _upgmaTreeMerge(TMatrix& mat,
425                 TActive& active,
426                 TSize index_i,
427                 TSize index_j,
428                 TSize nseq,
429                 UpgmaMin)
430 {
431     SEQAN_CHECKPOINT
432     typedef typename Value<TMatrix>::Type TValue;
433 
434     // Minimum
435     for(TSize i=0;i<nseq;++i) {
436         if ((i != index_i) && (i != index_j) && (active[i] != 0)) {
437             TValue newDist = (index_i < i) ? mat[index_i * nseq + i] : mat[i * nseq + index_i];
438             TValue newDist2 = (index_j < i) ? mat[index_j * nseq + i] : mat[i * nseq + index_j];
439             if (index_i < i) mat[index_i * nseq + i] = _min(newDist, newDist2);
440             else mat[i * nseq + index_i] = _min(newDist, newDist2);
441         }
442     }
443 }
444 
445 //////////////////////////////////////////////////////////////////////////////
446 
447 template<typename TMatrix, typename TActive, typename TSize>
448 inline void
_upgmaTreeMerge(TMatrix & mat,TActive & active,TSize index_i,TSize index_j,TSize nseq,UpgmaMax)449 _upgmaTreeMerge(TMatrix& mat,
450                 TActive& active,
451                 TSize index_i,
452                 TSize index_j,
453                 TSize nseq,
454                 UpgmaMax)
455 {
456     SEQAN_CHECKPOINT
457     typedef typename Value<TMatrix>::Type TValue;
458 
459     // Maximum
460     for(TSize i=0;i<nseq;++i) {
461         if ((i != index_i) && (i != index_j) && (active[i] != 0)) {
462             TValue newDist = (index_i < i) ? mat[index_i * nseq + i] : mat[i * nseq + index_i];
463             TValue newDist2 = (index_j < i) ? mat[index_j * nseq + i] : mat[i * nseq + index_j];
464             if (index_i < i) mat[index_i * nseq + i] = _max(newDist, newDist2);
465             else mat[i * nseq + index_i] = _max(newDist, newDist2);
466         }
467     }
468 }
469 
470 //////////////////////////////////////////////////////////////////////////////
471 
472 template<typename TCargo, typename TSpec, typename TActive, typename TEdgeDescriptor>
473 inline void
_upgmaTreeMerge(Graph<Undirected<TCargo,TSpec>> & pairGraph,TActive const &,TEdgeDescriptor best,UpgmaMax)474 _upgmaTreeMerge(Graph<Undirected<TCargo, TSpec> >& pairGraph,
475                 TActive const&,
476                 TEdgeDescriptor best,
477                 UpgmaMax)
478 {
479     typedef Graph<Undirected<TCargo, TSpec> > TGraph;
480     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
481     //typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
482     typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
483 
484     TVertexDescriptor s = sourceVertex(pairGraph,best);
485     TVertexDescriptor t = targetVertex(pairGraph,best);
486     typedef String<TEdgeDescriptor> TEdgeString;
487     typedef typename Iterator<TEdgeString>::Type TEdgeIter;
488     TEdgeString removeEdges;
489     for(TOutEdgeIterator outIt(pairGraph, s);!atEnd(outIt);goNext(outIt)) {
490         if (targetVertex(outIt) == t) continue;
491         TEdgeDescriptor e = findEdge(pairGraph, targetVertex(outIt), t);
492         if (e == 0) appendValue(removeEdges, value(outIt));
493         else {
494             if (cargo(e) > cargo(value(outIt))) cargo(value(outIt)) = cargo(e);
495         }
496     }
497     TEdgeIter eIt = begin(removeEdges);
498     TEdgeIter eItEnd = end(removeEdges);
499     for(;eIt != eItEnd; goNext(eIt)) removeEdge(pairGraph, value(eIt));
500     removeVertex(pairGraph, t);
501 }
502 
503 //////////////////////////////////////////////////////////////////////////////
504 
505 template<typename TCargo, typename TSpec, typename TActive, typename TEdgeDescriptor>
506 inline void
_upgmaTreeMerge(Graph<Undirected<TCargo,TSpec>> & pairGraph,TActive const &,TEdgeDescriptor best,UpgmaMin)507 _upgmaTreeMerge(Graph<Undirected<TCargo, TSpec> >& pairGraph,
508                 TActive const&,
509                 TEdgeDescriptor best,
510                 UpgmaMin)
511 {
512     typedef Graph<Undirected<TCargo, TSpec> > TGraph;
513     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
514     //typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
515     typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
516 
517     TVertexDescriptor s = sourceVertex(pairGraph,best);
518     TVertexDescriptor t = targetVertex(pairGraph,best);
519     for(TOutEdgeIterator outIt(pairGraph, s);!atEnd(outIt);goNext(outIt)) {
520         if (targetVertex(outIt) == t) continue;
521         TEdgeDescriptor e = findEdge(pairGraph, targetVertex(outIt), t);
522         if (e != 0) {
523             if (cargo(e) < cargo(value(outIt))) cargo(value(outIt)) = cargo(e);
524         }
525     }
526     for(TOutEdgeIterator outIt(pairGraph, t);!atEnd(outIt);goNext(outIt)) {
527         if (targetVertex(outIt) == s) continue;
528         TEdgeDescriptor e = findEdge(pairGraph, targetVertex(outIt), s);
529         if (e == 0) addEdge(pairGraph, s, targetVertex(outIt), cargo(value(outIt)));
530     }
531     removeVertex(pairGraph, t);
532 }
533 
534 
535 //////////////////////////////////////////////////////////////////////////////
536 
537 template<typename TCargo, typename TSpec, typename TActive, typename TEdgeDescriptor>
538 inline void
_upgmaTreeMerge(Graph<Undirected<TCargo,TSpec>> & pairGraph,TActive const & active,TEdgeDescriptor best,UpgmaWeightAvg)539 _upgmaTreeMerge(Graph<Undirected<TCargo, TSpec> >& pairGraph,
540                 TActive const& active,
541                 TEdgeDescriptor best,
542                 UpgmaWeightAvg)
543 {
544     typedef Graph<Undirected<TCargo, TSpec> > TGraph;
545     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
546     //typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
547     typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
548 
549     TCargo infCargo = _getInfinity<TCargo>();
550     TVertexDescriptor s = sourceVertex(pairGraph,best);
551     TVertexDescriptor t = targetVertex(pairGraph,best);
552 
553     for(TOutEdgeIterator outIt(pairGraph, s);!atEnd(outIt);goNext(outIt)) {
554         if (targetVertex(outIt) == t) continue;
555         TEdgeDescriptor e1 = value(outIt);
556         TEdgeDescriptor e2 = findEdge(pairGraph, targetVertex(outIt), t);
557         if (e2 != 0) cargo(e1) = ((TCargo) value(active,s) / (TCargo) (value(active,s) + value(active,t))) * cargo(e1) + ((TCargo) value(active,t) / (TCargo) (value(active,s) + value(active,t))) * cargo(e2);
558         else cargo(e1) = ((TCargo) value(active,s) / (TCargo) (value(active,s) + value(active,t))) * cargo(e1) + ((TCargo) value(active,t) / (TCargo) (value(active,s) + value(active,t))) * infCargo;
559     }
560     for(TOutEdgeIterator outIt(pairGraph, t);!atEnd(outIt);goNext(outIt)) {
561         if (targetVertex(outIt) == s) continue;
562         TEdgeDescriptor e = findEdge(pairGraph, targetVertex(outIt), s);
563         TCargo c = ((TCargo) value(active,s) / (TCargo) (value(active,s) + value(active,t))) * infCargo + ((TCargo) value(active,t) / (TCargo) (value(active,s) + value(active,t))) * cargo(value(outIt));
564         if (e == 0)  addEdge(pairGraph, s, targetVertex(outIt), c);
565     }
566     removeVertex(pairGraph, t);
567 }
568 
569 
570 //////////////////////////////////////////////////////////////////////////////
571 
572 template<typename TCargo, typename TSpec, typename TActive, typename TEdgeDescriptor>
573 inline void
_upgmaTreeMerge(Graph<Undirected<TCargo,TSpec>> & pairGraph,TActive const &,TEdgeDescriptor best,UpgmaAvg)574 _upgmaTreeMerge(Graph<Undirected<TCargo, TSpec> >& pairGraph,
575                 TActive const&,
576                 TEdgeDescriptor best,
577                 UpgmaAvg)
578 {
579     typedef Graph<Undirected<TCargo, TSpec> > TGraph;
580     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
581     //typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
582     typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
583 
584     TCargo infCargo = _getInfinity<TCargo>();
585     TVertexDescriptor s = sourceVertex(pairGraph,best);
586     TVertexDescriptor t = targetVertex(pairGraph,best);
587 
588     for(TOutEdgeIterator outIt(pairGraph, s);!atEnd(outIt);goNext(outIt)) {
589         if (targetVertex(outIt) == t) continue;
590         TEdgeDescriptor e1 = value(outIt);
591         TEdgeDescriptor e2 = findEdge(pairGraph, targetVertex(outIt), t);
592         cargo(e1) = (e2 != 0) ? (cargo(e1) + cargo(e2)) / 2 : (cargo(e1) + infCargo) / 2;
593     }
594     for(TOutEdgeIterator outIt(pairGraph, t);!atEnd(outIt);goNext(outIt)) {
595         if (targetVertex(outIt) == s) continue;
596         TEdgeDescriptor e = findEdge(pairGraph, targetVertex(outIt), s);
597         if (e == 0) {
598             TCargo c = (infCargo + cargo(value(outIt))) / 2;
599             addEdge(pairGraph, s, targetVertex(outIt), c);
600         }
601     }
602     removeVertex(pairGraph, t);
603 }
604 
605 //////////////////////////////////////////////////////////////////////////////
606 
607 template<typename TStringValue, typename TStringSpec, typename TCargo, typename TSpec, typename TTag>
608 inline void
upgmaTree(String<TStringValue,TStringSpec> & mat,Graph<Tree<TCargo,TSpec>> & g,TTag)609 upgmaTree(String<TStringValue, TStringSpec>& mat,
610           Graph<Tree<TCargo, TSpec> >& g,
611           TTag)
612 {
613     SEQAN_CHECKPOINT
614     typedef Graph<Tree<TCargo, TSpec> > TGraph;
615     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
616     typedef typename Size<TGraph>::Type TSize;
617     typedef String<TStringValue, TStringSpec> TMatrix;
618     typedef typename Value<TMatrix>::Type TValue;
619     //typedef typename Iterator<TMatrix>::Type TMatrixIter;
620 
621     // First initialization
622     TSize nseq = (TSize) std::sqrt((double)length(mat));
623     clearVertices(g);
624 
625     // Is it possible to make a guide tree?
626     if (nseq == 1) {
627         g.data_root = addVertex(g);
628         return;
629     } else if (nseq == 2) {
630         TVertexDescriptor v1 = addVertex(g);
631         TVertexDescriptor v2 = addVertex(g);
632         TVertexDescriptor internalVertex = addVertex(g);
633         addEdge(g, internalVertex, v1, (TCargo) 1);
634         addEdge(g, internalVertex, v2, (TCargo) 1);
635         g.data_root = internalVertex;
636         return;
637     }
638 
639     // Which entries in the matrix are still active and how many members belong to this group
640     String<TSize> active;
641     resize(active, nseq, 1);
642     // Vertex descriptor that represents that entry
643     String<TVertexDescriptor> nodes;
644     reserve(nodes, nseq);
645 
646 
647     // Find the minimal value
648     bool notFound = true;
649     TValue minVal = 0;
650     TSize index_i = 0;
651     TSize index_j = 1;
652     for(TSize row=0;row<nseq;++row) {
653         for(TSize col=row+1;col<nseq;++col) {
654             if ((notFound) || (minVal > mat[row*nseq + col])) {
655                 notFound = false;
656                 minVal = mat[row*nseq + col];
657                 index_i = row;
658                 index_j = col;
659             }
660         }
661         appendValue(nodes, addVertex(g));    // For each sequence one vertex
662     }
663 
664     // Property map for sum of weights for each node
665     String<TCargo> weights;
666     resize(weights, nseq, (TCargo) 0);
667     reserve(weights, 2*nseq - 1);
668 
669     // Merge groups
670     TSize m = nseq;
671     while (m>1) {
672         // Merge nodes
673         TVertexDescriptor internalNode = addVertex(g);
674 
675         //// Debug code
676         //for(TSize i=0;i<nseq;++i) {
677         //    if (value(active,i)==0) continue;
678         //    for(TSize j=i+1;j<nseq;++j) {
679         //        if (value(active,j)==0) continue;
680         //        std::cout << value(mat, i*nseq+j) << ",";
681         //    }
682         //    std::cout << std::endl;
683         //}
684         //std::cout << minVal << ',' << index_i << ',' << index_j << ',' << std::endl;
685         //std::cout << nodes[index_i] << ',' << nodes[index_j] << std::endl;
686         //std::cout << std::endl;
687 
688         TCargo w = (TCargo) (minVal / 2);
689         addEdge(g, internalNode, nodes[index_i], w - property(weights, nodes[index_i]));
690         addEdge(g, internalNode, nodes[index_j], w - property(weights, nodes[index_j]));
691         appendValue(weights, w);
692 
693         // Get the new distance values
694         _upgmaTreeMerge(mat, active, index_i, index_j, nseq, TTag());
695 
696         // Inactivate one group, adjust the member count for the other one
697         active[index_i] += active[index_j];
698         active[index_j] = 0;
699         nodes[index_i] = internalNode;
700 
701         // Find new minimum
702         notFound = true;
703         for(TSize i=0;i<nseq;++i) {
704             if (active[i] == 0) continue;
705             for(TSize j=i+1;j<nseq;++j) {
706                 if (active[j] == 0) continue;
707                 if ((notFound) || (minVal > mat[i*nseq + j])) {
708                     notFound = false;
709                     minVal = mat[i*nseq + j];
710                     index_i = i;
711                     index_j = j;
712                 }
713             }
714         }
715         --m;
716     }
717     g.data_root = numVertices(g) - 1;
718 }
719 
720 //////////////////////////////////////////////////////////////////////////////
721 
722 template<typename TValue, typename TSpec1, typename TCargo, typename TSpec2, typename TTag>
723 inline void
upgmaTree(Graph<Undirected<TValue,TSpec1>> & pairGraph,Graph<Tree<TCargo,TSpec2>> & g,TTag)724 upgmaTree(Graph<Undirected<TValue, TSpec1> >& pairGraph,
725           Graph<Tree<TCargo, TSpec2> >& g,
726           TTag)
727 {
728     SEQAN_CHECKPOINT
729     typedef Graph<Undirected<TValue, TSpec1> > TPairGraph;
730     typedef Graph<Tree<TCargo, TSpec2> > TGuideTree;
731     typedef typename VertexDescriptor<TGuideTree>::Type TVertexDescriptor;
732     typedef typename VertexDescriptor<TPairGraph>::Type TVD;
733     typedef typename EdgeDescriptor<TPairGraph>::Type TED;
734     typedef typename Iterator<TPairGraph, EdgeIterator>::Type TEdgeI;
735     typedef typename Iterator<TPairGraph, OutEdgeIterator>::Type TEdgeOutI;
736     typedef typename Iterator<TPairGraph, VertexIterator>::Type TVertexI;
737     typedef typename Size<TPairGraph>::Type TSize;
738 
739     // First initialization
740     TCargo const maxVal = maxValue<TCargo>();
741     TSize nseq = numVertices(pairGraph);
742     TCargo infCargo = _getInfinity<TCargo>();
743     clearVertices(g);
744 
745     // Is it possible to make a guide tree?
746     if (nseq == 1) {
747         g.data_root = addVertex(g);
748         return;
749     } else if (nseq == 2) {
750         TVertexDescriptor v1 = addVertex(g);
751         TVertexDescriptor v2 = addVertex(g);
752         TVertexDescriptor internalVertex = addVertex(g);
753         addEdge(g, internalVertex, v1, (TCargo) 1);
754         addEdge(g, internalVertex, v2, (TCargo) 1);
755         g.data_root = internalVertex;
756         return;
757     }
758 
759     // Which entries in the matrix are still active and how many members belong to this group
760     String<TSize> active;
761     resize(active, nseq, 1);
762     // Vertex descriptor that represents that entry
763     typedef String<TVertexDescriptor> TNodeString;
764     typedef typename Iterator<TNodeString, Standard>::Type TNodeIter;
765     TNodeString nodes;
766     resize(nodes, nseq);
767     TNodeIter nodeIt = begin(nodes, Standard() );
768     TNodeIter nodeItEnd = end(nodes, Standard() );
769     for(;nodeIt<nodeItEnd;goNext(nodeIt))
770         *nodeIt = addVertex(g);    // For each sequence one vertex
771 
772     // Find the minimal value for all vertices (with respect to all greater vertices)
773     typedef Pair<TValue, TVD> TWeightEdgePair;
774     typedef String<TWeightEdgePair> TMinValues;
775     TMinValues minValues;
776     resize(minValues, nseq, TWeightEdgePair(maxVal, 0));
777     TEdgeI itE(pairGraph);
778     for(;!atEnd(itE);goNext(itE)) {
779         TVD s = sourceVertex(itE);
780         TVD t = targetVertex(itE);
781         if (cargo(*itE) < (minValues[s].i1))
782             minValues[s] = TWeightEdgePair(cargo(*itE), t);
783     }
784     // Find the overall minimum
785     typedef typename Iterator<TMinValues, Standard>::Type TMinIter;
786     TMinIter itMin = begin(minValues, Standard() );
787     TMinIter itMinEnd = end(minValues, Standard() );
788     TValue minVal = maxVal;
789     TVD sourceBest = 0;
790     TVD targetBest = 0;
791     for(TVD index=0;itMin != itMinEnd; goNext(itMin), ++index) {
792         if ((*itMin).i1 < minVal) {
793             minVal = (*itMin).i1;
794             sourceBest = index;
795             targetBest = (*itMin).i2;
796         }
797     }
798     TED best = 0;
799     if (sourceBest == targetBest) { // If none is found we have to insert a new edge
800         TVertexI itV(pairGraph);
801         sourceBest = value(itV); goNext(itV);
802         targetBest = value(itV);
803         best = addEdge(pairGraph, sourceBest, targetBest, infCargo);
804     } else best = findEdge(pairGraph, sourceBest, targetBest);
805 
806     // Property map for sum of weights for each node
807     String<TCargo> weights;
808     resize(weights, nseq, (TCargo) 0);
809     reserve(weights, 2*nseq - 1);
810 
811     // Merge groups
812     TSize m = nseq;
813     while (m>1) {
814         // Merge nodes
815         TVertexDescriptor internalNode = addVertex(g);
816 
817         // Set the weights
818         TCargo w = (TCargo) (minVal / 2);
819         addEdge(g, internalNode, nodes[sourceBest], w - property(weights, nodes[sourceBest]));
820         addEdge(g, internalNode, nodes[targetBest], w - property(weights, nodes[targetBest]));
821         appendValue(weights, w);
822 
823         // Get the new distance values
824         _upgmaTreeMerge(pairGraph, active, best, TTag());
825 
826         // Inactivate one group, adjust the member count for the other one
827         active[sourceBest] += active[targetBest];
828         active[targetBest] = 0;
829         nodes[sourceBest] = internalNode;
830 
831         // Update the minimum values
832         minValues[sourceBest] = TWeightEdgePair(maxVal, 0);
833         for(TEdgeOutI itOutE(pairGraph, sourceBest);!atEnd(itOutE);goNext(itOutE)) {
834             TVD localTVD = targetVertex(itOutE);
835             if (sourceBest < localTVD) {
836                 if (cargo(value(itOutE)) < (minValues[sourceBest].i1)) minValues[sourceBest] = TWeightEdgePair(cargo(value(itOutE)), localTVD);
837             }
838         }
839         // Find the new minimum value
840         itMin = begin(minValues, Standard() );
841         itMinEnd = end(minValues, Standard() );
842         minVal = maxVal;
843         TVD oldSourceBest = sourceBest;
844         sourceBest = 0;
845         targetBest = 0;
846         for(TVD index= 0;itMin != itMinEnd; goNext(itMin), ++index) {
847             if (active[index] == 0) continue;
848             if (((*itMin).i2 == oldSourceBest) || (active[(*itMin).i2] == 0)) {
849                 // Update the values
850                 (*itMin).i1 = maxVal;
851                 TEdgeOutI itOutLocal(pairGraph, index);
852                 for(;!atEnd(itOutLocal);goNext(itOutLocal)) {
853                     TVD targ = targetVertex(itOutLocal);
854                     if (targ < index) continue;
855                     if (cargo(value(itOutLocal)) < (*itMin).i1) *itMin = TWeightEdgePair(cargo(value(itOutLocal)), targ);
856                 }
857             }
858             if ((*itMin).i1 < minVal) {
859                 minVal = (*itMin).i1;
860                 sourceBest = index;
861                 targetBest = (*itMin).i2;
862             }
863         }
864         // If none is found we have to insert a new edge
865         if ((sourceBest == targetBest) && (m>2)) {
866             TVertexI itV(pairGraph);
867             sourceBest = value(itV); goNext(itV);
868             targetBest = value(itV);
869             best = addEdge(pairGraph, sourceBest, targetBest, infCargo);
870         } else best = findEdge(pairGraph, sourceBest, targetBest);
871         --m;
872     }
873     g.data_root = numVertices(g) - 1;
874 }
875 
876 //////////////////////////////////////////////////////////////////////////////
877 
878 /*!
879  * @fn upgmaTree
880  * @headerfile <seqan/graph_msa.h>
881  * @brief Computes a guide tree from a distance matrix.
882  *
883  * @signature void upgmaTree(mat, tree[, tag]);
884  *
885  * @param[in]  mat  A matrix with pairwise distance values.  Can be a @link String @endlink representing a (dense)
886  *                  square matrix or a @link UndirectedGraph @endlink where each edge weight corresponds to the
887  *                  distance between sequence i and j (if the edge is present).
888  * @param[out] tree An undirected guide tre.
889  * @param[in]  tag  Tag indicating how to calcualte the cluster distances.  See: @link UpgmaConfiguratorTags @endlink.
890  *                  Default: <tt>UpgmaWeightAvg</tt>.
891  */
892 
893 template<typename TDistance, typename TCargo, typename TSpec>
894 inline void
upgmaTree(TDistance & dist,Graph<Tree<TCargo,TSpec>> & g)895 upgmaTree(TDistance& dist,
896           Graph<Tree<TCargo, TSpec> >& g)
897 {
898     SEQAN_CHECKPOINT
899     upgmaTree(dist, g, UpgmaWeightAvg());
900 }
901 
902 
903 }// namespace SEQAN_NAMESPACE_MAIN
904 
905 #endif //#ifndef SEQAN_HEADER_...
906