1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 
33 #ifndef SEQAN_HEADER_GRAPH_ALGORITHM_HMM_H
34 #define SEQAN_HEADER_GRAPH_ALGORITHM_HMM_H
35 
36 namespace seqan
37 {
38 
39 // ============================================================================
40 // Forwards
41 // ============================================================================
42 
43 // ============================================================================
44 // Tags, Classes, Enums
45 // ============================================================================
46 
47 // ============================================================================
48 // Metafunctions
49 // ============================================================================
50 
51 // ============================================================================
52 // Functions
53 // ============================================================================
54 
55 // --------------------------------------------------------------------------
56 // Function viterbiAlgorithm()
57 // --------------------------------------------------------------------------
58 
59 /*!
60  * @defgroup HmmAlgorithms HMM Algorithms
61  * @brief Algorithms on @link HmmGraph @endlink objects.
62  */
63 
64 /*!
65  * @fn HmmAlgorithms#viterbiAlgorithm
66  * @headerfile <seqan/graph_algorithms.h>
67  * @brief Implements the Viterbi algorithm for Hidden Markov Models.
68  *
69  * @signature TCargo viterbiAlgorithm(path, hmm, seq);
70  *
71  * @param[out] path The state path;  String of vertex descriptors.
72  * @param[in]  hmm  The @link HmmGraph @endlink to use.
73  * @param[in]  seq  Input sequence.
74  *
75  * @return TCargo Probability of the path, the type parameter <tt>TCargo</tt> from type of <tt>hmm</tt>.
76  *
77  * The Viterbi algorithm computes the most likely sequence of hidden states of the Hidden Markov Model <tt>hmm</tt>
78  * given the sequence <tt>seq</tt> using dynamic programming.  The result is the most likely sequence of hidden states
79  * and returned in <tt>path</tt>.
80  *
81  * @section Remarks
82  *
83  * See the <a href="http://wikipedia.org/wiki/Viterbi_algorithm"> Wikipedia article on the Viterbi algorithm</a> for an
84  * introduction to the algorithm itself.
85  *
86  * @see HmmAlgorithms#forwardAlgorithm
87  * @see HmmAlgorithms#backwardAlgorithm
88  */
89 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TPath>
90 inline TProbability
viterbiAlgorithm(TPath & path,Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq)91 viterbiAlgorithm(TPath & path,
92                  Graph<Hmm<TAlphabet, TProbability, TSpec> > const & hmm,
93                  TSequence const & seq)
94 {
95     typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
96     typedef typename Size<TGraph>::Type TSize;
97     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
98     typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
99 
100     // Initialization
101     String<TProbability> vMat;
102     String<TSize> traceback;
103     TSize numCols = length(seq) + 2;
104     TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
105     resize(vMat, numCols * numRows, 0.0);
106     resize(traceback, numCols * numRows);
107     value(vMat, getBeginState(hmm)) = 1.0;
108     TVertexDescriptor bState = getBeginState(hmm);
109     TVertexDescriptor eState = getEndState(hmm);
110     TVertexDescriptor nilVertex = getNil<TVertexDescriptor>();
111 
112     // Distinct between silent states and real states
113     typedef String<TVertexDescriptor> TStateSet;
114     typedef typename Iterator<TStateSet>::Type TStateIter;
115     TStateSet silentStates;
116     TStateSet realStates;
117     TVertexIterator itVertex(hmm);
118     for(;!atEnd(itVertex);goNext(itVertex)) {
119         if (isSilent(hmm, value(itVertex))) {
120             appendValue(silentStates, value(itVertex));
121         } else {
122             appendValue(realStates, value(itVertex));
123         }
124     }
125 
126     // Initialization for silent states connected to the begin state
127     TStateIter itSilentStateEnd = end(silentStates);
128     for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
129         if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
130         TProbability maxValue = 0.0;
131         TVertexDescriptor maxVertex = nilVertex;
132 
133         for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
134             TProbability local = value(vMat, value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
135             if (local > maxValue) {
136                 maxValue = local;
137                 maxVertex = value(itBelow);
138             }
139         }
140 
141         // Set traceback vertex
142         if (maxVertex != nilVertex) {
143             value(vMat, value(itSilentState)) = maxValue;
144             value(traceback, value(itSilentState)) = maxVertex;
145         }
146     }
147 
148     // Recurrence
149     TSize len = length(seq);
150     for(TSize i=1; i<=len; ++i) {
151         // Iterate over real states
152         TStateIter itRealStateEnd = end(realStates);
153         for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
154             // Find maximum
155             TProbability maxValue = 0.0;
156             TVertexDescriptor maxVertex = nilVertex;
157             TVertexIterator itMax(hmm);
158             for(;!atEnd(itMax);++itMax) {
159                 TProbability local = value(vMat, (i-1) * numRows + value(itMax)) * getTransitionProbability(hmm, value(itMax), value(itRealState));
160                 if (local > maxValue) {
161                     maxValue = local;
162                     maxVertex = *itMax;
163                 }
164             }
165             // Set traceback vertex
166             if (maxVertex != nilVertex) {
167                 value(vMat, i * numRows + value(itRealState)) = maxValue * getEmissionProbability(hmm, value(itRealState), value(seq, i-1));
168                 value(traceback, i * numRows + value(itRealState)) = maxVertex;
169             }
170         }
171 
172         // Iterate over silent states
173         for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
174             if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
175             // Find maximum
176             TProbability maxValue = 0.0;
177             TVertexDescriptor maxVertex = nilVertex;
178 
179             // Iterate over real states
180             for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
181                 TProbability local = value(vMat, i * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
182                 if (local > maxValue) {
183                     maxValue = local;
184                     maxVertex = value(itRealState);
185                 }
186             }
187             // Iterate over silent states in increasing order
188             for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
189                 TProbability local = value(vMat, i * numRows + value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
190                 if (local > maxValue) {
191                     maxValue = local;
192                     maxVertex = value(itBelow);
193                 }
194             }
195             // Set traceback vertex
196             if (maxVertex != nilVertex) {
197                 value(traceback, i * numRows + value(itSilentState)) = maxVertex;
198                 value(vMat, i * numRows + value(itSilentState)) = maxValue;
199             }
200         }
201     }
202 
203     // Termination
204     TProbability maxValue = 0.0;
205     TVertexDescriptor maxVertex = 0;
206     TVertexIterator itMax(hmm);
207     for(;!atEnd(itMax);++itMax) {
208         TProbability local = value(vMat, len * numRows + *itMax) * getTransitionProbability(hmm, value(itMax), eState);
209         if (local > maxValue) {
210             maxValue = local;
211             maxVertex = value(itMax);
212         }
213     }
214     value(traceback, (len + 1) * numRows + eState) = maxVertex;
215     if (maxVertex != nilVertex) value(vMat, (len+1) * numRows + eState) = maxValue;
216 
217     // Traceback
218     if (maxValue > 0.0) {
219         clear(path);
220         TVertexDescriptor oldState = eState;
221         appendValue(path, oldState);
222         for(TSize i = len + 1; i>=1; --i) {
223             do {
224                 if ((!isSilent(hmm, oldState)) || (oldState == eState)) oldState = value(traceback, i * numRows + oldState);
225                 else oldState = value(traceback, (i - 1) * numRows + oldState);
226                 appendValue(path, oldState);
227             } while ((isSilent(hmm, oldState)) && (oldState != bState));
228         }
229         std::reverse(begin(path), end(path));
230     }
231 
232     //// Debug code
233     //for(TSize i = 0; i<numRows; ++i) {
234     //  for(TSize j=0; j<numCols; ++j) {
235     //      std::cout << value(vMat, j*numRows + i) << ',';
236     //      //std::cout << value(traceback, j*numRows + i) << ',';
237     //  }
238     //  std::cout << std::endl;
239     //}
240     //for(TSize i = 0; i<length(path); ++i) {
241     //  std::cout << path[i] << ',';
242     //}
243     //std::cout << std::endl;
244 
245     return value(vMat, (len+1) * numRows + eState);
246 }
247 
248 // --------------------------------------------------------------------------
249 // Function forwardAlgorithm()
250 // --------------------------------------------------------------------------
251 
252 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TForwardMatrix>
253 inline TProbability
_forwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq,TForwardMatrix & fMat)254 _forwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
255                    TSequence const& seq,
256                    TForwardMatrix& fMat)
257 {
258     SEQAN_CHECKPOINT
259     typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
260     typedef typename Size<TGraph>::Type TSize;
261     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
262     typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
263 
264     // Initialization
265     TSize numCols = length(seq) + 2;
266     TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
267     resize(fMat, numCols * numRows, 0.0);
268     value(fMat, getBeginState(hmm)) = 1.0;
269     TVertexDescriptor bState = getBeginState(hmm);
270     TVertexDescriptor eState = getEndState(hmm);
271 
272     // Distinct between silent states and real states
273     typedef String<TVertexDescriptor> TStateSet;
274     typedef typename Iterator<TStateSet>::Type TStateIter;
275     TStateSet silentStates;
276     TStateSet realStates;
277     TVertexIterator itVertex(hmm);
278     for(;!atEnd(itVertex);goNext(itVertex)) {
279         if (isSilent(hmm, value(itVertex))) {
280             appendValue(silentStates, value(itVertex));
281         } else {
282             appendValue(realStates, value(itVertex));
283         }
284     }
285 
286     // Initialization for silent states connected to the begin state
287     TStateIter itSilentStateEnd = end(silentStates);
288     for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
289         if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
290         TProbability sumValue = 0.0;
291         for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
292             sumValue += value(fMat, value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
293         }
294         value(fMat, value(itSilentState)) = sumValue;
295     }
296 
297     // Recurrence
298     TSize len = length(seq);
299     for(TSize i=1; i<=len; ++i) {
300         // Iterate over real states
301         TStateIter itRealStateEnd = end(realStates);
302         for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
303             TProbability sum = 0.0;
304             TVertexIterator itAll(hmm);
305             for(;!atEnd(itAll);++itAll) sum += value(fMat, (i-1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itAll), value(itRealState));
306             value(fMat, i * numRows + value(itRealState)) = getEmissionProbability(hmm, value(itRealState), seq[i-1]) * sum;
307         }
308 
309         // Iterate over silent states
310         for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
311             if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
312             TProbability sumValue = 0.0;
313             // Iterate over real states
314             for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
315                 sumValue += value(fMat, i * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
316             }
317             // Iterate over silent states in increasing order
318             for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
319                 sumValue += value(fMat, i * numRows + value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
320             }
321             value(fMat, i * numRows + value(itSilentState)) = sumValue;
322         }
323     }
324 
325     // Termination
326     TProbability sum = 0.0;
327     TVertexIterator itAll(hmm);
328     for(;!atEnd(itAll);++itAll) {
329         sum += value(fMat, len * numRows + value(itAll)) * getTransitionProbability(hmm, value(itAll), eState);
330     }
331     value(fMat, (len+1) * numRows + eState) = sum;
332 
333     //// Debug code
334     //for(TSize i = 0; i<numRows; ++i) {
335     //  for(TSize j=0; j<numCols; ++j) {
336     //      std::cout << value(fMat, j*numRows + i) << ',';
337     //  }
338     //  std::cout << std::endl;
339     //}
340 
341     return value(fMat, (len+1) * numRows + eState);
342 }
343 
344 /*!
345  * @fn HmmAlgorithms#forwardAlgorithm
346  * @headerfile <seqan/graph_algorithms.h>
347  * @brief Given a Hidden Markov Model <tt>hmm</tt>, the forward algorithm computes the probability of the sequence
348  *        <tt>seq</tt>.
349  *
350  * @signature TCargo forwardAlgorithm(hmm, seq);
351  *
352  * @param[in] hmm The @link HmmGraph @endlink with the HMM to use.
353  * @param[in] seq Input sequence to use in the forward algorithm.
354  *
355  * @return TProbability The probability of the sequence <tt>seq</tt>.  <tt>TProbability</tt> is the type parameter
356  *                      <tt>TCargo</tt> of the type of <tt>hmm</tt>.
357  *
358  * @section Remarks
359  *
360  * See the <a href="http://en.wikipedia.org/wiki/Forward_algorithm">Wikipedia article on the Forward algorithm</a> for
361  * an introduction to the algorithm itself.
362  *
363  * @see HmmAlgorithms#viterbiAlgorithm
364  * @see HmmAlgorithms#backwardAlgorithm
365  */
366 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence>
367 TProbability
forwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq)368 forwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const & hmm,
369                  TSequence const & seq)
370 {
371     SEQAN_CHECKPOINT
372     String<TProbability> fMat;
373     return _forwardAlgorithm(hmm, seq, fMat);
374 }
375 
376 // --------------------------------------------------------------------------
377 // Function backwardAlgorithm()
378 // --------------------------------------------------------------------------
379 
380 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TBackwardMatrix>
381 inline TProbability
_backwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq,TBackwardMatrix & bMat)382 _backwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
383                     TSequence const& seq,
384                     TBackwardMatrix& bMat)
385 {
386     SEQAN_CHECKPOINT
387     typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
388     typedef typename Size<TGraph>::Type TSize;
389     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
390     typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
391 
392     // Initialization
393     TSize numCols = length(seq) + 2;
394     TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
395     resize(bMat, numCols * numRows, 0.0);
396     TVertexDescriptor bState = getBeginState(hmm);
397     TVertexDescriptor eState = getEndState(hmm);
398     TSize len = length(seq);
399     value(bMat, (len + 1) * numRows + eState) = 1.0;
400 
401     // Distinct between silent states and real states
402     typedef String<TVertexDescriptor> TStateSet;
403     typedef typename Iterator<TStateSet>::Type TStateIter;
404     TStateSet silentStates;
405     TStateSet realStates;
406     TVertexIterator itVertex(hmm);
407     for(;!atEnd(itVertex);goNext(itVertex)) {
408         if (isSilent(hmm, value(itVertex))) {
409             appendValue(silentStates, value(itVertex));
410         } else {
411             appendValue(realStates, value(itVertex));
412         }
413     }
414     // Reverse the silent states order
415     std::reverse(begin(silentStates), end(silentStates));
416 
417     // Initialization for silent states connected to the end state
418     TStateIter itSilentStateEnd = end(silentStates);
419     for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
420         if (value(itSilentState) == eState) continue;
421         TProbability sumValue = getTransitionProbability(hmm, value(itSilentState), eState);
422         for(TStateIter itAbove = begin(silentStates); itAbove != itSilentState; goNext(itAbove)) {
423             sumValue += value(bMat, len * numRows + value(itAbove)) * getTransitionProbability(hmm, value(itSilentState), value(itAbove));
424         }
425         value(bMat, len * numRows + value(itSilentState)) = sumValue;
426     }
427 
428     // Initialization for real states
429     TStateIter itRealStateEnd = end(realStates);
430     for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
431         TProbability sumValue = getTransitionProbability(hmm, value(itRealState), eState);
432         for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
433             sumValue += value(bMat, len * numRows + value(itSilentState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
434         }
435         value(bMat, len * numRows + value(itRealState)) = sumValue;
436     }
437 
438 
439     // Recurrence
440     if (len > 0) {
441         for(TSize i=len - 1; i>0; --i) {
442             // Iterate over silent states
443             for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
444                 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
445                 TProbability sumValue = 0.0;
446                 // Iterate over real states
447                 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
448                     sumValue += value(bMat, (i+1) * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itSilentState), value(itRealState))* getEmissionProbability(hmm, value(itRealState), value(seq, i));
449                 }
450                 // Iterate over silent states in decreasing order
451                 for(TStateIter itAbove = begin(silentStates); itAbove != itSilentState; goNext(itAbove)) {
452                     if ((value(itAbove) == bState) || (value(itAbove) == eState)) continue;
453                     sumValue += value(bMat, i * numRows + value(itAbove)) * getTransitionProbability(hmm, value(itSilentState), value(itAbove));
454                 }
455                 value(bMat, i * numRows + value(itSilentState)) = sumValue;
456             }
457 
458             // Iteration over real states
459             for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
460                 TProbability sumValue = 0.0;
461                 // Iterate over silent states
462                 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
463                     if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
464                     sumValue += value(bMat, i * numRows + value(itSilentState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
465                 }
466                 // Iterate over real states
467                 for(TStateIter itR = begin(realStates); itR != itRealStateEnd; goNext(itR)) {
468                     sumValue += value(bMat, (i+1) * numRows + value(itR)) * getTransitionProbability(hmm, value(itRealState), value(itR)) * getEmissionProbability(hmm, value(itR), value(seq, i));
469                 }
470                 value(bMat, i * numRows + value(itRealState)) =  sumValue;
471             }
472         }
473 
474         // Termination
475         // Iterate over silent states
476         for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
477             if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
478             TProbability sumValue = 0.0;
479             // Iterate over real states
480             for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
481                 sumValue += value(bMat, 1 * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itSilentState), value(itRealState))* getEmissionProbability(hmm, value(itRealState), value(seq, 0));
482             }
483             // Iterate over silent states in decreasing order
484             for(TStateIter itAbove = begin(silentStates); itAbove != itSilentState; goNext(itAbove)) {
485                 if ((value(itAbove) == bState) || (value(itAbove) == eState)) continue;
486                 sumValue += value(bMat, value(itAbove)) * getTransitionProbability(hmm, value(itSilentState), value(itAbove));
487             }
488             value(bMat, value(itSilentState)) = sumValue;
489         }
490         // Sum up all values
491         TProbability sumValue = 0.0;
492         for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
493             if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
494             sumValue += value(bMat, value(itSilentState)) * getTransitionProbability(hmm, bState, value(itSilentState));
495         }
496         for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
497             sumValue += value(bMat, 1 * numRows + value(itRealState)) * getTransitionProbability(hmm, bState, value(itRealState)) * getEmissionProbability(hmm, value(itRealState), value(seq, 0));
498         }
499         value(bMat, bState) = sumValue;
500     }
501 
502     //// Debug code
503     //for(TSize i = 0; i<numRows; ++i) {
504     //  for(TSize j=0; j<numCols; ++j) {
505     //      std::cout << value(bMat, j*numRows + i) << ',';
506     //  }
507     //  std::cout << std::endl;
508     //}
509 
510     return value(bMat, bState);
511 }
512 
513 /*!
514  * @fn HmmAlgorithms#backwardAlgorithm
515  * @headerfile <seqan/graph_algorithms.h>
516  * @brief Given a Hidden Markov Model <tt>hmm</tt>, the backward algorithm computes the probability of the sequence
517  *        <tt>seq</tt>.
518  *
519  * @signature TCargo backwardAlgorithm(hmm, seq);
520  *
521  * @param[in] hmm The @link HmmGraph @endlink with the HMM to use.
522  * @param[in] seq Input sequence to use in the backward algorithm.
523  *
524  * @return TCargo The probability of the sequence <tt>seq</tt>.  <tt>TProbability</tt> is the type parameter
525  *                <tt>TCargo</tt> of the type of <tt>hmm</tt>.
526  *
527  * See the <a href="http://en.wikipedia.org/wiki/Forward-backward_algorithm">Wikipedia article on the Forward-backward
528  * algorithm</a> for an introduction to the algorithm itself.
529  *
530  * @see HmmAlgorithms#viterbiAlgorithm
531  * @see HmmAlgorithms#forwardAlgorithm
532  */
533 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence>
534 TProbability
backwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq)535 backwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const & hmm,
536                   TSequence const & seq)
537 {
538     SEQAN_CHECKPOINT
539     String<TProbability> bMat;
540     return _backwardAlgorithm(hmm, seq, bMat);
541 }
542 
543 // --------------------------------------------------------------------------
544 // Function generateSequence()
545 // --------------------------------------------------------------------------
546 
547 /*!
548  * @fn HmmAlgorithms#generateSequence
549  * @headerfile <seqan/graph_algorithms.h>
550  * @brief Generates random state and alphabet sequence of a given HMM.
551  *
552  * @signature void generateSequence(hmm, seq, states, numSeq, maxLen);
553  *
554  * @param[out] seq    A @link StringSet @endlink of alphabet sequences.
555  * @param[out] states A @link ContainerConcept @endlink object of state sequences.
556  * @param[in]  hmm    The @link HmmGraph @endlink to use.
557  * @param[in]  numSeq The number of sequences to generate.
558  * @param[in]  maxLen The maximum length of the sequences.  The sequences might be shorter if the ends tate is reached
559  *                    before maxLen.
560  *
561  * @section Remarks
562  *
563  * Because of silent states, generated alphabet and state sequences might have different lengths.
564  */
565 template<typename TAlphabet, typename TProbability, typename TSpec,typename TSequenceSet, typename TStateSeqSet, typename TSize>
generateSequence(TSequenceSet & sequences,TStateSeqSet & states,Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSize numSeq,TSize maxLength)566 void generateSequence(TSequenceSet & sequences,
567                       TStateSeqSet & states,
568                       Graph<Hmm<TAlphabet, TProbability, TSpec> > const & hmm,
569                       TSize numSeq,
570                       TSize maxLength)
571 {
572     SEQAN_CHECKPOINT
573     typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
574     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
575     typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
576     typedef typename Value<TSequenceSet>::Type TSequence;
577     typedef typename Value<TStateSeqSet>::Type TStateSeq;
578 
579     // TODO(holtgrew): It should be possible to use a per-call RNG.
580     typedef typename GetDefaultRng<TGraph>::Type TRng;
581     typedef Pdf<Uniform<double> > TPdf;
582     TPdf pdf(0.0, 1.0);
583     TRng & rng = defaultRng(TGraph());
584 
585     // Initialization
586     clear(sequences);
587     clear(states);
588     TSize alphSize = ValueSize<TAlphabet>::VALUE;
589 
590     // Simulate sequences
591     TVertexDescriptor currentState;
592     TVertexDescriptor endState = getEndState(hmm);
593     for(TSize i=0;i<numSeq;++i){
594         currentState = getBeginState(hmm);
595         TSequence seq;
596         TStateSeq stat;
597         appendValue(stat, getBeginState(hmm));
598         bool stop = false;
599         TSize pos = 0;
600         while (pos < maxLength) {
601             TProbability prob = pickRandomNumber(rng, pdf);
602             TProbability compareProb = 0.0;
603             TOutEdgeIterator itState(hmm, currentState);
604             // Determine the next state
605             for(;!atEnd(itState);++itState) {
606                 // Probability of the next transition
607                 compareProb += getTransitionProbability(hmm, value(itState));
608                 // Compare with random probability
609                 if (prob <= compareProb){
610                     TVertexDescriptor nextState = targetVertex(hmm, value(itState));
611                     if (nextState == endState) {
612                         stop = true;
613                         break;
614                     }
615                     appendValue(stat, nextState);
616                     if (!isSilent(hmm, nextState)) {
617                         compareProb =0.0;
618                         prob = pickRandomNumber(rng, pdf);
619                         for (TSize c=0;c<alphSize;++c){
620                             compareProb += getEmissionProbability(hmm,targetVertex(hmm, value(itState)), TAlphabet(c));
621                             if (prob <= compareProb) {
622                                 appendValue(seq, TAlphabet(c));
623                                 ++pos;
624                                 break;
625                             }
626                         }
627                     }
628                     currentState = nextState;
629                     break;
630                 }
631             }
632             if (stop==true) break;
633         }
634         appendValue(stat, getEndState(hmm));
635         appendValue(sequences, seq);
636         appendValue(states, stat);
637     }
638 }
639 
640 template<typename TAlphabet, typename TProbability, typename TSpec,typename TSequenceSet, typename TSize>
641 inline void
generateSequence(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequenceSet & sequences,TSize numSeq,TSize maxLength)642 generateSequence(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
643                  TSequenceSet& sequences,
644                  TSize numSeq,
645                  TSize maxLength)
646 {
647     SEQAN_CHECKPOINT
648     typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
649     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
650     StringSet<String<TVertexDescriptor> > states;
651     generateSequence(hmm, sequences, states, numSeq, maxLength);
652 }
653 
654 // --------------------------------------------------------------------------
655 // Function estimationWithStates()
656 // --------------------------------------------------------------------------
657 
658 template<typename TAlphabet, typename TProbability, typename TSpec,typename TEmissionCounter, typename TTransitionCounter>
659 inline void
_parameterEstimator(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TEmissionCounter const & emission,TTransitionCounter const & transition)660 _parameterEstimator(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
661                      TEmissionCounter const& emission,
662                      TTransitionCounter const& transition)
663 {
664     typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
665     typedef typename Size<TGraph>::Type TSize;
666     typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
667     typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
668     typedef typename Value<TTransitionCounter>::Type TCounterValue;
669 
670     // Initialization
671     TSize alphSize = ValueSize<TAlphabet>::VALUE;
672     TCounterValue pseudoCount = (TCounterValue) 1.0;
673 
674     // Estimate the parameters from the counter values
675     TVertexIterator itAll(hmm);
676     for(;!atEnd(itAll);++itAll){
677         if (!isSilent(hmm, value(itAll))) {
678             TCounterValue summedCount = (TCounterValue) (0.0);
679             for(TSize i=0; i<alphSize;++i) summedCount += (pseudoCount + value(value(emission, value(itAll)),i));
680             for(TSize i=0; i<alphSize;++i) emissionProbability(hmm,value(itAll),TAlphabet(i)) = (pseudoCount + value(value(emission, value(itAll)),i)) / summedCount;
681         }
682         TCounterValue summedCount = (TCounterValue) (0.0);
683         TOutEdgeIterator itOutSum(hmm,value(itAll));
684         for(;!atEnd(itOutSum);++itOutSum) summedCount += (pseudoCount + getProperty(transition, value(itOutSum)));
685         TOutEdgeIterator itOut(hmm, value(itAll));
686         for(;!atEnd(itOut);++itOut) transitionProbability(hmm, value(itOut)) = (pseudoCount + getProperty(transition, value(itOut))) / summedCount;
687     }
688 }
689 
690 ///////////////////////////////////////////////////////////////////////////////////////
691 
692 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequenceSet, typename TStateSeqSet>
693 inline void
estimationWithStates(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TSequenceSet & sequences,TStateSeqSet & states)694 estimationWithStates(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
695                      TSequenceSet& sequences,
696                      TStateSeqSet& states)
697 {
698     typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
699     typedef typename Size<TGraph>::Type TSize;
700     typedef typename Value<TStateSeqSet>::Type TStateSeq;
701     typedef typename Value<TStateSeq>::Type TState;
702     typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
703 
704     // Initialization
705     TSize alphSize = ValueSize<TAlphabet>::VALUE;
706     typedef String<TSize> TCountString;
707     TCountString transitionCounter;
708     resize(transitionCounter, getIdUpperBound(_getEdgeIdManager(hmm)), 0);
709     StringSet<TCountString> emissionCounter;
710     TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
711     for(TSize i =0; i<numRows;++i) {
712         TCountString emisCount;
713         resize(emisCount,alphSize,0);
714         appendValue(emissionCounter,emisCount);
715     }
716 
717     // Iterate over all sequences
718     for (TSize j=0;j<length(states);++j) {
719         TSize posSeq = 0;
720         for (TSize i = 0;i<length(value(states,j));++i) {
721             TState s = value(value(states,j),i);
722             if (!isSilent(hmm, s)) {
723                 TAlphabet c = value(value(sequences,j),posSeq);
724                 value(value(emissionCounter, s), ordValue(c)) += 1;
725                 ++posSeq;
726             }
727             if (i<(length(value(states,j))-1)) {
728                 TEdgeDescriptor currEdge = findEdge(hmm, s, value(value(states,j), (i+1)));
729                 property(transitionCounter, currEdge) += 1;
730             }
731         }
732     }
733 
734     //// Debug Code
735     //typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
736     //TEdgeIterator itE(hmm);
737     //for(;!atEnd(itE);goNext(itE)) std::cout << sourceVertex(itE) << ',' << targetVertex(itE) << ':' << property(transitionCounter, value(itE)) << std::endl;
738     //for(TSize j=0; j<length(emissionCounter); ++j) {
739     //  for(TSize i=0;i<length(value(emissionCounter, j));++i) {
740     //      std::cout << j << ":" << TAlphabet(i) << '=' << value(value(emissionCounter, j), i) << std::endl;
741     //  }
742     //}
743 
744     // Estimate Parameters
745     _parameterEstimator(hmm,emissionCounter, transitionCounter);
746 }
747 
748 // --------------------------------------------------------------------------
749 // Function baumWelchAlgorithm()
750 // --------------------------------------------------------------------------
751 
752 template<typename TAlphabet, typename TProbability, typename TSpec>
753 inline void
_fillHmmUniform(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm)754 _fillHmmUniform(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm)
755 {
756     SEQAN_CHECKPOINT
757     typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
758     typedef typename Size<TGraph>::Type TSize;
759     typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
760     typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
761 
762     // Initialization
763     TSize alphSize = ValueSize<TAlphabet>::VALUE;
764 
765     // Iterate over all states
766     TVertexIterator itState(hmm);
767     for(;!atEnd(itState);goNext(itState)) {     //pass through the states of the hmm
768         TSize oD = outDegree(hmm, value(itState));
769         TOutEdgeIterator itOut(hmm, value(itState));
770         for (;!atEnd(itOut);goNext(itOut)) transitionProbability(hmm, value(itOut)) = (TProbability) (1.0 / (double) (oD));
771         if (!isSilent(hmm, value(itState))) {
772             for(TSize i=0;i<alphSize;++i){
773                 emissionProbability(hmm,value(itState),TAlphabet(i)) = (TProbability) (1.0 / (double) (alphSize));
774             }
775         }
776     }
777 }
778 
779 template<typename TAlphabet, typename TProbability, typename TSpec, typename TRNG>
780 inline void
_fillHmmRandom(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TRNG & rng)781 _fillHmmRandom(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
782                 TRNG & rng)
783 {
784     SEQAN_CHECKPOINT
785     typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
786     typedef typename Size<TGraph>::Type TSize;
787     typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
788     typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
789 
790     // Initialization
791     Pdf<Uniform<TSize> > pdf(20, 99);
792     TSize alphSize = ValueSize<TAlphabet>::VALUE;
793 
794     // Iterate over all states
795     TVertexIterator itState(hmm);
796     for(;!atEnd(itState);goNext(itState)) {     //pass through the states of the hmm
797         TSize oD = outDegree(hmm, value(itState));
798         if (oD > 0) {
799             String<TSize> counts;
800             TSize sum = 0;
801             for(TSize i = 0;i<oD;++i) {
802                 TSize rd = pickRandomNumber(rng, pdf);
803                 sum += rd;
804                 appendValue(counts, rd);
805             }
806             TSize pos = 0;
807             TOutEdgeIterator itOut(hmm, value(itState));
808             for (;!atEnd(itOut);goNext(itOut), ++pos) transitionProbability(hmm, value(itOut)) = (TProbability) ((double) (value(counts, pos)) / (double) (sum));
809         }
810         if (!isSilent(hmm, value(itState))) {
811             String<TSize> counts;
812             TSize sum = 0;
813             for(TSize i = 0;i<alphSize;++i) {
814                 TSize rd = pickRandomNumber(rng, pdf);
815                 sum += rd;
816                 appendValue(counts, rd);
817             }
818             for(TSize i=0;i<alphSize;++i) emissionProbability(hmm,value(itState),TAlphabet(i)) = (TProbability) ((double) (value(counts, i)) / (double) (sum));
819         }
820     }
821 }
822 
823 template<typename TAlphabet, typename TProbability, typename TSpec, typename TRNG>
824 inline void
randomizeHmm(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TRNG &)825 randomizeHmm(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
826              TRNG & /*rng*/)
827 {
828     // TODO(holtgrew): Why uniform and not random?
829     //_fillHmmRandom(hmm, rng);
830     _fillHmmUniform(hmm);
831 }
832 
833 template <typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TSize>
834 inline TProbability
_baumWelchAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,StringSet<TSequence> const & seqSet,TSize maxIter,TProbability epsilon)835 _baumWelchAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec > >& hmm,
836                      StringSet<TSequence> const& seqSet,
837                      TSize maxIter,
838                      TProbability epsilon)
839 {
840     typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
841     typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
842     typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
843     typedef String<TProbability> TCountString;
844 
845     // Initialization
846     TSize alphSize = ValueSize<TAlphabet>::VALUE;
847     TProbability lastTotalModelProb = 0.0;
848 
849     // Randomize current HMM so only topology is preserved
850     randomizeHmm(hmm);
851 
852     // Iterative Optimization
853     for(TSize iter=0; iter<maxIter; ++iter){
854         std::cout << "Iteration: "<< iter << std::endl;
855         std::cout << hmm << std::endl;
856         TCountString transitionCounter;
857         resize(transitionCounter, getIdUpperBound(_getEdgeIdManager(hmm)), 0.0);
858         StringSet<TCountString> emissionCounter;
859         TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
860         for(TSize i =0; i<numRows;++i) {
861             TCountString emisCount;
862             resize(emisCount,alphSize, 0.0);
863             appendValue(emissionCounter,emisCount);
864         }
865 
866         // MaximationStep
867         // Determine new contributions
868         TProbability totalModelLogProb = 0.0;
869         for (TSize i=0; i <length(seqSet);++i){     //sequences
870             TSize len = length(value(seqSet,i));
871 
872             // Forward algorithm
873             String<TProbability> fMat;
874             TProbability modelLogProb = _forwardAlgorithm(hmm, value(seqSet,i), fMat);
875             totalModelLogProb += modelLogProb;
876 
877             // Backward algorithm
878             String<TProbability> bMat;
879             _backwardAlgorithm(hmm, value(seqSet,i), bMat);
880 
881             // Use the posterior probabilities to estimate counter values
882             for (TSize j=0;j<len;++j){
883                 TAlphabet c = value(value(seqSet,i),j);
884                 TAlphabet nextC = c;
885                 if (j < (len - 1))  nextC = value(value(seqSet,i),(j+1));
886 
887                 // Iterate over all states
888                 TVertexIterator itAll(hmm);
889                 for(;!atEnd(itAll);++itAll) {
890                     // Handle begin state
891                     if (value(itAll) == beginState(hmm)) {
892                         if (j == 0) {
893                             TOutEdgeIterator itOut(hmm, value(itAll));
894                             for(;!atEnd(itOut); goNext(itOut)) {
895                                 if (!isSilent(hmm, targetVertex(itOut))) {
896                                     property(transitionCounter, value(itOut)) += (value(fMat, beginState(hmm)) * getTransitionProbability(hmm, value(itOut)) * getEmissionProbability(hmm, targetVertex(itOut), c) * value(bMat, numRows + targetVertex(itOut)) / modelLogProb);
897                                 } else {
898                                     property(transitionCounter, value(itOut)) += (value(fMat, beginState(hmm)) * getTransitionProbability(hmm, value(itOut)) * value(bMat, targetVertex(itOut)) / modelLogProb);
899                                 }
900                             }
901 
902                         }
903                         continue;
904                     }
905                     // Ignore the end state
906                     if (value(itAll) == endState(hmm)) continue;
907 
908                     // Determine emission expectation values
909                     if (!isSilent(hmm, value(itAll))) value(value(emissionCounter, value(itAll)),ordValue(c)) += ((value(fMat, (j+1) * numRows + value(itAll)) * value(bMat, (j+1) * numRows + value(itAll))) / modelLogProb);
910 
911                     // Determine transition expectation values
912                     TOutEdgeIterator itOut(hmm, value(itAll));
913                     for(;!atEnd(itOut); goNext(itOut)) {
914                         // Handle the end state
915                         if ((j == (len - 1)) && (targetVertex(itOut)==endState(hmm))) {
916                             property(transitionCounter, value(itOut)) += (value(fMat, (j+1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itAll),endState(hmm)) / modelLogProb);
917                         } else {
918                             if (!isSilent(hmm, targetVertex(itOut))) {
919                                 if (j < (len - 1)) property(transitionCounter, value(itOut)) += (value(fMat, (j+1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itOut)) * getEmissionProbability(hmm, targetVertex(itOut), nextC) * value(bMat, (j+2) * numRows + targetVertex(itOut)) / modelLogProb);
920                             } else {
921                                 property(transitionCounter, value(itOut)) += (value(fMat, (j+1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itOut)) * value(bMat, (j+1) * numRows + targetVertex(itOut)) / modelLogProb);
922                             }
923                         }
924                     }
925                 }
926             }
927         }
928         // Expectation step
929         _parameterEstimator(hmm,emissionCounter, transitionCounter);
930 
931 
932         // Termination?
933         if ((iter > 5) && ((totalModelLogProb - lastTotalModelProb) < epsilon)) {
934             lastTotalModelProb = totalModelLogProb;
935             break;
936         } else {
937             lastTotalModelProb = totalModelLogProb;
938         }
939     }
940     return lastTotalModelProb;
941 }
942 
943 template <typename TAlphabet, typename TProbability, typename TSpec, typename TSequence>
944 inline TProbability
baumWelchAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,StringSet<TSequence> const & seqSet)945 baumWelchAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec > >& hmm,
946                    StringSet<TSequence> const& seqSet)
947 {
948     SEQAN_CHECKPOINT
949     typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
950     typedef typename Size<TGraph>::Type TSize;
951     TSize maxIter = 100;
952     TProbability epsilon = 0.00001;
953     return _baumWelchAlgorithm(hmm, seqSet, maxIter, epsilon);
954 }
955 
956 /*
957 //////////////////////////////////////////////////////////////////////////////
958 // Profile HMMs
959 //////////////////////////////////////////////////////////////////////////////
960 
961 //////////////////////////////////////////////////////////////////////////////
962 
963 
964 template < typename TAlphabet, typename TProbability, typename TSpec, typename TMat, typename TConsensus, typename TSize>
965 inline void
966 _profileHmmCounter(Graph<Hmm<TAlphabet, TProbability, TSpec> >& pHmm,
967                     TMat const& matr,
968                     TConsensus const& consensus,
969                     TSize const& numRows,
970                     TSize const& numCols)
971 {
972     SEQAN_CHECKPOINT
973     typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
974     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
975     typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
976     typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
977     typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
978     typedef typename Value<TConsensus>::Type TValue;
979 
980     // Initialization
981     TSize alphSize = ValueSize<TAlphabet>::VALUE;
982     TValue gapChar = gapValue<TValue>();
983     TVertexDescriptor begState = beginState(pHmm);
984     TVertexDescriptor eState = endState(pHmm);
985     TVertexDescriptor matchState = 1;
986     TVertexDescriptor insertState = 2;
987     TVertexDescriptor deleteState = 3;
988 
989     typedef String<TProbability> TCountString;
990     TCountString transitionCounter;
991     resize(transitionCounter, getIdUpperBound(_getEdgeIdManager(pHmm)), 0.0);
992     StringSet<TCountString> emissionCounter;
993     TSize nR = getIdUpperBound(_getVertexIdManager(pHmm));
994     for(TSize i =0; i<nR;++i) {
995         TCountString emisCount;
996         resize(emisCount,alphSize, 0.0);
997         appendValue(emissionCounter,emisCount);
998     }
999 
1000 
1001     String<TVertexDescriptor> oldCol;
1002     String<TVertexDescriptor> currCol;
1003     resize(oldCol, numRows, begState);
1004     resize(currCol, numRows, 0);
1005     TEdgeDescriptor currEdge;
1006 
1007     for(TSize i = 0; i<length(consensus); ++i) {
1008         //transitionvalues
1009 
1010         //being in insertState
1011         if (value(consensus, i)==gapChar){
1012             for(TSize j = 0; j<numRows; ++j){
1013                 if ((value(matr, j * numCols + i)!=gapChar)){
1014                     value(currCol,j) = insertState;
1015                     currEdge = findEdge(pHmm, value(oldCol,j), value(currCol,j));
1016                     property(transitionCounter, currEdge) += 1;
1017                     value(oldCol,j) = value(currCol,j);
1018                 }
1019                 else if ((value(matr, j * numCols + i)==gapChar) ){
1020                     value(currCol,j) = value(oldCol,j);
1021                 }
1022             }
1023         }
1024         //being in normal State
1025         else{
1026             for(TSize j = 0; j<numRows; ++j){
1027                 if(value(matr, j * numCols + i)!=gapChar) value(currCol,j) = matchState;
1028                 else value(currCol,j) = deleteState;
1029                 currEdge = findEdge(pHmm, value(oldCol,j), value(currCol,j));
1030                 property(transitionCounter, currEdge) += 1;
1031                 value(oldCol,j)=value(currCol,j);
1032             }
1033         }
1034         //emissionvalues
1035         if (value(consensus, i)==gapChar) {
1036             for(TSize j = 0; j<numRows; ++j)
1037                 if(value(matr, j * numCols + i)!=gapChar)
1038                     value(value(emissionCounter, insertState), ordValue( (TAlphabet)  value(matr, j * numCols + i) ) ) += 1;
1039 
1040             continue;
1041         }
1042         else
1043             for(TSize j = 0; j<numRows; ++j)
1044                 if(value(matr, j * numCols + i)!=gapChar)
1045                     value(value(emissionCounter, matchState), ordValue( (TAlphabet)  value(matr, j * numCols + i) ) ) += 1;
1046 
1047         matchState+=3;
1048         if ((insertState+3)==eState) insertState+=2;
1049         else insertState+=3;
1050         deleteState+=3;
1051     }
1052 
1053     //transition in endState
1054     for(TSize j = 0; j<numRows; ++j){
1055         currEdge = findEdge(pHmm,value(currCol,j),eState);
1056         property(transitionCounter, currEdge) += 1;
1057     }
1058 
1059     _parameterEstimator(pHmm, emissionCounter, transitionCounter);
1060 }
1061 
1062 
1063 //////////////////////////////////////////////////////////////////////////////
1064 
1065 template<typename TAlphabet, typename TCargo, typename TSpec, typename TConsensus>
1066 inline void
1067 _createProfileHmm(Graph<Hmm<TAlphabet, TCargo, TSpec> >& pHmm,
1068                    TConsensus const& consensus)
1069 {
1070     SEQAN_CHECKPOINT
1071     typedef Graph<Hmm<TAlphabet, TCargo, TSpec> > TGraph;
1072     typedef typename Size<TGraph>::Type TSize;
1073     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
1074     typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
1075     typedef typename Value<TConsensus>::Type TValue;
1076 
1077     // Initialization
1078     TSize alphSize = ValueSize<TAlphabet>::VALUE;
1079     TValue gapChar = gapValue<TValue>();
1080     clear(pHmm);
1081 
1082     // Add begin state
1083     TVertexDescriptor begState = addVertex(pHmm);
1084     assignBeginState(pHmm, begState);
1085 
1086     // Add for each consensus letter 3 states
1087     for (TSize i=0;i<length(consensus);++i){
1088         if (value(consensus,i) == gapChar) continue;
1089         addVertex(pHmm); addVertex(pHmm); addVertex(pHmm, true);
1090     }
1091 
1092     // Add last insertion state
1093     TVertexDescriptor lastIState = addVertex(pHmm);
1094 
1095     // Add end state
1096     TVertexDescriptor endState = addVertex(pHmm);
1097     assignEndState(pHmm, endState);
1098 
1099     // Is there no consensus letter?
1100     if (lastIState == 1) {
1101         clear(pHmm);
1102         return;
1103     }
1104 
1105     // Remember the kind of state
1106     TVertexDescriptor mState = 1;
1107     TVertexDescriptor iState = 2;
1108     TVertexDescriptor dState = 3;
1109 
1110     // Add tranistions from begin state
1111     addEdge(pHmm, begState, mState);
1112     addEdge(pHmm, begState, iState);
1113     addEdge(pHmm, begState, dState);
1114 
1115     // Add all remaining transitions
1116     for (TSize i=0;i<length(consensus);++i){
1117         if (value(consensus,i) == gapChar) continue;
1118         else if ((mState + 3) == lastIState) {
1119             addEdge(pHmm, iState, mState);
1120             addEdge(pHmm, iState, iState);
1121             addEdge(pHmm, iState, dState);
1122             break;
1123         }
1124         else{
1125             addEdge(pHmm, mState, (mState+3));
1126             addEdge(pHmm, iState, mState);
1127             addEdge(pHmm, dState, (mState+3));
1128             addEdge(pHmm, mState, (iState+3));
1129             addEdge(pHmm, iState, iState);
1130             addEdge(pHmm, dState, (iState+3));
1131             addEdge(pHmm, mState, (dState+3));
1132             addEdge(pHmm, iState, dState);
1133             addEdge(pHmm, dState, (dState+3));
1134             mState+=3;
1135             iState+=3;
1136             dState+=3;
1137         }
1138     }
1139 
1140     // Transitions to the endState and the last I-state
1141     addEdge(pHmm, mState, endState);
1142     addEdge(pHmm, mState, lastIState);
1143     addEdge(pHmm, lastIState, endState);
1144     addEdge(pHmm, lastIState, lastIState);
1145     addEdge(pHmm, dState, endState);
1146     addEdge(pHmm, dState, lastIState);
1147 }
1148 
1149 //////////////////////////////////////////////////////////////////////////////
1150 
1151 //e.g.
1152 //String<char> matr = "-AT---GAG-G-AG-CT-C--A--GT-G-CT---G";
1153 //msaToProfileHmm(matr, hmm, 5);
1154 
1155 template<typename TAlignmentChar, typename TAlphabet, typename TProbability, typename TSpec, typename TSize>
1156 inline void
1157 msaToProfileHmm(String<TAlignmentChar> const& matr,
1158                 Graph<Hmm<TAlphabet, TProbability, TSpec> >& pHmm,
1159                 TSize nSeq)
1160 {
1161     SEQAN_CHECKPOINT
1162     typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > THmm;
1163 
1164     // Consensus
1165     String<unsigned int> coverage;
1166     String<char> gappedConsensus;
1167     String<Dna> consensusSequence;
1168     consensusCalling(matr, consensusSequence, gappedConsensus, coverage, nSeq, MajorityVote() );
1169 
1170     // Build the HMM topology
1171     _createProfileHmm(pHmm,gappedConsensus);
1172 
1173     // Parameterize the pHmm
1174     TSize numCols = length(matr) / nSeq;
1175     _profileHmmCounter(pHmm, matr, gappedConsensus, nSeq, numCols);
1176 }
1177 
1178 */
1179 
1180 
1181 }  // namespace seqan
1182 
1183 #endif //#ifndef SEQAN_HEADER_...
1184