1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32
33 #ifndef SEQAN_HEADER_GRAPH_ALGORITHM_HMM_H
34 #define SEQAN_HEADER_GRAPH_ALGORITHM_HMM_H
35
36 namespace seqan
37 {
38
39 // ============================================================================
40 // Forwards
41 // ============================================================================
42
43 // ============================================================================
44 // Tags, Classes, Enums
45 // ============================================================================
46
47 // ============================================================================
48 // Metafunctions
49 // ============================================================================
50
51 // ============================================================================
52 // Functions
53 // ============================================================================
54
55 // --------------------------------------------------------------------------
56 // Function viterbiAlgorithm()
57 // --------------------------------------------------------------------------
58
59 /*!
60 * @defgroup HmmAlgorithms HMM Algorithms
61 * @brief Algorithms on @link HmmGraph @endlink objects.
62 */
63
64 /*!
65 * @fn HmmAlgorithms#viterbiAlgorithm
66 * @headerfile <seqan/graph_algorithms.h>
67 * @brief Implements the Viterbi algorithm for Hidden Markov Models.
68 *
69 * @signature TCargo viterbiAlgorithm(path, hmm, seq);
70 *
71 * @param[out] path The state path; String of vertex descriptors.
72 * @param[in] hmm The @link HmmGraph @endlink to use.
73 * @param[in] seq Input sequence.
74 *
75 * @return TCargo Probability of the path, the type parameter <tt>TCargo</tt> from type of <tt>hmm</tt>.
76 *
77 * The Viterbi algorithm computes the most likely sequence of hidden states of the Hidden Markov Model <tt>hmm</tt>
78 * given the sequence <tt>seq</tt> using dynamic programming. The result is the most likely sequence of hidden states
79 * and returned in <tt>path</tt>.
80 *
81 * @section Remarks
82 *
83 * See the <a href="http://wikipedia.org/wiki/Viterbi_algorithm"> Wikipedia article on the Viterbi algorithm</a> for an
84 * introduction to the algorithm itself.
85 *
86 * @see HmmAlgorithms#forwardAlgorithm
87 * @see HmmAlgorithms#backwardAlgorithm
88 */
89 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TPath>
90 inline TProbability
viterbiAlgorithm(TPath & path,Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq)91 viterbiAlgorithm(TPath & path,
92 Graph<Hmm<TAlphabet, TProbability, TSpec> > const & hmm,
93 TSequence const & seq)
94 {
95 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
96 typedef typename Size<TGraph>::Type TSize;
97 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
98 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
99
100 // Initialization
101 String<TProbability> vMat;
102 String<TSize> traceback;
103 TSize numCols = length(seq) + 2;
104 TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
105 resize(vMat, numCols * numRows, 0.0);
106 resize(traceback, numCols * numRows);
107 value(vMat, getBeginState(hmm)) = 1.0;
108 TVertexDescriptor bState = getBeginState(hmm);
109 TVertexDescriptor eState = getEndState(hmm);
110 TVertexDescriptor nilVertex = getNil<TVertexDescriptor>();
111
112 // Distinct between silent states and real states
113 typedef String<TVertexDescriptor> TStateSet;
114 typedef typename Iterator<TStateSet>::Type TStateIter;
115 TStateSet silentStates;
116 TStateSet realStates;
117 TVertexIterator itVertex(hmm);
118 for(;!atEnd(itVertex);goNext(itVertex)) {
119 if (isSilent(hmm, value(itVertex))) {
120 appendValue(silentStates, value(itVertex));
121 } else {
122 appendValue(realStates, value(itVertex));
123 }
124 }
125
126 // Initialization for silent states connected to the begin state
127 TStateIter itSilentStateEnd = end(silentStates);
128 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
129 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
130 TProbability maxValue = 0.0;
131 TVertexDescriptor maxVertex = nilVertex;
132
133 for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
134 TProbability local = value(vMat, value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
135 if (local > maxValue) {
136 maxValue = local;
137 maxVertex = value(itBelow);
138 }
139 }
140
141 // Set traceback vertex
142 if (maxVertex != nilVertex) {
143 value(vMat, value(itSilentState)) = maxValue;
144 value(traceback, value(itSilentState)) = maxVertex;
145 }
146 }
147
148 // Recurrence
149 TSize len = length(seq);
150 for(TSize i=1; i<=len; ++i) {
151 // Iterate over real states
152 TStateIter itRealStateEnd = end(realStates);
153 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
154 // Find maximum
155 TProbability maxValue = 0.0;
156 TVertexDescriptor maxVertex = nilVertex;
157 TVertexIterator itMax(hmm);
158 for(;!atEnd(itMax);++itMax) {
159 TProbability local = value(vMat, (i-1) * numRows + value(itMax)) * getTransitionProbability(hmm, value(itMax), value(itRealState));
160 if (local > maxValue) {
161 maxValue = local;
162 maxVertex = *itMax;
163 }
164 }
165 // Set traceback vertex
166 if (maxVertex != nilVertex) {
167 value(vMat, i * numRows + value(itRealState)) = maxValue * getEmissionProbability(hmm, value(itRealState), value(seq, i-1));
168 value(traceback, i * numRows + value(itRealState)) = maxVertex;
169 }
170 }
171
172 // Iterate over silent states
173 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
174 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
175 // Find maximum
176 TProbability maxValue = 0.0;
177 TVertexDescriptor maxVertex = nilVertex;
178
179 // Iterate over real states
180 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
181 TProbability local = value(vMat, i * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
182 if (local > maxValue) {
183 maxValue = local;
184 maxVertex = value(itRealState);
185 }
186 }
187 // Iterate over silent states in increasing order
188 for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
189 TProbability local = value(vMat, i * numRows + value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
190 if (local > maxValue) {
191 maxValue = local;
192 maxVertex = value(itBelow);
193 }
194 }
195 // Set traceback vertex
196 if (maxVertex != nilVertex) {
197 value(traceback, i * numRows + value(itSilentState)) = maxVertex;
198 value(vMat, i * numRows + value(itSilentState)) = maxValue;
199 }
200 }
201 }
202
203 // Termination
204 TProbability maxValue = 0.0;
205 TVertexDescriptor maxVertex = 0;
206 TVertexIterator itMax(hmm);
207 for(;!atEnd(itMax);++itMax) {
208 TProbability local = value(vMat, len * numRows + *itMax) * getTransitionProbability(hmm, value(itMax), eState);
209 if (local > maxValue) {
210 maxValue = local;
211 maxVertex = value(itMax);
212 }
213 }
214 value(traceback, (len + 1) * numRows + eState) = maxVertex;
215 if (maxVertex != nilVertex) value(vMat, (len+1) * numRows + eState) = maxValue;
216
217 // Traceback
218 if (maxValue > 0.0) {
219 clear(path);
220 TVertexDescriptor oldState = eState;
221 appendValue(path, oldState);
222 for(TSize i = len + 1; i>=1; --i) {
223 do {
224 if ((!isSilent(hmm, oldState)) || (oldState == eState)) oldState = value(traceback, i * numRows + oldState);
225 else oldState = value(traceback, (i - 1) * numRows + oldState);
226 appendValue(path, oldState);
227 } while ((isSilent(hmm, oldState)) && (oldState != bState));
228 }
229 std::reverse(begin(path), end(path));
230 }
231
232 //// Debug code
233 //for(TSize i = 0; i<numRows; ++i) {
234 // for(TSize j=0; j<numCols; ++j) {
235 // std::cout << value(vMat, j*numRows + i) << ',';
236 // //std::cout << value(traceback, j*numRows + i) << ',';
237 // }
238 // std::cout << std::endl;
239 //}
240 //for(TSize i = 0; i<length(path); ++i) {
241 // std::cout << path[i] << ',';
242 //}
243 //std::cout << std::endl;
244
245 return value(vMat, (len+1) * numRows + eState);
246 }
247
248 // --------------------------------------------------------------------------
249 // Function forwardAlgorithm()
250 // --------------------------------------------------------------------------
251
252 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TForwardMatrix>
253 inline TProbability
_forwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq,TForwardMatrix & fMat)254 _forwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
255 TSequence const& seq,
256 TForwardMatrix& fMat)
257 {
258 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
259 typedef typename Size<TGraph>::Type TSize;
260 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
261 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
262
263 // Initialization
264 TSize numCols = length(seq) + 2;
265 TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
266 resize(fMat, numCols * numRows, 0.0);
267 value(fMat, getBeginState(hmm)) = 1.0;
268 TVertexDescriptor bState = getBeginState(hmm);
269 TVertexDescriptor eState = getEndState(hmm);
270
271 // Distinct between silent states and real states
272 typedef String<TVertexDescriptor> TStateSet;
273 typedef typename Iterator<TStateSet>::Type TStateIter;
274 TStateSet silentStates;
275 TStateSet realStates;
276 TVertexIterator itVertex(hmm);
277 for(;!atEnd(itVertex);goNext(itVertex)) {
278 if (isSilent(hmm, value(itVertex))) {
279 appendValue(silentStates, value(itVertex));
280 } else {
281 appendValue(realStates, value(itVertex));
282 }
283 }
284
285 // Initialization for silent states connected to the begin state
286 TStateIter itSilentStateEnd = end(silentStates);
287 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
288 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
289 TProbability sumValue = 0.0;
290 for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
291 sumValue += value(fMat, value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
292 }
293 value(fMat, value(itSilentState)) = sumValue;
294 }
295
296 // Recurrence
297 TSize len = length(seq);
298 for(TSize i=1; i<=len; ++i) {
299 // Iterate over real states
300 TStateIter itRealStateEnd = end(realStates);
301 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
302 TProbability sum = 0.0;
303 TVertexIterator itAll(hmm);
304 for(;!atEnd(itAll);++itAll) sum += value(fMat, (i-1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itAll), value(itRealState));
305 value(fMat, i * numRows + value(itRealState)) = getEmissionProbability(hmm, value(itRealState), seq[i-1]) * sum;
306 }
307
308 // Iterate over silent states
309 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
310 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
311 TProbability sumValue = 0.0;
312 // Iterate over real states
313 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
314 sumValue += value(fMat, i * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
315 }
316 // Iterate over silent states in increasing order
317 for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
318 sumValue += value(fMat, i * numRows + value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
319 }
320 value(fMat, i * numRows + value(itSilentState)) = sumValue;
321 }
322 }
323
324 // Termination
325 TProbability sum = 0.0;
326 TVertexIterator itAll(hmm);
327 for(;!atEnd(itAll);++itAll) {
328 sum += value(fMat, len * numRows + value(itAll)) * getTransitionProbability(hmm, value(itAll), eState);
329 }
330 value(fMat, (len+1) * numRows + eState) = sum;
331
332 //// Debug code
333 //for(TSize i = 0; i<numRows; ++i) {
334 // for(TSize j=0; j<numCols; ++j) {
335 // std::cout << value(fMat, j*numRows + i) << ',';
336 // }
337 // std::cout << std::endl;
338 //}
339
340 return value(fMat, (len+1) * numRows + eState);
341 }
342
343 /*!
344 * @fn HmmAlgorithms#forwardAlgorithm
345 * @headerfile <seqan/graph_algorithms.h>
346 * @brief Given a Hidden Markov Model <tt>hmm</tt>, the forward algorithm computes the probability of the sequence
347 * <tt>seq</tt>.
348 *
349 * @signature TCargo forwardAlgorithm(hmm, seq);
350 *
351 * @param[in] hmm The @link HmmGraph @endlink with the HMM to use.
352 * @param[in] seq Input sequence to use in the forward algorithm.
353 *
354 * @return TProbability The probability of the sequence <tt>seq</tt>. <tt>TProbability</tt> is the type parameter
355 * <tt>TCargo</tt> of the type of <tt>hmm</tt>.
356 *
357 * @section Remarks
358 *
359 * See the <a href="http://en.wikipedia.org/wiki/Forward_algorithm">Wikipedia article on the Forward algorithm</a> for
360 * an introduction to the algorithm itself.
361 *
362 * @see HmmAlgorithms#viterbiAlgorithm
363 * @see HmmAlgorithms#backwardAlgorithm
364 */
365 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence>
366 TProbability
forwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq)367 forwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const & hmm,
368 TSequence const & seq)
369 {
370 String<TProbability> fMat;
371 return _forwardAlgorithm(hmm, seq, fMat);
372 }
373
374 // --------------------------------------------------------------------------
375 // Function backwardAlgorithm()
376 // --------------------------------------------------------------------------
377
378 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TBackwardMatrix>
379 inline TProbability
_backwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq,TBackwardMatrix & bMat)380 _backwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
381 TSequence const& seq,
382 TBackwardMatrix& bMat)
383 {
384 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
385 typedef typename Size<TGraph>::Type TSize;
386 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
387 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
388
389 // Initialization
390 TSize numCols = length(seq) + 2;
391 TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
392 resize(bMat, numCols * numRows, 0.0);
393 TVertexDescriptor bState = getBeginState(hmm);
394 TVertexDescriptor eState = getEndState(hmm);
395 TSize len = length(seq);
396 value(bMat, (len + 1) * numRows + eState) = 1.0;
397
398 // Distinct between silent states and real states
399 typedef String<TVertexDescriptor> TStateSet;
400 typedef typename Iterator<TStateSet>::Type TStateIter;
401 TStateSet silentStates;
402 TStateSet realStates;
403 TVertexIterator itVertex(hmm);
404 for(;!atEnd(itVertex);goNext(itVertex)) {
405 if (isSilent(hmm, value(itVertex))) {
406 appendValue(silentStates, value(itVertex));
407 } else {
408 appendValue(realStates, value(itVertex));
409 }
410 }
411 // Reverse the silent states order
412 std::reverse(begin(silentStates), end(silentStates));
413
414 // Initialization for silent states connected to the end state
415 TStateIter itSilentStateEnd = end(silentStates);
416 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
417 if (value(itSilentState) == eState) continue;
418 TProbability sumValue = getTransitionProbability(hmm, value(itSilentState), eState);
419 for(TStateIter itAbove = begin(silentStates); itAbove != itSilentState; goNext(itAbove)) {
420 sumValue += value(bMat, len * numRows + value(itAbove)) * getTransitionProbability(hmm, value(itSilentState), value(itAbove));
421 }
422 value(bMat, len * numRows + value(itSilentState)) = sumValue;
423 }
424
425 // Initialization for real states
426 TStateIter itRealStateEnd = end(realStates);
427 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
428 TProbability sumValue = getTransitionProbability(hmm, value(itRealState), eState);
429 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
430 sumValue += value(bMat, len * numRows + value(itSilentState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
431 }
432 value(bMat, len * numRows + value(itRealState)) = sumValue;
433 }
434
435
436 // Recurrence
437 if (len > 0) {
438 for(TSize i=len - 1; i>0; --i) {
439 // Iterate over silent states
440 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
441 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
442 TProbability sumValue = 0.0;
443 // Iterate over real states
444 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
445 sumValue += value(bMat, (i+1) * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itSilentState), value(itRealState))* getEmissionProbability(hmm, value(itRealState), value(seq, i));
446 }
447 // Iterate over silent states in decreasing order
448 for(TStateIter itAbove = begin(silentStates); itAbove != itSilentState; goNext(itAbove)) {
449 if ((value(itAbove) == bState) || (value(itAbove) == eState)) continue;
450 sumValue += value(bMat, i * numRows + value(itAbove)) * getTransitionProbability(hmm, value(itSilentState), value(itAbove));
451 }
452 value(bMat, i * numRows + value(itSilentState)) = sumValue;
453 }
454
455 // Iteration over real states
456 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
457 TProbability sumValue = 0.0;
458 // Iterate over silent states
459 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
460 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
461 sumValue += value(bMat, i * numRows + value(itSilentState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
462 }
463 // Iterate over real states
464 for(TStateIter itR = begin(realStates); itR != itRealStateEnd; goNext(itR)) {
465 sumValue += value(bMat, (i+1) * numRows + value(itR)) * getTransitionProbability(hmm, value(itRealState), value(itR)) * getEmissionProbability(hmm, value(itR), value(seq, i));
466 }
467 value(bMat, i * numRows + value(itRealState)) = sumValue;
468 }
469 }
470
471 // Termination
472 // Iterate over silent states
473 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
474 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
475 TProbability sumValue = 0.0;
476 // Iterate over real states
477 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
478 sumValue += value(bMat, 1 * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itSilentState), value(itRealState))* getEmissionProbability(hmm, value(itRealState), value(seq, 0));
479 }
480 // Iterate over silent states in decreasing order
481 for(TStateIter itAbove = begin(silentStates); itAbove != itSilentState; goNext(itAbove)) {
482 if ((value(itAbove) == bState) || (value(itAbove) == eState)) continue;
483 sumValue += value(bMat, value(itAbove)) * getTransitionProbability(hmm, value(itSilentState), value(itAbove));
484 }
485 value(bMat, value(itSilentState)) = sumValue;
486 }
487 // Sum up all values
488 TProbability sumValue = 0.0;
489 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
490 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
491 sumValue += value(bMat, value(itSilentState)) * getTransitionProbability(hmm, bState, value(itSilentState));
492 }
493 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
494 sumValue += value(bMat, 1 * numRows + value(itRealState)) * getTransitionProbability(hmm, bState, value(itRealState)) * getEmissionProbability(hmm, value(itRealState), value(seq, 0));
495 }
496 value(bMat, bState) = sumValue;
497 }
498
499 //// Debug code
500 //for(TSize i = 0; i<numRows; ++i) {
501 // for(TSize j=0; j<numCols; ++j) {
502 // std::cout << value(bMat, j*numRows + i) << ',';
503 // }
504 // std::cout << std::endl;
505 //}
506
507 return value(bMat, bState);
508 }
509
510 /*!
511 * @fn HmmAlgorithms#backwardAlgorithm
512 * @headerfile <seqan/graph_algorithms.h>
513 * @brief Given a Hidden Markov Model <tt>hmm</tt>, the backward algorithm computes the probability of the sequence
514 * <tt>seq</tt>.
515 *
516 * @signature TCargo backwardAlgorithm(hmm, seq);
517 *
518 * @param[in] hmm The @link HmmGraph @endlink with the HMM to use.
519 * @param[in] seq Input sequence to use in the backward algorithm.
520 *
521 * @return TCargo The probability of the sequence <tt>seq</tt>. <tt>TProbability</tt> is the type parameter
522 * <tt>TCargo</tt> of the type of <tt>hmm</tt>.
523 *
524 * See the <a href="http://en.wikipedia.org/wiki/Forward-backward_algorithm">Wikipedia article on the Forward-backward
525 * algorithm</a> for an introduction to the algorithm itself.
526 *
527 * @see HmmAlgorithms#viterbiAlgorithm
528 * @see HmmAlgorithms#forwardAlgorithm
529 */
530 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence>
531 TProbability
backwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq)532 backwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const & hmm,
533 TSequence const & seq)
534 {
535 String<TProbability> bMat;
536 return _backwardAlgorithm(hmm, seq, bMat);
537 }
538
539 // --------------------------------------------------------------------------
540 // Function generateSequence()
541 // --------------------------------------------------------------------------
542
543 /*!
544 * @fn HmmAlgorithms#generateSequence
545 * @headerfile <seqan/graph_algorithms.h>
546 * @brief Generates random state and alphabet sequence of a given HMM.
547 *
548 * @signature void generateSequence(hmm, seq, states, numSeq, maxLen);
549 *
550 * @param[out] seq A @link StringSet @endlink of alphabet sequences.
551 * @param[out] states A @link ContainerConcept @endlink object of state sequences.
552 * @param[in] hmm The @link HmmGraph @endlink to use.
553 * @param[in] numSeq The number of sequences to generate.
554 * @param[in] maxLen The maximum length of the sequences. The sequences might be shorter if the ends tate is reached
555 * before maxLen.
556 *
557 * @section Remarks
558 *
559 * Because of silent states, generated alphabet and state sequences might have different lengths.
560 */
561 template<typename TAlphabet, typename TProbability, typename TSpec,typename TSequenceSet, typename TStateSeqSet, typename TSize>
generateSequence(TSequenceSet & sequences,TStateSeqSet & states,Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSize numSeq,TSize maxLength)562 void generateSequence(TSequenceSet & sequences,
563 TStateSeqSet & states,
564 Graph<Hmm<TAlphabet, TProbability, TSpec> > const & hmm,
565 TSize numSeq,
566 TSize maxLength)
567 {
568 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
569 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
570 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
571 typedef typename Value<TSequenceSet>::Type TSequence;
572 typedef typename Value<TStateSeqSet>::Type TStateSeq;
573
574 // TODO(holtgrew): It should be possible to use a per-call RNG.
575 typedef typename GetDefaultRng<TGraph>::Type TRng;
576
577 std::uniform_real_distribution<double> pdf(0.0, 1.0);
578 TRng & rng = defaultRng(TGraph());
579
580 // Initialization
581 clear(sequences);
582 clear(states);
583 TSize alphSize = ValueSize<TAlphabet>::VALUE;
584
585 // Simulate sequences
586 TVertexDescriptor currentState;
587 TVertexDescriptor endState = getEndState(hmm);
588 for(TSize i=0;i<numSeq;++i){
589 currentState = getBeginState(hmm);
590 TSequence seq;
591 TStateSeq stat;
592 appendValue(stat, getBeginState(hmm));
593 bool stop = false;
594 TSize pos = 0;
595 while (pos < maxLength) {
596 TProbability prob = pdf(rng);
597 TProbability compareProb = 0.0;
598 TOutEdgeIterator itState(hmm, currentState);
599 // Determine the next state
600 for(;!atEnd(itState);++itState) {
601 // Probability of the next transition
602 compareProb += getTransitionProbability(hmm, value(itState));
603 // Compare with random probability
604 if (prob <= compareProb){
605 TVertexDescriptor nextState = targetVertex(hmm, value(itState));
606 if (nextState == endState) {
607 stop = true;
608 break;
609 }
610 appendValue(stat, nextState);
611 if (!isSilent(hmm, nextState)) {
612 compareProb =0.0;
613 prob = pdf(rng);
614 for (TSize c=0;c<alphSize;++c){
615 compareProb += getEmissionProbability(hmm,targetVertex(hmm, value(itState)), TAlphabet(c));
616 if (prob <= compareProb) {
617 appendValue(seq, TAlphabet(c));
618 ++pos;
619 break;
620 }
621 }
622 }
623 currentState = nextState;
624 break;
625 }
626 }
627 if (stop==true) break;
628 }
629 appendValue(stat, getEndState(hmm));
630 appendValue(sequences, seq);
631 appendValue(states, stat);
632 }
633 }
634
635 template<typename TAlphabet, typename TProbability, typename TSpec,typename TSequenceSet, typename TSize>
636 inline void
generateSequence(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequenceSet & sequences,TSize numSeq,TSize maxLength)637 generateSequence(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
638 TSequenceSet& sequences,
639 TSize numSeq,
640 TSize maxLength)
641 {
642 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
643 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
644 StringSet<String<TVertexDescriptor> > states;
645 generateSequence(hmm, sequences, states, numSeq, maxLength);
646 }
647
648 // --------------------------------------------------------------------------
649 // Function estimationWithStates()
650 // --------------------------------------------------------------------------
651
652 template<typename TAlphabet, typename TProbability, typename TSpec,typename TEmissionCounter, typename TTransitionCounter>
653 inline void
_parameterEstimator(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TEmissionCounter const & emission,TTransitionCounter const & transition)654 _parameterEstimator(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
655 TEmissionCounter const& emission,
656 TTransitionCounter const& transition)
657 {
658 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
659 typedef typename Size<TGraph>::Type TSize;
660 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
661 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
662 typedef typename Value<TTransitionCounter>::Type TCounterValue;
663
664 // Initialization
665 TSize alphSize = ValueSize<TAlphabet>::VALUE;
666 TCounterValue pseudoCount = (TCounterValue) 1.0;
667
668 // Estimate the parameters from the counter values
669 TVertexIterator itAll(hmm);
670 for(;!atEnd(itAll);++itAll){
671 if (!isSilent(hmm, value(itAll))) {
672 TCounterValue summedCount = (TCounterValue) (0.0);
673 for(TSize i=0; i<alphSize;++i) summedCount += (pseudoCount + value(value(emission, value(itAll)),i));
674 for(TSize i=0; i<alphSize;++i) emissionProbability(hmm,value(itAll),TAlphabet(i)) = (pseudoCount + value(value(emission, value(itAll)),i)) / summedCount;
675 }
676 TCounterValue summedCount = (TCounterValue) (0.0);
677 TOutEdgeIterator itOutSum(hmm,value(itAll));
678 for(;!atEnd(itOutSum);++itOutSum) summedCount += (pseudoCount + getProperty(transition, value(itOutSum)));
679 TOutEdgeIterator itOut(hmm, value(itAll));
680 for(;!atEnd(itOut);++itOut) transitionProbability(hmm, value(itOut)) = (pseudoCount + getProperty(transition, value(itOut))) / summedCount;
681 }
682 }
683
684 ///////////////////////////////////////////////////////////////////////////////////////
685
686 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequenceSet, typename TStateSeqSet>
687 inline void
estimationWithStates(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TSequenceSet & sequences,TStateSeqSet & states)688 estimationWithStates(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
689 TSequenceSet& sequences,
690 TStateSeqSet& states)
691 {
692 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
693 typedef typename Size<TGraph>::Type TSize;
694 typedef typename Value<TStateSeqSet>::Type TStateSeq;
695 typedef typename Value<TStateSeq>::Type TState;
696 typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
697
698 // Initialization
699 TSize alphSize = ValueSize<TAlphabet>::VALUE;
700 typedef String<TSize> TCountString;
701 TCountString transitionCounter;
702 resize(transitionCounter, getIdUpperBound(_getEdgeIdManager(hmm)), 0);
703 StringSet<TCountString> emissionCounter;
704 TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
705 for(TSize i =0; i<numRows;++i) {
706 TCountString emisCount;
707 resize(emisCount,alphSize,0);
708 appendValue(emissionCounter,emisCount);
709 }
710
711 // Iterate over all sequences
712 for (TSize j=0;j<length(states);++j) {
713 TSize posSeq = 0;
714 for (TSize i = 0;i<length(value(states,j));++i) {
715 TState s = value(value(states,j),i);
716 if (!isSilent(hmm, s)) {
717 TAlphabet c = value(value(sequences,j),posSeq);
718 value(value(emissionCounter, s), ordValue(c)) += 1;
719 ++posSeq;
720 }
721 if (i<(length(value(states,j))-1)) {
722 TEdgeDescriptor currEdge = findEdge(hmm, s, value(value(states,j), (i+1)));
723 property(transitionCounter, currEdge) += 1;
724 }
725 }
726 }
727
728 //// Debug Code
729 //typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
730 //TEdgeIterator itE(hmm);
731 //for(;!atEnd(itE);goNext(itE)) std::cout << sourceVertex(itE) << ',' << targetVertex(itE) << ':' << property(transitionCounter, value(itE)) << std::endl;
732 //for(TSize j=0; j<length(emissionCounter); ++j) {
733 // for(TSize i=0;i<length(value(emissionCounter, j));++i) {
734 // std::cout << j << ":" << TAlphabet(i) << '=' << value(value(emissionCounter, j), i) << std::endl;
735 // }
736 //}
737
738 // Estimate Parameters
739 _parameterEstimator(hmm,emissionCounter, transitionCounter);
740 }
741
742 // --------------------------------------------------------------------------
743 // Function baumWelchAlgorithm()
744 // --------------------------------------------------------------------------
745
746 template<typename TAlphabet, typename TProbability, typename TSpec>
747 inline void
_fillHmmUniform(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm)748 _fillHmmUniform(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm)
749 {
750 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
751 typedef typename Size<TGraph>::Type TSize;
752 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
753 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
754
755 // Initialization
756 TSize alphSize = ValueSize<TAlphabet>::VALUE;
757
758 // Iterate over all states
759 TVertexIterator itState(hmm);
760 for(;!atEnd(itState);goNext(itState)) { //pass through the states of the hmm
761 TSize oD = outDegree(hmm, value(itState));
762 TOutEdgeIterator itOut(hmm, value(itState));
763 for (;!atEnd(itOut);goNext(itOut)) transitionProbability(hmm, value(itOut)) = (TProbability) (1.0 / (double) (oD));
764 if (!isSilent(hmm, value(itState))) {
765 for(TSize i=0;i<alphSize;++i){
766 emissionProbability(hmm,value(itState),TAlphabet(i)) = (TProbability) (1.0 / (double) (alphSize));
767 }
768 }
769 }
770 }
771
772 template<typename TAlphabet, typename TProbability, typename TSpec, typename TRNG>
773 inline void
_fillHmmRandom(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TRNG & rng)774 _fillHmmRandom(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
775 TRNG & rng)
776 {
777 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
778 typedef typename Size<TGraph>::Type TSize;
779 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
780 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
781
782 // Initialization
783 std::uniform_int_distribution<TSize> pdf(20, 99);
784 TSize alphSize = ValueSize<TAlphabet>::VALUE;
785
786 // Iterate over all states
787 TVertexIterator itState(hmm);
788 for(;!atEnd(itState);goNext(itState)) { //pass through the states of the hmm
789 TSize oD = outDegree(hmm, value(itState));
790 if (oD > 0) {
791 String<TSize> counts;
792 TSize sum = 0;
793 for(TSize i = 0;i<oD;++i) {
794 TSize rd = pdf(rng);
795 sum += rd;
796 appendValue(counts, rd);
797 }
798 TSize pos = 0;
799 TOutEdgeIterator itOut(hmm, value(itState));
800 for (;!atEnd(itOut);goNext(itOut), ++pos) transitionProbability(hmm, value(itOut)) = (TProbability) ((double) (value(counts, pos)) / (double) (sum));
801 }
802 if (!isSilent(hmm, value(itState))) {
803 String<TSize> counts;
804 TSize sum = 0;
805 for(TSize i = 0;i<alphSize;++i) {
806 TSize rd = pdf(rng);
807 sum += rd;
808 appendValue(counts, rd);
809 }
810 for(TSize i=0;i<alphSize;++i) emissionProbability(hmm,value(itState),TAlphabet(i)) = (TProbability) ((double) (value(counts, i)) / (double) (sum));
811 }
812 }
813 }
814
815 template<typename TAlphabet, typename TProbability, typename TSpec, typename TRNG>
816 inline void
randomizeHmm(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TRNG &)817 randomizeHmm(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
818 TRNG & /*rng*/)
819 {
820 // TODO(holtgrew): Why uniform and not random?
821 //_fillHmmRandom(hmm, rng);
822 _fillHmmUniform(hmm);
823 }
824
825 template <typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TSize>
826 inline TProbability
_baumWelchAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,StringSet<TSequence> const & seqSet,TSize maxIter,TProbability epsilon)827 _baumWelchAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec > >& hmm,
828 StringSet<TSequence> const& seqSet,
829 TSize maxIter,
830 TProbability epsilon)
831 {
832 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
833 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
834 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
835 typedef String<TProbability> TCountString;
836
837 // Initialization
838 TSize alphSize = ValueSize<TAlphabet>::VALUE;
839 TProbability lastTotalModelProb = 0.0;
840
841 // Randomize current HMM so only topology is preserved
842 randomizeHmm(hmm);
843
844 // Iterative Optimization
845 for(TSize iter=0; iter<maxIter; ++iter){
846 std::cout << "Iteration: "<< iter << std::endl;
847 std::cout << hmm << std::endl;
848 TCountString transitionCounter;
849 resize(transitionCounter, getIdUpperBound(_getEdgeIdManager(hmm)), 0.0);
850 StringSet<TCountString> emissionCounter;
851 TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
852 for(TSize i =0; i<numRows;++i) {
853 TCountString emisCount;
854 resize(emisCount,alphSize, 0.0);
855 appendValue(emissionCounter,emisCount);
856 }
857
858 // MaximationStep
859 // Determine new contributions
860 TProbability totalModelLogProb = 0.0;
861 for (TSize i=0; i <length(seqSet);++i){ //sequences
862 TSize len = length(value(seqSet,i));
863
864 // Forward algorithm
865 String<TProbability> fMat;
866 TProbability modelLogProb = _forwardAlgorithm(hmm, value(seqSet,i), fMat);
867 totalModelLogProb += modelLogProb;
868
869 // Backward algorithm
870 String<TProbability> bMat;
871 _backwardAlgorithm(hmm, value(seqSet,i), bMat);
872
873 // Use the posterior probabilities to estimate counter values
874 for (TSize j=0;j<len;++j){
875 TAlphabet c = value(value(seqSet,i),j);
876 TAlphabet nextC = c;
877 if (j < (len - 1)) nextC = value(value(seqSet,i),(j+1));
878
879 // Iterate over all states
880 TVertexIterator itAll(hmm);
881 for(;!atEnd(itAll);++itAll) {
882 // Handle begin state
883 if (value(itAll) == beginState(hmm)) {
884 if (j == 0) {
885 TOutEdgeIterator itOut(hmm, value(itAll));
886 for(;!atEnd(itOut); goNext(itOut)) {
887 if (!isSilent(hmm, targetVertex(itOut))) {
888 property(transitionCounter, value(itOut)) += (value(fMat, beginState(hmm)) * getTransitionProbability(hmm, value(itOut)) * getEmissionProbability(hmm, targetVertex(itOut), c) * value(bMat, numRows + targetVertex(itOut)) / modelLogProb);
889 } else {
890 property(transitionCounter, value(itOut)) += (value(fMat, beginState(hmm)) * getTransitionProbability(hmm, value(itOut)) * value(bMat, targetVertex(itOut)) / modelLogProb);
891 }
892 }
893
894 }
895 continue;
896 }
897 // Ignore the end state
898 if (value(itAll) == endState(hmm)) continue;
899
900 // Determine emission expectation values
901 if (!isSilent(hmm, value(itAll))) value(value(emissionCounter, value(itAll)),ordValue(c)) += ((value(fMat, (j+1) * numRows + value(itAll)) * value(bMat, (j+1) * numRows + value(itAll))) / modelLogProb);
902
903 // Determine transition expectation values
904 TOutEdgeIterator itOut(hmm, value(itAll));
905 for(;!atEnd(itOut); goNext(itOut)) {
906 // Handle the end state
907 if ((j == (len - 1)) && (targetVertex(itOut)==endState(hmm))) {
908 property(transitionCounter, value(itOut)) += (value(fMat, (j+1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itAll),endState(hmm)) / modelLogProb);
909 } else {
910 if (!isSilent(hmm, targetVertex(itOut))) {
911 if (j < (len - 1)) property(transitionCounter, value(itOut)) += (value(fMat, (j+1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itOut)) * getEmissionProbability(hmm, targetVertex(itOut), nextC) * value(bMat, (j+2) * numRows + targetVertex(itOut)) / modelLogProb);
912 } else {
913 property(transitionCounter, value(itOut)) += (value(fMat, (j+1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itOut)) * value(bMat, (j+1) * numRows + targetVertex(itOut)) / modelLogProb);
914 }
915 }
916 }
917 }
918 }
919 }
920 // Expectation step
921 _parameterEstimator(hmm,emissionCounter, transitionCounter);
922
923
924 // Termination?
925 if ((iter > 5) && ((totalModelLogProb - lastTotalModelProb) < epsilon)) {
926 lastTotalModelProb = totalModelLogProb;
927 break;
928 } else {
929 lastTotalModelProb = totalModelLogProb;
930 }
931 }
932 return lastTotalModelProb;
933 }
934
935 template <typename TAlphabet, typename TProbability, typename TSpec, typename TSequence>
936 inline TProbability
baumWelchAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,StringSet<TSequence> const & seqSet)937 baumWelchAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec > >& hmm,
938 StringSet<TSequence> const& seqSet)
939 {
940 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
941 typedef typename Size<TGraph>::Type TSize;
942 TSize maxIter = 100;
943 TProbability epsilon = 0.00001;
944 return _baumWelchAlgorithm(hmm, seqSet, maxIter, epsilon);
945 }
946
947 /*
948 //////////////////////////////////////////////////////////////////////////////
949 // Profile HMMs
950 //////////////////////////////////////////////////////////////////////////////
951
952 //////////////////////////////////////////////////////////////////////////////
953
954
955 template < typename TAlphabet, typename TProbability, typename TSpec, typename TMat, typename TConsensus, typename TSize>
956 inline void
957 _profileHmmCounter(Graph<Hmm<TAlphabet, TProbability, TSpec> >& pHmm,
958 TMat const& matr,
959 TConsensus const& consensus,
960 TSize const& numRows,
961 TSize const& numCols)
962 {
963 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
964 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
965 typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
966 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
967 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
968 typedef typename Value<TConsensus>::Type TValue;
969
970 // Initialization
971 TSize alphSize = ValueSize<TAlphabet>::VALUE;
972 TValue gapChar = gapValue<TValue>();
973 TVertexDescriptor begState = beginState(pHmm);
974 TVertexDescriptor eState = endState(pHmm);
975 TVertexDescriptor matchState = 1;
976 TVertexDescriptor insertState = 2;
977 TVertexDescriptor deleteState = 3;
978
979 typedef String<TProbability> TCountString;
980 TCountString transitionCounter;
981 resize(transitionCounter, getIdUpperBound(_getEdgeIdManager(pHmm)), 0.0);
982 StringSet<TCountString> emissionCounter;
983 TSize nR = getIdUpperBound(_getVertexIdManager(pHmm));
984 for(TSize i =0; i<nR;++i) {
985 TCountString emisCount;
986 resize(emisCount,alphSize, 0.0);
987 appendValue(emissionCounter,emisCount);
988 }
989
990
991 String<TVertexDescriptor> oldCol;
992 String<TVertexDescriptor> currCol;
993 resize(oldCol, numRows, begState);
994 resize(currCol, numRows, 0);
995 TEdgeDescriptor currEdge;
996
997 for(TSize i = 0; i<length(consensus); ++i) {
998 //transitionvalues
999
1000 //being in insertState
1001 if (value(consensus, i)==gapChar){
1002 for(TSize j = 0; j<numRows; ++j){
1003 if ((value(matr, j * numCols + i)!=gapChar)){
1004 value(currCol,j) = insertState;
1005 currEdge = findEdge(pHmm, value(oldCol,j), value(currCol,j));
1006 property(transitionCounter, currEdge) += 1;
1007 value(oldCol,j) = value(currCol,j);
1008 }
1009 else if ((value(matr, j * numCols + i)==gapChar) ){
1010 value(currCol,j) = value(oldCol,j);
1011 }
1012 }
1013 }
1014 //being in normal State
1015 else{
1016 for(TSize j = 0; j<numRows; ++j){
1017 if(value(matr, j * numCols + i)!=gapChar) value(currCol,j) = matchState;
1018 else value(currCol,j) = deleteState;
1019 currEdge = findEdge(pHmm, value(oldCol,j), value(currCol,j));
1020 property(transitionCounter, currEdge) += 1;
1021 value(oldCol,j)=value(currCol,j);
1022 }
1023 }
1024 //emissionvalues
1025 if (value(consensus, i)==gapChar) {
1026 for(TSize j = 0; j<numRows; ++j)
1027 if(value(matr, j * numCols + i)!=gapChar)
1028 value(value(emissionCounter, insertState), ordValue( (TAlphabet) value(matr, j * numCols + i) ) ) += 1;
1029
1030 continue;
1031 }
1032 else
1033 for(TSize j = 0; j<numRows; ++j)
1034 if(value(matr, j * numCols + i)!=gapChar)
1035 value(value(emissionCounter, matchState), ordValue( (TAlphabet) value(matr, j * numCols + i) ) ) += 1;
1036
1037 matchState+=3;
1038 if ((insertState+3)==eState) insertState+=2;
1039 else insertState+=3;
1040 deleteState+=3;
1041 }
1042
1043 //transition in endState
1044 for(TSize j = 0; j<numRows; ++j){
1045 currEdge = findEdge(pHmm,value(currCol,j),eState);
1046 property(transitionCounter, currEdge) += 1;
1047 }
1048
1049 _parameterEstimator(pHmm, emissionCounter, transitionCounter);
1050 }
1051
1052
1053 //////////////////////////////////////////////////////////////////////////////
1054
1055 template<typename TAlphabet, typename TCargo, typename TSpec, typename TConsensus>
1056 inline void
1057 _createProfileHmm(Graph<Hmm<TAlphabet, TCargo, TSpec> >& pHmm,
1058 TConsensus const& consensus)
1059 {
1060 typedef Graph<Hmm<TAlphabet, TCargo, TSpec> > TGraph;
1061 typedef typename Size<TGraph>::Type TSize;
1062 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
1063 typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
1064 typedef typename Value<TConsensus>::Type TValue;
1065
1066 // Initialization
1067 TSize alphSize = ValueSize<TAlphabet>::VALUE;
1068 TValue gapChar = gapValue<TValue>();
1069 clear(pHmm);
1070
1071 // Add begin state
1072 TVertexDescriptor begState = addVertex(pHmm);
1073 assignBeginState(pHmm, begState);
1074
1075 // Add for each consensus letter 3 states
1076 for (TSize i=0;i<length(consensus);++i){
1077 if (value(consensus,i) == gapChar) continue;
1078 addVertex(pHmm); addVertex(pHmm); addVertex(pHmm, true);
1079 }
1080
1081 // Add last insertion state
1082 TVertexDescriptor lastIState = addVertex(pHmm);
1083
1084 // Add end state
1085 TVertexDescriptor endState = addVertex(pHmm);
1086 assignEndState(pHmm, endState);
1087
1088 // Is there no consensus letter?
1089 if (lastIState == 1) {
1090 clear(pHmm);
1091 return;
1092 }
1093
1094 // Remember the kind of state
1095 TVertexDescriptor mState = 1;
1096 TVertexDescriptor iState = 2;
1097 TVertexDescriptor dState = 3;
1098
1099 // Add tranistions from begin state
1100 addEdge(pHmm, begState, mState);
1101 addEdge(pHmm, begState, iState);
1102 addEdge(pHmm, begState, dState);
1103
1104 // Add all remaining transitions
1105 for (TSize i=0;i<length(consensus);++i){
1106 if (value(consensus,i) == gapChar) continue;
1107 else if ((mState + 3) == lastIState) {
1108 addEdge(pHmm, iState, mState);
1109 addEdge(pHmm, iState, iState);
1110 addEdge(pHmm, iState, dState);
1111 break;
1112 }
1113 else{
1114 addEdge(pHmm, mState, (mState+3));
1115 addEdge(pHmm, iState, mState);
1116 addEdge(pHmm, dState, (mState+3));
1117 addEdge(pHmm, mState, (iState+3));
1118 addEdge(pHmm, iState, iState);
1119 addEdge(pHmm, dState, (iState+3));
1120 addEdge(pHmm, mState, (dState+3));
1121 addEdge(pHmm, iState, dState);
1122 addEdge(pHmm, dState, (dState+3));
1123 mState+=3;
1124 iState+=3;
1125 dState+=3;
1126 }
1127 }
1128
1129 // Transitions to the endState and the last I-state
1130 addEdge(pHmm, mState, endState);
1131 addEdge(pHmm, mState, lastIState);
1132 addEdge(pHmm, lastIState, endState);
1133 addEdge(pHmm, lastIState, lastIState);
1134 addEdge(pHmm, dState, endState);
1135 addEdge(pHmm, dState, lastIState);
1136 }
1137
1138 //////////////////////////////////////////////////////////////////////////////
1139
1140 //e.g.
1141 //String<char> matr = "-AT---GAG-G-AG-CT-C--A--GT-G-CT---G";
1142 //msaToProfileHmm(matr, hmm, 5);
1143
1144 template<typename TAlignmentChar, typename TAlphabet, typename TProbability, typename TSpec, typename TSize>
1145 inline void
1146 msaToProfileHmm(String<TAlignmentChar> const& matr,
1147 Graph<Hmm<TAlphabet, TProbability, TSpec> >& pHmm,
1148 TSize nSeq)
1149 {
1150 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > THmm;
1151
1152 // Consensus
1153 String<unsigned int> coverage;
1154 String<char> gappedConsensus;
1155 String<Dna> consensusSequence;
1156 consensusCalling(matr, consensusSequence, gappedConsensus, coverage, nSeq, MajorityVote() );
1157
1158 // Build the HMM topology
1159 _createProfileHmm(pHmm,gappedConsensus);
1160
1161 // Parameterize the pHmm
1162 TSize numCols = length(matr) / nSeq;
1163 _profileHmmCounter(pHmm, matr, gappedConsensus, nSeq, numCols);
1164 }
1165
1166 */
1167
1168
1169 } // namespace seqan
1170
1171 #endif //#ifndef SEQAN_HEADER_...
1172