1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32
33 #ifndef SEQAN_HEADER_GRAPH_ALGORITHM_HMM_H
34 #define SEQAN_HEADER_GRAPH_ALGORITHM_HMM_H
35
36 namespace seqan
37 {
38
39 // ============================================================================
40 // Forwards
41 // ============================================================================
42
43 // ============================================================================
44 // Tags, Classes, Enums
45 // ============================================================================
46
47 // ============================================================================
48 // Metafunctions
49 // ============================================================================
50
51 // ============================================================================
52 // Functions
53 // ============================================================================
54
55 // --------------------------------------------------------------------------
56 // Function viterbiAlgorithm()
57 // --------------------------------------------------------------------------
58
59 /*!
60 * @defgroup HmmAlgorithms HMM Algorithms
61 * @brief Algorithms on @link HmmGraph @endlink objects.
62 */
63
64 /*!
65 * @fn HmmAlgorithms#viterbiAlgorithm
66 * @headerfile <seqan/graph_algorithms.h>
67 * @brief Implements the Viterbi algorithm for Hidden Markov Models.
68 *
69 * @signature TCargo viterbiAlgorithm(path, hmm, seq);
70 *
71 * @param[out] path The state path; String of vertex descriptors.
72 * @param[in] hmm The @link HmmGraph @endlink to use.
73 * @param[in] seq Input sequence.
74 *
75 * @return TCargo Probability of the path, the type parameter <tt>TCargo</tt> from type of <tt>hmm</tt>.
76 *
77 * The Viterbi algorithm computes the most likely sequence of hidden states of the Hidden Markov Model <tt>hmm</tt>
78 * given the sequence <tt>seq</tt> using dynamic programming. The result is the most likely sequence of hidden states
79 * and returned in <tt>path</tt>.
80 *
81 * @section Remarks
82 *
83 * See the <a href="http://wikipedia.org/wiki/Viterbi_algorithm"> Wikipedia article on the Viterbi algorithm</a> for an
84 * introduction to the algorithm itself.
85 *
86 * @see HmmAlgorithms#forwardAlgorithm
87 * @see HmmAlgorithms#backwardAlgorithm
88 */
89 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TPath>
90 inline TProbability
viterbiAlgorithm(TPath & path,Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq)91 viterbiAlgorithm(TPath & path,
92 Graph<Hmm<TAlphabet, TProbability, TSpec> > const & hmm,
93 TSequence const & seq)
94 {
95 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
96 typedef typename Size<TGraph>::Type TSize;
97 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
98 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
99
100 // Initialization
101 String<TProbability> vMat;
102 String<TSize> traceback;
103 TSize numCols = length(seq) + 2;
104 TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
105 resize(vMat, numCols * numRows, 0.0);
106 resize(traceback, numCols * numRows);
107 value(vMat, getBeginState(hmm)) = 1.0;
108 TVertexDescriptor bState = getBeginState(hmm);
109 TVertexDescriptor eState = getEndState(hmm);
110 TVertexDescriptor nilVertex = getNil<TVertexDescriptor>();
111
112 // Distinct between silent states and real states
113 typedef String<TVertexDescriptor> TStateSet;
114 typedef typename Iterator<TStateSet>::Type TStateIter;
115 TStateSet silentStates;
116 TStateSet realStates;
117 TVertexIterator itVertex(hmm);
118 for(;!atEnd(itVertex);goNext(itVertex)) {
119 if (isSilent(hmm, value(itVertex))) {
120 appendValue(silentStates, value(itVertex));
121 } else {
122 appendValue(realStates, value(itVertex));
123 }
124 }
125
126 // Initialization for silent states connected to the begin state
127 TStateIter itSilentStateEnd = end(silentStates);
128 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
129 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
130 TProbability maxValue = 0.0;
131 TVertexDescriptor maxVertex = nilVertex;
132
133 for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
134 TProbability local = value(vMat, value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
135 if (local > maxValue) {
136 maxValue = local;
137 maxVertex = value(itBelow);
138 }
139 }
140
141 // Set traceback vertex
142 if (maxVertex != nilVertex) {
143 value(vMat, value(itSilentState)) = maxValue;
144 value(traceback, value(itSilentState)) = maxVertex;
145 }
146 }
147
148 // Recurrence
149 TSize len = length(seq);
150 for(TSize i=1; i<=len; ++i) {
151 // Iterate over real states
152 TStateIter itRealStateEnd = end(realStates);
153 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
154 // Find maximum
155 TProbability maxValue = 0.0;
156 TVertexDescriptor maxVertex = nilVertex;
157 TVertexIterator itMax(hmm);
158 for(;!atEnd(itMax);++itMax) {
159 TProbability local = value(vMat, (i-1) * numRows + value(itMax)) * getTransitionProbability(hmm, value(itMax), value(itRealState));
160 if (local > maxValue) {
161 maxValue = local;
162 maxVertex = *itMax;
163 }
164 }
165 // Set traceback vertex
166 if (maxVertex != nilVertex) {
167 value(vMat, i * numRows + value(itRealState)) = maxValue * getEmissionProbability(hmm, value(itRealState), value(seq, i-1));
168 value(traceback, i * numRows + value(itRealState)) = maxVertex;
169 }
170 }
171
172 // Iterate over silent states
173 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
174 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
175 // Find maximum
176 TProbability maxValue = 0.0;
177 TVertexDescriptor maxVertex = nilVertex;
178
179 // Iterate over real states
180 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
181 TProbability local = value(vMat, i * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
182 if (local > maxValue) {
183 maxValue = local;
184 maxVertex = value(itRealState);
185 }
186 }
187 // Iterate over silent states in increasing order
188 for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
189 TProbability local = value(vMat, i * numRows + value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
190 if (local > maxValue) {
191 maxValue = local;
192 maxVertex = value(itBelow);
193 }
194 }
195 // Set traceback vertex
196 if (maxVertex != nilVertex) {
197 value(traceback, i * numRows + value(itSilentState)) = maxVertex;
198 value(vMat, i * numRows + value(itSilentState)) = maxValue;
199 }
200 }
201 }
202
203 // Termination
204 TProbability maxValue = 0.0;
205 TVertexDescriptor maxVertex = 0;
206 TVertexIterator itMax(hmm);
207 for(;!atEnd(itMax);++itMax) {
208 TProbability local = value(vMat, len * numRows + *itMax) * getTransitionProbability(hmm, value(itMax), eState);
209 if (local > maxValue) {
210 maxValue = local;
211 maxVertex = value(itMax);
212 }
213 }
214 value(traceback, (len + 1) * numRows + eState) = maxVertex;
215 if (maxVertex != nilVertex) value(vMat, (len+1) * numRows + eState) = maxValue;
216
217 // Traceback
218 if (maxValue > 0.0) {
219 clear(path);
220 TVertexDescriptor oldState = eState;
221 appendValue(path, oldState);
222 for(TSize i = len + 1; i>=1; --i) {
223 do {
224 if ((!isSilent(hmm, oldState)) || (oldState == eState)) oldState = value(traceback, i * numRows + oldState);
225 else oldState = value(traceback, (i - 1) * numRows + oldState);
226 appendValue(path, oldState);
227 } while ((isSilent(hmm, oldState)) && (oldState != bState));
228 }
229 std::reverse(begin(path), end(path));
230 }
231
232 //// Debug code
233 //for(TSize i = 0; i<numRows; ++i) {
234 // for(TSize j=0; j<numCols; ++j) {
235 // std::cout << value(vMat, j*numRows + i) << ',';
236 // //std::cout << value(traceback, j*numRows + i) << ',';
237 // }
238 // std::cout << std::endl;
239 //}
240 //for(TSize i = 0; i<length(path); ++i) {
241 // std::cout << path[i] << ',';
242 //}
243 //std::cout << std::endl;
244
245 return value(vMat, (len+1) * numRows + eState);
246 }
247
248 // --------------------------------------------------------------------------
249 // Function forwardAlgorithm()
250 // --------------------------------------------------------------------------
251
252 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TForwardMatrix>
253 inline TProbability
_forwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq,TForwardMatrix & fMat)254 _forwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
255 TSequence const& seq,
256 TForwardMatrix& fMat)
257 {
258 SEQAN_CHECKPOINT
259 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
260 typedef typename Size<TGraph>::Type TSize;
261 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
262 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
263
264 // Initialization
265 TSize numCols = length(seq) + 2;
266 TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
267 resize(fMat, numCols * numRows, 0.0);
268 value(fMat, getBeginState(hmm)) = 1.0;
269 TVertexDescriptor bState = getBeginState(hmm);
270 TVertexDescriptor eState = getEndState(hmm);
271
272 // Distinct between silent states and real states
273 typedef String<TVertexDescriptor> TStateSet;
274 typedef typename Iterator<TStateSet>::Type TStateIter;
275 TStateSet silentStates;
276 TStateSet realStates;
277 TVertexIterator itVertex(hmm);
278 for(;!atEnd(itVertex);goNext(itVertex)) {
279 if (isSilent(hmm, value(itVertex))) {
280 appendValue(silentStates, value(itVertex));
281 } else {
282 appendValue(realStates, value(itVertex));
283 }
284 }
285
286 // Initialization for silent states connected to the begin state
287 TStateIter itSilentStateEnd = end(silentStates);
288 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
289 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
290 TProbability sumValue = 0.0;
291 for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
292 sumValue += value(fMat, value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
293 }
294 value(fMat, value(itSilentState)) = sumValue;
295 }
296
297 // Recurrence
298 TSize len = length(seq);
299 for(TSize i=1; i<=len; ++i) {
300 // Iterate over real states
301 TStateIter itRealStateEnd = end(realStates);
302 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
303 TProbability sum = 0.0;
304 TVertexIterator itAll(hmm);
305 for(;!atEnd(itAll);++itAll) sum += value(fMat, (i-1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itAll), value(itRealState));
306 value(fMat, i * numRows + value(itRealState)) = getEmissionProbability(hmm, value(itRealState), seq[i-1]) * sum;
307 }
308
309 // Iterate over silent states
310 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
311 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
312 TProbability sumValue = 0.0;
313 // Iterate over real states
314 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
315 sumValue += value(fMat, i * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
316 }
317 // Iterate over silent states in increasing order
318 for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
319 sumValue += value(fMat, i * numRows + value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
320 }
321 value(fMat, i * numRows + value(itSilentState)) = sumValue;
322 }
323 }
324
325 // Termination
326 TProbability sum = 0.0;
327 TVertexIterator itAll(hmm);
328 for(;!atEnd(itAll);++itAll) {
329 sum += value(fMat, len * numRows + value(itAll)) * getTransitionProbability(hmm, value(itAll), eState);
330 }
331 value(fMat, (len+1) * numRows + eState) = sum;
332
333 //// Debug code
334 //for(TSize i = 0; i<numRows; ++i) {
335 // for(TSize j=0; j<numCols; ++j) {
336 // std::cout << value(fMat, j*numRows + i) << ',';
337 // }
338 // std::cout << std::endl;
339 //}
340
341 return value(fMat, (len+1) * numRows + eState);
342 }
343
344 /*!
345 * @fn HmmAlgorithms#forwardAlgorithm
346 * @headerfile <seqan/graph_algorithms.h>
347 * @brief Given a Hidden Markov Model <tt>hmm</tt>, the forward algorithm computes the probability of the sequence
348 * <tt>seq</tt>.
349 *
350 * @signature TCargo forwardAlgorithm(hmm, seq);
351 *
352 * @param[in] hmm The @link HmmGraph @endlink with the HMM to use.
353 * @param[in] seq Input sequence to use in the forward algorithm.
354 *
355 * @return TProbability The probability of the sequence <tt>seq</tt>. <tt>TProbability</tt> is the type parameter
356 * <tt>TCargo</tt> of the type of <tt>hmm</tt>.
357 *
358 * @section Remarks
359 *
360 * See the <a href="http://en.wikipedia.org/wiki/Forward_algorithm">Wikipedia article on the Forward algorithm</a> for
361 * an introduction to the algorithm itself.
362 *
363 * @see HmmAlgorithms#viterbiAlgorithm
364 * @see HmmAlgorithms#backwardAlgorithm
365 */
366 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence>
367 TProbability
forwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq)368 forwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const & hmm,
369 TSequence const & seq)
370 {
371 SEQAN_CHECKPOINT
372 String<TProbability> fMat;
373 return _forwardAlgorithm(hmm, seq, fMat);
374 }
375
376 // --------------------------------------------------------------------------
377 // Function backwardAlgorithm()
378 // --------------------------------------------------------------------------
379
380 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TBackwardMatrix>
381 inline TProbability
_backwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq,TBackwardMatrix & bMat)382 _backwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
383 TSequence const& seq,
384 TBackwardMatrix& bMat)
385 {
386 SEQAN_CHECKPOINT
387 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
388 typedef typename Size<TGraph>::Type TSize;
389 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
390 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
391
392 // Initialization
393 TSize numCols = length(seq) + 2;
394 TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
395 resize(bMat, numCols * numRows, 0.0);
396 TVertexDescriptor bState = getBeginState(hmm);
397 TVertexDescriptor eState = getEndState(hmm);
398 TSize len = length(seq);
399 value(bMat, (len + 1) * numRows + eState) = 1.0;
400
401 // Distinct between silent states and real states
402 typedef String<TVertexDescriptor> TStateSet;
403 typedef typename Iterator<TStateSet>::Type TStateIter;
404 TStateSet silentStates;
405 TStateSet realStates;
406 TVertexIterator itVertex(hmm);
407 for(;!atEnd(itVertex);goNext(itVertex)) {
408 if (isSilent(hmm, value(itVertex))) {
409 appendValue(silentStates, value(itVertex));
410 } else {
411 appendValue(realStates, value(itVertex));
412 }
413 }
414 // Reverse the silent states order
415 std::reverse(begin(silentStates), end(silentStates));
416
417 // Initialization for silent states connected to the end state
418 TStateIter itSilentStateEnd = end(silentStates);
419 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
420 if (value(itSilentState) == eState) continue;
421 TProbability sumValue = getTransitionProbability(hmm, value(itSilentState), eState);
422 for(TStateIter itAbove = begin(silentStates); itAbove != itSilentState; goNext(itAbove)) {
423 sumValue += value(bMat, len * numRows + value(itAbove)) * getTransitionProbability(hmm, value(itSilentState), value(itAbove));
424 }
425 value(bMat, len * numRows + value(itSilentState)) = sumValue;
426 }
427
428 // Initialization for real states
429 TStateIter itRealStateEnd = end(realStates);
430 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
431 TProbability sumValue = getTransitionProbability(hmm, value(itRealState), eState);
432 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
433 sumValue += value(bMat, len * numRows + value(itSilentState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
434 }
435 value(bMat, len * numRows + value(itRealState)) = sumValue;
436 }
437
438
439 // Recurrence
440 if (len > 0) {
441 for(TSize i=len - 1; i>0; --i) {
442 // Iterate over silent states
443 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
444 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
445 TProbability sumValue = 0.0;
446 // Iterate over real states
447 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
448 sumValue += value(bMat, (i+1) * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itSilentState), value(itRealState))* getEmissionProbability(hmm, value(itRealState), value(seq, i));
449 }
450 // Iterate over silent states in decreasing order
451 for(TStateIter itAbove = begin(silentStates); itAbove != itSilentState; goNext(itAbove)) {
452 if ((value(itAbove) == bState) || (value(itAbove) == eState)) continue;
453 sumValue += value(bMat, i * numRows + value(itAbove)) * getTransitionProbability(hmm, value(itSilentState), value(itAbove));
454 }
455 value(bMat, i * numRows + value(itSilentState)) = sumValue;
456 }
457
458 // Iteration over real states
459 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
460 TProbability sumValue = 0.0;
461 // Iterate over silent states
462 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
463 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
464 sumValue += value(bMat, i * numRows + value(itSilentState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
465 }
466 // Iterate over real states
467 for(TStateIter itR = begin(realStates); itR != itRealStateEnd; goNext(itR)) {
468 sumValue += value(bMat, (i+1) * numRows + value(itR)) * getTransitionProbability(hmm, value(itRealState), value(itR)) * getEmissionProbability(hmm, value(itR), value(seq, i));
469 }
470 value(bMat, i * numRows + value(itRealState)) = sumValue;
471 }
472 }
473
474 // Termination
475 // Iterate over silent states
476 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
477 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
478 TProbability sumValue = 0.0;
479 // Iterate over real states
480 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
481 sumValue += value(bMat, 1 * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itSilentState), value(itRealState))* getEmissionProbability(hmm, value(itRealState), value(seq, 0));
482 }
483 // Iterate over silent states in decreasing order
484 for(TStateIter itAbove = begin(silentStates); itAbove != itSilentState; goNext(itAbove)) {
485 if ((value(itAbove) == bState) || (value(itAbove) == eState)) continue;
486 sumValue += value(bMat, value(itAbove)) * getTransitionProbability(hmm, value(itSilentState), value(itAbove));
487 }
488 value(bMat, value(itSilentState)) = sumValue;
489 }
490 // Sum up all values
491 TProbability sumValue = 0.0;
492 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
493 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
494 sumValue += value(bMat, value(itSilentState)) * getTransitionProbability(hmm, bState, value(itSilentState));
495 }
496 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
497 sumValue += value(bMat, 1 * numRows + value(itRealState)) * getTransitionProbability(hmm, bState, value(itRealState)) * getEmissionProbability(hmm, value(itRealState), value(seq, 0));
498 }
499 value(bMat, bState) = sumValue;
500 }
501
502 //// Debug code
503 //for(TSize i = 0; i<numRows; ++i) {
504 // for(TSize j=0; j<numCols; ++j) {
505 // std::cout << value(bMat, j*numRows + i) << ',';
506 // }
507 // std::cout << std::endl;
508 //}
509
510 return value(bMat, bState);
511 }
512
513 /*!
514 * @fn HmmAlgorithms#backwardAlgorithm
515 * @headerfile <seqan/graph_algorithms.h>
516 * @brief Given a Hidden Markov Model <tt>hmm</tt>, the backward algorithm computes the probability of the sequence
517 * <tt>seq</tt>.
518 *
519 * @signature TCargo backwardAlgorithm(hmm, seq);
520 *
521 * @param[in] hmm The @link HmmGraph @endlink with the HMM to use.
522 * @param[in] seq Input sequence to use in the backward algorithm.
523 *
524 * @return TCargo The probability of the sequence <tt>seq</tt>. <tt>TProbability</tt> is the type parameter
525 * <tt>TCargo</tt> of the type of <tt>hmm</tt>.
526 *
527 * See the <a href="http://en.wikipedia.org/wiki/Forward-backward_algorithm">Wikipedia article on the Forward-backward
528 * algorithm</a> for an introduction to the algorithm itself.
529 *
530 * @see HmmAlgorithms#viterbiAlgorithm
531 * @see HmmAlgorithms#forwardAlgorithm
532 */
533 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence>
534 TProbability
backwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq)535 backwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const & hmm,
536 TSequence const & seq)
537 {
538 SEQAN_CHECKPOINT
539 String<TProbability> bMat;
540 return _backwardAlgorithm(hmm, seq, bMat);
541 }
542
543 // --------------------------------------------------------------------------
544 // Function generateSequence()
545 // --------------------------------------------------------------------------
546
547 /*!
548 * @fn HmmAlgorithms#generateSequence
549 * @headerfile <seqan/graph_algorithms.h>
550 * @brief Generates random state and alphabet sequence of a given HMM.
551 *
552 * @signature void generateSequence(hmm, seq, states, numSeq, maxLen);
553 *
554 * @param[out] seq A @link StringSet @endlink of alphabet sequences.
555 * @param[out] states A @link ContainerConcept @endlink object of state sequences.
556 * @param[in] hmm The @link HmmGraph @endlink to use.
557 * @param[in] numSeq The number of sequences to generate.
558 * @param[in] maxLen The maximum length of the sequences. The sequences might be shorter if the ends tate is reached
559 * before maxLen.
560 *
561 * @section Remarks
562 *
563 * Because of silent states, generated alphabet and state sequences might have different lengths.
564 */
565 template<typename TAlphabet, typename TProbability, typename TSpec,typename TSequenceSet, typename TStateSeqSet, typename TSize>
generateSequence(TSequenceSet & sequences,TStateSeqSet & states,Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSize numSeq,TSize maxLength)566 void generateSequence(TSequenceSet & sequences,
567 TStateSeqSet & states,
568 Graph<Hmm<TAlphabet, TProbability, TSpec> > const & hmm,
569 TSize numSeq,
570 TSize maxLength)
571 {
572 SEQAN_CHECKPOINT
573 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
574 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
575 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
576 typedef typename Value<TSequenceSet>::Type TSequence;
577 typedef typename Value<TStateSeqSet>::Type TStateSeq;
578
579 // TODO(holtgrew): It should be possible to use a per-call RNG.
580 typedef typename GetDefaultRng<TGraph>::Type TRng;
581 typedef Pdf<Uniform<double> > TPdf;
582 TPdf pdf(0.0, 1.0);
583 TRng & rng = defaultRng(TGraph());
584
585 // Initialization
586 clear(sequences);
587 clear(states);
588 TSize alphSize = ValueSize<TAlphabet>::VALUE;
589
590 // Simulate sequences
591 TVertexDescriptor currentState;
592 TVertexDescriptor endState = getEndState(hmm);
593 for(TSize i=0;i<numSeq;++i){
594 currentState = getBeginState(hmm);
595 TSequence seq;
596 TStateSeq stat;
597 appendValue(stat, getBeginState(hmm));
598 bool stop = false;
599 TSize pos = 0;
600 while (pos < maxLength) {
601 TProbability prob = pickRandomNumber(rng, pdf);
602 TProbability compareProb = 0.0;
603 TOutEdgeIterator itState(hmm, currentState);
604 // Determine the next state
605 for(;!atEnd(itState);++itState) {
606 // Probability of the next transition
607 compareProb += getTransitionProbability(hmm, value(itState));
608 // Compare with random probability
609 if (prob <= compareProb){
610 TVertexDescriptor nextState = targetVertex(hmm, value(itState));
611 if (nextState == endState) {
612 stop = true;
613 break;
614 }
615 appendValue(stat, nextState);
616 if (!isSilent(hmm, nextState)) {
617 compareProb =0.0;
618 prob = pickRandomNumber(rng, pdf);
619 for (TSize c=0;c<alphSize;++c){
620 compareProb += getEmissionProbability(hmm,targetVertex(hmm, value(itState)), TAlphabet(c));
621 if (prob <= compareProb) {
622 appendValue(seq, TAlphabet(c));
623 ++pos;
624 break;
625 }
626 }
627 }
628 currentState = nextState;
629 break;
630 }
631 }
632 if (stop==true) break;
633 }
634 appendValue(stat, getEndState(hmm));
635 appendValue(sequences, seq);
636 appendValue(states, stat);
637 }
638 }
639
640 template<typename TAlphabet, typename TProbability, typename TSpec,typename TSequenceSet, typename TSize>
641 inline void
generateSequence(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequenceSet & sequences,TSize numSeq,TSize maxLength)642 generateSequence(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
643 TSequenceSet& sequences,
644 TSize numSeq,
645 TSize maxLength)
646 {
647 SEQAN_CHECKPOINT
648 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
649 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
650 StringSet<String<TVertexDescriptor> > states;
651 generateSequence(hmm, sequences, states, numSeq, maxLength);
652 }
653
654 // --------------------------------------------------------------------------
655 // Function estimationWithStates()
656 // --------------------------------------------------------------------------
657
658 template<typename TAlphabet, typename TProbability, typename TSpec,typename TEmissionCounter, typename TTransitionCounter>
659 inline void
_parameterEstimator(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TEmissionCounter const & emission,TTransitionCounter const & transition)660 _parameterEstimator(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
661 TEmissionCounter const& emission,
662 TTransitionCounter const& transition)
663 {
664 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
665 typedef typename Size<TGraph>::Type TSize;
666 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
667 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
668 typedef typename Value<TTransitionCounter>::Type TCounterValue;
669
670 // Initialization
671 TSize alphSize = ValueSize<TAlphabet>::VALUE;
672 TCounterValue pseudoCount = (TCounterValue) 1.0;
673
674 // Estimate the parameters from the counter values
675 TVertexIterator itAll(hmm);
676 for(;!atEnd(itAll);++itAll){
677 if (!isSilent(hmm, value(itAll))) {
678 TCounterValue summedCount = (TCounterValue) (0.0);
679 for(TSize i=0; i<alphSize;++i) summedCount += (pseudoCount + value(value(emission, value(itAll)),i));
680 for(TSize i=0; i<alphSize;++i) emissionProbability(hmm,value(itAll),TAlphabet(i)) = (pseudoCount + value(value(emission, value(itAll)),i)) / summedCount;
681 }
682 TCounterValue summedCount = (TCounterValue) (0.0);
683 TOutEdgeIterator itOutSum(hmm,value(itAll));
684 for(;!atEnd(itOutSum);++itOutSum) summedCount += (pseudoCount + getProperty(transition, value(itOutSum)));
685 TOutEdgeIterator itOut(hmm, value(itAll));
686 for(;!atEnd(itOut);++itOut) transitionProbability(hmm, value(itOut)) = (pseudoCount + getProperty(transition, value(itOut))) / summedCount;
687 }
688 }
689
690 ///////////////////////////////////////////////////////////////////////////////////////
691
692 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequenceSet, typename TStateSeqSet>
693 inline void
estimationWithStates(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TSequenceSet & sequences,TStateSeqSet & states)694 estimationWithStates(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
695 TSequenceSet& sequences,
696 TStateSeqSet& states)
697 {
698 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
699 typedef typename Size<TGraph>::Type TSize;
700 typedef typename Value<TStateSeqSet>::Type TStateSeq;
701 typedef typename Value<TStateSeq>::Type TState;
702 typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
703
704 // Initialization
705 TSize alphSize = ValueSize<TAlphabet>::VALUE;
706 typedef String<TSize> TCountString;
707 TCountString transitionCounter;
708 resize(transitionCounter, getIdUpperBound(_getEdgeIdManager(hmm)), 0);
709 StringSet<TCountString> emissionCounter;
710 TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
711 for(TSize i =0; i<numRows;++i) {
712 TCountString emisCount;
713 resize(emisCount,alphSize,0);
714 appendValue(emissionCounter,emisCount);
715 }
716
717 // Iterate over all sequences
718 for (TSize j=0;j<length(states);++j) {
719 TSize posSeq = 0;
720 for (TSize i = 0;i<length(value(states,j));++i) {
721 TState s = value(value(states,j),i);
722 if (!isSilent(hmm, s)) {
723 TAlphabet c = value(value(sequences,j),posSeq);
724 value(value(emissionCounter, s), ordValue(c)) += 1;
725 ++posSeq;
726 }
727 if (i<(length(value(states,j))-1)) {
728 TEdgeDescriptor currEdge = findEdge(hmm, s, value(value(states,j), (i+1)));
729 property(transitionCounter, currEdge) += 1;
730 }
731 }
732 }
733
734 //// Debug Code
735 //typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
736 //TEdgeIterator itE(hmm);
737 //for(;!atEnd(itE);goNext(itE)) std::cout << sourceVertex(itE) << ',' << targetVertex(itE) << ':' << property(transitionCounter, value(itE)) << std::endl;
738 //for(TSize j=0; j<length(emissionCounter); ++j) {
739 // for(TSize i=0;i<length(value(emissionCounter, j));++i) {
740 // std::cout << j << ":" << TAlphabet(i) << '=' << value(value(emissionCounter, j), i) << std::endl;
741 // }
742 //}
743
744 // Estimate Parameters
745 _parameterEstimator(hmm,emissionCounter, transitionCounter);
746 }
747
748 // --------------------------------------------------------------------------
749 // Function baumWelchAlgorithm()
750 // --------------------------------------------------------------------------
751
752 template<typename TAlphabet, typename TProbability, typename TSpec>
753 inline void
_fillHmmUniform(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm)754 _fillHmmUniform(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm)
755 {
756 SEQAN_CHECKPOINT
757 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
758 typedef typename Size<TGraph>::Type TSize;
759 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
760 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
761
762 // Initialization
763 TSize alphSize = ValueSize<TAlphabet>::VALUE;
764
765 // Iterate over all states
766 TVertexIterator itState(hmm);
767 for(;!atEnd(itState);goNext(itState)) { //pass through the states of the hmm
768 TSize oD = outDegree(hmm, value(itState));
769 TOutEdgeIterator itOut(hmm, value(itState));
770 for (;!atEnd(itOut);goNext(itOut)) transitionProbability(hmm, value(itOut)) = (TProbability) (1.0 / (double) (oD));
771 if (!isSilent(hmm, value(itState))) {
772 for(TSize i=0;i<alphSize;++i){
773 emissionProbability(hmm,value(itState),TAlphabet(i)) = (TProbability) (1.0 / (double) (alphSize));
774 }
775 }
776 }
777 }
778
779 template<typename TAlphabet, typename TProbability, typename TSpec, typename TRNG>
780 inline void
_fillHmmRandom(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TRNG & rng)781 _fillHmmRandom(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
782 TRNG & rng)
783 {
784 SEQAN_CHECKPOINT
785 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
786 typedef typename Size<TGraph>::Type TSize;
787 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
788 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
789
790 // Initialization
791 Pdf<Uniform<TSize> > pdf(20, 99);
792 TSize alphSize = ValueSize<TAlphabet>::VALUE;
793
794 // Iterate over all states
795 TVertexIterator itState(hmm);
796 for(;!atEnd(itState);goNext(itState)) { //pass through the states of the hmm
797 TSize oD = outDegree(hmm, value(itState));
798 if (oD > 0) {
799 String<TSize> counts;
800 TSize sum = 0;
801 for(TSize i = 0;i<oD;++i) {
802 TSize rd = pickRandomNumber(rng, pdf);
803 sum += rd;
804 appendValue(counts, rd);
805 }
806 TSize pos = 0;
807 TOutEdgeIterator itOut(hmm, value(itState));
808 for (;!atEnd(itOut);goNext(itOut), ++pos) transitionProbability(hmm, value(itOut)) = (TProbability) ((double) (value(counts, pos)) / (double) (sum));
809 }
810 if (!isSilent(hmm, value(itState))) {
811 String<TSize> counts;
812 TSize sum = 0;
813 for(TSize i = 0;i<alphSize;++i) {
814 TSize rd = pickRandomNumber(rng, pdf);
815 sum += rd;
816 appendValue(counts, rd);
817 }
818 for(TSize i=0;i<alphSize;++i) emissionProbability(hmm,value(itState),TAlphabet(i)) = (TProbability) ((double) (value(counts, i)) / (double) (sum));
819 }
820 }
821 }
822
823 template<typename TAlphabet, typename TProbability, typename TSpec, typename TRNG>
824 inline void
randomizeHmm(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TRNG &)825 randomizeHmm(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
826 TRNG & /*rng*/)
827 {
828 // TODO(holtgrew): Why uniform and not random?
829 //_fillHmmRandom(hmm, rng);
830 _fillHmmUniform(hmm);
831 }
832
833 template <typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TSize>
834 inline TProbability
_baumWelchAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,StringSet<TSequence> const & seqSet,TSize maxIter,TProbability epsilon)835 _baumWelchAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec > >& hmm,
836 StringSet<TSequence> const& seqSet,
837 TSize maxIter,
838 TProbability epsilon)
839 {
840 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
841 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
842 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
843 typedef String<TProbability> TCountString;
844
845 // Initialization
846 TSize alphSize = ValueSize<TAlphabet>::VALUE;
847 TProbability lastTotalModelProb = 0.0;
848
849 // Randomize current HMM so only topology is preserved
850 randomizeHmm(hmm);
851
852 // Iterative Optimization
853 for(TSize iter=0; iter<maxIter; ++iter){
854 std::cout << "Iteration: "<< iter << std::endl;
855 std::cout << hmm << std::endl;
856 TCountString transitionCounter;
857 resize(transitionCounter, getIdUpperBound(_getEdgeIdManager(hmm)), 0.0);
858 StringSet<TCountString> emissionCounter;
859 TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
860 for(TSize i =0; i<numRows;++i) {
861 TCountString emisCount;
862 resize(emisCount,alphSize, 0.0);
863 appendValue(emissionCounter,emisCount);
864 }
865
866 // MaximationStep
867 // Determine new contributions
868 TProbability totalModelLogProb = 0.0;
869 for (TSize i=0; i <length(seqSet);++i){ //sequences
870 TSize len = length(value(seqSet,i));
871
872 // Forward algorithm
873 String<TProbability> fMat;
874 TProbability modelLogProb = _forwardAlgorithm(hmm, value(seqSet,i), fMat);
875 totalModelLogProb += modelLogProb;
876
877 // Backward algorithm
878 String<TProbability> bMat;
879 _backwardAlgorithm(hmm, value(seqSet,i), bMat);
880
881 // Use the posterior probabilities to estimate counter values
882 for (TSize j=0;j<len;++j){
883 TAlphabet c = value(value(seqSet,i),j);
884 TAlphabet nextC = c;
885 if (j < (len - 1)) nextC = value(value(seqSet,i),(j+1));
886
887 // Iterate over all states
888 TVertexIterator itAll(hmm);
889 for(;!atEnd(itAll);++itAll) {
890 // Handle begin state
891 if (value(itAll) == beginState(hmm)) {
892 if (j == 0) {
893 TOutEdgeIterator itOut(hmm, value(itAll));
894 for(;!atEnd(itOut); goNext(itOut)) {
895 if (!isSilent(hmm, targetVertex(itOut))) {
896 property(transitionCounter, value(itOut)) += (value(fMat, beginState(hmm)) * getTransitionProbability(hmm, value(itOut)) * getEmissionProbability(hmm, targetVertex(itOut), c) * value(bMat, numRows + targetVertex(itOut)) / modelLogProb);
897 } else {
898 property(transitionCounter, value(itOut)) += (value(fMat, beginState(hmm)) * getTransitionProbability(hmm, value(itOut)) * value(bMat, targetVertex(itOut)) / modelLogProb);
899 }
900 }
901
902 }
903 continue;
904 }
905 // Ignore the end state
906 if (value(itAll) == endState(hmm)) continue;
907
908 // Determine emission expectation values
909 if (!isSilent(hmm, value(itAll))) value(value(emissionCounter, value(itAll)),ordValue(c)) += ((value(fMat, (j+1) * numRows + value(itAll)) * value(bMat, (j+1) * numRows + value(itAll))) / modelLogProb);
910
911 // Determine transition expectation values
912 TOutEdgeIterator itOut(hmm, value(itAll));
913 for(;!atEnd(itOut); goNext(itOut)) {
914 // Handle the end state
915 if ((j == (len - 1)) && (targetVertex(itOut)==endState(hmm))) {
916 property(transitionCounter, value(itOut)) += (value(fMat, (j+1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itAll),endState(hmm)) / modelLogProb);
917 } else {
918 if (!isSilent(hmm, targetVertex(itOut))) {
919 if (j < (len - 1)) property(transitionCounter, value(itOut)) += (value(fMat, (j+1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itOut)) * getEmissionProbability(hmm, targetVertex(itOut), nextC) * value(bMat, (j+2) * numRows + targetVertex(itOut)) / modelLogProb);
920 } else {
921 property(transitionCounter, value(itOut)) += (value(fMat, (j+1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itOut)) * value(bMat, (j+1) * numRows + targetVertex(itOut)) / modelLogProb);
922 }
923 }
924 }
925 }
926 }
927 }
928 // Expectation step
929 _parameterEstimator(hmm,emissionCounter, transitionCounter);
930
931
932 // Termination?
933 if ((iter > 5) && ((totalModelLogProb - lastTotalModelProb) < epsilon)) {
934 lastTotalModelProb = totalModelLogProb;
935 break;
936 } else {
937 lastTotalModelProb = totalModelLogProb;
938 }
939 }
940 return lastTotalModelProb;
941 }
942
943 template <typename TAlphabet, typename TProbability, typename TSpec, typename TSequence>
944 inline TProbability
baumWelchAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,StringSet<TSequence> const & seqSet)945 baumWelchAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec > >& hmm,
946 StringSet<TSequence> const& seqSet)
947 {
948 SEQAN_CHECKPOINT
949 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
950 typedef typename Size<TGraph>::Type TSize;
951 TSize maxIter = 100;
952 TProbability epsilon = 0.00001;
953 return _baumWelchAlgorithm(hmm, seqSet, maxIter, epsilon);
954 }
955
956 /*
957 //////////////////////////////////////////////////////////////////////////////
958 // Profile HMMs
959 //////////////////////////////////////////////////////////////////////////////
960
961 //////////////////////////////////////////////////////////////////////////////
962
963
964 template < typename TAlphabet, typename TProbability, typename TSpec, typename TMat, typename TConsensus, typename TSize>
965 inline void
966 _profileHmmCounter(Graph<Hmm<TAlphabet, TProbability, TSpec> >& pHmm,
967 TMat const& matr,
968 TConsensus const& consensus,
969 TSize const& numRows,
970 TSize const& numCols)
971 {
972 SEQAN_CHECKPOINT
973 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
974 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
975 typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
976 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
977 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
978 typedef typename Value<TConsensus>::Type TValue;
979
980 // Initialization
981 TSize alphSize = ValueSize<TAlphabet>::VALUE;
982 TValue gapChar = gapValue<TValue>();
983 TVertexDescriptor begState = beginState(pHmm);
984 TVertexDescriptor eState = endState(pHmm);
985 TVertexDescriptor matchState = 1;
986 TVertexDescriptor insertState = 2;
987 TVertexDescriptor deleteState = 3;
988
989 typedef String<TProbability> TCountString;
990 TCountString transitionCounter;
991 resize(transitionCounter, getIdUpperBound(_getEdgeIdManager(pHmm)), 0.0);
992 StringSet<TCountString> emissionCounter;
993 TSize nR = getIdUpperBound(_getVertexIdManager(pHmm));
994 for(TSize i =0; i<nR;++i) {
995 TCountString emisCount;
996 resize(emisCount,alphSize, 0.0);
997 appendValue(emissionCounter,emisCount);
998 }
999
1000
1001 String<TVertexDescriptor> oldCol;
1002 String<TVertexDescriptor> currCol;
1003 resize(oldCol, numRows, begState);
1004 resize(currCol, numRows, 0);
1005 TEdgeDescriptor currEdge;
1006
1007 for(TSize i = 0; i<length(consensus); ++i) {
1008 //transitionvalues
1009
1010 //being in insertState
1011 if (value(consensus, i)==gapChar){
1012 for(TSize j = 0; j<numRows; ++j){
1013 if ((value(matr, j * numCols + i)!=gapChar)){
1014 value(currCol,j) = insertState;
1015 currEdge = findEdge(pHmm, value(oldCol,j), value(currCol,j));
1016 property(transitionCounter, currEdge) += 1;
1017 value(oldCol,j) = value(currCol,j);
1018 }
1019 else if ((value(matr, j * numCols + i)==gapChar) ){
1020 value(currCol,j) = value(oldCol,j);
1021 }
1022 }
1023 }
1024 //being in normal State
1025 else{
1026 for(TSize j = 0; j<numRows; ++j){
1027 if(value(matr, j * numCols + i)!=gapChar) value(currCol,j) = matchState;
1028 else value(currCol,j) = deleteState;
1029 currEdge = findEdge(pHmm, value(oldCol,j), value(currCol,j));
1030 property(transitionCounter, currEdge) += 1;
1031 value(oldCol,j)=value(currCol,j);
1032 }
1033 }
1034 //emissionvalues
1035 if (value(consensus, i)==gapChar) {
1036 for(TSize j = 0; j<numRows; ++j)
1037 if(value(matr, j * numCols + i)!=gapChar)
1038 value(value(emissionCounter, insertState), ordValue( (TAlphabet) value(matr, j * numCols + i) ) ) += 1;
1039
1040 continue;
1041 }
1042 else
1043 for(TSize j = 0; j<numRows; ++j)
1044 if(value(matr, j * numCols + i)!=gapChar)
1045 value(value(emissionCounter, matchState), ordValue( (TAlphabet) value(matr, j * numCols + i) ) ) += 1;
1046
1047 matchState+=3;
1048 if ((insertState+3)==eState) insertState+=2;
1049 else insertState+=3;
1050 deleteState+=3;
1051 }
1052
1053 //transition in endState
1054 for(TSize j = 0; j<numRows; ++j){
1055 currEdge = findEdge(pHmm,value(currCol,j),eState);
1056 property(transitionCounter, currEdge) += 1;
1057 }
1058
1059 _parameterEstimator(pHmm, emissionCounter, transitionCounter);
1060 }
1061
1062
1063 //////////////////////////////////////////////////////////////////////////////
1064
1065 template<typename TAlphabet, typename TCargo, typename TSpec, typename TConsensus>
1066 inline void
1067 _createProfileHmm(Graph<Hmm<TAlphabet, TCargo, TSpec> >& pHmm,
1068 TConsensus const& consensus)
1069 {
1070 SEQAN_CHECKPOINT
1071 typedef Graph<Hmm<TAlphabet, TCargo, TSpec> > TGraph;
1072 typedef typename Size<TGraph>::Type TSize;
1073 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
1074 typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
1075 typedef typename Value<TConsensus>::Type TValue;
1076
1077 // Initialization
1078 TSize alphSize = ValueSize<TAlphabet>::VALUE;
1079 TValue gapChar = gapValue<TValue>();
1080 clear(pHmm);
1081
1082 // Add begin state
1083 TVertexDescriptor begState = addVertex(pHmm);
1084 assignBeginState(pHmm, begState);
1085
1086 // Add for each consensus letter 3 states
1087 for (TSize i=0;i<length(consensus);++i){
1088 if (value(consensus,i) == gapChar) continue;
1089 addVertex(pHmm); addVertex(pHmm); addVertex(pHmm, true);
1090 }
1091
1092 // Add last insertion state
1093 TVertexDescriptor lastIState = addVertex(pHmm);
1094
1095 // Add end state
1096 TVertexDescriptor endState = addVertex(pHmm);
1097 assignEndState(pHmm, endState);
1098
1099 // Is there no consensus letter?
1100 if (lastIState == 1) {
1101 clear(pHmm);
1102 return;
1103 }
1104
1105 // Remember the kind of state
1106 TVertexDescriptor mState = 1;
1107 TVertexDescriptor iState = 2;
1108 TVertexDescriptor dState = 3;
1109
1110 // Add tranistions from begin state
1111 addEdge(pHmm, begState, mState);
1112 addEdge(pHmm, begState, iState);
1113 addEdge(pHmm, begState, dState);
1114
1115 // Add all remaining transitions
1116 for (TSize i=0;i<length(consensus);++i){
1117 if (value(consensus,i) == gapChar) continue;
1118 else if ((mState + 3) == lastIState) {
1119 addEdge(pHmm, iState, mState);
1120 addEdge(pHmm, iState, iState);
1121 addEdge(pHmm, iState, dState);
1122 break;
1123 }
1124 else{
1125 addEdge(pHmm, mState, (mState+3));
1126 addEdge(pHmm, iState, mState);
1127 addEdge(pHmm, dState, (mState+3));
1128 addEdge(pHmm, mState, (iState+3));
1129 addEdge(pHmm, iState, iState);
1130 addEdge(pHmm, dState, (iState+3));
1131 addEdge(pHmm, mState, (dState+3));
1132 addEdge(pHmm, iState, dState);
1133 addEdge(pHmm, dState, (dState+3));
1134 mState+=3;
1135 iState+=3;
1136 dState+=3;
1137 }
1138 }
1139
1140 // Transitions to the endState and the last I-state
1141 addEdge(pHmm, mState, endState);
1142 addEdge(pHmm, mState, lastIState);
1143 addEdge(pHmm, lastIState, endState);
1144 addEdge(pHmm, lastIState, lastIState);
1145 addEdge(pHmm, dState, endState);
1146 addEdge(pHmm, dState, lastIState);
1147 }
1148
1149 //////////////////////////////////////////////////////////////////////////////
1150
1151 //e.g.
1152 //String<char> matr = "-AT---GAG-G-AG-CT-C--A--GT-G-CT---G";
1153 //msaToProfileHmm(matr, hmm, 5);
1154
1155 template<typename TAlignmentChar, typename TAlphabet, typename TProbability, typename TSpec, typename TSize>
1156 inline void
1157 msaToProfileHmm(String<TAlignmentChar> const& matr,
1158 Graph<Hmm<TAlphabet, TProbability, TSpec> >& pHmm,
1159 TSize nSeq)
1160 {
1161 SEQAN_CHECKPOINT
1162 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > THmm;
1163
1164 // Consensus
1165 String<unsigned int> coverage;
1166 String<char> gappedConsensus;
1167 String<Dna> consensusSequence;
1168 consensusCalling(matr, consensusSequence, gappedConsensus, coverage, nSeq, MajorityVote() );
1169
1170 // Build the HMM topology
1171 _createProfileHmm(pHmm,gappedConsensus);
1172
1173 // Parameterize the pHmm
1174 TSize numCols = length(matr) / nSeq;
1175 _profileHmmCounter(pHmm, matr, gappedConsensus, nSeq, numCols);
1176 }
1177
1178 */
1179
1180
1181 } // namespace seqan
1182
1183 #endif //#ifndef SEQAN_HEADER_...
1184