1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2010, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32
33 #ifndef SEQAN_HEADER_GRAPH_ALGORITHM_HMM_H
34 #define SEQAN_HEADER_GRAPH_ALGORITHM_HMM_H
35
36 namespace SEQAN_NAMESPACE_MAIN
37 {
38
39 //////////////////////////////////////////////////////////////////////////////
40 // Basic HMM algorithms
41 //////////////////////////////////////////////////////////////////////////////
42
43
44 //////////////////////////////////////////////////////////////////////////////
45
46 /**
47 .Function.viterbiAlgorithm:
48 ..cat:Graph
49 ..summary:Implements the viterbi algorithm.
50 ..signature:viterbiAlgorithm(hmm, seq, path)
51 ..param.hmm:In-parameter:Input HMM.
52 ...type:Spec.Hmm
53 ..param.seq:In-parameter:Input sequence.
54 ..param.path:Out-parameter:State path.
55 ..returns:TCargo
56 ...remarks:Probability of the path.
57 ..see:Function.forwardAlgorithm
58 ..see:Function.backwardAlgorithm
59 ..include:seqan/graph_algorithms.h
60 */
61 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TPath>
62 inline TProbability
viterbiAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq,TPath & path)63 viterbiAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
64 TSequence const& seq,
65 TPath& path)
66 {
67 SEQAN_CHECKPOINT
68 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
69 typedef typename Size<TGraph>::Type TSize;
70 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
71 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
72
73 // Initialization
74 String<TProbability> vMat;
75 String<TSize> traceback;
76 TSize numCols = length(seq) + 2;
77 TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
78 resize(vMat, numCols * numRows, 0.0);
79 resize(traceback, numCols * numRows);
80 value(vMat, getBeginState(hmm)) = 1.0;
81 TVertexDescriptor bState = getBeginState(hmm);
82 TVertexDescriptor eState = getEndState(hmm);
83 TVertexDescriptor nilVertex = getNil<TVertexDescriptor>();
84
85 // Distinct between silent states and real states
86 typedef String<TVertexDescriptor> TStateSet;
87 typedef typename Iterator<TStateSet>::Type TStateIter;
88 TStateSet silentStates;
89 TStateSet realStates;
90 TVertexIterator itVertex(hmm);
91 for(;!atEnd(itVertex);goNext(itVertex)) {
92 if (isSilent(hmm, value(itVertex))) {
93 appendValue(silentStates, value(itVertex));
94 } else {
95 appendValue(realStates, value(itVertex));
96 }
97 }
98
99 // Initialization for silent states connected to the begin state
100 TStateIter itSilentStateEnd = end(silentStates);
101 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
102 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
103 TProbability maxValue = 0.0;
104 TVertexDescriptor maxVertex = nilVertex;
105
106 for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
107 TProbability local = value(vMat, value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
108 if (local > maxValue) {
109 maxValue = local;
110 maxVertex = value(itBelow);
111 }
112 }
113
114 // Set traceback vertex
115 if (maxVertex != nilVertex) {
116 value(vMat, value(itSilentState)) = maxValue;
117 value(traceback, value(itSilentState)) = maxVertex;
118 }
119 }
120
121 // Recurrence
122 TSize len = length(seq);
123 for(TSize i=1; i<=len; ++i) {
124 // Iterate over real states
125 TStateIter itRealStateEnd = end(realStates);
126 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
127 // Find maximum
128 TProbability maxValue = 0.0;
129 TVertexDescriptor maxVertex = nilVertex;
130 TVertexIterator itMax(hmm);
131 for(;!atEnd(itMax);++itMax) {
132 TProbability local = value(vMat, (i-1) * numRows + value(itMax)) * getTransitionProbability(hmm, value(itMax), value(itRealState));
133 if (local > maxValue) {
134 maxValue = local;
135 maxVertex = *itMax;
136 }
137 }
138 // Set traceback vertex
139 if (maxVertex != nilVertex) {
140 value(vMat, i * numRows + value(itRealState)) = maxValue * getEmissionProbability(hmm, value(itRealState), value(seq, i-1));
141 value(traceback, i * numRows + value(itRealState)) = maxVertex;
142 }
143 }
144
145 // Iterate over silent states
146 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
147 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
148 // Find maximum
149 TProbability maxValue = 0.0;
150 TVertexDescriptor maxVertex = nilVertex;
151
152 // Iterate over real states
153 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
154 TProbability local = value(vMat, i * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
155 if (local > maxValue) {
156 maxValue = local;
157 maxVertex = value(itRealState);
158 }
159 }
160 // Iterate over silent states in increasing order
161 for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
162 TProbability local = value(vMat, i * numRows + value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
163 if (local > maxValue) {
164 maxValue = local;
165 maxVertex = value(itBelow);
166 }
167 }
168 // Set traceback vertex
169 if (maxVertex != nilVertex) {
170 value(traceback, i * numRows + value(itSilentState)) = maxVertex;
171 value(vMat, i * numRows + value(itSilentState)) = maxValue;
172 }
173 }
174 }
175
176 // Termination
177 TProbability maxValue = 0.0;
178 TVertexDescriptor maxVertex = 0;
179 TVertexIterator itMax(hmm);
180 for(;!atEnd(itMax);++itMax) {
181 TProbability local = value(vMat, len * numRows + *itMax) * getTransitionProbability(hmm, value(itMax), eState);
182 if (local > maxValue) {
183 maxValue = local;
184 maxVertex = value(itMax);
185 }
186 }
187 value(traceback, (len + 1) * numRows + eState) = maxVertex;
188 if (maxVertex != nilVertex) value(vMat, (len+1) * numRows + eState) = maxValue;
189
190 // Traceback
191 if (maxValue > 0.0) {
192 clear(path);
193 TVertexDescriptor oldState = eState;
194 appendValue(path, oldState);
195 for(TSize i = len + 1; i>=1; --i) {
196 do {
197 if ((!isSilent(hmm, oldState)) || (oldState == eState)) oldState = value(traceback, i * numRows + oldState);
198 else oldState = value(traceback, (i - 1) * numRows + oldState);
199 appendValue(path, oldState);
200 } while ((isSilent(hmm, oldState)) && (oldState != bState));
201 }
202 std::reverse(begin(path), end(path));
203 }
204
205 //// Debug code
206 //for(TSize i = 0; i<numRows; ++i) {
207 // for(TSize j=0; j<numCols; ++j) {
208 // std::cout << value(vMat, j*numRows + i) << ',';
209 // //std::cout << value(traceback, j*numRows + i) << ',';
210 // }
211 // std::cout << std::endl;
212 //}
213 //for(TSize i = 0; i<length(path); ++i) {
214 // std::cout << path[i] << ',';
215 //}
216 //std::cout << std::endl;
217
218 return value(vMat, (len+1) * numRows + eState);
219 }
220
221
222 //////////////////////////////////////////////////////////////////////////////
223
224 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TForwardMatrix>
225 inline TProbability
_forwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq,TForwardMatrix & fMat)226 _forwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
227 TSequence const& seq,
228 TForwardMatrix& fMat)
229 {
230 SEQAN_CHECKPOINT
231 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
232 typedef typename Size<TGraph>::Type TSize;
233 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
234 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
235
236 // Initialization
237 TSize numCols = length(seq) + 2;
238 TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
239 resize(fMat, numCols * numRows, 0.0);
240 value(fMat, getBeginState(hmm)) = 1.0;
241 TVertexDescriptor bState = getBeginState(hmm);
242 TVertexDescriptor eState = getEndState(hmm);
243
244 // Distinct between silent states and real states
245 typedef String<TVertexDescriptor> TStateSet;
246 typedef typename Iterator<TStateSet>::Type TStateIter;
247 TStateSet silentStates;
248 TStateSet realStates;
249 TVertexIterator itVertex(hmm);
250 for(;!atEnd(itVertex);goNext(itVertex)) {
251 if (isSilent(hmm, value(itVertex))) {
252 appendValue(silentStates, value(itVertex));
253 } else {
254 appendValue(realStates, value(itVertex));
255 }
256 }
257
258 // Initialization for silent states connected to the begin state
259 TStateIter itSilentStateEnd = end(silentStates);
260 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
261 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
262 TProbability sumValue = 0.0;
263 for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
264 sumValue += value(fMat, value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
265 }
266 value(fMat, value(itSilentState)) = sumValue;
267 }
268
269 // Recurrence
270 TSize len = length(seq);
271 for(TSize i=1; i<=len; ++i) {
272 // Iterate over real states
273 TStateIter itRealStateEnd = end(realStates);
274 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
275 TProbability sum = 0.0;
276 TVertexIterator itAll(hmm);
277 for(;!atEnd(itAll);++itAll) sum += value(fMat, (i-1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itAll), value(itRealState));
278 value(fMat, i * numRows + value(itRealState)) = getEmissionProbability(hmm, value(itRealState), seq[i-1]) * sum;
279 }
280
281 // Iterate over silent states
282 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
283 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
284 TProbability sumValue = 0.0;
285 // Iterate over real states
286 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
287 sumValue += value(fMat, i * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
288 }
289 // Iterate over silent states in increasing order
290 for(TStateIter itBelow = begin(silentStates); itBelow != itSilentState; goNext(itBelow)) {
291 sumValue += value(fMat, i * numRows + value(itBelow)) * getTransitionProbability(hmm, value(itBelow), value(itSilentState));
292 }
293 value(fMat, i * numRows + value(itSilentState)) = sumValue;
294 }
295 }
296
297 // Termination
298 TProbability sum = 0.0;
299 TVertexIterator itAll(hmm);
300 for(;!atEnd(itAll);++itAll) {
301 sum += value(fMat, len * numRows + value(itAll)) * getTransitionProbability(hmm, value(itAll), eState);
302 }
303 value(fMat, (len+1) * numRows + eState) = sum;
304
305 //// Debug code
306 //for(TSize i = 0; i<numRows; ++i) {
307 // for(TSize j=0; j<numCols; ++j) {
308 // std::cout << value(fMat, j*numRows + i) << ',';
309 // }
310 // std::cout << std::endl;
311 //}
312
313 return value(fMat, (len+1) * numRows + eState);
314 }
315
316 //////////////////////////////////////////////////////////////////////////////
317
318 /**
319 .Function.forwardAlgorithm:
320 ..cat:Graph
321 ..summary:Implements the forward algorithm.
322 ..signature:forwardAlgorithm(hmm, seq)
323 ..param.hmm:In-parameter:Input HMM.
324 ...type:Spec.Hmm
325 ..param.seq:In-parameter:Input sequence.
326 ..returns:TProbability
327 ...remarks:Probability of the sequence.
328 ..see:Function.viterbiAlgorithm
329 ..see:Function.backwardAlgorithm
330 ..include:seqan/graph_algorithms.h
331 */
332 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence>
333 inline TProbability
forwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq)334 forwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
335 TSequence const& seq)
336 {
337 SEQAN_CHECKPOINT
338 String<TProbability> fMat;
339 return _forwardAlgorithm(hmm, seq, fMat);
340 }
341
342 //////////////////////////////////////////////////////////////////////////////
343
344 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TBackwardMatrix>
345 inline TProbability
_backwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq,TBackwardMatrix & bMat)346 _backwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
347 TSequence const& seq,
348 TBackwardMatrix& bMat)
349 {
350 SEQAN_CHECKPOINT
351 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
352 typedef typename Size<TGraph>::Type TSize;
353 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
354 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
355
356 // Initialization
357 TSize numCols = length(seq) + 2;
358 TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
359 resize(bMat, numCols * numRows, 0.0);
360 TVertexDescriptor bState = getBeginState(hmm);
361 TVertexDescriptor eState = getEndState(hmm);
362 TSize len = length(seq);
363 value(bMat, (len + 1) * numRows + eState) = 1.0;
364
365 // Distinct between silent states and real states
366 typedef String<TVertexDescriptor> TStateSet;
367 typedef typename Iterator<TStateSet>::Type TStateIter;
368 TStateSet silentStates;
369 TStateSet realStates;
370 TVertexIterator itVertex(hmm);
371 for(;!atEnd(itVertex);goNext(itVertex)) {
372 if (isSilent(hmm, value(itVertex))) {
373 appendValue(silentStates, value(itVertex));
374 } else {
375 appendValue(realStates, value(itVertex));
376 }
377 }
378 // Reverse the silent states order
379 std::reverse(begin(silentStates), end(silentStates));
380
381 // Initialization for silent states connected to the end state
382 TStateIter itSilentStateEnd = end(silentStates);
383 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
384 if (value(itSilentState) == eState) continue;
385 TProbability sumValue = getTransitionProbability(hmm, value(itSilentState), eState);
386 for(TStateIter itAbove = begin(silentStates); itAbove != itSilentState; goNext(itAbove)) {
387 sumValue += value(bMat, len * numRows + value(itAbove)) * getTransitionProbability(hmm, value(itSilentState), value(itAbove));
388 }
389 value(bMat, len * numRows + value(itSilentState)) = sumValue;
390 }
391
392 // Initialization for real states
393 TStateIter itRealStateEnd = end(realStates);
394 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
395 TProbability sumValue = getTransitionProbability(hmm, value(itRealState), eState);
396 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
397 sumValue += value(bMat, len * numRows + value(itSilentState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
398 }
399 value(bMat, len * numRows + value(itRealState)) = sumValue;
400 }
401
402
403 // Recurrence
404 if (len > 0) {
405 for(TSize i=len - 1; i>0; --i) {
406 // Iterate over silent states
407 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
408 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
409 TProbability sumValue = 0.0;
410 // Iterate over real states
411 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
412 sumValue += value(bMat, (i+1) * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itSilentState), value(itRealState))* getEmissionProbability(hmm, value(itRealState), value(seq, i));
413 }
414 // Iterate over silent states in decreasing order
415 for(TStateIter itAbove = begin(silentStates); itAbove != itSilentState; goNext(itAbove)) {
416 if ((value(itAbove) == bState) || (value(itAbove) == eState)) continue;
417 sumValue += value(bMat, i * numRows + value(itAbove)) * getTransitionProbability(hmm, value(itSilentState), value(itAbove));
418 }
419 value(bMat, i * numRows + value(itSilentState)) = sumValue;
420 }
421
422 // Iteration over real states
423 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
424 TProbability sumValue = 0.0;
425 // Iterate over silent states
426 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
427 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
428 sumValue += value(bMat, i * numRows + value(itSilentState)) * getTransitionProbability(hmm, value(itRealState), value(itSilentState));
429 }
430 // Iterate over real states
431 for(TStateIter itR = begin(realStates); itR != itRealStateEnd; goNext(itR)) {
432 sumValue += value(bMat, (i+1) * numRows + value(itR)) * getTransitionProbability(hmm, value(itRealState), value(itR)) * getEmissionProbability(hmm, value(itR), value(seq, i));
433 }
434 value(bMat, i * numRows + value(itRealState)) = sumValue;
435 }
436 }
437
438 // Termination
439 // Iterate over silent states
440 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
441 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
442 TProbability sumValue = 0.0;
443 // Iterate over real states
444 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
445 sumValue += value(bMat, 1 * numRows + value(itRealState)) * getTransitionProbability(hmm, value(itSilentState), value(itRealState))* getEmissionProbability(hmm, value(itRealState), value(seq, 0));
446 }
447 // Iterate over silent states in decreasing order
448 for(TStateIter itAbove = begin(silentStates); itAbove != itSilentState; goNext(itAbove)) {
449 if ((value(itAbove) == bState) || (value(itAbove) == eState)) continue;
450 sumValue += value(bMat, value(itAbove)) * getTransitionProbability(hmm, value(itSilentState), value(itAbove));
451 }
452 value(bMat, value(itSilentState)) = sumValue;
453 }
454 // Sum up all values
455 TProbability sumValue = 0.0;
456 for(TStateIter itSilentState = begin(silentStates); itSilentState != itSilentStateEnd; goNext(itSilentState)) {
457 if ((value(itSilentState) == bState) || (value(itSilentState) == eState)) continue;
458 sumValue += value(bMat, value(itSilentState)) * getTransitionProbability(hmm, bState, value(itSilentState));
459 }
460 for(TStateIter itRealState = begin(realStates); itRealState != itRealStateEnd; goNext(itRealState)) {
461 sumValue += value(bMat, 1 * numRows + value(itRealState)) * getTransitionProbability(hmm, bState, value(itRealState)) * getEmissionProbability(hmm, value(itRealState), value(seq, 0));
462 }
463 value(bMat, bState) = sumValue;
464 }
465
466 //// Debug code
467 //for(TSize i = 0; i<numRows; ++i) {
468 // for(TSize j=0; j<numCols; ++j) {
469 // std::cout << value(bMat, j*numRows + i) << ',';
470 // }
471 // std::cout << std::endl;
472 //}
473
474 return value(bMat, bState);
475 }
476
477
478
479 //////////////////////////////////////////////////////////////////////////////
480
481 /**
482 .Function.backwardAlgorithm:
483 ..cat:Graph
484 ..summary:Implements the backward algorithm.
485 ..signature:backwardAlgorithm(hmm, seq)
486 ..param.hmm:In-parameter:Input HMM.
487 ...type:Spec.Hmm
488 ..param.seq:In-parameter:Input sequence.
489 ..returns:TProbability
490 ...remarks:Probability of the sequence.
491 ..see:Function.viterbiAlgorithm
492 ..see:Function.forwardAlgorithm
493 ..include:seqan/graph_algorithms.h
494 */
495 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequence>
496 inline TProbability
backwardAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequence const & seq)497 backwardAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
498 TSequence const& seq)
499 {
500 SEQAN_CHECKPOINT
501 String<TProbability> bMat;
502 return _backwardAlgorithm(hmm, seq, bMat);
503 }
504
505 ///////////////////////////////////////////////////////////////////////////////////////
506
507 /**
508 .Function.generateSequence:
509 ..cat:Graph
510 ..summary:Generates random state and alphabet sequences of a given HMM.
511 ..signature:generateSequence(hmm, sequences, states, numSeq, maxLength)
512 ..param.hmm:In-parameter:Input HMM.
513 ...type:Spec.Hmm
514 ..param.sequences:The StringSet of alphabet sequences.
515 ...type:Class.StringSet
516 ..param.sequences:The StringSet of state sequences.
517 ...type:Class.StringSet
518 ..param.numSeq:The number of sequences to generate.
519 ..param.maxLength:The maximum length of the sequences.
520 ...remarks:Sequences might be shorter if the end state is reached prior to maxLength.
521 ..remarks: Because of silent states, generated alphabet and state sequences might have different length.
522 ..returns:void
523 ..include:seqan/graph_algorithms.h
524 */
525 template<typename TAlphabet, typename TProbability, typename TSpec,typename TSequenceSet, typename TStateSeqSet, typename TSize>
526 inline void
generateSequence(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequenceSet & sequences,TStateSeqSet & states,TSize numSeq,TSize maxLength)527 generateSequence(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
528 TSequenceSet& sequences,
529 TStateSeqSet& states,
530 TSize numSeq,
531 TSize maxLength)
532 {
533 SEQAN_CHECKPOINT
534 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
535 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
536 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
537 typedef typename Value<TSequenceSet>::Type TSequence;
538 typedef typename Value<TStateSeqSet>::Type TStateSeq;
539
540 // Initialization
541 mtRandInit();
542 clear(sequences);
543 clear(states);
544 TSize alphSize = ValueSize<TAlphabet>::VALUE;
545
546 // Simulate sequences
547 TVertexDescriptor currentState;
548 TVertexDescriptor endState = getEndState(hmm);
549 for(TSize i=0;i<numSeq;++i){
550 currentState = getBeginState(hmm);
551 TSequence seq;
552 TStateSeq stat;
553 appendValue(stat, getBeginState(hmm));
554 bool stop = false;
555 TSize pos = 0;
556 while (pos < maxLength) {
557 TProbability prob = (double) (mtRand() % 100000) / (double) (100000);
558 TProbability compareProb = 0.0;
559 TOutEdgeIterator itState(hmm, currentState);
560 // Determine the next state
561 for(;!atEnd(itState);++itState) {
562 // Probability of the next transition
563 compareProb += getTransitionProbability(hmm, value(itState));
564 // Compare with random probability
565 if (prob <= compareProb){
566 TVertexDescriptor nextState = targetVertex(hmm, value(itState));
567 if (nextState == endState) {
568 stop = true;
569 break;
570 }
571 appendValue(stat, nextState);
572 if (!isSilent(hmm, nextState)) {
573 compareProb =0.0;
574 prob = (double) (mtRand() % 100000) / (double) (100000);
575 for (TSize c=0;c<alphSize;++c){
576 compareProb += getEmissionProbability(hmm,targetVertex(hmm, value(itState)), TAlphabet(c));
577 if (prob <= compareProb) {
578 appendValue(seq, TAlphabet(c));
579 ++pos;
580 break;
581 }
582 }
583 }
584 currentState = nextState;
585 break;
586 }
587 }
588 if (stop==true) break;
589 }
590 appendValue(stat, getEndState(hmm));
591 appendValue(sequences, seq);
592 appendValue(states, stat);
593 }
594 }
595
596 ///////////////////////////////////////////////////////////////////////////////////////
597
598 template<typename TAlphabet, typename TProbability, typename TSpec,typename TSequenceSet, typename TSize>
599 inline void
generateSequence(Graph<Hmm<TAlphabet,TProbability,TSpec>> const & hmm,TSequenceSet & sequences,TSize numSeq,TSize maxLength)600 generateSequence(Graph<Hmm<TAlphabet, TProbability, TSpec> > const& hmm,
601 TSequenceSet& sequences,
602 TSize numSeq,
603 TSize maxLength)
604 {
605 SEQAN_CHECKPOINT
606 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
607 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
608 StringSet<String<TVertexDescriptor> > states;
609 generateSequence(hmm, sequences, states, numSeq, maxLength);
610 }
611
612
613
614 //////////////////////////////////////////////////////////////////////////////
615 // Training algorithms
616 //////////////////////////////////////////////////////////////////////////////
617
618
619 //////////////////////////////////////////////////////////////////////////////
620
621 template<typename TAlphabet, typename TProbability, typename TSpec,typename TEmissionCounter, typename TTransitionCounter>
622 inline void
_parameterEstimator(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TEmissionCounter const & emission,TTransitionCounter const & transition)623 _parameterEstimator(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
624 TEmissionCounter const& emission,
625 TTransitionCounter const& transition)
626 {
627 SEQAN_CHECKPOINT
628 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
629 typedef typename Size<TGraph>::Type TSize;
630 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
631 typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
632 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
633 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
634 typedef typename Value<TTransitionCounter>::Type TCounterValue;
635
636 // Initialization
637 TSize alphSize = ValueSize<TAlphabet>::VALUE;
638 TCounterValue pseudoCount = (TCounterValue) 1.0;
639
640 // Estimate the parameters from the counter values
641 TVertexIterator itAll(hmm);
642 for(;!atEnd(itAll);++itAll){
643 if (!isSilent(hmm, value(itAll))) {
644 TCounterValue summedCount = (TCounterValue) (0.0);
645 for(TSize i=0; i<alphSize;++i) summedCount += (pseudoCount + value(value(emission, value(itAll)),i));
646 for(TSize i=0; i<alphSize;++i) emissionProbability(hmm,value(itAll),TAlphabet(i)) = (pseudoCount + value(value(emission, value(itAll)),i)) / summedCount;
647 }
648 TCounterValue summedCount = (TCounterValue) (0.0);
649 TOutEdgeIterator itOutSum(hmm,value(itAll));
650 for(;!atEnd(itOutSum);++itOutSum) summedCount += (pseudoCount + getProperty(transition, value(itOutSum)));
651 TOutEdgeIterator itOut(hmm, value(itAll));
652 for(;!atEnd(itOut);++itOut) transitionProbability(hmm, value(itOut)) = (pseudoCount + getProperty(transition, value(itOut))) / summedCount;
653 }
654 }
655
656 ///////////////////////////////////////////////////////////////////////////////////////
657
658 template<typename TAlphabet, typename TProbability, typename TSpec, typename TSequenceSet, typename TStateSeqSet>
659 inline void
estimationWithStates(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TSequenceSet & sequences,TStateSeqSet & states)660 estimationWithStates(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
661 TSequenceSet& sequences,
662 TStateSeqSet& states)
663 {
664 SEQAN_CHECKPOINT
665 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
666 typedef typename Size<TGraph>::Type TSize;
667 typedef typename Value<TStateSeqSet>::Type TStateSeq;
668 typedef typename Value<TStateSeq>::Type TState;
669 typedef typename Value<TSequenceSet>::Type TSequence;
670 typedef typename Value<TSequence>::Type TChar;
671 typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
672
673 // Initialization
674 TSize alphSize = ValueSize<TAlphabet>::VALUE;
675 typedef String<TSize> TCountString;
676 TCountString transitionCounter;
677 resize(transitionCounter, getIdUpperBound(_getEdgeIdManager(hmm)), 0);
678 StringSet<TCountString> emissionCounter;
679 TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
680 for(TSize i =0; i<numRows;++i) {
681 TCountString emisCount;
682 resize(emisCount,alphSize,0);
683 appendValue(emissionCounter,emisCount);
684 }
685
686 // Iterate over all sequences
687 for (TSize j=0;j<length(states);++j) {
688 TSize posSeq = 0;
689 for (TSize i = 0;i<length(value(states,j));++i) {
690 TState s = value(value(states,j),i);
691 if (!isSilent(hmm, s)) {
692 TAlphabet c = value(value(sequences,j),posSeq);
693 value(value(emissionCounter, s), ordValue(c)) += 1;
694 ++posSeq;
695 }
696 if (i<(length(value(states,j))-1)) {
697 TEdgeDescriptor currEdge = findEdge(hmm, s, value(value(states,j), (i+1)));
698 property(transitionCounter, currEdge) += 1;
699 }
700 }
701 }
702
703 //// Debug Code
704 //typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
705 //TEdgeIterator itE(hmm);
706 //for(;!atEnd(itE);goNext(itE)) std::cout << sourceVertex(itE) << ',' << targetVertex(itE) << ':' << property(transitionCounter, value(itE)) << std::endl;
707 //for(TSize j=0; j<length(emissionCounter); ++j) {
708 // for(TSize i=0;i<length(value(emissionCounter, j));++i) {
709 // std::cout << j << ":" << TAlphabet(i) << '=' << value(value(emissionCounter, j), i) << std::endl;
710 // }
711 //}
712
713 // Estimate Parameters
714 _parameterEstimator(hmm,emissionCounter, transitionCounter);
715 }
716
717 ///////////////////////////////////////////////////////////////////////////////////////
718
719 template<typename TAlphabet, typename TProbability, typename TSpec>
720 inline void
_fillHmmUniform(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm)721 _fillHmmUniform(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm)
722 {
723 SEQAN_CHECKPOINT
724 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
725 typedef typename Size<TGraph>::Type TSize;
726 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
727 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
728
729 // Initialization
730 TSize alphSize = ValueSize<TAlphabet>::VALUE;
731
732 // Iterate over all states
733 TVertexIterator itState(hmm);
734 for(;!atEnd(itState);goNext(itState)) { //pass through the states of the hmm
735 TSize oD = outDegree(hmm, value(itState));
736 TOutEdgeIterator itOut(hmm, value(itState));
737 for (;!atEnd(itOut);goNext(itOut)) transitionProbability(hmm, value(itOut)) = (TProbability) (1.0 / (double) (oD));
738 if (!isSilent(hmm, value(itState))) {
739 for(TSize i=0;i<alphSize;++i){
740 emissionProbability(hmm,value(itState),TAlphabet(i)) = (TProbability) (1.0 / (double) (alphSize));
741 }
742 }
743 }
744 }
745
746 ///////////////////////////////////////////////////////////////////////////////////////
747
748 template<typename TAlphabet, typename TProbability, typename TSpec, typename TRNG>
749 inline void
_fillHmmRandom(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TRNG & rng)750 _fillHmmRandom(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
751 TRNG & rng)
752 {
753 SEQAN_CHECKPOINT
754 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
755 typedef typename Size<TGraph>::Type TSize;
756 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
757 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
758
759 // Initialization
760 Pdf<Uniform<TSize> > pdf(20, 99);
761 TSize alphSize = ValueSize<TAlphabet>::VALUE;
762
763 // Iterate over all states
764 TVertexIterator itState(hmm);
765 for(;!atEnd(itState);goNext(itState)) { //pass through the states of the hmm
766 TSize oD = outDegree(hmm, value(itState));
767 if (oD > 0) {
768 String<TSize> counts;
769 TSize sum = 0;
770 for(TSize i = 0;i<oD;++i) {
771 TSize rd = pickRandomNumber(rng, pdf);
772 sum += rd;
773 appendValue(counts, rd);
774 }
775 TSize pos = 0;
776 TOutEdgeIterator itOut(hmm, value(itState));
777 for (;!atEnd(itOut);goNext(itOut), ++pos) transitionProbability(hmm, value(itOut)) = (TProbability) ((double) (value(counts, pos)) / (double) (sum));
778 }
779 if (!isSilent(hmm, value(itState))) {
780 String<TSize> counts;
781 TSize sum = 0;
782 for(TSize i = 0;i<alphSize;++i) {
783 TSize rd = pickRandomNumber(rng, pdf);
784 sum += rd;
785 appendValue(counts, rd);
786 }
787 for(TSize i=0;i<alphSize;++i) emissionProbability(hmm,value(itState),TAlphabet(i)) = (TProbability) ((double) (value(counts, i)) / (double) (sum));
788 }
789 }
790 }
791
792 ///////////////////////////////////////////////////////////////////////////////////////
793
794 template<typename TAlphabet, typename TProbability, typename TSpec, typename TRNG>
795 inline void
randomizeHmm(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,TRNG &)796 randomizeHmm(Graph<Hmm<TAlphabet, TProbability, TSpec> >& hmm,
797 TRNG & /*rng*/)
798 {
799 //_fillHmmRandom(hmm, rng);
800 _fillHmmUniform(hmm);
801 }
802
803 ///////////////////////////////////////////////////////////////////////////////////////
804
805 template <typename TAlphabet, typename TProbability, typename TSpec, typename TSequence, typename TSize>
806 inline TProbability
_baumWelchAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,StringSet<TSequence> const & seqSet,TSize maxIter,TProbability epsilon)807 _baumWelchAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec > >& hmm,
808 StringSet<TSequence> const& seqSet,
809 TSize maxIter,
810 TProbability epsilon)
811 {
812
813 SEQAN_CHECKPOINT
814 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
815 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
816 typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
817 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
818 typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
819 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
820 typedef String<TProbability> TCountString;
821
822 // Initialization
823 TSize alphSize = ValueSize<TAlphabet>::VALUE;
824 TProbability lastTotalModelProb = 0.0;
825
826 // Randomize current HMM so only topology is preserved
827 randomizeHmm(hmm);
828
829 // Iterative Optimization
830 for(TSize iter=0; iter<maxIter; ++iter){
831 std::cout << "Iteration: "<< iter << std::endl;
832 std::cout << hmm << std::endl;
833 TCountString transitionCounter;
834 resize(transitionCounter, getIdUpperBound(_getEdgeIdManager(hmm)), 0.0);
835 StringSet<TCountString> emissionCounter;
836 TSize numRows = getIdUpperBound(_getVertexIdManager(hmm));
837 for(TSize i =0; i<numRows;++i) {
838 TCountString emisCount;
839 resize(emisCount,alphSize, 0.0);
840 appendValue(emissionCounter,emisCount);
841 }
842
843 // MaximationStep
844 // Determine new contributions
845 TProbability totalModelLogProb = 0.0;
846 for (TSize i=0; i <length(seqSet);++i){ //sequences
847 TSize len = length(value(seqSet,i));
848
849 // Forward algorithm
850 String<TProbability> fMat;
851 TProbability modelLogProb = _forwardAlgorithm(hmm, value(seqSet,i), fMat);
852 totalModelLogProb += modelLogProb;
853
854 // Backward algorithm
855 String<TProbability> bMat;
856 _backwardAlgorithm(hmm, value(seqSet,i), bMat);
857
858 // Use the posterior probabilities to estimate counter values
859 for (TSize j=0;j<len;++j){
860 TAlphabet c = value(value(seqSet,i),j);
861 TAlphabet nextC = c;
862 if (j < (len - 1)) nextC = value(value(seqSet,i),(j+1));
863
864 // Iterate over all states
865 TVertexIterator itAll(hmm);
866 for(;!atEnd(itAll);++itAll) {
867 // Handle begin state
868 if (value(itAll) == beginState(hmm)) {
869 if (j == 0) {
870 TOutEdgeIterator itOut(hmm, value(itAll));
871 for(;!atEnd(itOut); goNext(itOut)) {
872 if (!isSilent(hmm, targetVertex(itOut))) {
873 property(transitionCounter, value(itOut)) += (value(fMat, beginState(hmm)) * getTransitionProbability(hmm, value(itOut)) * getEmissionProbability(hmm, targetVertex(itOut), c) * value(bMat, numRows + targetVertex(itOut)) / modelLogProb);
874 } else {
875 property(transitionCounter, value(itOut)) += (value(fMat, beginState(hmm)) * getTransitionProbability(hmm, value(itOut)) * value(bMat, targetVertex(itOut)) / modelLogProb);
876 }
877 }
878
879 }
880 continue;
881 }
882 // Ignore the end state
883 if (value(itAll) == endState(hmm)) continue;
884
885 // Determine emission expectation values
886 if (!isSilent(hmm, value(itAll))) value(value(emissionCounter, value(itAll)),ordValue(c)) += ((value(fMat, (j+1) * numRows + value(itAll)) * value(bMat, (j+1) * numRows + value(itAll))) / modelLogProb);
887
888 // Determine transition expectation values
889 TOutEdgeIterator itOut(hmm, value(itAll));
890 for(;!atEnd(itOut); goNext(itOut)) {
891 // Handle the end state
892 if ((j == (len - 1)) && (targetVertex(itOut)==endState(hmm))) {
893 property(transitionCounter, value(itOut)) += (value(fMat, (j+1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itAll),endState(hmm)) / modelLogProb);
894 } else {
895 if (!isSilent(hmm, targetVertex(itOut))) {
896 if (j < (len - 1)) property(transitionCounter, value(itOut)) += (value(fMat, (j+1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itOut)) * getEmissionProbability(hmm, targetVertex(itOut), nextC) * value(bMat, (j+2) * numRows + targetVertex(itOut)) / modelLogProb);
897 } else {
898 property(transitionCounter, value(itOut)) += (value(fMat, (j+1) * numRows + value(itAll)) * getTransitionProbability(hmm, value(itOut)) * value(bMat, (j+1) * numRows + targetVertex(itOut)) / modelLogProb);
899 }
900 }
901 }
902 }
903 }
904 }
905 // Expectation step
906 _parameterEstimator(hmm,emissionCounter, transitionCounter);
907
908
909 // Termination?
910 if ((iter > 5) && ((totalModelLogProb - lastTotalModelProb) < epsilon)) {
911 lastTotalModelProb = totalModelLogProb;
912 break;
913 } else {
914 lastTotalModelProb = totalModelLogProb;
915 }
916 }
917 return lastTotalModelProb;
918 }
919
920 ///////////////////////////////////////////////////////////////////////////////////////
921
922 template <typename TAlphabet, typename TProbability, typename TSpec, typename TSequence>
923 inline TProbability
baumWelchAlgorithm(Graph<Hmm<TAlphabet,TProbability,TSpec>> & hmm,StringSet<TSequence> const & seqSet)924 baumWelchAlgorithm(Graph<Hmm<TAlphabet, TProbability, TSpec > >& hmm,
925 StringSet<TSequence> const& seqSet)
926 {
927 SEQAN_CHECKPOINT
928 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
929 typedef typename Size<TGraph>::Type TSize;
930 TSize maxIter = 100;
931 TProbability epsilon = 0.00001;
932 return _baumWelchAlgorithm(hmm, seqSet, maxIter, epsilon);
933 }
934
935 /*
936 //////////////////////////////////////////////////////////////////////////////
937 // Profile HMMs
938 //////////////////////////////////////////////////////////////////////////////
939
940 //////////////////////////////////////////////////////////////////////////////
941
942
943 template < typename TAlphabet, typename TProbability, typename TSpec, typename TMat, typename TConsensus, typename TSize>
944 inline void
945 _profileHmmCounter(Graph<Hmm<TAlphabet, TProbability, TSpec> >& pHmm,
946 TMat const& matr,
947 TConsensus const& consensus,
948 TSize const& numRows,
949 TSize const& numCols)
950 {
951 SEQAN_CHECKPOINT
952 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > TGraph;
953 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
954 typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
955 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
956 typedef typename Iterator<TGraph, OutEdgeIterator>::Type TOutEdgeIterator;
957 typedef typename Value<TConsensus>::Type TValue;
958
959 // Initialization
960 TSize alphSize = ValueSize<TAlphabet>::VALUE;
961 TValue gapChar = gapValue<TValue>();
962 TVertexDescriptor begState = beginState(pHmm);
963 TVertexDescriptor eState = endState(pHmm);
964 TVertexDescriptor matchState = 1;
965 TVertexDescriptor insertState = 2;
966 TVertexDescriptor deleteState = 3;
967
968 typedef String<TProbability> TCountString;
969 TCountString transitionCounter;
970 resize(transitionCounter, getIdUpperBound(_getEdgeIdManager(pHmm)), 0.0);
971 StringSet<TCountString> emissionCounter;
972 TSize nR = getIdUpperBound(_getVertexIdManager(pHmm));
973 for(TSize i =0; i<nR;++i) {
974 TCountString emisCount;
975 resize(emisCount,alphSize, 0.0);
976 appendValue(emissionCounter,emisCount);
977 }
978
979
980 String<TVertexDescriptor> oldCol;
981 String<TVertexDescriptor> currCol;
982 resize(oldCol, numRows, begState);
983 resize(currCol, numRows, 0);
984 TEdgeDescriptor currEdge;
985
986 for(TSize i = 0; i<length(consensus); ++i) {
987 //transitionvalues
988
989 //being in insertState
990 if (value(consensus, i)==gapChar){
991 for(TSize j = 0; j<numRows; ++j){
992 if ((value(matr, j * numCols + i)!=gapChar)){
993 value(currCol,j) = insertState;
994 currEdge = findEdge(pHmm, value(oldCol,j), value(currCol,j));
995 property(transitionCounter, currEdge) += 1;
996 value(oldCol,j) = value(currCol,j);
997 }
998 else if ((value(matr, j * numCols + i)==gapChar) ){
999 value(currCol,j) = value(oldCol,j);
1000 }
1001 }
1002 }
1003 //being in normal State
1004 else{
1005 for(TSize j = 0; j<numRows; ++j){
1006 if(value(matr, j * numCols + i)!=gapChar) value(currCol,j) = matchState;
1007 else value(currCol,j) = deleteState;
1008 currEdge = findEdge(pHmm, value(oldCol,j), value(currCol,j));
1009 property(transitionCounter, currEdge) += 1;
1010 value(oldCol,j)=value(currCol,j);
1011 }
1012 }
1013 //emissionvalues
1014 if (value(consensus, i)==gapChar) {
1015 for(TSize j = 0; j<numRows; ++j)
1016 if(value(matr, j * numCols + i)!=gapChar)
1017 value(value(emissionCounter, insertState), ordValue( (TAlphabet) value(matr, j * numCols + i) ) ) += 1;
1018
1019 continue;
1020 }
1021 else
1022 for(TSize j = 0; j<numRows; ++j)
1023 if(value(matr, j * numCols + i)!=gapChar)
1024 value(value(emissionCounter, matchState), ordValue( (TAlphabet) value(matr, j * numCols + i) ) ) += 1;
1025
1026 matchState+=3;
1027 if ((insertState+3)==eState) insertState+=2;
1028 else insertState+=3;
1029 deleteState+=3;
1030 }
1031
1032 //transition in endState
1033 for(TSize j = 0; j<numRows; ++j){
1034 currEdge = findEdge(pHmm,value(currCol,j),eState);
1035 property(transitionCounter, currEdge) += 1;
1036 }
1037
1038 _parameterEstimator(pHmm, emissionCounter, transitionCounter);
1039 }
1040
1041
1042 //////////////////////////////////////////////////////////////////////////////
1043
1044 template<typename TAlphabet, typename TCargo, typename TSpec, typename TConsensus>
1045 inline void
1046 _createProfileHmm(Graph<Hmm<TAlphabet, TCargo, TSpec> >& pHmm,
1047 TConsensus const& consensus)
1048 {
1049 SEQAN_CHECKPOINT
1050 typedef Graph<Hmm<TAlphabet, TCargo, TSpec> > TGraph;
1051 typedef typename Size<TGraph>::Type TSize;
1052 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
1053 typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
1054 typedef typename Value<TConsensus>::Type TValue;
1055
1056 // Initialization
1057 TSize alphSize = ValueSize<TAlphabet>::VALUE;
1058 TValue gapChar = gapValue<TValue>();
1059 clear(pHmm);
1060
1061 // Add begin state
1062 TVertexDescriptor begState = addVertex(pHmm);
1063 assignBeginState(pHmm, begState);
1064
1065 // Add for each consensus letter 3 states
1066 for (TSize i=0;i<length(consensus);++i){
1067 if (value(consensus,i) == gapChar) continue;
1068 addVertex(pHmm); addVertex(pHmm); addVertex(pHmm, true);
1069 }
1070
1071 // Add last insertion state
1072 TVertexDescriptor lastIState = addVertex(pHmm);
1073
1074 // Add end state
1075 TVertexDescriptor endState = addVertex(pHmm);
1076 assignEndState(pHmm, endState);
1077
1078 // Is there no consensus letter?
1079 if (lastIState == 1) {
1080 clear(pHmm);
1081 return;
1082 }
1083
1084 // Remember the kind of state
1085 TVertexDescriptor mState = 1;
1086 TVertexDescriptor iState = 2;
1087 TVertexDescriptor dState = 3;
1088
1089 // Add tranistions from begin state
1090 addEdge(pHmm, begState, mState);
1091 addEdge(pHmm, begState, iState);
1092 addEdge(pHmm, begState, dState);
1093
1094 // Add all remaining transitions
1095 for (TSize i=0;i<length(consensus);++i){
1096 if (value(consensus,i) == gapChar) continue;
1097 else if ((mState + 3) == lastIState) {
1098 addEdge(pHmm, iState, mState);
1099 addEdge(pHmm, iState, iState);
1100 addEdge(pHmm, iState, dState);
1101 break;
1102 }
1103 else{
1104 addEdge(pHmm, mState, (mState+3));
1105 addEdge(pHmm, iState, mState);
1106 addEdge(pHmm, dState, (mState+3));
1107 addEdge(pHmm, mState, (iState+3));
1108 addEdge(pHmm, iState, iState);
1109 addEdge(pHmm, dState, (iState+3));
1110 addEdge(pHmm, mState, (dState+3));
1111 addEdge(pHmm, iState, dState);
1112 addEdge(pHmm, dState, (dState+3));
1113 mState+=3;
1114 iState+=3;
1115 dState+=3;
1116 }
1117 }
1118
1119 // Transitions to the endState and the last I-state
1120 addEdge(pHmm, mState, endState);
1121 addEdge(pHmm, mState, lastIState);
1122 addEdge(pHmm, lastIState, endState);
1123 addEdge(pHmm, lastIState, lastIState);
1124 addEdge(pHmm, dState, endState);
1125 addEdge(pHmm, dState, lastIState);
1126 }
1127
1128 //////////////////////////////////////////////////////////////////////////////
1129
1130 //e.g.
1131 //String<char> matr = "-AT---GAG-G-AG-CT-C--A--GT-G-CT---G";
1132 //msaToProfileHmm(matr, hmm, 5);
1133
1134 template<typename TAlignmentChar, typename TAlphabet, typename TProbability, typename TSpec, typename TSize>
1135 inline void
1136 msaToProfileHmm(String<TAlignmentChar> const& matr,
1137 Graph<Hmm<TAlphabet, TProbability, TSpec> >& pHmm,
1138 TSize nSeq)
1139 {
1140 SEQAN_CHECKPOINT
1141 typedef Graph<Hmm<TAlphabet, TProbability, TSpec> > THmm;
1142
1143 // Consensus
1144 String<unsigned int> coverage;
1145 String<char> gappedConsensus;
1146 String<Dna> consensusSequence;
1147 consensusCalling(matr, consensusSequence, gappedConsensus, coverage, nSeq, MajorityVote() );
1148
1149 // Build the HMM topology
1150 _createProfileHmm(pHmm,gappedConsensus);
1151
1152 // Parameterize the pHmm
1153 TSize numCols = length(matr) / nSeq;
1154 _profileHmmCounter(pHmm, matr, gappedConsensus, nSeq, numCols);
1155 }
1156
1157 */
1158
1159
1160 }// namespace SEQAN_NAMESPACE_MAIN
1161
1162 #endif //#ifndef SEQAN_HEADER_...
1163