1 /*******************************************************************************
2 *
3 * This file is part of the General Hidden Markov Model Library,
4 * GHMM version __VERSION__, see http://ghmm.org
5 *
6 * Filename: ghmm/ghmm/viterbi.c
7 * Authors: Wasinee Rungsarityotin, Benjamin Georgi
8 *
9 * Copyright (C) 1998-2004 Alexander Schliep
10 * Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln
11 * Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik,
12 * Berlin
13 *
14 * Contact: schliep@ghmm.org
15 *
16 * This library is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU Library General Public
18 * License as published by the Free Software Foundation; either
19 * version 2 of the License, or (at your option) any later version.
20 *
21 * This library is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 * Library General Public License for more details.
25 *
26 * You should have received a copy of the GNU Library General Public
27 * License along with this library; if not, write to the Free
28 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 *
30 *
31 * This file is version $Revision: 1713 $
32 * from $Date: 2006-10-16 10:06:28 -0400 (Mon, 16 Oct 2006) $
33 * last change by $Author: grunau $.
34 *
35 *******************************************************************************/
36
37 /* Possible sources of errors: initialisation, pushback (the loop after) */
38
39 #ifdef HAVE_CONFIG_H
40 # include "../config.h"
41 #endif
42
43 #include <float.h>
44 #include <math.h>
45 #include <assert.h>
46 #include <stdlib.h>
47
48 #include "ghmm.h"
49 #include "mprintf.h"
50 #include "mes.h"
51 #include "matrix.h"
52 #include "pmodel.h"
53 #include "psequence.h"
54 #include "pviterbi.h"
55 #include "ghmm_internals.h"
56
57
58 typedef struct plocal_store_t {
59 /** precomputed log probabilities for transitions into the states
60 for each transition class **/
61 double *** log_in_a;
62 /** precomputed log probabilities for each state for the emissions **/
63 double ** log_b;
64 /** lookback matrix for the last offset steps **/
65 double *** phi;
66 /** log probabilities for the current u, v and every state **/
67 double *phi_new;
68 /** traceback matrix **/
69 int ***psi;
70 /** for convinience store a pointer to the model **/
71 ghmm_dpmodel * mo;
72 /** for the debug mode store information of matrix sizes **/
73 /** length of sequence X determines size of psi **/
74 int len_x;
75 /** length of sequence Y determines size of phi and psi **/
76 int len_y;
77 /** non functional stuff **/
78 int *topo_order;
79 int topo_order_length;
80 } plocal_store_t;
81
82
83 static void ghmm_dpmodel_print_viterbi_store(plocal_store_t * pv);
84
85 static plocal_store_t *pviterbi_alloc(ghmm_dpmodel *mo, int len_x, int len_y);
86
87 static int pviterbi_free(plocal_store_t **v, int n, int len_x, int len_y,
88 int max_offset_x, int max_offset_y);
89
90 static void init_phi(plocal_store_t * pv, ghmm_dpseq * X, ghmm_dpseq * Y);
91
92 static double get_phi(plocal_store_t * pv, int x, int y, int offset_x,
93 int offset_y, int state);
94
95 static void set_phi(plocal_store_t * pv, int x, int y, int ghmm_dstate,
96 double prob);
97
98 static void push_back_phi(plocal_store_t * pv, int length_y);
99
100 static void set_psi(plocal_store_t * pv, int x, int y, int ghmm_dstate,
101 int from_state);
102
103
104 /*============================================================================*/
pviterbi_alloc(ghmm_dpmodel * mo,int len_x,int len_y)105 static plocal_store_t *pviterbi_alloc(ghmm_dpmodel *mo, int len_x, int len_y) {
106 #define CUR_PROC "pviterbi_alloc"
107 plocal_store_t* v = NULL;
108 int i, j;
109 ARRAY_CALLOC (v, 1);
110 v->mo = mo;
111 v->len_y = len_y;
112 v->len_x = len_x;
113 /* Allocate the log_in_a's -> individal lenghts */
114 ARRAY_CALLOC (v->log_in_a, mo->N);
115 /* first index of log_in_a: target state */
116 for (j = 0; j < mo->N; j++){
117 /* second index: source state */
118 ARRAY_CALLOC (v->log_in_a[j], mo->s[j].in_states);
119 for (i=0; i<mo->s[j].in_states; i++) {
120 /* third index: transition classes of source state */
121 ARRAY_CALLOC (v->log_in_a[j][i], mo->s[mo->s[j].in_id[i]].kclasses);
122 }
123 }
124 ARRAY_CALLOC (v->log_b, mo->N);
125 for (j=0; j<mo->N; j++) {
126 ARRAY_CALLOC (v->log_b[j], ghmm_dpmodel_emission_table_size(mo, j) + 1);
127 }
128 if (!(v->log_b)) {GHMM_LOG_QUEUED(LCONVERTED); goto STOP;}
129 v->phi = ighmm_cmatrix_3d_alloc(mo->max_offset_x + 1, len_y + mo->max_offset_y + 1, mo->N);
130 if (!(v->phi)) {GHMM_LOG_QUEUED(LCONVERTED); goto STOP;}
131 ARRAY_CALLOC (v->phi_new, mo->N);
132 v->psi = ighmm_dmatrix_3d_alloc(len_x + mo->max_offset_x + 1, len_y + mo->max_offset_y + 1, mo->N);
133 if (!(v->psi)) {GHMM_LOG_QUEUED(LCONVERTED); goto STOP;}
134
135 v->topo_order_length = 0;
136 ARRAY_CALLOC (v->topo_order, mo->N);
137
138 return(v);
139 STOP: /* Label STOP from ARRAY_[CM]ALLOC */
140 pviterbi_free((&v), mo->N, len_x, len_y, mo->max_offset_x, mo->max_offset_y);
141 return(NULL);
142 #undef CUR_PROC
143 } /* viterbi_alloc */
144
145
146 /*============================================================================*/
pviterbi_free(plocal_store_t ** v,int n,int len_x,int len_y,int max_offset_x,int max_offset_y)147 static int pviterbi_free(plocal_store_t **v, int n, int len_x, int len_y,
148 int max_offset_x, int max_offset_y) {
149 #define CUR_PROC "pviterbi_free"
150 int i, j;
151 mes_check_ptr(v, return(-1));
152 if( !*v ) return(0);
153 for (j = 0; j < n; j++) {
154 for (i=0; i < (*v)->mo->s[j].in_states; i++)
155 m_free((*v)->log_in_a[j][i]);
156 m_free((*v)->log_in_a[j]);
157 }
158 m_free((*v)->log_in_a);
159 for (j=0; j<n; j++)
160 m_free((*v)->log_b[j]);
161 m_free((*v)->log_b);
162 ighmm_cmatrix_3d_free( &((*v)->phi), max_offset_x + 1,
163 len_y + max_offset_y + 1);
164 m_free((*v)->phi_new);
165 ighmm_dmatrix_3d_free( &((*v)->psi), len_x + max_offset_x + 1, len_y + max_offset_y + 1);
166 m_free((*v)->topo_order);
167 (*v)->mo = NULL;
168 m_free(*v);
169 return(0);
170 #undef CUR_PROC
171 } /* viterbi_free */
172
173 /*============================================================================*/
ghmm_dpmodel_print_viterbi_store(plocal_store_t * pv)174 static void ghmm_dpmodel_print_viterbi_store(plocal_store_t * pv) {
175 int j, k;
176 ghmm_dpmodel * mo;
177
178 printf("Local store for pair HMM viterbi algorithm\n");
179 printf("Log in a:\n");
180 mo = pv->mo;
181 for (j = 0; j < mo->N; j++){
182 printf("state %i in states %i\n", j, mo->s[j].in_states);
183 for (k=0; k<mo->s[j].in_states; k++)
184 printf("FIXME: log_in_a has three dimensions!"/*From: %i %f\n", mo->s[j].in_id[k], pv->log_in_a[j][k]*/);
185 }
186 printf("Log b:\n");
187 for (j = 0; j < mo->N; j++){
188 printf("state %i #chars: %i\n", j, ghmm_dpmodel_emission_table_size(mo, j));
189 for (k=0; k<ghmm_dpmodel_emission_table_size(mo, j); k++)
190 printf("Emission prob char: %i %f\n", k, pv->log_b[j][k]);
191 }
192 }
193
194 /*============================================================================*/
pviterbi_precompute(ghmm_dpmodel * mo,plocal_store_t * v)195 static void pviterbi_precompute( ghmm_dpmodel *mo, plocal_store_t *v) {
196 #define CUR_PROC "pviterbi_precompute"
197 int i, j, emission, t_class;
198
199 /* Precomputing the log(a_ij) */
200
201 for (j = 0; j < mo->N; j++)
202 for (i = 0; i < mo->s[j].in_states; i++)
203 for (t_class = 0; t_class < mo->s[mo->s[j].in_id[i]].kclasses; t_class++){
204 if ( mo->s[j].in_a[i][t_class] == 0.0 ) /* DBL_EPSILON ? */
205 v->log_in_a[j][i][t_class] = +1; /*Not used any further in the calculations */
206 else
207 v->log_in_a[j][i][t_class] = log( mo->s[j].in_a[i][t_class] );
208 }
209
210 /* Precomputing the log emission probabilities for each state*/
211 for (j = 0; j < mo->N; j++) {
212 for (emission = 0; emission < ghmm_dpmodel_emission_table_size(mo,j); emission++) {
213 if (1) {
214 if ( mo->s[j].b[emission] == 0.0 ) /* DBL_EPSILON ? */
215 v->log_b[j][emission] = +1;
216 else
217 v->log_b[j][emission] = log( mo->s[j].b[emission] );
218 }
219 else{
220 v->log_b[j][emission] = 0.0;
221 }
222 }
223 v->log_b[j][emission] = 1; /* the last field is for invalid emissions */
224 }
225 #undef CUR_PROC
226 }/* viterbi_precompute */
227
228 /*============================================================================*/
229 /** */
230 /* static void p__viterbi_silent( ghmm_dmodel *mo, int t, plocal_store_t *v ) */
231 /* { */
232 /* int topocount; */
233 /* int i, k; */
234 /* double max_value, value; */
235
236 /* for ( topocount = 0; topocount < mo->topo_order_length; topocount++) { */
237 /* k = mo->topo_order[topocount]; */
238 /* if ( mo->silent[k] ) { /\* Silent states *\/ */
239 /* /\* Determine the maximum *\/ */
240 /* /\* max_phi = phi[i] + log_in_a[j][i] ... *\/ */
241 /* max_value = -DBL_MAX; */
242 /* v->psi[t][k] = -1; */
243 /* for (i = 0; i < mo->s[k].in_states; i++) { */
244
245 /* if ( v->phi[ mo->s[k].in_id[i] ] != +1 && */
246 /* v->log_in_a[k][i] != +1) { */
247 /* value = v->phi[ mo->s[k].in_id[i] ] + v->log_in_a[k][i]; */
248 /* if (value > max_value) { */
249 /* max_value = value; */
250 /* v->psi[t][k] = mo->s[k].in_id[i]; */
251 /* } */
252 /* } */
253 /* } */
254
255 /* /\* No maximum found (that is, state never reached) */
256 /* or the output O[t] = 0.0: *\/ */
257 /* if (max_value == -DBL_MAX) { */
258 /* v->phi[k] = +1; */
259 /* } else { */
260 /* v->phi[k] = max_value; */
261 /* } */
262
263 /* } */
264 /* } */
265 /* } */
266
267 /*============================================================================*/
sget_log_in_a(plocal_store_t * pv,int i,int j,ghmm_dpseq * X,ghmm_dpseq * Y,int index_x,int index_y)268 static double sget_log_in_a(plocal_store_t * pv, int i, int j, ghmm_dpseq * X, ghmm_dpseq * Y, int index_x, int index_y) {
269 /* determine the transition class for the source ghmm_dstate */
270 int id = pv->mo->s[i].in_id[j];
271 int cl = pv->mo->s[id].class_change->get_class(pv->mo, X, Y, index_x,index_y,
272 pv->mo->s[id].class_change->user_data);
273 return pv->log_in_a[i][j][cl];
274 }
275
276 /*============================================================================*/
log_b(plocal_store_t * pv,int state,int emission)277 static double log_b(plocal_store_t * pv, int state, int emission) {
278 #ifdef DEBUG
279 if (ghmm_dstate > pv->mo->N)
280 fprintf(stderr, "log_b: State index out of bounds %i > %i\n", ghmm_dstate, pv->mo->N);
281 if (emission > ghmm_dpmodel_emission_table_size(pv->mo, state))
282 fprintf(stderr, "log_b: Emission index out of bounds %i > %i for ghmm_dstate %i\n",
283 emission, ghmm_dpmodel_emission_table_size(pv->mo, state), state);
284 #endif
285 return pv->log_b[state][emission];
286 }
287
288 /*============================================================================*/
init_phi(plocal_store_t * pv,ghmm_dpseq * X,ghmm_dpseq * Y)289 static void init_phi(plocal_store_t * pv, ghmm_dpseq * X, ghmm_dpseq * Y) {
290 #ifdef DEBUG
291 int emission;
292 #endif
293 int u, v, j, i, off_x, y;
294 double log_in_a_ij;
295 double value, max_value, previous_prob, log_b_i;
296 /* printf("ghmm_dpmodel_viterbi init\n"); */
297 ghmm_dpmodel * mo = pv->mo;
298 double (*log_in_a)(plocal_store_t*, int, int, ghmm_dpseq*, ghmm_dpseq*,
299 int, int);
300 log_in_a = &sget_log_in_a;
301
302 /* Initialize the lookback matrix (for positions [-offsetX,0], [0, len_y]*/
303 for (off_x=0; off_x<mo->max_offset_x + 1; off_x++)
304 for (y=0; y<Y->length + mo->max_offset_y + 1; y++)
305 for (j=0; j<mo->N; j++) {
306 pv->phi[off_x][y][j] = +1;
307 }
308 if ( mo->model_type & GHMM_kSilentStates ) { /* could go into silent state at t=0 */
309
310 /*p__viterbi_silent( mo, t=0, v);*/
311 }
312 /*for (j = 0; j < mo->N; j++)
313 {
314 printf("\npsi[%d],in:%d, phi=%f\n", t, v->psi[t][j], v->phi[j]);
315 }
316
317 for( i = 0; i < mo->N; i++){
318 printf("%d\t", former_matchcount[i]);
319 }
320 for (i = 0; i < mo->N; i++){
321 printf("%d\t", recent_matchcount[i]);
322 }*/
323
324 /* initialize for offsets > 1 (u < max_offset_x, v < max_offset_y) */
325 /* this is in principle the same as the main recurrence but adds initial
326 probabilities to states that cannot be inhabitated at u=0, v=0 because
327 of greater offsets than one
328 iteration start is u=-1 v=-1 to allow states with offset_x == 0
329 which corresponds to a series of gap states before reading the first
330 character of x at position x=0, y=v or equally for offset_y == 0 */
331 /* u, v <= max offsets */
332 for (u = -1; u <= mo->max_offset_x; u++) {
333 for (v = -mo->max_offset_y; v < Y->length; v++) {
334 for (j = 0; j < mo->N; j++)
335 {
336 /** initialization of phi (lookback matrix), psi (traceback) **/
337 set_phi(pv, u, v, j, +1);
338 set_psi(pv, u, v, j, -1);
339 }
340 /* for each state i */
341 for (i = 0; i < mo->N; i++) {
342 /* Determine the maximum */
343 /* max_phi = phi[i] + log_in_a[j][i] ... */
344 if (!(mo->model_type & GHMM_kSilentStates) || !mo->silent[i] ) {
345 max_value = -DBL_MAX;
346 set_psi(pv, u, v, i, -1);
347 for (j = 0; j < mo->s[i].in_states; j++) {
348 /* look back in the phi matrix at the offsets */
349 previous_prob = get_phi(pv, u, v, mo->s[i].offset_x,
350 mo->s[i].offset_y, mo->s[i].in_id[j]);
351 log_in_a_ij = (*log_in_a)(pv, i, j, X, Y, u, v);
352 if ( previous_prob != +1 && log_in_a_ij != +1) {
353 value = previous_prob + log_in_a_ij;
354 if (value > max_value) {
355 max_value = value;
356 set_psi(pv, u, v, i, mo->s[i].in_id[j]);
357 }
358 }
359 else
360 {;} /* fprintf(stderr, " %d --> %d = %f, \n", i,i,v->log_in_a[i][i]); */
361 }
362 #ifdef DEBUG
363 emission = ghmm_dpmodel_pair(ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
364 ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
365 mo->size_of_alphabet[mo->s[i].alphabet],
366 mo->s[i].offset_x, mo->s[i].offset_y);
367 if (emission > ghmm_dpmodel_emission_table_size(mo, i)){
368 printf("State %i\n", i);
369 ghmm_dpmodel_state_print(&(mo->s[i]));
370 printf("charX: %i charY: %i alphabet size: %i emission table: %i emission index: %i\n",
371 ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
372 ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
373 mo->size_of_alphabet[mo->s[i].alphabet],
374 ghmm_dpmodel_emission_table_size(mo, i), emission);
375 }
376 #endif
377 log_b_i = log_b(pv, i, ghmm_dpmodel_pair(ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
378 ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
379 mo->size_of_alphabet[mo->s[i].alphabet],
380 mo->s[i].offset_x, mo->s[i].offset_y));
381
382 /* this is the difference from the main loop:
383 check whether this state could be an initial state and add the
384 initial probability */
385 if (log_b_i == +1 ) {
386 set_phi(pv, u, v, i, +1);
387 }
388 else {
389 if (max_value == -DBL_MAX)
390 set_phi(pv, u, v, i, +1);
391 else
392 set_phi(pv, u, v, i, max_value);
393 /* if (mo->s[i].pi != 0 && mo->s[i].offset_x - 1 == u &&
394 mo->s[i].offset_y - 1 == v) { */
395 if (mo->s[i].log_pi != 1 && mo->s[i].offset_x - 1 == u &&
396 mo->s[i].offset_y - 1 == v) {
397 set_phi(pv, u, v, i, mo->s[i].log_pi);
398 #ifdef DEBUG
399 printf("Initial log prob state %i at (%i, %i) = %f\n", i, u, v, get_phi(pv, u, v, 0, 0, i));
400 printf("Characters emitted X: %i, Y: %i\n",
401 ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
402 ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v));
403 #endif
404 }
405 if (get_phi(pv, u, v, 0, 0, i) != 1)
406 set_phi(pv, u, v, i, get_phi(pv, u, v, 0, 0, i) + log_b_i);
407 }
408 }
409 /* if (v == 0) {
410 printf"(%i, %i, %i) preceding %i\n", u, v, i, pv->psi[u][v][i]);
411 } */
412 } /* complete time step for emitting states */
413
414 /* last_osc = osc; */
415 /* save last transition class */
416
417 /*if (mo->model_type & kSilentStates) {
418 p__viterbi_silent( mo, t, v );
419 }*/ /* complete time step for silent states */
420
421 /**************
422 for (j = 0; j < mo->N; j++)
423 {
424 printf("\npsi[%d],in:%d, phi=%f\n", t, v->psi[t][j], v->phi[j]);
425 }
426
427 for (i = 0; i < mo->N; i++){
428 printf("%d\t", former_matchcount[i]);
429 }
430
431 for (i = 0; i < mo->N; i++){
432 printf("%d\t", recent_matchcount[i]);
433 }
434 ****************/
435 } /* End for v in Y */
436 /* Next character in X */
437 /* push back the old phi values */
438 push_back_phi(pv, Y->length);
439 } /* End for u in X */
440 }
441
442 /*============================================================================*/
443 /* call this function when you are at position (x, y) and want to get the
444 probability of state 'state' which is offset_x and offset_y away from x,y */
get_phi(plocal_store_t * pv,int x,int y,int offset_x,int offset_y,int state)445 static double get_phi(plocal_store_t * pv, int x, int y, int offset_x, int offset_y, int state){
446 #ifdef DEBUG
447 if (y > pv->len_y || state > pv->mo->N || y < - pv->mo->max_offset_y)
448 fprintf(stderr, "get_phi: out of bounds %i %i %i\n",
449 x + pv->mo->max_offset_x, y + pv->mo->max_offset_y, state);
450 #endif
451
452 if (y - offset_y + pv->mo->max_offset_y >= 0)
453 return pv->phi[offset_x][y - offset_y + pv->mo->max_offset_y][state];
454 else
455 return 1;
456 }
457
458 /*============================================================================*/
459 /* set the value of this matrix cell */
set_phi(plocal_store_t * pv,int x,int y,int state,double prob)460 static void set_phi(plocal_store_t * pv, int x, int y, int state, double prob){
461 #ifdef DEBUG
462 if (y > pv->len_y || state > pv->mo->N || y < - pv->mo->max_offset_y)
463 fprintf(stderr, "set_phi: out of bounds %i %i %i %f\n",
464 x + pv->mo->max_offset_x, y + pv->mo->max_offset_y,
465 state, prob);
466 #endif
467 pv->phi[0][y + pv->mo->max_offset_y][state] = prob;
468 }
469
470 /*============================================================================*/
471 /* since we only keep the frontier we have to push back when ever a row is
472 complete */
push_back_phi(plocal_store_t * pv,int length_y)473 static void push_back_phi(plocal_store_t * pv, int length_y){
474 int off_x, y, j;
475 /* push back the old phi values */
476 for (off_x=pv->mo->max_offset_x; off_x>0; off_x--)
477 for (y=0; y<length_y + pv->mo->max_offset_y + 1; y++)
478 for (j=0; j<pv->mo->N; j++)
479 pv->phi[off_x][y][j] = pv->phi[off_x-1][y][j];
480 }
481
482 /*============================================================================*/
set_psi(plocal_store_t * pv,int x,int y,int state,int from_state)483 static void set_psi(plocal_store_t * pv, int x, int y, int state, int from_state){
484 /* shift by max_offsets for negative indices */
485 #ifdef DEBUG
486 if (x > pv->len_x || y > pv->len_y || state > pv->mo->N ||
487 x < - pv->mo->max_offset_x || y < - pv->mo->max_offset_y)
488 fprintf(stderr, "set_psi: out of bounds %i %i %i %i\n",
489 x + pv->mo->max_offset_x, y + pv->mo->max_offset_y,
490 state, from_state);
491 #endif
492 pv->psi[x + pv->mo->max_offset_x][y + pv->mo->max_offset_y][state] = from_state;
493 }
494
495 /*============================================================================*/
get_psi(plocal_store_t * pv,int x,int y,int state)496 static int get_psi(plocal_store_t * pv, int x, int y, int state) {
497 /* shift by max_offsets for negative indices*/
498 #ifdef DEBUG
499 if (x > pv->len_x || y > pv->len_y || state > pv->mo->N ||
500 x < - pv->mo->max_offset_x || y < - pv->mo->max_offset_y)
501 fprintf(stderr, "get_psi: out of bounds %i %i %i\n",
502 x + pv->mo->max_offset_x, y + pv->mo->max_offset_y,
503 state);
504 #endif
505 return pv->psi[x + pv->mo->max_offset_x][y + pv->mo->max_offset_y][state];
506 }
507
508 /*============================================================================*/
ghmm_dpmodel_viterbi_test(ghmm_dpmodel * mo,ghmm_dpseq * X,ghmm_dpseq * Y,double * log_p,int * path_length)509 int *ghmm_dpmodel_viterbi_test(ghmm_dpmodel *mo, ghmm_dpseq * X, ghmm_dpseq * Y,
510 double *log_p, int *path_length) {
511 plocal_store_t *pv;
512 printf("---- viterbi test -----\n");
513 /*ghmm_dpmodel_print(mo);*/
514 pv = pviterbi_alloc(mo, X->length, Y->length);
515 printf("try free within pviterbi_test\n");
516 pviterbi_free(&pv, mo->N, X->length, Y->length, mo->max_offset_x ,
517 mo->max_offset_y);
518 printf("OK\n");
519 return NULL;
520 }
521
522
523 /*============================================================================*/
ghmm_dpmodel_viterbi(ghmm_dpmodel * mo,ghmm_dpseq * X,ghmm_dpseq * Y,double * log_p,int * path_length)524 int *ghmm_dpmodel_viterbi(ghmm_dpmodel *mo, ghmm_dpseq * X, ghmm_dpseq * Y, double *log_p,
525 int *path_length) {
526 return ghmm_dpmodel_viterbi_variable_tb(mo, X, Y, log_p, path_length, -1);
527 }
528
529 /*============================================================================*/
ghmm_dpmodel_viterbi_variable_tb(ghmm_dpmodel * mo,ghmm_dpseq * X,ghmm_dpseq * Y,double * log_p,int * path_length,int start_traceback_with)530 int *ghmm_dpmodel_viterbi_variable_tb(ghmm_dpmodel *mo, ghmm_dpseq * X, ghmm_dpseq * Y,
531 double *log_p, int *path_length,
532 int start_traceback_with) {
533 #define CUR_PROC "ghmm_dpmodel_viterbi"
534 int u, v, j, i, off_x, off_y, current_state_index;
535 double value, max_value, previous_prob;
536 plocal_store_t *pv;
537 int *state_seq = NULL;
538 int emission;
539 double log_b_i, log_in_a_ij;
540 double (*log_in_a)(plocal_store_t*, int, int, ghmm_dpseq*, ghmm_dpseq*, int, int);
541
542 /* printf("---- viterbi -----\n"); */
543 i_list * state_list;
544 state_list = ighmm_list_init_list();
545 log_in_a = &sget_log_in_a;
546 /* int len_path = mo->N*len; the length of the path is not known apriori */
547
548 /* if (mo->model_type & kSilentStates && */
549 /* mo->silent != NULL && */
550 /* mo->topo_order == NULL) { */
551 /* ghmm_dmodel_topo_order( mo ); */
552 /* } */
553
554 /* Allocate the matrices log_in_a, log_b,Vektor phi, phi_new, Matrix psi */
555 pv = pviterbi_alloc(mo, X->length, Y->length);
556 if (!pv) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; }
557
558 /* Precomputing the log(a_ij) and log(bj(ot)) */
559 pviterbi_precompute(mo, pv);
560 /* Initialize the lookback matrix (for positions [-offsetX,0], [-1, len_y]*/
561 init_phi(pv, X, Y);
562
563 /* u > max_offset_x , v starts -1 to allow states with offset_x == 0
564 which corresponds to a series of gap states before reading the first
565 character of x at position x=0, y=v */
566 /** THIS IS THE MAIN RECURRENCE **/
567 for (u = mo->max_offset_x + 1; u < X->length; u++) {
568 for (v = -mo->max_offset_y; v < Y->length; v++) {
569 for (j = 0; j < mo->N; j++)
570 {
571 /** initialization of phi (lookback matrix), psi (traceback) **/
572 set_phi(pv, u, v, j, +1);
573 set_psi(pv, u, v, j, -1);
574 }
575
576 for (i = 0; i < mo->N; i++) {
577 /* Determine the maximum */
578 /* max_phi = phi[i] + log_in_a[j][i] ... */
579 if (!(mo->model_type & GHMM_kSilentStates) || !mo->silent[i] ) {
580 max_value = -DBL_MAX;
581 set_psi(pv, u, v, i, -1);
582 for (j = 0; j < mo->s[i].in_states; j++) {
583 /* look back in the phi matrix at the offsets */
584 previous_prob = get_phi(pv, u, v, mo->s[i].offset_x, mo->s[i].offset_y, mo->s[i].in_id[j]);
585 log_in_a_ij = (*log_in_a)(pv, i, j, X, Y, u, v);
586 if ( previous_prob != +1 && log_in_a_ij != +1) {
587 value = previous_prob + log_in_a_ij;
588 if (value > max_value) {
589 max_value = value;
590 set_psi(pv, u, v, i, mo->s[i].in_id[j]);
591 }
592 }
593 else
594 {;} /* fprintf(stderr, " %d --> %d = %f, \n", i,i,v->log_in_a[i][i]); */
595 }
596
597 emission = ghmm_dpmodel_pair(ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
598 ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
599 mo->size_of_alphabet[mo->s[i].alphabet],
600 mo->s[i].offset_x, mo->s[i].offset_y);
601 #ifdef DEBUG
602 if (emission > ghmm_dpmodel_emission_table_size(mo, i)){
603 printf("State %i\n", i);
604 ghmm_dpmodel_state_print(&(mo->s[i]));
605 printf("charX: %i charY: %i alphabet size: %i emission table: %i emission index: %i\n",
606 ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
607 ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
608 mo->size_of_alphabet[mo->s[i].alphabet],
609 ghmm_dpmodel_emission_table_size(mo, i), emission);
610 }
611 #endif
612 log_b_i = log_b(pv, i, ghmm_dpmodel_pair(ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
613 ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
614 mo->size_of_alphabet[mo->s[i].alphabet],
615 mo->s[i].offset_x, mo->s[i].offset_y));
616
617 /* No maximum found (that is, state never reached)
618 or the output O[t] = 0.0: */
619 if (max_value == -DBL_MAX ||/* and then also: (v->psi[t][j] == -1) */
620 log_b_i == +1 ) {
621 set_phi(pv, u, v, i, +1);
622 }
623 else
624 set_phi(pv, u, v, i, max_value + log_b_i);
625 }
626 } /* complete time step for emitting states */
627
628 /* last_osc = osc; */
629 /* save last transition class */
630
631 /*if (mo->model_type & kSilentStates) {
632 p__viterbi_silent( mo, t, v );
633 }*/ /* complete time step for silent states */
634
635 /**************
636 for (j = 0; j < mo->N; j++)
637 {
638 printf("\npsi[%d],in:%d, phi=%f\n", t, v->psi[t][j], v->phi[j]);
639 }
640
641 for (i = 0; i < mo->N; i++){
642 printf("%d\t", former_matchcount[i]);
643 }
644
645 for (i = 0; i < mo->N; i++){
646 printf("%d\t", recent_matchcount[i]);
647 }
648 ****************/
649 } /* End for v in Y */
650 /* Next character in X */
651 push_back_phi(pv, Y->length);
652 } /* End for u in X */
653
654 /* Termination */
655 max_value = -DBL_MAX;
656 ighmm_list_append(state_list, -1);
657 /* if start_traceback_with is -1 (it is by default) search for the most
658 likely state at the end of both sequences */
659 if (start_traceback_with == -1) {
660 for (j = 0; j < mo->N; j++){
661 #ifdef DEBUG
662 printf("phi(len_x)(len_y)(%i)=%f\n", j, get_phi(pv, u, Y->length-1, 0, 0, j));
663 #endif
664 if ( get_phi(pv, u, Y->length-1, 0, 0, j) != +1 &&
665 get_phi(pv, u, Y->length-1, 0, 0, j) > max_value) {
666 max_value = get_phi(pv, X->length-1, Y->length-1, 0, 0, j);
667 state_list->last->val = j;
668 }
669 }
670 }
671 /* this is the special traceback mode for the d & c algorithm that also
672 connects the traceback to the first state of the rest of the path */
673 else {
674 #ifdef DEBUG
675 printf("D & C traceback from state %i!\n", start_traceback_with);
676 printf("Last characters emitted X: %i, Y: %i\n",
677 ghmm_dpseq_get_char(X, mo->s[start_traceback_with].alphabet,
678 X->length-1),
679 ghmm_dpseq_get_char(Y, mo->s[start_traceback_with].alphabet,
680 Y->length-1));
681 for (j = 0; j < mo->N; j++){
682 printf("phi(len_x)(len_y)(%i)=%f\n", j, get_phi(pv, X->length-1, Y->length-1, 0, 0, j));
683 }
684 #endif
685 max_value = get_phi(pv, X->length-1, Y->length-1, 0, 0, start_traceback_with);
686 if (max_value != 1 && max_value > -DBL_MAX)
687 state_list->last->val = start_traceback_with;
688 }
689 if (max_value == -DBL_MAX) {
690 /* Sequence can't be generated from the model! */
691 *log_p = +1;
692 /* Backtracing doesn't work, because state_seq[*] allocated with -1 */
693 /* for (t = len - 2; t >= 0; t--)
694 state_list->last->val = -1; */
695 }
696 else {
697 /* Backtracing, should put DEL path nicely */
698 *log_p = max_value;
699 /* removed the handling of silent states here */
700 /* start trace back at the end of both sequences */
701 u = X->length - 1;
702 v = Y->length - 1;
703 current_state_index = state_list->first->val;
704 off_x = mo->s[current_state_index].offset_x;
705 off_y = mo->s[current_state_index].offset_y;
706 while (u - off_x >= -1 && v - off_y >= -1 && current_state_index != -1) {
707 /* while (u > 0 && v > 0) { */
708 /* look up the preceding state and save it in the first position of the
709 state list */
710 /* printf("Current state %i at (%i,%i) -> preceding state %i\n",
711 current_state_index, u, v, get_psi(pv, u, v, current_state_index)); */
712 /* update the current state */
713 current_state_index = get_psi(pv, u, v, current_state_index);
714 if (current_state_index != -1)
715 ighmm_list_insert(state_list, current_state_index);
716 /* move in the alignment matrix */
717 u -= off_x;
718 v -= off_y;
719 /* get the next offsets */
720 off_x = mo->s[current_state_index].offset_x;
721 off_y = mo->s[current_state_index].offset_y;
722 }
723 }
724
725 /* Free the memory space */
726 pviterbi_free(&pv, mo->N, X->length, Y->length, mo->max_offset_x ,
727 mo->max_offset_y);
728 /* printf("After traceback: last state = %i\n", state_list->last->val); */
729 state_seq = ighmm_list_to_array(state_list);
730 *path_length = state_list->length;
731 /* PRINT PATH */
732
733 /* fprintf(stderr, "Viterbi path: " ); */
734 /* int t; */
735 /* for(t=0; t < *path_length; t++) */
736 /* if (state_seq[t] >= 0) fprintf(stderr, " %d ", state_seq[t]); */
737 /* fprintf(stderr, "\n Freeing ... \n"); */
738 return (state_seq);
739 STOP: /* Label STOP from ARRAY_[CM]ALLOC */
740 /* Free the memory space */
741 pviterbi_free(&pv, mo->N, X->length, Y->length, mo->max_offset_x,
742 mo->max_offset_y);
743 m_free(state_seq);
744 ighmm_list_free(state_list);
745 return NULL;
746 #undef CUR_PROC
747 } /* viterbi */
748
749
750 /*============================================================================*/
ghmm_dpmodel_viterbi_logp(ghmm_dpmodel * mo,ghmm_dpseq * X,ghmm_dpseq * Y,int * state_seq,int state_seq_len)751 double ghmm_dpmodel_viterbi_logp(ghmm_dpmodel *mo, ghmm_dpseq * X, ghmm_dpseq * Y,
752 int *state_seq, int state_seq_len) {
753 #define CUR_PROC "ghmm_dpmodel_viterbi_logp"
754 int s, t, i, j, u, v;
755 double log_p = 0.0;
756 double log_b_i = 1.0;
757 double log_in_a = 1.0;
758 plocal_store_t *pv;
759
760 /* Allocate the matrices log_in_a, log_b,Vektor phi, phi_new, Matrix psi */
761 pv = pviterbi_alloc(mo, 0, 0);
762 pviterbi_precompute(mo, pv);
763 /* initial state */
764 t = 0; u = -1; v = -1;
765 if (state_seq_len > t) {
766 i = state_seq[t];
767 /* initial state probability */
768 /* log_p += log(mo->s[i].pi); */
769 log_p += mo->s[i].log_pi;
770 if (log_p == 1.0) {
771 pviterbi_free(&pv, mo->N, 0, 0, mo->max_offset_x, mo->max_offset_y);
772 fprintf(stderr, "the initial probability of state %i is zero\n", i);
773 return 1.0;/* the initial prob is zero */
774 }
775 /* consume the first characters of the sequences */
776 u += mo->s[i].offset_x;
777 v += mo->s[i].offset_y;
778 /* get the emission probability */
779 log_b_i = log_b(pv, i, ghmm_dpmodel_pair(ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
780 ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
781 mo->size_of_alphabet[mo->s[i].alphabet],
782 mo->s[i].offset_x, mo->s[i].offset_y));
783 if (log_b_i == 1.0) { /* chars cant be emitted */
784 pviterbi_free(&pv, mo->N, 0, 0, mo->max_offset_x, mo->max_offset_y);
785 fprintf(stderr, "characters (%i, %i) at position (%i, %i) cannot be emitted by state %i (t=%i)\n",
786 ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
787 ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v), u, v, i, t);
788 return 1.0;
789 }
790 log_p += log_b_i;
791 }
792 else { /* there is no path.. */
793 pviterbi_free(&pv, mo->N, 0, 0, mo->max_offset_x, mo->max_offset_y);
794 fprintf(stderr, "No path given!\n");
795 return 1.0;
796 }
797 /* rest of the sequence */
798 for (t=1; t<state_seq_len; t++) {
799 j = i; /* predecessor state */
800 i = state_seq[t]; /* current state */
801 /* consume the next characters */
802 u += mo->s[i].offset_x;
803 v += mo->s[i].offset_y;
804 if (u >= X->length || v >= Y->length) { /* path consumes too many chars */
805 pviterbi_free(&pv, mo->N, 0, 0, mo->max_offset_x, mo->max_offset_y);
806 fprintf(stderr, "path consumes too many chars\n");
807 return 1.0;
808 }
809 /* transition prob state j -> i*/
810 log_in_a = 1.0;
811 for (s=0; s<mo->s[i].in_states; s++) {
812 if (mo->s[i].in_id[s] == j) {
813 log_in_a = sget_log_in_a(pv, i, s, X, Y, u, v);
814 break;
815 }
816 }
817 if (log_in_a == 1.0) {
818 pviterbi_free(&pv, mo->N, 0, 0, mo->max_offset_x, mo->max_offset_y);
819 fprintf(stderr, "transition (%i -> %i) at t=%i not possible\n", j, i,t);
820 return 1.0; /* transition not possible */
821 }
822 /* emission probability */
823 log_b_i = log_b(pv, i, ghmm_dpmodel_pair(ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
824 ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
825 mo->size_of_alphabet[mo->s[i].alphabet],
826 mo->s[i].offset_x, mo->s[i].offset_y));
827 if (log_b_i == 1.0) {
828 pviterbi_free(&pv, mo->N, 0, 0, mo->max_offset_x, mo->max_offset_y);
829 fprintf(stderr, "characters (%i, %i) at position (%i, %i) cannot be emitted by state %i (t=%i)\n",
830 ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
831 ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v), u, v, i, t);
832 return 1.0; /* characters cant be emitted */
833 }
834 log_p += log_in_a + log_b_i;
835 }
836 pviterbi_free(&pv, mo->N, 0, 0, mo->max_offset_x,
837 mo->max_offset_y);
838 /* check if all of the sequences has been consumed */
839 if (u != X->length - 1 && v != Y->length - 1) {
840 fprintf(stderr, "path consumes not all characters (%i of %i, %i of %i)\n",
841 u + 1, X->length, v + 1, Y->length);
842 return 1.0;
843 }
844 return log_p;
845
846 #undef CUR_PROC
847 } /* ghmm_dpmodel_viterbi_logp */
848
849