1 /*******************************************************************************
2 *
3 *       This file is part of the General Hidden Markov Model Library,
4 *       GHMM version __VERSION__, see http://ghmm.org
5 *
6 *       Filename: ghmm/ghmm/viterbi.c
7 *       Authors:  Wasinee Rungsarityotin, Benjamin Georgi
8 *
9 *       Copyright (C) 1998-2004 Alexander Schliep
10 *       Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln
11 *       Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik,
12 *                               Berlin
13 *
14 *       Contact: schliep@ghmm.org
15 *
16 *       This library is free software; you can redistribute it and/or
17 *       modify it under the terms of the GNU Library General Public
18 *       License as published by the Free Software Foundation; either
19 *       version 2 of the License, or (at your option) any later version.
20 *
21 *       This library is distributed in the hope that it will be useful,
22 *       but WITHOUT ANY WARRANTY; without even the implied warranty of
23 *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24 *       Library General Public License for more details.
25 *
26 *       You should have received a copy of the GNU Library General Public
27 *       License along with this library; if not, write to the Free
28 *       Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 *
30 *
31 *       This file is version $Revision: 1713 $
32 *                       from $Date: 2006-10-16 10:06:28 -0400 (Mon, 16 Oct 2006) $
33 *             last change by $Author: grunau $.
34 *
35 *******************************************************************************/
36 
37 /* Possible sources of errors: initialisation, pushback (the loop after) */
38 
39 #ifdef HAVE_CONFIG_H
40 #  include "../config.h"
41 #endif
42 
43 #include <float.h>
44 #include <math.h>
45 #include <assert.h>
46 #include <stdlib.h>
47 
48 #include "ghmm.h"
49 #include "mprintf.h"
50 #include "mes.h"
51 #include "matrix.h"
52 #include "pmodel.h"
53 #include "psequence.h"
54 #include "pviterbi.h"
55 #include "ghmm_internals.h"
56 
57 
58 typedef struct plocal_store_t {
59   /** precomputed log probabilities for transitions into the states
60       for each transition class **/
61   double *** log_in_a;
62   /** precomputed log probabilities for each state for the emissions  **/
63   double ** log_b;
64   /** lookback matrix for the last offset steps **/
65   double *** phi;
66   /** log probabilities for the current u, v and every state **/
67   double *phi_new;
68   /** traceback matrix **/
69   int ***psi;
70   /** for convinience store a pointer to the model **/
71   ghmm_dpmodel * mo;
72   /** for the debug mode store information of matrix sizes **/
73   /** length of sequence X determines size of psi **/
74   int len_x;
75   /** length of sequence Y determines size of phi and psi **/
76   int len_y;
77   /** non functional stuff **/
78   int    *topo_order;
79   int    topo_order_length;
80 } plocal_store_t;
81 
82 
83   static void ghmm_dpmodel_print_viterbi_store(plocal_store_t * pv);
84 
85   static plocal_store_t *pviterbi_alloc(ghmm_dpmodel *mo, int len_x, int len_y);
86 
87   static int pviterbi_free(plocal_store_t **v, int n, int len_x, int len_y,
88 			   int max_offset_x, int max_offset_y);
89 
90   static void init_phi(plocal_store_t * pv, ghmm_dpseq * X, ghmm_dpseq * Y);
91 
92   static double get_phi(plocal_store_t * pv, int x, int y, int offset_x,
93 			int offset_y, int state);
94 
95   static void set_phi(plocal_store_t * pv, int x, int y, int ghmm_dstate,
96 		      double prob);
97 
98   static void push_back_phi(plocal_store_t * pv, int length_y);
99 
100   static void set_psi(plocal_store_t * pv, int x, int y, int ghmm_dstate,
101 		      int from_state);
102 
103 
104 /*============================================================================*/
pviterbi_alloc(ghmm_dpmodel * mo,int len_x,int len_y)105 static plocal_store_t *pviterbi_alloc(ghmm_dpmodel *mo, int len_x, int len_y) {
106 #define CUR_PROC "pviterbi_alloc"
107   plocal_store_t* v = NULL;
108   int i, j;
109   ARRAY_CALLOC (v, 1);
110   v->mo = mo;
111   v->len_y = len_y;
112   v->len_x = len_x;
113   /* Allocate the log_in_a's -> individal lenghts */
114   ARRAY_CALLOC (v->log_in_a, mo->N);
115   /* first index of log_in_a: target state */
116   for (j = 0; j < mo->N; j++){
117     /* second index: source state */
118     ARRAY_CALLOC (v->log_in_a[j], mo->s[j].in_states);
119     for (i=0; i<mo->s[j].in_states; i++) {
120       /* third index: transition classes of source state */
121       ARRAY_CALLOC (v->log_in_a[j][i], mo->s[mo->s[j].in_id[i]].kclasses);
122     }
123   }
124   ARRAY_CALLOC (v->log_b, mo->N);
125   for (j=0; j<mo->N; j++) {
126     ARRAY_CALLOC (v->log_b[j], ghmm_dpmodel_emission_table_size(mo, j) + 1);
127   }
128   if (!(v->log_b)) {GHMM_LOG_QUEUED(LCONVERTED); goto STOP;}
129   v->phi = ighmm_cmatrix_3d_alloc(mo->max_offset_x + 1, len_y + mo->max_offset_y + 1, mo->N);
130   if (!(v->phi)) {GHMM_LOG_QUEUED(LCONVERTED); goto STOP;}
131   ARRAY_CALLOC (v->phi_new, mo->N);
132   v->psi = ighmm_dmatrix_3d_alloc(len_x + mo->max_offset_x + 1, len_y + mo->max_offset_y + 1, mo->N);
133   if (!(v->psi)) {GHMM_LOG_QUEUED(LCONVERTED); goto STOP;}
134 
135   v->topo_order_length = 0;
136   ARRAY_CALLOC (v->topo_order, mo->N);
137 
138   return(v);
139 STOP:     /* Label STOP from ARRAY_[CM]ALLOC */
140   pviterbi_free((&v), mo->N, len_x, len_y, mo->max_offset_x, mo->max_offset_y);
141   return(NULL);
142 #undef CUR_PROC
143 } /* viterbi_alloc */
144 
145 
146 /*============================================================================*/
pviterbi_free(plocal_store_t ** v,int n,int len_x,int len_y,int max_offset_x,int max_offset_y)147 static int pviterbi_free(plocal_store_t **v, int n, int len_x, int len_y,
148 			 int max_offset_x, int max_offset_y) {
149 #define CUR_PROC "pviterbi_free"
150   int i, j;
151   mes_check_ptr(v, return(-1));
152   if( !*v ) return(0);
153   for (j = 0; j < n; j++) {
154     for (i=0; i < (*v)->mo->s[j].in_states; i++)
155       m_free((*v)->log_in_a[j][i]);
156     m_free((*v)->log_in_a[j]);
157   }
158   m_free((*v)->log_in_a);
159   for (j=0; j<n; j++)
160     m_free((*v)->log_b[j]);
161   m_free((*v)->log_b);
162   ighmm_cmatrix_3d_free( &((*v)->phi), max_offset_x + 1,
163 		   len_y + max_offset_y + 1);
164   m_free((*v)->phi_new);
165   ighmm_dmatrix_3d_free( &((*v)->psi), len_x + max_offset_x + 1, len_y + max_offset_y + 1);
166   m_free((*v)->topo_order);
167   (*v)->mo = NULL;
168   m_free(*v);
169   return(0);
170 #undef CUR_PROC
171 } /* viterbi_free */
172 
173 /*============================================================================*/
ghmm_dpmodel_print_viterbi_store(plocal_store_t * pv)174 static void ghmm_dpmodel_print_viterbi_store(plocal_store_t * pv) {
175   int j, k;
176   ghmm_dpmodel * mo;
177 
178   printf("Local store for pair HMM viterbi algorithm\n");
179   printf("Log in a:\n");
180   mo = pv->mo;
181   for (j = 0; j < mo->N; j++){
182     printf("state %i in states %i\n", j, mo->s[j].in_states);
183     for (k=0; k<mo->s[j].in_states; k++)
184       printf("FIXME: log_in_a has three dimensions!"/*From: %i %f\n", mo->s[j].in_id[k], pv->log_in_a[j][k]*/);
185   }
186   printf("Log b:\n");
187   for (j = 0; j < mo->N; j++){
188     printf("state %i #chars: %i\n", j, ghmm_dpmodel_emission_table_size(mo, j));
189     for (k=0; k<ghmm_dpmodel_emission_table_size(mo, j); k++)
190       printf("Emission prob char: %i %f\n", k, pv->log_b[j][k]);
191   }
192 }
193 
194 /*============================================================================*/
pviterbi_precompute(ghmm_dpmodel * mo,plocal_store_t * v)195 static void pviterbi_precompute( ghmm_dpmodel *mo, plocal_store_t *v) {
196 #define CUR_PROC "pviterbi_precompute"
197   int i, j, emission, t_class;
198 
199   /* Precomputing the log(a_ij) */
200 
201   for (j = 0; j < mo->N; j++)
202     for (i = 0; i < mo->s[j].in_states; i++)
203       for (t_class = 0; t_class < mo->s[mo->s[j].in_id[i]].kclasses; t_class++){
204 	if ( mo->s[j].in_a[i][t_class] == 0.0 )   /* DBL_EPSILON ? */
205 	  v->log_in_a[j][i][t_class] = +1; /*Not used any further in the calculations */
206 	else
207 	  v->log_in_a[j][i][t_class] = log( mo->s[j].in_a[i][t_class] );
208       }
209 
210   /* Precomputing the log emission probabilities for each state*/
211   for (j = 0; j < mo->N; j++) {
212     for (emission = 0; emission < ghmm_dpmodel_emission_table_size(mo,j); emission++) {
213       if (1) {
214 	if ( mo->s[j].b[emission] == 0.0 )   /* DBL_EPSILON ? */
215 	  v->log_b[j][emission] = +1;
216 	else
217 	  v->log_b[j][emission] = log( mo->s[j].b[emission] );
218       }
219       else{
220 	v->log_b[j][emission] = 0.0;
221       }
222     }
223     v->log_b[j][emission] = 1; /* the last field is for invalid emissions */
224   }
225 #undef CUR_PROC
226 }/* viterbi_precompute */
227 
228 /*============================================================================*/
229 /** */
230 /* static void p__viterbi_silent( ghmm_dmodel *mo, int t, plocal_store_t *v ) */
231 /* { */
232 /*   int topocount; */
233 /*   int i, k; */
234 /*   double max_value, value; */
235 
236 /*   for ( topocount = 0; topocount < mo->topo_order_length; topocount++) { */
237 /*       k = mo->topo_order[topocount]; */
238 /*       if ( mo->silent[k] ) { /\* Silent states *\/	 */
239 /* 	  /\* Determine the maximum *\/ */
240 /* 	  /\* max_phi = phi[i] + log_in_a[j][i] ... *\/ */
241 /* 	  max_value = -DBL_MAX; */
242 /* 	  v->psi[t][k] = -1; */
243 /* 	  for (i = 0; i < mo->s[k].in_states; i++) { */
244 
245 /* 	  if ( v->phi[ mo->s[k].in_id[i] ] != +1 && */
246 /* 		 v->log_in_a[k][i]      != +1) { */
247 /* 	      value = v->phi[ mo->s[k].in_id[i] ] + v->log_in_a[k][i]; */
248 /* 	      if (value > max_value) { */
249 /* 		max_value = value; */
250 /* 		v->psi[t][k] = mo->s[k].in_id[i]; */
251 /* 	      } */
252 /* 	    } */
253 /* 	  } */
254 
255 /* 	  /\* No maximum found (that is, state never reached) */
256 /* 	     or the output O[t] = 0.0: *\/ */
257 /* 	  if (max_value    == -DBL_MAX) { */
258 /* 	    v->phi[k] = +1; */
259 /* 	  } else { */
260 /* 	    v->phi[k] = max_value; */
261 /* 	  } */
262 
263 /* 	} */
264 /*   } */
265 /* } */
266 
267 /*============================================================================*/
sget_log_in_a(plocal_store_t * pv,int i,int j,ghmm_dpseq * X,ghmm_dpseq * Y,int index_x,int index_y)268 static double sget_log_in_a(plocal_store_t * pv, int i, int j, ghmm_dpseq * X, ghmm_dpseq * Y, int index_x, int index_y) {
269   /* determine the transition class for the source ghmm_dstate */
270   int id = pv->mo->s[i].in_id[j];
271   int cl = pv->mo->s[id].class_change->get_class(pv->mo, X, Y, index_x,index_y,
272 				   pv->mo->s[id].class_change->user_data);
273   return pv->log_in_a[i][j][cl];
274 }
275 
276 /*============================================================================*/
log_b(plocal_store_t * pv,int state,int emission)277 static double log_b(plocal_store_t * pv, int state, int emission) {
278 #ifdef DEBUG
279   if (ghmm_dstate > pv->mo->N)
280     fprintf(stderr, "log_b: State index out of bounds %i > %i\n", ghmm_dstate, pv->mo->N);
281   if (emission > ghmm_dpmodel_emission_table_size(pv->mo, state))
282     fprintf(stderr, "log_b: Emission index out of bounds %i > %i for ghmm_dstate %i\n",
283 	    emission, ghmm_dpmodel_emission_table_size(pv->mo, state), state);
284 #endif
285   return pv->log_b[state][emission];
286 }
287 
288 /*============================================================================*/
init_phi(plocal_store_t * pv,ghmm_dpseq * X,ghmm_dpseq * Y)289 static void init_phi(plocal_store_t * pv, ghmm_dpseq * X, ghmm_dpseq * Y) {
290 #ifdef DEBUG
291   int emission;
292 #endif
293   int u, v, j, i, off_x, y;
294   double log_in_a_ij;
295   double value, max_value, previous_prob, log_b_i;
296   /* printf("ghmm_dpmodel_viterbi init\n"); */
297   ghmm_dpmodel * mo = pv->mo;
298   double (*log_in_a)(plocal_store_t*, int, int, ghmm_dpseq*, ghmm_dpseq*,
299 		     int, int);
300   log_in_a = &sget_log_in_a;
301 
302   /* Initialize the lookback matrix (for positions [-offsetX,0], [0, len_y]*/
303   for (off_x=0; off_x<mo->max_offset_x + 1; off_x++)
304     for (y=0; y<Y->length + mo->max_offset_y + 1; y++)
305       for (j=0; j<mo->N; j++) {
306 	pv->phi[off_x][y][j] = +1;
307       }
308     if ( mo->model_type & GHMM_kSilentStates ) { /* could go into silent state at t=0 */
309 
310     /*p__viterbi_silent( mo, t=0, v);*/
311   }
312   /*for (j = 0; j < mo->N; j++)
313     {
314       printf("\npsi[%d],in:%d, phi=%f\n", t, v->psi[t][j], v->phi[j]);
315     }
316 
317   for( i = 0; i < mo->N; i++){
318     printf("%d\t", former_matchcount[i]);
319   }
320   for (i = 0; i < mo->N; i++){
321     printf("%d\t", recent_matchcount[i]);
322   }*/
323 
324   /* initialize for offsets > 1 (u < max_offset_x, v < max_offset_y) */
325   /* this is in principle the same as the main recurrence but adds initial
326      probabilities to states that cannot be inhabitated at u=0, v=0 because
327      of greater offsets than one
328      iteration start is u=-1 v=-1 to allow states with offset_x == 0
329      which corresponds to a series of gap states before reading the first
330      character of x at position x=0, y=v or equally for offset_y == 0 */
331   /* u, v <= max offsets */
332     for (u = -1; u <= mo->max_offset_x; u++) {
333       for (v = -mo->max_offset_y; v < Y->length; v++) {
334 	for (j = 0; j < mo->N; j++)
335 	  {
336 	    /** initialization of phi (lookback matrix), psi (traceback) **/
337 	    set_phi(pv, u, v, j, +1);
338 	    set_psi(pv, u, v, j, -1);
339 	  }
340 	/* for each state i */
341 	for (i = 0; i < mo->N; i++) {
342 	/* Determine the maximum */
343 	/* max_phi = phi[i] + log_in_a[j][i] ... */
344 	  if (!(mo->model_type & GHMM_kSilentStates) || !mo->silent[i] ) {
345 	    max_value = -DBL_MAX;
346 	    set_psi(pv, u, v, i, -1);
347 	    for (j = 0; j < mo->s[i].in_states; j++) {
348 	      /* look back in the phi matrix at the offsets */
349 	      previous_prob = get_phi(pv, u, v, mo->s[i].offset_x,
350 				      mo->s[i].offset_y, mo->s[i].in_id[j]);
351 	      log_in_a_ij = (*log_in_a)(pv, i, j, X, Y, u, v);
352 	      if ( previous_prob != +1 && log_in_a_ij != +1) {
353 		value = previous_prob + log_in_a_ij;
354 		if (value > max_value) {
355 		  max_value = value;
356 		  set_psi(pv, u, v, i, mo->s[i].in_id[j]);
357 		}
358 	      }
359 	      else
360 		{;} /* fprintf(stderr, " %d --> %d = %f, \n", i,i,v->log_in_a[i][i]); */
361 	    }
362 #ifdef DEBUG
363 	    emission = ghmm_dpmodel_pair(ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
364 				ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
365 				mo->size_of_alphabet[mo->s[i].alphabet],
366 				mo->s[i].offset_x, mo->s[i].offset_y);
367 	    if (emission > ghmm_dpmodel_emission_table_size(mo, i)){
368 	      printf("State %i\n", i);
369 	      ghmm_dpmodel_state_print(&(mo->s[i]));
370 	      printf("charX: %i charY: %i alphabet size: %i emission table: %i emission index: %i\n",
371 		     ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
372 		     ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
373 		     mo->size_of_alphabet[mo->s[i].alphabet],
374 		     ghmm_dpmodel_emission_table_size(mo, i), emission);
375 	    }
376 #endif
377 	    log_b_i = log_b(pv, i, ghmm_dpmodel_pair(ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
378 					ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
379 					mo->size_of_alphabet[mo->s[i].alphabet],
380 					mo->s[i].offset_x, mo->s[i].offset_y));
381 
382 	    /* this is the difference from the main loop:
383 	       check whether this state could be an initial state and add the
384 	       initial probability */
385 	    if (log_b_i == +1 ) {
386 	      set_phi(pv, u, v, i, +1);
387 	    }
388 	    else {
389 	      if (max_value == -DBL_MAX)
390 		set_phi(pv, u, v, i, +1);
391 	      else
392 		set_phi(pv, u, v, i, max_value);
393 	      /* if (mo->s[i].pi != 0 && mo->s[i].offset_x - 1 == u &&
394 		 mo->s[i].offset_y - 1 == v) { */
395 	      if (mo->s[i].log_pi != 1 && mo->s[i].offset_x - 1 == u &&
396 		  mo->s[i].offset_y - 1 == v) {
397 		set_phi(pv, u, v, i, mo->s[i].log_pi);
398 #ifdef DEBUG
399 		printf("Initial log prob state %i at (%i, %i) = %f\n", i, u, v, get_phi(pv, u, v, 0, 0, i));
400 		printf("Characters emitted X: %i, Y: %i\n",
401 		       ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
402 		       ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v));
403 #endif
404 	      }
405 	      if (get_phi(pv, u, v, 0, 0, i) != 1)
406 		set_phi(pv, u, v, i, get_phi(pv, u, v, 0, 0, i) + log_b_i);
407 	    }
408 	  }
409 	  /* if (v == 0) {
410 	     printf"(%i, %i, %i) preceding %i\n", u, v, i, pv->psi[u][v][i]);
411 	     } */
412 	} /* complete time step for emitting states */
413 
414 	/* last_osc = osc; */
415 	/* save last transition class */
416 
417 	/*if (mo->model_type & kSilentStates) {
418 	  p__viterbi_silent( mo, t, v );
419 	  }*/ /* complete time step for silent states */
420 
421 	/**************
422     for (j = 0; j < mo->N; j++)
423       {
424 	printf("\npsi[%d],in:%d, phi=%f\n", t, v->psi[t][j], v->phi[j]);
425        }
426 
427     for (i = 0; i < mo->N; i++){
428       printf("%d\t", former_matchcount[i]);
429     }
430 
431     for (i = 0; i < mo->N; i++){
432       printf("%d\t", recent_matchcount[i]);
433     }
434       ****************/
435       } /* End for v in Y */
436     /* Next character in X */
437     /* push back the old phi values */
438       push_back_phi(pv, Y->length);
439     } /* End for u in X */
440 }
441 
442 /*============================================================================*/
443 /* call this function when you are at position (x, y) and want to get the
444    probability of state 'state' which is offset_x and offset_y away from x,y */
get_phi(plocal_store_t * pv,int x,int y,int offset_x,int offset_y,int state)445 static double get_phi(plocal_store_t * pv, int x, int y, int offset_x, int offset_y, int state){
446 #ifdef DEBUG
447   if (y > pv->len_y || state > pv->mo->N || y < - pv->mo->max_offset_y)
448     fprintf(stderr, "get_phi: out of bounds %i %i %i\n",
449 	    x + pv->mo->max_offset_x, y + pv->mo->max_offset_y, state);
450 #endif
451 
452   if (y - offset_y + pv->mo->max_offset_y >= 0)
453     return pv->phi[offset_x][y - offset_y + pv->mo->max_offset_y][state];
454   else
455     return 1;
456 }
457 
458 /*============================================================================*/
459 /* set the value of this matrix cell */
set_phi(plocal_store_t * pv,int x,int y,int state,double prob)460 static void set_phi(plocal_store_t * pv, int x, int y, int state, double prob){
461 #ifdef DEBUG
462   if (y > pv->len_y || state > pv->mo->N || y < - pv->mo->max_offset_y)
463     fprintf(stderr, "set_phi: out of bounds %i %i %i %f\n",
464 	    x + pv->mo->max_offset_x, y + pv->mo->max_offset_y,
465 	    state, prob);
466 #endif
467   pv->phi[0][y + pv->mo->max_offset_y][state] = prob;
468 }
469 
470 /*============================================================================*/
471 /* since we only keep the frontier we have to push back when ever a row is
472    complete */
push_back_phi(plocal_store_t * pv,int length_y)473 static void push_back_phi(plocal_store_t * pv, int length_y){
474   int off_x, y, j;
475   /* push back the old phi values */
476   for (off_x=pv->mo->max_offset_x; off_x>0; off_x--)
477     for (y=0; y<length_y + pv->mo->max_offset_y + 1; y++)
478       for (j=0; j<pv->mo->N; j++)
479 	pv->phi[off_x][y][j] = pv->phi[off_x-1][y][j];
480 }
481 
482 /*============================================================================*/
set_psi(plocal_store_t * pv,int x,int y,int state,int from_state)483 static void set_psi(plocal_store_t * pv, int x, int y, int state, int from_state){
484   /* shift by max_offsets for negative indices */
485 #ifdef DEBUG
486   if (x > pv->len_x || y > pv->len_y || state > pv->mo->N ||
487       x < - pv->mo->max_offset_x || y < - pv->mo->max_offset_y)
488     fprintf(stderr, "set_psi: out of bounds %i %i %i %i\n",
489 	    x + pv->mo->max_offset_x, y + pv->mo->max_offset_y,
490 	    state, from_state);
491 #endif
492   pv->psi[x + pv->mo->max_offset_x][y + pv->mo->max_offset_y][state] = from_state;
493 }
494 
495 /*============================================================================*/
get_psi(plocal_store_t * pv,int x,int y,int state)496 static int get_psi(plocal_store_t * pv, int x, int y, int state) {
497   /* shift by max_offsets for negative indices*/
498 #ifdef DEBUG
499   if (x > pv->len_x || y > pv->len_y || state > pv->mo->N ||
500       x < - pv->mo->max_offset_x || y < - pv->mo->max_offset_y)
501     fprintf(stderr, "get_psi: out of bounds %i %i %i\n",
502 	    x + pv->mo->max_offset_x, y + pv->mo->max_offset_y,
503 	    state);
504 #endif
505   return pv->psi[x + pv->mo->max_offset_x][y + pv->mo->max_offset_y][state];
506 }
507 
508 /*============================================================================*/
ghmm_dpmodel_viterbi_test(ghmm_dpmodel * mo,ghmm_dpseq * X,ghmm_dpseq * Y,double * log_p,int * path_length)509 int *ghmm_dpmodel_viterbi_test(ghmm_dpmodel *mo, ghmm_dpseq * X, ghmm_dpseq * Y,
510 			  double *log_p, int *path_length) {
511   plocal_store_t *pv;
512   printf("---- viterbi test -----\n");
513   /*ghmm_dpmodel_print(mo);*/
514   pv = pviterbi_alloc(mo, X->length, Y->length);
515   printf("try free within pviterbi_test\n");
516   pviterbi_free(&pv, mo->N, X->length, Y->length, mo->max_offset_x ,
517 		mo->max_offset_y);
518   printf("OK\n");
519   return NULL;
520 }
521 
522 
523 /*============================================================================*/
ghmm_dpmodel_viterbi(ghmm_dpmodel * mo,ghmm_dpseq * X,ghmm_dpseq * Y,double * log_p,int * path_length)524 int *ghmm_dpmodel_viterbi(ghmm_dpmodel *mo, ghmm_dpseq * X, ghmm_dpseq * Y, double *log_p,
525 		     int *path_length) {
526   return ghmm_dpmodel_viterbi_variable_tb(mo, X, Y, log_p, path_length, -1);
527 }
528 
529 /*============================================================================*/
ghmm_dpmodel_viterbi_variable_tb(ghmm_dpmodel * mo,ghmm_dpseq * X,ghmm_dpseq * Y,double * log_p,int * path_length,int start_traceback_with)530 int *ghmm_dpmodel_viterbi_variable_tb(ghmm_dpmodel *mo, ghmm_dpseq * X, ghmm_dpseq * Y,
531 				 double *log_p, int *path_length,
532 				 int start_traceback_with) {
533 #define CUR_PROC "ghmm_dpmodel_viterbi"
534   int u, v, j, i, off_x, off_y, current_state_index;
535   double value, max_value, previous_prob;
536   plocal_store_t *pv;
537   int *state_seq = NULL;
538   int emission;
539   double log_b_i, log_in_a_ij;
540   double (*log_in_a)(plocal_store_t*, int, int, ghmm_dpseq*, ghmm_dpseq*, int, int);
541 
542   /* printf("---- viterbi -----\n"); */
543   i_list * state_list;
544   state_list = ighmm_list_init_list();
545   log_in_a = &sget_log_in_a;
546   /* int len_path  = mo->N*len; the length of the path is not known apriori */
547 
548 /*   if (mo->model_type & kSilentStates &&  */
549 /*       mo->silent != NULL &&  */
550 /*       mo->topo_order == NULL) { */
551 /*     ghmm_dmodel_topo_order( mo );  */
552 /*   } */
553 
554   /* Allocate the matrices log_in_a, log_b,Vektor phi, phi_new, Matrix psi */
555   pv = pviterbi_alloc(mo, X->length, Y->length);
556   if (!pv)                        { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; }
557 
558   /* Precomputing the log(a_ij) and log(bj(ot)) */
559   pviterbi_precompute(mo, pv);
560   /* Initialize the lookback matrix (for positions [-offsetX,0], [-1, len_y]*/
561   init_phi(pv, X, Y);
562 
563   /* u > max_offset_x , v starts -1 to allow states with offset_x == 0
564      which corresponds to a series of gap states before reading the first
565      character of x at position x=0, y=v */
566   /** THIS IS THE MAIN RECURRENCE **/
567   for (u = mo->max_offset_x + 1; u < X->length; u++) {
568     for (v = -mo->max_offset_y; v < Y->length; v++) {
569       for (j = 0; j < mo->N; j++)
570 	{
571 	  /** initialization of phi (lookback matrix), psi (traceback) **/
572 	  set_phi(pv, u, v, j, +1);
573 	  set_psi(pv, u, v, j, -1);
574 	}
575 
576       for (i = 0; i < mo->N; i++) {
577 	/* Determine the maximum */
578 	/* max_phi = phi[i] + log_in_a[j][i] ... */
579 	if (!(mo->model_type & GHMM_kSilentStates) || !mo->silent[i] ) {
580 	  max_value = -DBL_MAX;
581 	  set_psi(pv, u, v, i, -1);
582 	  for (j = 0; j < mo->s[i].in_states; j++) {
583 	    /* look back in the phi matrix at the offsets */
584 	    previous_prob = get_phi(pv, u, v, mo->s[i].offset_x, mo->s[i].offset_y, mo->s[i].in_id[j]);
585 	    log_in_a_ij = (*log_in_a)(pv, i, j, X, Y, u, v);
586 	    if ( previous_prob != +1 && log_in_a_ij != +1) {
587 	      value = previous_prob + log_in_a_ij;
588 	      if (value > max_value) {
589 		max_value = value;
590 		set_psi(pv, u, v, i, mo->s[i].in_id[j]);
591 	      }
592 	    }
593 	    else
594 	      {;} /* fprintf(stderr, " %d --> %d = %f, \n", i,i,v->log_in_a[i][i]); */
595 	  }
596 
597 	  emission = ghmm_dpmodel_pair(ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
598 			      ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
599 			      mo->size_of_alphabet[mo->s[i].alphabet],
600 			      mo->s[i].offset_x, mo->s[i].offset_y);
601 #ifdef DEBUG
602 	  if (emission > ghmm_dpmodel_emission_table_size(mo, i)){
603 	    printf("State %i\n", i);
604 	    ghmm_dpmodel_state_print(&(mo->s[i]));
605 	    printf("charX: %i charY: %i alphabet size: %i emission table: %i emission index: %i\n",
606 		   ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
607 		   ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
608 		   mo->size_of_alphabet[mo->s[i].alphabet],
609 		   ghmm_dpmodel_emission_table_size(mo, i), emission);
610 	  }
611 #endif
612 	  log_b_i = log_b(pv, i, ghmm_dpmodel_pair(ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
613 				      ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
614 				      mo->size_of_alphabet[mo->s[i].alphabet],
615 				      mo->s[i].offset_x, mo->s[i].offset_y));
616 
617 	  /* No maximum found (that is, state never reached)
618 	     or the output O[t] = 0.0: */
619 	  if (max_value == -DBL_MAX ||/* and then also: (v->psi[t][j] == -1) */
620 	      log_b_i == +1 ) {
621 	    set_phi(pv, u, v, i, +1);
622 	  }
623 	  else
624 	    set_phi(pv, u, v, i, max_value + log_b_i);
625 	}
626       } /* complete time step for emitting states */
627 
628 	/* last_osc = osc; */
629         /* save last transition class */
630 
631       /*if (mo->model_type & kSilentStates) {
632 	p__viterbi_silent( mo, t, v );
633 	}*/ /* complete time step for silent states */
634 
635       /**************
636     for (j = 0; j < mo->N; j++)
637       {
638 	printf("\npsi[%d],in:%d, phi=%f\n", t, v->psi[t][j], v->phi[j]);
639        }
640 
641     for (i = 0; i < mo->N; i++){
642       printf("%d\t", former_matchcount[i]);
643     }
644 
645     for (i = 0; i < mo->N; i++){
646       printf("%d\t", recent_matchcount[i]);
647     }
648       ****************/
649     } /* End for v in Y */
650     /* Next character in X */
651     push_back_phi(pv, Y->length);
652   } /* End for u in X */
653 
654   /* Termination */
655   max_value = -DBL_MAX;
656   ighmm_list_append(state_list, -1);
657   /* if start_traceback_with is -1 (it is by default) search for the most
658      likely state at the end of both sequences */
659   if (start_traceback_with == -1) {
660     for (j = 0; j < mo->N; j++){
661 #ifdef DEBUG
662       printf("phi(len_x)(len_y)(%i)=%f\n", j, get_phi(pv, u, Y->length-1, 0, 0, j));
663 #endif
664       if ( get_phi(pv, u, Y->length-1, 0, 0, j) != +1 &&
665 	   get_phi(pv, u, Y->length-1, 0, 0, j) > max_value) {
666 	max_value = get_phi(pv, X->length-1, Y->length-1, 0, 0, j);
667 	state_list->last->val = j;
668       }
669     }
670   }
671   /* this is the special traceback mode for the d & c algorithm that also
672      connects the traceback to the first state of the rest of the path */
673   else {
674 #ifdef DEBUG
675     printf("D & C traceback from state %i!\n", start_traceback_with);
676     printf("Last characters emitted X: %i, Y: %i\n",
677 	   ghmm_dpseq_get_char(X, mo->s[start_traceback_with].alphabet,
678 			       X->length-1),
679 	   ghmm_dpseq_get_char(Y, mo->s[start_traceback_with].alphabet,
680 			       Y->length-1));
681     for (j = 0; j < mo->N; j++){
682       printf("phi(len_x)(len_y)(%i)=%f\n", j, get_phi(pv, X->length-1, Y->length-1, 0, 0, j));
683     }
684 #endif
685     max_value = get_phi(pv, X->length-1, Y->length-1, 0, 0, start_traceback_with);
686     if (max_value != 1 && max_value > -DBL_MAX)
687       state_list->last->val = start_traceback_with;
688   }
689   if (max_value == -DBL_MAX) {
690     /* Sequence can't be generated from the model! */
691     *log_p = +1;
692     /* Backtracing doesn't work, because state_seq[*] allocated with -1 */
693     /* for (t = len - 2; t >= 0; t--)
694        state_list->last->val = -1;    */
695   }
696   else {
697     /* Backtracing, should put DEL path nicely */
698     *log_p = max_value;
699     /* removed the handling of silent states here */
700     /* start trace back at the end of both sequences */
701     u = X->length - 1;
702     v = Y->length - 1;
703     current_state_index = state_list->first->val;
704     off_x = mo->s[current_state_index].offset_x;
705     off_y = mo->s[current_state_index].offset_y;
706     while (u - off_x >= -1 && v - off_y >= -1 && current_state_index != -1) {
707       /* while (u > 0 && v > 0) { */
708       /* look up the preceding state and save it in the first position of the
709 	 state list */
710       /* printf("Current state %i at (%i,%i) -> preceding state %i\n",
711 	 current_state_index, u, v, get_psi(pv, u, v, current_state_index)); */
712       /* update the current state */
713       current_state_index = get_psi(pv, u, v, current_state_index);
714       if (current_state_index != -1)
715 	ighmm_list_insert(state_list, current_state_index);
716       /* move in the alignment matrix */
717       u -= off_x;
718       v -= off_y;
719       /* get the next offsets */
720       off_x = mo->s[current_state_index].offset_x;
721       off_y = mo->s[current_state_index].offset_y;
722     }
723   }
724 
725   /* Free the memory space */
726   pviterbi_free(&pv, mo->N, X->length, Y->length, mo->max_offset_x ,
727 		mo->max_offset_y);
728   /* printf("After traceback: last state = %i\n", state_list->last->val); */
729   state_seq = ighmm_list_to_array(state_list);
730   *path_length = state_list->length;
731   /* PRINT PATH */
732 
733 /*   fprintf(stderr, "Viterbi path: " ); */
734 /*   int t; */
735 /*   for(t=0; t < *path_length; t++) */
736 /*     if (state_seq[t] >= 0) fprintf(stderr, " %d ",  state_seq[t]); */
737 /*   fprintf(stderr, "\n Freeing ... \n");  */
738   return (state_seq);
739 STOP:     /* Label STOP from ARRAY_[CM]ALLOC */
740   /* Free the memory space */
741   pviterbi_free(&pv, mo->N, X->length, Y->length, mo->max_offset_x,
742 		mo->max_offset_y);
743   m_free(state_seq);
744   ighmm_list_free(state_list);
745   return NULL;
746 #undef CUR_PROC
747 } /* viterbi */
748 
749 
750 /*============================================================================*/
ghmm_dpmodel_viterbi_logp(ghmm_dpmodel * mo,ghmm_dpseq * X,ghmm_dpseq * Y,int * state_seq,int state_seq_len)751 double ghmm_dpmodel_viterbi_logp(ghmm_dpmodel *mo, ghmm_dpseq * X, ghmm_dpseq * Y,
752 			    int *state_seq, int state_seq_len) {
753 #define CUR_PROC "ghmm_dpmodel_viterbi_logp"
754   int s, t, i, j, u, v;
755   double log_p = 0.0;
756   double log_b_i = 1.0;
757   double log_in_a = 1.0;
758   plocal_store_t *pv;
759 
760   /* Allocate the matrices log_in_a, log_b,Vektor phi, phi_new, Matrix psi */
761   pv = pviterbi_alloc(mo, 0, 0);
762   pviterbi_precompute(mo, pv);
763   /* initial state */
764   t = 0; u = -1; v = -1;
765   if (state_seq_len > t) {
766     i = state_seq[t];
767     /* initial state probability */
768     /* log_p += log(mo->s[i].pi); */
769     log_p += mo->s[i].log_pi;
770     if (log_p == 1.0) {
771       pviterbi_free(&pv, mo->N, 0, 0, mo->max_offset_x, mo->max_offset_y);
772       fprintf(stderr, "the initial probability of state %i is zero\n", i);
773       return 1.0;/* the initial prob is zero */
774     }
775     /* consume the first characters of the sequences */
776     u += mo->s[i].offset_x;
777     v += mo->s[i].offset_y;
778     /* get the emission probability */
779     log_b_i = log_b(pv, i, ghmm_dpmodel_pair(ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
780 				ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
781 				mo->size_of_alphabet[mo->s[i].alphabet],
782 				mo->s[i].offset_x, mo->s[i].offset_y));
783     if (log_b_i == 1.0) { /* chars cant be emitted */
784       pviterbi_free(&pv, mo->N, 0, 0, mo->max_offset_x, mo->max_offset_y);
785       fprintf(stderr, "characters (%i, %i) at position (%i, %i) cannot be emitted by state %i (t=%i)\n",
786 	      ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
787 	      ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v), u, v, i, t);
788       return 1.0;
789     }
790     log_p += log_b_i;
791   }
792   else { /* there is no path.. */
793     pviterbi_free(&pv, mo->N, 0, 0, mo->max_offset_x, mo->max_offset_y);
794     fprintf(stderr, "No path given!\n");
795     return 1.0;
796   }
797   /* rest of the sequence */
798   for (t=1; t<state_seq_len; t++) {
799     j = i; /* predecessor state */
800     i = state_seq[t]; /* current state */
801     /* consume the next characters */
802     u += mo->s[i].offset_x;
803     v += mo->s[i].offset_y;
804     if (u >= X->length || v >= Y->length) { /* path consumes too many chars */
805       pviterbi_free(&pv, mo->N, 0, 0, mo->max_offset_x, mo->max_offset_y);
806       fprintf(stderr, "path consumes too many chars\n");
807       return 1.0;
808     }
809     /* transition prob state j -> i*/
810     log_in_a = 1.0;
811     for (s=0; s<mo->s[i].in_states; s++) {
812       if (mo->s[i].in_id[s] == j) {
813 	log_in_a = sget_log_in_a(pv, i, s, X, Y, u, v);
814 	break;
815       }
816     }
817     if (log_in_a == 1.0) {
818       pviterbi_free(&pv, mo->N, 0, 0, mo->max_offset_x, mo->max_offset_y);
819       fprintf(stderr, "transition (%i -> %i) at t=%i not possible\n", j, i,t);
820       return 1.0; /* transition not possible */
821     }
822     /* emission probability */
823     log_b_i = log_b(pv, i, ghmm_dpmodel_pair(ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
824 				ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v),
825 				mo->size_of_alphabet[mo->s[i].alphabet],
826 				mo->s[i].offset_x, mo->s[i].offset_y));
827     if (log_b_i == 1.0) {
828       pviterbi_free(&pv, mo->N, 0, 0, mo->max_offset_x, mo->max_offset_y);
829       fprintf(stderr, "characters (%i, %i) at position (%i, %i) cannot be emitted by state %i (t=%i)\n",
830 	      ghmm_dpseq_get_char(X, mo->s[i].alphabet, u),
831 	      ghmm_dpseq_get_char(Y, mo->s[i].alphabet, v), u, v, i, t);
832       return 1.0; /* characters cant be emitted */
833     }
834     log_p += log_in_a + log_b_i;
835   }
836   pviterbi_free(&pv, mo->N, 0, 0, mo->max_offset_x,
837 		mo->max_offset_y);
838   /* check if all of the sequences has been consumed */
839   if (u != X->length - 1 && v != Y->length - 1) {
840     fprintf(stderr, "path consumes not all characters (%i of %i, %i of %i)\n",
841 	    u + 1, X->length, v + 1, Y->length);
842     return 1.0;
843   }
844   return log_p;
845 
846 #undef CUR_PROC
847 } /* ghmm_dpmodel_viterbi_logp */
848 
849