1 #include "model.h"
2 #include "fbgibbs.h"
3 #include "foba.h"
4 #include "matrix.h"
5 #include "ghmm.h"
6 #include "mes.h"
7 #include "ghmm_internals.h"
8 #include "sequence.h"
9 #include "rng.h"
10 #include "randvar.h"
11 #include "obsolete.h"
12 
13 #ifdef HAVE_CONFIG_H
14 #  include "../config.h"
15 #endif
16 
17 //===========================================================================================
18 //=====================            sampleing           ======================================
19 //===========================================================================================
20 //returns a sample from a cdf  **could use binary search here**
sample(int seed,double * dist,int N)21 int sample(int seed, double* dist, int N){
22     //printf("sample\n\n");
23 
24     double total = dist[N-1];
25     //printf("total = %f\n", total);
26     double rn = ighmm_rand_uniform_cont(seed, total, 0.0f);//XXX exception handleing what if total=0
27     //printf("rn = %f \n\n", rn);
28     int i;
29     if(rn <= dist[0])
30       return 0;
31     for(i = 1; i < N; i++){
32        if(dist[i-1] <rn && rn <= dist[i])
33          return i;
34    }
35    return N-1;
36 }
37 
sampleStatePath(int N,double * alpha,double *** pmats,int T,int * states)38 void sampleStatePath(int N, double *alpha, double ***pmats, int T, int* states){
39 #define CUR_PROC "sampleStatePath"
40   //printf("sampleStatePath\n\n");
41   int i, j;
42   double temp[N];
43   double dist[N];
44   temp[0] = alpha[0];
45   for(i = 1; i < N; i++)
46     temp[i] = temp[i-1] + alpha[i];
47   //printf("tmp1 = %f, tmp2 = %f\n", alpha[0], temp[1]);
48   states[T-1] = sample(0, temp, N);
49   //printf("State T-1 = %d\n", states[T-1]);
50   for(i = T-2; i >=0; i--){
51     //printf("pmats %d, %d, %d = %f\n", i+1, states[i+1], mo->N-1, pmats[i+1][states[i+1]][mo->N-1]);
52     states[i] = sample(0, pmats[i+1][states[i+1]], N);
53     //printf("State %d = %d\n", i, states[i]);
54   }
55 #undef CUR_PROC
56 }
57 
58 //===========================================================================================
59 //========= Modified forward to get an array of matrices used for sampling ==================
60 //===========================================================================================
61                                  /* ghmm_dmodel_forwardGibbs_init */
ghmm_dmodel_forwardGibbs_init(ghmm_dmodel * mo,double * alpha_1,int symb,double * scale)62 int ghmm_dmodel_forwardGibbs_init (ghmm_dmodel * mo, double *alpha_1, int symb, double *scale){
63 # define CUR_PROC "ghmm_dmodel_forwardGibbs_init"
64   int i, j, id, in_id;
65   double c_0;
66   *scale = 0.0;
67 
68   /*printf(" *** foba_initforward\n");*/
69 
70   /*iterate over non-silent states*/
71   /*printf(" *** iterate over non-silent states \n");*/
72   for (i = 0; i < mo->N; i++) {
73     if (!(mo->model_type & GHMM_kSilentStates) || !(mo->silent[i])) {
74       /*no starting in states with order > 0 !!!*/
75       if (!(mo->model_type & GHMM_kHigherOrderEmissions) || mo->order[i] == 0) {
76         alpha_1[i] = mo->s[i].pi * mo->s[i].b[symb];
77         /*printf("\nalpha1[%i]=%f\n",i,alpha_1[i]);*/
78 
79         *scale += alpha_1[i];
80       }
81       else {
82         alpha_1[i] = 0;
83       }
84     }
85   }
86   /*iterate over silent states*/
87   /*printf(" *** iterate over silent states \n");*/
88   if (mo->model_type & GHMM_kSilentStates) {
89     for (i = 0; i < mo->topo_order_length; i++) {
90       id = mo->topo_order[i];
91       alpha_1[id] = mo->s[id].pi;
92 
93       /*printf("\nsilent_start alpha1[%i]=%f\n",id,alpha_1[id]);*/
94 
95       for (j = 0; j < mo->s[id].in_states; j++) {
96         in_id = mo->s[id].in_id[j];
97         alpha_1[id] += mo->s[id].in_a[j] * alpha_1[in_id];
98 
99         /*printf("\n\tsilent_run alpha1[%i]=%f\n",id,alpha_1[id]);*/
100 
101       }
102       *scale += alpha_1[id];
103     }
104   }
105 
106   /* printf("\nwo label: scale[0] = %f\n",scale[0]); */
107 
108   if (*scale >= GHMM_EPS_PREC) {
109     c_0 = 1 / *scale;
110     for (i = 0; i < mo->N; i++)
111       alpha_1[i] *= c_0;
112   }
113   return (0);                   /* attention: scale might be 0 */
114 # undef CUR_PROC
115 }                               /* ghmm_dmodel_forwardGibbs_init */
116 
117 /*----------------------------------------------------------------------------*/
118 
ghmm_dmodel_forwardGibbs_step(ghmm_dmodel * mo,ghmm_dstate * s,double * alpha_t,const double b_symb,double *** pmats,int t,int j)119 double ghmm_dmodel_forwardGibbs_step (ghmm_dmodel *mo, ghmm_dstate * s, double *alpha_t, const double b_symb, double*** pmats, int t, int j)
120 {
121 #define CUR_PROC "ghmm_dmodel_forwardGibbs_step"
122   int i, id;
123   double value = 0.0;
124   int prv;
125   if (b_symb < GHMM_EPS_PREC)
126     return 0.0;
127 
128   /*printf(" *** fobagibbs_stepforward\n");*/
129   //printf("%d\n", s->in_states);
130   prv = mo->N;
131   for (i = 0; i < s->in_states; i++) {
132     id = s->in_id[i];
133     pmats[t][j][id] = s->in_a[i] * alpha_t[id]*b_symb;
134     value += pmats[t][j][id];
135 
136     //printf("t = %d,j = %d,id = %d, value %f, p_symb %f, pmats %f\n",t,j,id, value, b_symb, pmats[t][j][id]);
137     for(; prv < id; prv++){
138         pmats[t][j][prv+1] += pmats[t][j][prv];
139         //printf("asdf t = %d,j = %d,i = %d, value %f, p_symb %f, pmats %f\n",t,j,prv+1, value, b_symb, pmats[t][j][prv+1]);
140     }
141     prv = id;
142     //printf("t = %d,j = %d,id = %d, value %f, p_symb %f, pmats %f\n",t,j,id, value, b_symb, pmats[t][j][id]);
143   }
144   for(prv+=1;prv<mo->N;prv++){
145       pmats[t][j][prv] += pmats[t][j][prv-1];
146       //printf("t = %d,j = %d,i = %d, value %f, p_symb %f, pmats %f\n",t,j,prv, value, b_symb, pmats[t][j][prv]);
147   }
148   return (value);
149 #undef CUR_PROC
150 }                               /* ghmm_dmodel_forwardGibbs_step */
151 
152 /*============================================================================*/
153 
ghmm_dmodel_forwardGibbs(ghmm_dmodel * mo,const int * O,int len,double ** alpha,double *** pmats)154 int ghmm_dmodel_forwardGibbs (ghmm_dmodel * mo, const int *O, int len, double **alpha, double*** pmats)
155 {
156 # define CUR_PROC "ghmm_dmodel_forwardGibbs"
157   char * str;
158   int i, t, id;
159   int e_index;
160   double c_t;
161   double scale;
162 
163 
164   if (mo->model_type & GHMM_kSilentStates)
165     ghmm_dmodel_order_topological(mo);
166 
167   ghmm_dmodel_forwardGibbs_init (mo, alpha[0], O[0], &scale);
168 
169   if (scale < GHMM_EPS_PREC) {
170     /* means: first symbol can't be generated by hmm */
171     printf("\nscale kleiner als eps (line_no: 123)\n");
172     return -1;
173   }
174   else {
175     for (t = 1; t < len; t++) {
176 
177       scale = 0.0;
178       update_emission_history (mo, O[t - 1]);
179 
180       //printf("\n\nStep t=%i mit len=%i, O[i]=%i\n",t,len,O[t]);
181       //printf("iterate over non-silent state\n");
182       /* iterate over non-silent states */
183       for (i = 0; i < mo->N; i++) {
184         if (!(mo->model_type & GHMM_kSilentStates) || !(mo->silent[i])) {
185           e_index = get_emission_index (mo, i, O[t], t);
186           if (e_index != -1) {
187             //printf("pmat %d, %d \n", t, i);
188             alpha[t][i] =
189               ghmm_dmodel_forwardGibbs_step (mo, &mo->s[i], alpha[t - 1], mo->s[i].b[e_index], pmats, t, i);
190             scale += alpha[t][i];
191           }
192           else {
193             alpha[t][i] = 0;
194           }
195         }
196       }
197       /* iterate over silent states */
198       //printf("iterate over silent state\n");
199       if (mo->model_type & GHMM_kSilentStates) {
200         for (i = 0; i < mo->topo_order_length; i++) {
201           /*printf("\nget id\n");*/
202           id = mo->topo_order[i];
203           /*printf("  akt_ state %d\n",id);*/
204           /*printf("\nin stepforward\n");*/
205           alpha[t][id] = ghmm_dmodel_forwardGibbs_step (mo, &mo->s[id], alpha[t], 1, pmats, t, i);
206           /*printf("\nnach stepforward\n");*/
207           scale += alpha[t][id];
208         }
209       }
210 
211       if (scale < GHMM_EPS_PREC) {
212         return -1;
213       }
214       c_t = 1 / scale;
215       for (i = 0; i < mo->N; i++) {
216         alpha[t][i] *= c_t;
217       }
218     }
219   }
220   return 0;
221 #undef CUR_PROC
222 }
223 
224 
225 			/* ghmm_dmodel_forwardGibbs */
226 //======================================================================================
227 //======================== update parameters ===========================================
228 //======================================================================================
allocCounts(ghmm_dmodel * mo,double *** transitions,double ** obsinstate,double *** obsinstatealpha)229 void allocCounts(ghmm_dmodel* mo, double ***transitions, double **obsinstate,
230         double ***obsinstatealpha){
231 #define CUR_PROC "allocCount"
232     *transitions = ighmm_cmatrix_alloc(mo->N,mo->N);
233     ARRAY_CALLOC(*obsinstate, mo->N);
234     *obsinstatealpha = ighmm_cmatrix_alloc(mo->N,mo->M);
235 STOP:
236     return;//XXX error handle
237 #undef CUR_PROC
238 }
allocCountsH(ghmm_dmodel * mo,double *** transitions,double ** obsinstate,double *** obsinstatealpha)239 void allocCountsH(ghmm_dmodel* mo, double ***transitions, double **obsinstate,
240         double ***obsinstatealpha){
241 #define CUR_PROC "allocCountsH"
242   *transitions = ighmm_cmatrix_alloc(mo->N, mo->N);
243   ARRAY_CALLOC(*obsinstate, mo->N);
244   ARRAY_CALLOC(*obsinstatealpha, mo->N);
245   int l;
246   for(l = 0; l < mo->N; l++){
247      ARRAY_CALLOC((*obsinstatealpha)[l], ghmm_ipow (mo, mo->M, mo->order[l]+1));
248   }
249 STOP:
250   return;
251 #undef CUR_PROC
252 }
253 
freeCounts(ghmm_dmodel * mo,double *** transitions,double ** obsinstate,double *** obsinstatealpha)254 void freeCounts(ghmm_dmodel* mo, double ***transitions, double **obsinstate, double ***obsinstatealpha){
255 #define CUR_PROC "freeCounts"
256     ighmm_cmatrix_free(transitions, mo->N);
257     ighmm_cmatrix_free(obsinstatealpha, mo->N);
258     m_free(*obsinstate);
259 #undef CUR_PROC
260 }
261 
freeCountsH(ghmm_dmodel * mo,double *** transitions,double ** obsinstate,double *** obsinstatealpha)262 void freeCountsH(ghmm_dmodel* mo, double ***transitions, double **obsinstate, double ***obsinstatealpha){
263 #define CUR_PROC "freeCountsH"
264     ighmm_cmatrix_free(transitions, mo->N);
265     m_free(*obsinstate);
266     int l;
267     for(l=0;l<mo->N;l++)
268         m_free((*obsinstatealpha)[l]);
269     m_free(*obsinstatealpha);
270 #undef CUR_PROC
271 }
272 
273 
initCounts(ghmm_dmodel * mo,double ** transition,double * obsinstate,double ** obsinstatealpha,double ** pA,double ** pB,double * pPi)274 void initCounts(ghmm_dmodel* mo, double **transition, double *obsinstate, double **obsinstatealpha,
275         double **pA, double **pB, double *pPi){
276     //initialize
277     int i, k;
278     for(i=0;i<mo->N;i++){
279         obsinstate[i]  = pPi[i];
280         for(k=0;k<mo->N;k++){
281             transition[i][k] = pA[i][k];
282         }
283         for(k=0;k<mo->M;k++){
284             obsinstatealpha[i][k] = pB[i][k];
285         }
286     }
287 }
288 
initCountsH(ghmm_dmodel * mo,double ** transition,double * obsinstate,double ** obsinstatealpha,double ** pA,double ** pB,double * pPi)289 void initCountsH(ghmm_dmodel* mo, double **transition, double *obsinstate, double **obsinstatealpha,
290         double **pA, double **pB, double *pPi){
291     int i,k;
292     for(i=0;i<mo->N;i++){
293         obsinstate[i]  = pPi[i];
294         for(k=0;k<mo->N;k++){
295             transition[i][k] = pA[i][k];
296         }
297         for(k=0;k<ghmm_ipow (mo, mo->M, mo->order[i]+1);k++){
298             obsinstatealpha[i][k] = pB[i][k];
299         }
300     }
301 }
302 
getCounts(int * states,int * O,int T,double ** transition,double * obsinstate,double ** obsinstatealpha)303 void getCounts(int *states, int* O, int T,
304         double **transition, double *obsinstate,
305         double **obsinstatealpha){
306     int i,k;
307    //A, B
308     for(i=0;i<T;i++){
309         obsinstate[states[i]] += 1;
310         obsinstatealpha[states[i]][O[i]] += 1;
311     }
312     for(i=0;i<T-1;i++){
313         transition[states[i]][states[i+1]] += 1;
314     }
315 }
316 
getCountsH(ghmm_dmodel * mo,int * states,int * O,int T,double ** transition,double * obsinstate,double ** obsinstatealpha)317 void getCountsH(ghmm_dmodel* mo, int *states, int* O, int T,
318         double **transition, double *obsinstate,
319         double **obsinstatealpha){
320     mo->emission_history = 0;
321     int i;
322 
323     //A, B
324     for(i=0;i<T;i++){
325         if(mo->order[states[i]] == 0)//only want to count first order states for pi
326           obsinstate[states[i]] += 1;
327         int e_index = get_emission_index (mo, states[i], O[i], i);
328         if (-1 != e_index)
329             obsinstatealpha[states[i]][e_index] += 1;
330         update_emission_history (mo, O[i]);
331     }
332     for(i=0;i<T-1;i++){
333         transition[states[i]][states[i+1]] += 1;
334     }
335 }
336 //given states, psueodocount matrices pA, pB, pPi see wiki, calculates new A,B,Pi
337 //XXX should use fix in state
338 //assumes psuedocount preserves structure, ie doesnt add 1 to a zero transition.
update(ghmm_dmodel * mo,double ** transition,double * obsinstate,double ** obsinstatealpha)339 void update(ghmm_dmodel* mo, double **transition, double *obsinstate, double **obsinstatealpha){
340 #define CUR_PROC "update"
341   double tmp_m[mo->M];
342   double tmp_n[mo->N];
343   int i, k;
344   for(i=0;i<mo->N;i++){
345       if(!mo->s[i].fix)//fix == 1 dont change b
346             ighmm_rand_dirichlet(0, mo->M, obsinstatealpha[i], tmp_m);
347         ighmm_rand_dirichlet(0, mo->N, transition[i], tmp_n);
348         //update model
349         if(!mo->s[i].fix){//dont update if fix
350             for(k = 0; k < mo->M; k++){
351                 mo->s[i].b[k] = tmp_m[k];
352             }
353         }
354         for(k = 0; k < mo->N; k++){
355 	    ghmm_dmodel_set_transition(mo, i, k, tmp_n[k]);//linear search optimize for mo->N>10?
356         }
357     }
358 
359     ighmm_rand_dirichlet(0, mo->N, obsinstate, tmp_n);
360     for(k=0;k<mo->N;k++){
361         mo->s[k].pi = tmp_n[k];
362     }
363     if (mo->model_type & GHMM_kTiedEmissions)
364         ghmm_dmodel_update_tie_groups (mo);
365 
366 #undef CUR_PROC
367 }
368 
369 
updateH(ghmm_dmodel * mo,double ** transition,double * obsinstate,double ** obsinstatealpha)370 void updateH(ghmm_dmodel* mo, double **transition, double *obsinstate, double **obsinstatealpha){
371 #define CUR_PROC "updateH"
372     double tmp_n[mo->N];
373     double tmp_m[mo->M];
374     double *p;
375     int i,k,l;
376     for(i=0;i<mo->N;i++){
377         ighmm_rand_dirichlet(0, mo->N, transition[i], tmp_n);
378         for(k = 0; k < mo->N; k++){
379 	    ghmm_dmodel_set_transition(mo, i, k, tmp_n[k]);
380         }
381         if(!mo->s[i].fix){
382             p = obsinstatealpha[i];
383             for(k = 0; k < ghmm_ipow(mo, mo->M, mo->order[i]); k++){
384                 ighmm_rand_dirichlet(0,mo->M,p+k*mo->M, tmp_m);
385                 for(l = 0; l < mo->M; l++){
386                     mo->s[i].b[k*mo->M + l] = tmp_m[l];
387                 }
388             }
389         }
390     }
391     ighmm_rand_dirichlet(0, mo->N, obsinstate, tmp_n);
392     for(k=0;k<mo->N;k++){
393         mo->s[k].pi = tmp_n[k];
394     }
395     if (mo->model_type & GHMM_kTiedEmissions)
396         ghmm_dmodel_update_tie_groups (mo);
397 #undef CUR_PROC
398 }
399 //================================================================================================
400 //=============================fbgibbs============================================================
401 //================================================================================================
402 
403 //allocates space and initilizes prior counts to 1 if not null
init_priors(ghmm_dmodel * mo,double *** pA,double *** pB,double ** pPi)404 void init_priors(ghmm_dmodel *mo, double ***pA, double ***pB, double **pPi){
405   int i, j, k;
406   if(!(*pA)){
407     *pA = ighmm_cmatrix_alloc(mo->N, mo->N);
408     for(i=0;i<mo->N;i++){
409       for(j=0;j<mo->N;j++){
410           (*pA)[i][j] = 1;
411       }
412     }
413   }
414   if(!(*pPi)){
415     *pPi = malloc(sizeof(double)*mo->N);
416     for(i=0;i<mo->N;i++){
417        (*pPi)[i] = 1;
418     }
419   }
420   if(!(*pB)){
421     if(mo->model_type & GHMM_kHigherOrderEmissions){
422       *pB = malloc(sizeof(double*)*mo->N);
423       for(i = 0; i < mo->N; i++){
424         (*pB)[i] = (double*)malloc(sizeof(double)*ghmm_ipow (mo, mo->M, mo->order[i]+1));
425         for(j = 0; j < ghmm_ipow (mo, mo->M, mo->order[i]+1); j++){
426           (*pB)[i][j] = 1;
427         }
428       }
429     }
430     else{
431       *pB = ighmm_cmatrix_alloc(mo->N, mo->M);
432       for(i=0;i<mo->N;i++){
433         for(j = 0; j < mo->M; j++){
434           (*pB)[i][j] = 1;
435         }
436       }
437     }
438   }
439 }
440 
441 
ghmm_dmodel_fbgibbstep(ghmm_dmodel * mo,int * O,int len,int * Q,double ** alpha,double *** pmats)442 void ghmm_dmodel_fbgibbstep (ghmm_dmodel * mo, int* O, int len, int *Q, double** alpha, double***pmats){
443 #define CUR_PROC "ghmm_dmodel_fbgibbstep"
444   //sample state sequence
445   //update parameters
446   //printf("fbgibbsStep \n\n");
447   int i,j,k;
448   for(i = 0; i < len; i++){
449     for(j = 0; j < mo->N; j++){
450       alpha[i][j] = 0;
451       for(k = 0; k < mo->N; k++){
452         pmats[i][j][k] = 0;
453       }
454     }
455   }
456   ghmm_dmodel_forwardGibbs(mo, O, len, alpha, pmats);
457   sampleStatePath(mo->N, alpha[len-1], pmats, len, Q);
458   //printf("done samplestatepath\n\n");
459   //for(i = 0; i < len; i++){
460     //printf("%d ", Q[i]);
461   //}
462   //printf("\n");
463 #undef CUR_PROC
464 }
465 
ghmm_dmodel_fbgibbs(ghmm_dmodel * mo,ghmm_dseq * seq,double ** pA,double ** pB,double * pPi,int burnIn,int seed)466 int** ghmm_dmodel_fbgibbs(ghmm_dmodel * mo, ghmm_dseq*  seq, double **pA, double **pB, double *pPi, int burnIn, int seed){
467 #ifdef DO_WITH_GSL
468 #define CUR_PROC "ghmm_dmodel_fbgibbs"
469   //initilizations
470   GHMM_RNG_SET (RNG, seed);
471   int **Q;
472   ARRAY_MALLOC (Q, seq->seq_number);
473   int i;
474   int len = 0;
475   for(i = 0; i < seq->seq_number; i++){
476       ARRAY_MALLOC (Q[i], seq->seq_len[i]);
477       if(len < seq->seq_len[i])
478           len = seq->seq_len[i];
479   }
480   double **alpha = ighmm_cmatrix_alloc(len, mo->N);
481   double ***pmats = ighmm_cmatrix_3d_alloc(len, mo->N, mo->N);
482   double **transitions,**obsinstatealpha;
483   double *obsinstate;
484 
485   if(mo->model_type & GHMM_kHigherOrderEmissions){//higher order
486       allocCountsH(mo, &transitions, &obsinstate, &obsinstatealpha);
487       for(;burnIn > 0; burnIn--){
488          initCountsH(mo, transitions, obsinstate, obsinstatealpha, pA, pB, pPi);
489          if(burnIn % 100 == 0) printf("iter %d\n", burnIn);
490          for(i = 0; i < seq->seq_number; i++){
491              ghmm_dmodel_fbgibbstep(mo, seq->seq[i], seq->seq_len[i], Q[i], alpha, pmats);
492              getCountsH(mo, Q[i], seq->seq[i], seq->seq_len[i], transitions, obsinstate, obsinstatealpha);
493          }
494          updateH(mo, transitions, obsinstate, obsinstatealpha);
495       }
496       freeCountsH(mo, &transitions, &obsinstate, &obsinstatealpha);
497 
498   }
499   else{
500       allocCounts(mo, &transitions, &obsinstate, &obsinstatealpha);
501 
502       for(;burnIn > 0; burnIn--){
503          initCounts(mo, transitions, obsinstate, obsinstatealpha, pA, pB, pPi);
504          for(i = 0; i < seq->seq_number; i++){
505              ghmm_dmodel_fbgibbstep(mo, seq->seq[i], seq->seq_len[i], Q[i], alpha, pmats);
506              getCounts(Q[i], seq->seq[i], seq->seq_len[i], transitions, obsinstate, obsinstatealpha);
507          }
508          update(mo, transitions, obsinstate, obsinstatealpha);
509       }
510       freeCounts(mo, &transitions, &obsinstate, &obsinstatealpha);
511   }
512   ighmm_cmatrix_3d_free(&pmats, len, mo->N);
513   ighmm_cmatrix_free(&alpha, len);
514   return Q;
515 STOP:
516   return NULL;
517 #undef CUR_PROC
518 #else
519    printf("fbgibbs uses gsl for dirichlete distrubutions, compile with gsl\n");
520    return NULL;
521 #endif
522 }
523 
524