1 #include "model.h"
2 #include "fbgibbs.h"
3 #include "foba.h"
4 #include "matrix.h"
5 #include "ghmm.h"
6 #include "mes.h"
7 #include "ghmm_internals.h"
8 #include "sequence.h"
9 #include "rng.h"
10 #include "randvar.h"
11 #include "obsolete.h"
12
13 #ifdef HAVE_CONFIG_H
14 # include "../config.h"
15 #endif
16
17 //===========================================================================================
18 //===================== sampleing ======================================
19 //===========================================================================================
20 //returns a sample from a cdf **could use binary search here**
sample(int seed,double * dist,int N)21 int sample(int seed, double* dist, int N){
22 //printf("sample\n\n");
23
24 double total = dist[N-1];
25 //printf("total = %f\n", total);
26 double rn = ighmm_rand_uniform_cont(seed, total, 0.0f);//XXX exception handleing what if total=0
27 //printf("rn = %f \n\n", rn);
28 int i;
29 if(rn <= dist[0])
30 return 0;
31 for(i = 1; i < N; i++){
32 if(dist[i-1] <rn && rn <= dist[i])
33 return i;
34 }
35 return N-1;
36 }
37
sampleStatePath(int N,double * alpha,double *** pmats,int T,int * states)38 void sampleStatePath(int N, double *alpha, double ***pmats, int T, int* states){
39 #define CUR_PROC "sampleStatePath"
40 //printf("sampleStatePath\n\n");
41 int i, j;
42 double temp[N];
43 double dist[N];
44 temp[0] = alpha[0];
45 for(i = 1; i < N; i++)
46 temp[i] = temp[i-1] + alpha[i];
47 //printf("tmp1 = %f, tmp2 = %f\n", alpha[0], temp[1]);
48 states[T-1] = sample(0, temp, N);
49 //printf("State T-1 = %d\n", states[T-1]);
50 for(i = T-2; i >=0; i--){
51 //printf("pmats %d, %d, %d = %f\n", i+1, states[i+1], mo->N-1, pmats[i+1][states[i+1]][mo->N-1]);
52 states[i] = sample(0, pmats[i+1][states[i+1]], N);
53 //printf("State %d = %d\n", i, states[i]);
54 }
55 #undef CUR_PROC
56 }
57
58 //===========================================================================================
59 //========= Modified forward to get an array of matrices used for sampling ==================
60 //===========================================================================================
61 /* ghmm_dmodel_forwardGibbs_init */
ghmm_dmodel_forwardGibbs_init(ghmm_dmodel * mo,double * alpha_1,int symb,double * scale)62 int ghmm_dmodel_forwardGibbs_init (ghmm_dmodel * mo, double *alpha_1, int symb, double *scale){
63 # define CUR_PROC "ghmm_dmodel_forwardGibbs_init"
64 int i, j, id, in_id;
65 double c_0;
66 *scale = 0.0;
67
68 /*printf(" *** foba_initforward\n");*/
69
70 /*iterate over non-silent states*/
71 /*printf(" *** iterate over non-silent states \n");*/
72 for (i = 0; i < mo->N; i++) {
73 if (!(mo->model_type & GHMM_kSilentStates) || !(mo->silent[i])) {
74 /*no starting in states with order > 0 !!!*/
75 if (!(mo->model_type & GHMM_kHigherOrderEmissions) || mo->order[i] == 0) {
76 alpha_1[i] = mo->s[i].pi * mo->s[i].b[symb];
77 /*printf("\nalpha1[%i]=%f\n",i,alpha_1[i]);*/
78
79 *scale += alpha_1[i];
80 }
81 else {
82 alpha_1[i] = 0;
83 }
84 }
85 }
86 /*iterate over silent states*/
87 /*printf(" *** iterate over silent states \n");*/
88 if (mo->model_type & GHMM_kSilentStates) {
89 for (i = 0; i < mo->topo_order_length; i++) {
90 id = mo->topo_order[i];
91 alpha_1[id] = mo->s[id].pi;
92
93 /*printf("\nsilent_start alpha1[%i]=%f\n",id,alpha_1[id]);*/
94
95 for (j = 0; j < mo->s[id].in_states; j++) {
96 in_id = mo->s[id].in_id[j];
97 alpha_1[id] += mo->s[id].in_a[j] * alpha_1[in_id];
98
99 /*printf("\n\tsilent_run alpha1[%i]=%f\n",id,alpha_1[id]);*/
100
101 }
102 *scale += alpha_1[id];
103 }
104 }
105
106 /* printf("\nwo label: scale[0] = %f\n",scale[0]); */
107
108 if (*scale >= GHMM_EPS_PREC) {
109 c_0 = 1 / *scale;
110 for (i = 0; i < mo->N; i++)
111 alpha_1[i] *= c_0;
112 }
113 return (0); /* attention: scale might be 0 */
114 # undef CUR_PROC
115 } /* ghmm_dmodel_forwardGibbs_init */
116
117 /*----------------------------------------------------------------------------*/
118
ghmm_dmodel_forwardGibbs_step(ghmm_dmodel * mo,ghmm_dstate * s,double * alpha_t,const double b_symb,double *** pmats,int t,int j)119 double ghmm_dmodel_forwardGibbs_step (ghmm_dmodel *mo, ghmm_dstate * s, double *alpha_t, const double b_symb, double*** pmats, int t, int j)
120 {
121 #define CUR_PROC "ghmm_dmodel_forwardGibbs_step"
122 int i, id;
123 double value = 0.0;
124 int prv;
125 if (b_symb < GHMM_EPS_PREC)
126 return 0.0;
127
128 /*printf(" *** fobagibbs_stepforward\n");*/
129 //printf("%d\n", s->in_states);
130 prv = mo->N;
131 for (i = 0; i < s->in_states; i++) {
132 id = s->in_id[i];
133 pmats[t][j][id] = s->in_a[i] * alpha_t[id]*b_symb;
134 value += pmats[t][j][id];
135
136 //printf("t = %d,j = %d,id = %d, value %f, p_symb %f, pmats %f\n",t,j,id, value, b_symb, pmats[t][j][id]);
137 for(; prv < id; prv++){
138 pmats[t][j][prv+1] += pmats[t][j][prv];
139 //printf("asdf t = %d,j = %d,i = %d, value %f, p_symb %f, pmats %f\n",t,j,prv+1, value, b_symb, pmats[t][j][prv+1]);
140 }
141 prv = id;
142 //printf("t = %d,j = %d,id = %d, value %f, p_symb %f, pmats %f\n",t,j,id, value, b_symb, pmats[t][j][id]);
143 }
144 for(prv+=1;prv<mo->N;prv++){
145 pmats[t][j][prv] += pmats[t][j][prv-1];
146 //printf("t = %d,j = %d,i = %d, value %f, p_symb %f, pmats %f\n",t,j,prv, value, b_symb, pmats[t][j][prv]);
147 }
148 return (value);
149 #undef CUR_PROC
150 } /* ghmm_dmodel_forwardGibbs_step */
151
152 /*============================================================================*/
153
ghmm_dmodel_forwardGibbs(ghmm_dmodel * mo,const int * O,int len,double ** alpha,double *** pmats)154 int ghmm_dmodel_forwardGibbs (ghmm_dmodel * mo, const int *O, int len, double **alpha, double*** pmats)
155 {
156 # define CUR_PROC "ghmm_dmodel_forwardGibbs"
157 char * str;
158 int i, t, id;
159 int e_index;
160 double c_t;
161 double scale;
162
163
164 if (mo->model_type & GHMM_kSilentStates)
165 ghmm_dmodel_order_topological(mo);
166
167 ghmm_dmodel_forwardGibbs_init (mo, alpha[0], O[0], &scale);
168
169 if (scale < GHMM_EPS_PREC) {
170 /* means: first symbol can't be generated by hmm */
171 printf("\nscale kleiner als eps (line_no: 123)\n");
172 return -1;
173 }
174 else {
175 for (t = 1; t < len; t++) {
176
177 scale = 0.0;
178 update_emission_history (mo, O[t - 1]);
179
180 //printf("\n\nStep t=%i mit len=%i, O[i]=%i\n",t,len,O[t]);
181 //printf("iterate over non-silent state\n");
182 /* iterate over non-silent states */
183 for (i = 0; i < mo->N; i++) {
184 if (!(mo->model_type & GHMM_kSilentStates) || !(mo->silent[i])) {
185 e_index = get_emission_index (mo, i, O[t], t);
186 if (e_index != -1) {
187 //printf("pmat %d, %d \n", t, i);
188 alpha[t][i] =
189 ghmm_dmodel_forwardGibbs_step (mo, &mo->s[i], alpha[t - 1], mo->s[i].b[e_index], pmats, t, i);
190 scale += alpha[t][i];
191 }
192 else {
193 alpha[t][i] = 0;
194 }
195 }
196 }
197 /* iterate over silent states */
198 //printf("iterate over silent state\n");
199 if (mo->model_type & GHMM_kSilentStates) {
200 for (i = 0; i < mo->topo_order_length; i++) {
201 /*printf("\nget id\n");*/
202 id = mo->topo_order[i];
203 /*printf(" akt_ state %d\n",id);*/
204 /*printf("\nin stepforward\n");*/
205 alpha[t][id] = ghmm_dmodel_forwardGibbs_step (mo, &mo->s[id], alpha[t], 1, pmats, t, i);
206 /*printf("\nnach stepforward\n");*/
207 scale += alpha[t][id];
208 }
209 }
210
211 if (scale < GHMM_EPS_PREC) {
212 return -1;
213 }
214 c_t = 1 / scale;
215 for (i = 0; i < mo->N; i++) {
216 alpha[t][i] *= c_t;
217 }
218 }
219 }
220 return 0;
221 #undef CUR_PROC
222 }
223
224
225 /* ghmm_dmodel_forwardGibbs */
226 //======================================================================================
227 //======================== update parameters ===========================================
228 //======================================================================================
allocCounts(ghmm_dmodel * mo,double *** transitions,double ** obsinstate,double *** obsinstatealpha)229 void allocCounts(ghmm_dmodel* mo, double ***transitions, double **obsinstate,
230 double ***obsinstatealpha){
231 #define CUR_PROC "allocCount"
232 *transitions = ighmm_cmatrix_alloc(mo->N,mo->N);
233 ARRAY_CALLOC(*obsinstate, mo->N);
234 *obsinstatealpha = ighmm_cmatrix_alloc(mo->N,mo->M);
235 STOP:
236 return;//XXX error handle
237 #undef CUR_PROC
238 }
allocCountsH(ghmm_dmodel * mo,double *** transitions,double ** obsinstate,double *** obsinstatealpha)239 void allocCountsH(ghmm_dmodel* mo, double ***transitions, double **obsinstate,
240 double ***obsinstatealpha){
241 #define CUR_PROC "allocCountsH"
242 *transitions = ighmm_cmatrix_alloc(mo->N, mo->N);
243 ARRAY_CALLOC(*obsinstate, mo->N);
244 ARRAY_CALLOC(*obsinstatealpha, mo->N);
245 int l;
246 for(l = 0; l < mo->N; l++){
247 ARRAY_CALLOC((*obsinstatealpha)[l], ghmm_ipow (mo, mo->M, mo->order[l]+1));
248 }
249 STOP:
250 return;
251 #undef CUR_PROC
252 }
253
freeCounts(ghmm_dmodel * mo,double *** transitions,double ** obsinstate,double *** obsinstatealpha)254 void freeCounts(ghmm_dmodel* mo, double ***transitions, double **obsinstate, double ***obsinstatealpha){
255 #define CUR_PROC "freeCounts"
256 ighmm_cmatrix_free(transitions, mo->N);
257 ighmm_cmatrix_free(obsinstatealpha, mo->N);
258 m_free(*obsinstate);
259 #undef CUR_PROC
260 }
261
freeCountsH(ghmm_dmodel * mo,double *** transitions,double ** obsinstate,double *** obsinstatealpha)262 void freeCountsH(ghmm_dmodel* mo, double ***transitions, double **obsinstate, double ***obsinstatealpha){
263 #define CUR_PROC "freeCountsH"
264 ighmm_cmatrix_free(transitions, mo->N);
265 m_free(*obsinstate);
266 int l;
267 for(l=0;l<mo->N;l++)
268 m_free((*obsinstatealpha)[l]);
269 m_free(*obsinstatealpha);
270 #undef CUR_PROC
271 }
272
273
initCounts(ghmm_dmodel * mo,double ** transition,double * obsinstate,double ** obsinstatealpha,double ** pA,double ** pB,double * pPi)274 void initCounts(ghmm_dmodel* mo, double **transition, double *obsinstate, double **obsinstatealpha,
275 double **pA, double **pB, double *pPi){
276 //initialize
277 int i, k;
278 for(i=0;i<mo->N;i++){
279 obsinstate[i] = pPi[i];
280 for(k=0;k<mo->N;k++){
281 transition[i][k] = pA[i][k];
282 }
283 for(k=0;k<mo->M;k++){
284 obsinstatealpha[i][k] = pB[i][k];
285 }
286 }
287 }
288
initCountsH(ghmm_dmodel * mo,double ** transition,double * obsinstate,double ** obsinstatealpha,double ** pA,double ** pB,double * pPi)289 void initCountsH(ghmm_dmodel* mo, double **transition, double *obsinstate, double **obsinstatealpha,
290 double **pA, double **pB, double *pPi){
291 int i,k;
292 for(i=0;i<mo->N;i++){
293 obsinstate[i] = pPi[i];
294 for(k=0;k<mo->N;k++){
295 transition[i][k] = pA[i][k];
296 }
297 for(k=0;k<ghmm_ipow (mo, mo->M, mo->order[i]+1);k++){
298 obsinstatealpha[i][k] = pB[i][k];
299 }
300 }
301 }
302
getCounts(int * states,int * O,int T,double ** transition,double * obsinstate,double ** obsinstatealpha)303 void getCounts(int *states, int* O, int T,
304 double **transition, double *obsinstate,
305 double **obsinstatealpha){
306 int i,k;
307 //A, B
308 for(i=0;i<T;i++){
309 obsinstate[states[i]] += 1;
310 obsinstatealpha[states[i]][O[i]] += 1;
311 }
312 for(i=0;i<T-1;i++){
313 transition[states[i]][states[i+1]] += 1;
314 }
315 }
316
getCountsH(ghmm_dmodel * mo,int * states,int * O,int T,double ** transition,double * obsinstate,double ** obsinstatealpha)317 void getCountsH(ghmm_dmodel* mo, int *states, int* O, int T,
318 double **transition, double *obsinstate,
319 double **obsinstatealpha){
320 mo->emission_history = 0;
321 int i;
322
323 //A, B
324 for(i=0;i<T;i++){
325 if(mo->order[states[i]] == 0)//only want to count first order states for pi
326 obsinstate[states[i]] += 1;
327 int e_index = get_emission_index (mo, states[i], O[i], i);
328 if (-1 != e_index)
329 obsinstatealpha[states[i]][e_index] += 1;
330 update_emission_history (mo, O[i]);
331 }
332 for(i=0;i<T-1;i++){
333 transition[states[i]][states[i+1]] += 1;
334 }
335 }
336 //given states, psueodocount matrices pA, pB, pPi see wiki, calculates new A,B,Pi
337 //XXX should use fix in state
338 //assumes psuedocount preserves structure, ie doesnt add 1 to a zero transition.
update(ghmm_dmodel * mo,double ** transition,double * obsinstate,double ** obsinstatealpha)339 void update(ghmm_dmodel* mo, double **transition, double *obsinstate, double **obsinstatealpha){
340 #define CUR_PROC "update"
341 double tmp_m[mo->M];
342 double tmp_n[mo->N];
343 int i, k;
344 for(i=0;i<mo->N;i++){
345 if(!mo->s[i].fix)//fix == 1 dont change b
346 ighmm_rand_dirichlet(0, mo->M, obsinstatealpha[i], tmp_m);
347 ighmm_rand_dirichlet(0, mo->N, transition[i], tmp_n);
348 //update model
349 if(!mo->s[i].fix){//dont update if fix
350 for(k = 0; k < mo->M; k++){
351 mo->s[i].b[k] = tmp_m[k];
352 }
353 }
354 for(k = 0; k < mo->N; k++){
355 ghmm_dmodel_set_transition(mo, i, k, tmp_n[k]);//linear search optimize for mo->N>10?
356 }
357 }
358
359 ighmm_rand_dirichlet(0, mo->N, obsinstate, tmp_n);
360 for(k=0;k<mo->N;k++){
361 mo->s[k].pi = tmp_n[k];
362 }
363 if (mo->model_type & GHMM_kTiedEmissions)
364 ghmm_dmodel_update_tie_groups (mo);
365
366 #undef CUR_PROC
367 }
368
369
updateH(ghmm_dmodel * mo,double ** transition,double * obsinstate,double ** obsinstatealpha)370 void updateH(ghmm_dmodel* mo, double **transition, double *obsinstate, double **obsinstatealpha){
371 #define CUR_PROC "updateH"
372 double tmp_n[mo->N];
373 double tmp_m[mo->M];
374 double *p;
375 int i,k,l;
376 for(i=0;i<mo->N;i++){
377 ighmm_rand_dirichlet(0, mo->N, transition[i], tmp_n);
378 for(k = 0; k < mo->N; k++){
379 ghmm_dmodel_set_transition(mo, i, k, tmp_n[k]);
380 }
381 if(!mo->s[i].fix){
382 p = obsinstatealpha[i];
383 for(k = 0; k < ghmm_ipow(mo, mo->M, mo->order[i]); k++){
384 ighmm_rand_dirichlet(0,mo->M,p+k*mo->M, tmp_m);
385 for(l = 0; l < mo->M; l++){
386 mo->s[i].b[k*mo->M + l] = tmp_m[l];
387 }
388 }
389 }
390 }
391 ighmm_rand_dirichlet(0, mo->N, obsinstate, tmp_n);
392 for(k=0;k<mo->N;k++){
393 mo->s[k].pi = tmp_n[k];
394 }
395 if (mo->model_type & GHMM_kTiedEmissions)
396 ghmm_dmodel_update_tie_groups (mo);
397 #undef CUR_PROC
398 }
399 //================================================================================================
400 //=============================fbgibbs============================================================
401 //================================================================================================
402
403 //allocates space and initilizes prior counts to 1 if not null
init_priors(ghmm_dmodel * mo,double *** pA,double *** pB,double ** pPi)404 void init_priors(ghmm_dmodel *mo, double ***pA, double ***pB, double **pPi){
405 int i, j, k;
406 if(!(*pA)){
407 *pA = ighmm_cmatrix_alloc(mo->N, mo->N);
408 for(i=0;i<mo->N;i++){
409 for(j=0;j<mo->N;j++){
410 (*pA)[i][j] = 1;
411 }
412 }
413 }
414 if(!(*pPi)){
415 *pPi = malloc(sizeof(double)*mo->N);
416 for(i=0;i<mo->N;i++){
417 (*pPi)[i] = 1;
418 }
419 }
420 if(!(*pB)){
421 if(mo->model_type & GHMM_kHigherOrderEmissions){
422 *pB = malloc(sizeof(double*)*mo->N);
423 for(i = 0; i < mo->N; i++){
424 (*pB)[i] = (double*)malloc(sizeof(double)*ghmm_ipow (mo, mo->M, mo->order[i]+1));
425 for(j = 0; j < ghmm_ipow (mo, mo->M, mo->order[i]+1); j++){
426 (*pB)[i][j] = 1;
427 }
428 }
429 }
430 else{
431 *pB = ighmm_cmatrix_alloc(mo->N, mo->M);
432 for(i=0;i<mo->N;i++){
433 for(j = 0; j < mo->M; j++){
434 (*pB)[i][j] = 1;
435 }
436 }
437 }
438 }
439 }
440
441
ghmm_dmodel_fbgibbstep(ghmm_dmodel * mo,int * O,int len,int * Q,double ** alpha,double *** pmats)442 void ghmm_dmodel_fbgibbstep (ghmm_dmodel * mo, int* O, int len, int *Q, double** alpha, double***pmats){
443 #define CUR_PROC "ghmm_dmodel_fbgibbstep"
444 //sample state sequence
445 //update parameters
446 //printf("fbgibbsStep \n\n");
447 int i,j,k;
448 for(i = 0; i < len; i++){
449 for(j = 0; j < mo->N; j++){
450 alpha[i][j] = 0;
451 for(k = 0; k < mo->N; k++){
452 pmats[i][j][k] = 0;
453 }
454 }
455 }
456 ghmm_dmodel_forwardGibbs(mo, O, len, alpha, pmats);
457 sampleStatePath(mo->N, alpha[len-1], pmats, len, Q);
458 //printf("done samplestatepath\n\n");
459 //for(i = 0; i < len; i++){
460 //printf("%d ", Q[i]);
461 //}
462 //printf("\n");
463 #undef CUR_PROC
464 }
465
ghmm_dmodel_fbgibbs(ghmm_dmodel * mo,ghmm_dseq * seq,double ** pA,double ** pB,double * pPi,int burnIn,int seed)466 int** ghmm_dmodel_fbgibbs(ghmm_dmodel * mo, ghmm_dseq* seq, double **pA, double **pB, double *pPi, int burnIn, int seed){
467 #ifdef DO_WITH_GSL
468 #define CUR_PROC "ghmm_dmodel_fbgibbs"
469 //initilizations
470 GHMM_RNG_SET (RNG, seed);
471 int **Q;
472 ARRAY_MALLOC (Q, seq->seq_number);
473 int i;
474 int len = 0;
475 for(i = 0; i < seq->seq_number; i++){
476 ARRAY_MALLOC (Q[i], seq->seq_len[i]);
477 if(len < seq->seq_len[i])
478 len = seq->seq_len[i];
479 }
480 double **alpha = ighmm_cmatrix_alloc(len, mo->N);
481 double ***pmats = ighmm_cmatrix_3d_alloc(len, mo->N, mo->N);
482 double **transitions,**obsinstatealpha;
483 double *obsinstate;
484
485 if(mo->model_type & GHMM_kHigherOrderEmissions){//higher order
486 allocCountsH(mo, &transitions, &obsinstate, &obsinstatealpha);
487 for(;burnIn > 0; burnIn--){
488 initCountsH(mo, transitions, obsinstate, obsinstatealpha, pA, pB, pPi);
489 if(burnIn % 100 == 0) printf("iter %d\n", burnIn);
490 for(i = 0; i < seq->seq_number; i++){
491 ghmm_dmodel_fbgibbstep(mo, seq->seq[i], seq->seq_len[i], Q[i], alpha, pmats);
492 getCountsH(mo, Q[i], seq->seq[i], seq->seq_len[i], transitions, obsinstate, obsinstatealpha);
493 }
494 updateH(mo, transitions, obsinstate, obsinstatealpha);
495 }
496 freeCountsH(mo, &transitions, &obsinstate, &obsinstatealpha);
497
498 }
499 else{
500 allocCounts(mo, &transitions, &obsinstate, &obsinstatealpha);
501
502 for(;burnIn > 0; burnIn--){
503 initCounts(mo, transitions, obsinstate, obsinstatealpha, pA, pB, pPi);
504 for(i = 0; i < seq->seq_number; i++){
505 ghmm_dmodel_fbgibbstep(mo, seq->seq[i], seq->seq_len[i], Q[i], alpha, pmats);
506 getCounts(Q[i], seq->seq[i], seq->seq_len[i], transitions, obsinstate, obsinstatealpha);
507 }
508 update(mo, transitions, obsinstate, obsinstatealpha);
509 }
510 freeCounts(mo, &transitions, &obsinstate, &obsinstatealpha);
511 }
512 ighmm_cmatrix_3d_free(&pmats, len, mo->N);
513 ighmm_cmatrix_free(&alpha, len);
514 return Q;
515 STOP:
516 return NULL;
517 #undef CUR_PROC
518 #else
519 printf("fbgibbs uses gsl for dirichlete distrubutions, compile with gsl\n");
520 return NULL;
521 #endif
522 }
523
524