1 /* probify.c
2  * Convert counts to probabilities, using regularization.
3  * SRE, Fri Sep  3 08:02:49 1993
4  *
5  * Because our training sequence set is finite (and often small)
6  * and our number of free parameters is large, we have a small
7  * sample statistics problem in estimating each parameter.
8  *
9  * The simplest way to deal with this problem is to use the
10  * so-called Laplace law of succession. If I have N sequences
11  * with a base symbol assigned to some state, and see n A's,
12  * I calculate the emission probability of A as (n+1)/(N+4),
13  * i.e., adding 1 to the numerator and the number of possible
14  * outcomes (4) to the denominator. A summary of the proof of
15  * this appears in Berg & von Hippel (J Mol Biol 193:723-750, 1987).
16  * It is referred to as the "plus-one prior" by David Haussler
17  * and by us.
18  *
19  * The plus-one prior implies that we have no knowledge at all about
20  * the prior probabilities; in absence of any data, the probabilities
21  * default to 1/4. What if we do have prior information? (For instance,
22  * for state transitions, we know that deletes and inserts are
23  * relatively rare.) We use a generalization of the Laplace law
24  * of succession:
25  *                n(x) + alpha * R(x)
26  *         P(x) = -------------------
27  *                ---
28  * 		  \   n(i) + alpha * R(i)
29  * 		  /__
30  * 		   i
31  *
32  * Here, R(x) is a "regularizer" and alpha is a weight applied
33  * to the regularizer. Both were 1.0 in the plus-one prior.
34  * Now, we can bias R(x) to reflect our prior expectations
35  * about the probability distribution P(x). (In practice,
36  * we calculate R(x) by specifying a prior probability distribution
37  * and multiplying each term by the number of possible outcomes.)
38  * alpha is a "confidence" term; the higher alpha is, the more
39  * data it takes to outweigh the prior. We usually set alpha to
40  * 1.0, but sometimes -- such as for insert state emissions,
41  * where we may assert that the emission probabilities are
42  * the same as random regardless of the data -- we might use
43  * arbitrarily high alpha's to freeze certain probability distributions
44  * at their priors.
45  *
46  * All this follows the description in Krogh et. al's HMM paper
47  * (in press, JMB, 1993).
48  *
49  */
50 
51 
52 #include "structs.h"
53 #include "funcs.h"
54 
55 
56 /* Function: ProbifyCM()
57  *
58  * Purpose:  Convert all the state transitions and symbol emissions in
59  *           a covariance model from counts to probabilities.
60  *
61  * Args:     cm - the model to convert
62  *
63  * Return:   (void). Counts in cm become probabilities.
64  */
65 void
ProbifyCM(struct cm_s * cm,struct prior_s * prior)66 ProbifyCM(struct cm_s *cm, struct prior_s *prior)
67 {
68   int k;
69 
70   for (k = 0; k < cm->nodes; k++)
71     {
72       if (cm->nd[k].type != BIFURC_NODE)
73 	{
74 	  if (cm->nd[k].nxt == -1)
75 	    ProbifyTransitionMatrix(cm->nd[k].tmx, cm->nd[k].type, END_NODE, prior);
76 	  else
77 	    ProbifyTransitionMatrix(cm->nd[k].tmx, cm->nd[k].type, cm->nd[k+1].type, prior);
78 	}
79 
80       switch (cm->nd[k].type) {
81       case MATP_NODE:
82 	ProbifySingletEmission(cm->nd[k].il_emit, uINSL_ST, prior);
83 	ProbifySingletEmission(cm->nd[k].ir_emit, uINSR_ST, prior);
84 	ProbifySingletEmission(cm->nd[k].ml_emit, uMATL_ST, prior);
85 	ProbifySingletEmission(cm->nd[k].mr_emit, uMATR_ST, prior);
86 	ProbifyPairEmission(cm->nd[k].mp_emit, prior);
87 	break;
88 
89       case MATL_NODE:
90 	ProbifySingletEmission(cm->nd[k].il_emit, uINSL_ST, prior);
91 	ProbifySingletEmission(cm->nd[k].ml_emit, uMATL_ST, prior);
92 	break;
93 
94       case MATR_NODE:
95 	ProbifySingletEmission(cm->nd[k].ir_emit, uINSR_ST, prior);
96 	ProbifySingletEmission(cm->nd[k].mr_emit, uMATR_ST, prior);
97 	break;
98 
99       case BEGINR_NODE:
100 	ProbifySingletEmission(cm->nd[k].il_emit, uINSL_ST, prior);
101 	break;
102 
103       case ROOT_NODE:
104 	ProbifySingletEmission(cm->nd[k].il_emit, uINSL_ST, prior);
105 	ProbifySingletEmission(cm->nd[k].ir_emit, uINSR_ST, prior);
106 	break;
107 
108       case BIFURC_NODE: break;
109       case BEGINL_NODE: break;
110       default: Die("Unrecognized node type %d at node %d", cm->nd[k].type, k);
111       }
112     }
113 }
114 
115 
116 
117 /* Function: ProbifyTransitionMatrix()
118  *
119  * Purpose:  Convert the state transition matrix between two nodes
120  *           from counts to probabilities.
121  *
122  * Args:     tmx:       6x6 state transition matrix of counts
123  *           from_node: e.g. MATP_NODE, type of node we transit from
124  *           to_node:   type of node we transit to
125  *           prior:     prior probability distributions
126  *
127  * Return:   (void). Values in tmx become probabilities.
128  */
129 void
ProbifyTransitionMatrix(double tmx[STATETYPES][STATETYPES],int from_node,int to_node,struct prior_s * prior)130 ProbifyTransitionMatrix(double          tmx[STATETYPES][STATETYPES],
131 			int             from_node,
132 			int             to_node,
133 			struct prior_s *prior)
134 {
135   int    i,j;
136   double denom;
137 
138   for (i = 0; i < STATETYPES; i++)
139     {
140       /* if no transitions to DEL in prior, this must be an unused vector */
141       if (prior->tprior[from_node][to_node][i][0] > 0.0)
142 	{
143 	  denom = 0.0;
144 	  for (j = 0; j < STATETYPES; j++)
145 	    {
146 	      tmx[i][j] = tmx[i][j] + prior->talpha[i] * prior->tprior[from_node][to_node][i][j];
147 	      denom += tmx[i][j];
148 	    }
149 	  for (j = 0; j < STATETYPES; j++)
150 	    tmx[i][j] /= denom;
151 	}
152     }
153 }
154 
155 
156 
157 /* Function: ProbifySingletEmission()
158  *
159  * Purpose:  Convert a singlet emission vector from counts to probabilities.
160  *
161  * Args:     emvec:     the emission vector
162  *           statetype: type of state: uMATL_ST, uMATR_ST, uINSL_ST, uINSR_ST
163  *           prior:     prior probability distributions
164  *
165  * Return:   (void). Values in emvec become probabilities.
166  */
167 void
ProbifySingletEmission(double emvec[ALPHASIZE],int statetype,struct prior_s * prior)168 ProbifySingletEmission(double          emvec[ALPHASIZE],
169 		       int             statetype,
170 		       struct prior_s *prior)
171 {
172   int x;
173   double denom;
174   double *em_prior;
175 
176   /* Choose the correct prior probability distribution to use.
177    */
178   switch (statetype) {
179   case uMATL_ST: em_prior = prior->matl_prior; break;
180   case uMATR_ST: em_prior = prior->matr_prior; break;
181   case uINSL_ST: em_prior = prior->insl_prior; break;
182   case uINSR_ST: em_prior = prior->insr_prior; break;
183   default: Die("statetype %d is not a singlet emitting state\n", statetype);
184   }
185 
186   denom = 0.0;
187   for (x = 0; x < ALPHASIZE; x++)
188     {
189       emvec[x] = emvec[x] + prior->emalpha[StatetypeIndex(statetype)] * em_prior[x];
190       denom += emvec[x];
191     }
192   if (denom > 0.0)
193     for (x = 0; x < ALPHASIZE; x++)
194       emvec[x] /= denom;
195 }
196 
197 
198 /* Function: ProbifyPairEmission()
199  *
200  * Purpose:  Convert a MATP pairwise emission matrix from counts to probabilities.
201  *
202  * Args:     emx:       the emission matrix
203  *           prior:     prior probability distributions
204  *
205  * Return:   (void). Values in emx become probabilities.
206  */
207 void
ProbifyPairEmission(double emx[ALPHASIZE][ALPHASIZE],struct prior_s * prior)208 ProbifyPairEmission(double          emx[ALPHASIZE][ALPHASIZE],
209 		    struct prior_s *prior)
210 {
211   int x,y;
212   double denom;
213 
214   denom = 0.0;
215   for (x = 0; x < ALPHASIZE; x++)
216     for (y = 0; y < ALPHASIZE; y++)
217       {
218 	emx[x][y] = emx[x][y] + prior->emalpha[MATP_ST] * prior->matp_prior[x][y];
219 	denom += emx[x][y];
220       }
221   if (denom > 0.0)
222     for (x = 0; x < ALPHASIZE; x++)
223       for (y = 0; y < ALPHASIZE; y++)
224 	emx[x][y] /= denom;
225 }
226