1 /* probify.c
2 * Convert counts to probabilities, using regularization.
3 * SRE, Fri Sep 3 08:02:49 1993
4 *
5 * Because our training sequence set is finite (and often small)
6 * and our number of free parameters is large, we have a small
7 * sample statistics problem in estimating each parameter.
8 *
9 * The simplest way to deal with this problem is to use the
10 * so-called Laplace law of succession. If I have N sequences
11 * with a base symbol assigned to some state, and see n A's,
12 * I calculate the emission probability of A as (n+1)/(N+4),
13 * i.e., adding 1 to the numerator and the number of possible
14 * outcomes (4) to the denominator. A summary of the proof of
15 * this appears in Berg & von Hippel (J Mol Biol 193:723-750, 1987).
16 * It is referred to as the "plus-one prior" by David Haussler
17 * and by us.
18 *
19 * The plus-one prior implies that we have no knowledge at all about
20 * the prior probabilities; in absence of any data, the probabilities
21 * default to 1/4. What if we do have prior information? (For instance,
22 * for state transitions, we know that deletes and inserts are
23 * relatively rare.) We use a generalization of the Laplace law
24 * of succession:
25 * n(x) + alpha * R(x)
26 * P(x) = -------------------
27 * ---
28 * \ n(i) + alpha * R(i)
29 * /__
30 * i
31 *
32 * Here, R(x) is a "regularizer" and alpha is a weight applied
33 * to the regularizer. Both were 1.0 in the plus-one prior.
34 * Now, we can bias R(x) to reflect our prior expectations
35 * about the probability distribution P(x). (In practice,
36 * we calculate R(x) by specifying a prior probability distribution
37 * and multiplying each term by the number of possible outcomes.)
38 * alpha is a "confidence" term; the higher alpha is, the more
39 * data it takes to outweigh the prior. We usually set alpha to
40 * 1.0, but sometimes -- such as for insert state emissions,
41 * where we may assert that the emission probabilities are
42 * the same as random regardless of the data -- we might use
43 * arbitrarily high alpha's to freeze certain probability distributions
44 * at their priors.
45 *
46 * All this follows the description in Krogh et. al's HMM paper
47 * (in press, JMB, 1993).
48 *
49 */
50
51
52 #include "structs.h"
53 #include "funcs.h"
54
55
56 /* Function: ProbifyCM()
57 *
58 * Purpose: Convert all the state transitions and symbol emissions in
59 * a covariance model from counts to probabilities.
60 *
61 * Args: cm - the model to convert
62 *
63 * Return: (void). Counts in cm become probabilities.
64 */
65 void
ProbifyCM(struct cm_s * cm,struct prior_s * prior)66 ProbifyCM(struct cm_s *cm, struct prior_s *prior)
67 {
68 int k;
69
70 for (k = 0; k < cm->nodes; k++)
71 {
72 if (cm->nd[k].type != BIFURC_NODE)
73 {
74 if (cm->nd[k].nxt == -1)
75 ProbifyTransitionMatrix(cm->nd[k].tmx, cm->nd[k].type, END_NODE, prior);
76 else
77 ProbifyTransitionMatrix(cm->nd[k].tmx, cm->nd[k].type, cm->nd[k+1].type, prior);
78 }
79
80 switch (cm->nd[k].type) {
81 case MATP_NODE:
82 ProbifySingletEmission(cm->nd[k].il_emit, uINSL_ST, prior);
83 ProbifySingletEmission(cm->nd[k].ir_emit, uINSR_ST, prior);
84 ProbifySingletEmission(cm->nd[k].ml_emit, uMATL_ST, prior);
85 ProbifySingletEmission(cm->nd[k].mr_emit, uMATR_ST, prior);
86 ProbifyPairEmission(cm->nd[k].mp_emit, prior);
87 break;
88
89 case MATL_NODE:
90 ProbifySingletEmission(cm->nd[k].il_emit, uINSL_ST, prior);
91 ProbifySingletEmission(cm->nd[k].ml_emit, uMATL_ST, prior);
92 break;
93
94 case MATR_NODE:
95 ProbifySingletEmission(cm->nd[k].ir_emit, uINSR_ST, prior);
96 ProbifySingletEmission(cm->nd[k].mr_emit, uMATR_ST, prior);
97 break;
98
99 case BEGINR_NODE:
100 ProbifySingletEmission(cm->nd[k].il_emit, uINSL_ST, prior);
101 break;
102
103 case ROOT_NODE:
104 ProbifySingletEmission(cm->nd[k].il_emit, uINSL_ST, prior);
105 ProbifySingletEmission(cm->nd[k].ir_emit, uINSR_ST, prior);
106 break;
107
108 case BIFURC_NODE: break;
109 case BEGINL_NODE: break;
110 default: Die("Unrecognized node type %d at node %d", cm->nd[k].type, k);
111 }
112 }
113 }
114
115
116
117 /* Function: ProbifyTransitionMatrix()
118 *
119 * Purpose: Convert the state transition matrix between two nodes
120 * from counts to probabilities.
121 *
122 * Args: tmx: 6x6 state transition matrix of counts
123 * from_node: e.g. MATP_NODE, type of node we transit from
124 * to_node: type of node we transit to
125 * prior: prior probability distributions
126 *
127 * Return: (void). Values in tmx become probabilities.
128 */
129 void
ProbifyTransitionMatrix(double tmx[STATETYPES][STATETYPES],int from_node,int to_node,struct prior_s * prior)130 ProbifyTransitionMatrix(double tmx[STATETYPES][STATETYPES],
131 int from_node,
132 int to_node,
133 struct prior_s *prior)
134 {
135 int i,j;
136 double denom;
137
138 for (i = 0; i < STATETYPES; i++)
139 {
140 /* if no transitions to DEL in prior, this must be an unused vector */
141 if (prior->tprior[from_node][to_node][i][0] > 0.0)
142 {
143 denom = 0.0;
144 for (j = 0; j < STATETYPES; j++)
145 {
146 tmx[i][j] = tmx[i][j] + prior->talpha[i] * prior->tprior[from_node][to_node][i][j];
147 denom += tmx[i][j];
148 }
149 for (j = 0; j < STATETYPES; j++)
150 tmx[i][j] /= denom;
151 }
152 }
153 }
154
155
156
157 /* Function: ProbifySingletEmission()
158 *
159 * Purpose: Convert a singlet emission vector from counts to probabilities.
160 *
161 * Args: emvec: the emission vector
162 * statetype: type of state: uMATL_ST, uMATR_ST, uINSL_ST, uINSR_ST
163 * prior: prior probability distributions
164 *
165 * Return: (void). Values in emvec become probabilities.
166 */
167 void
ProbifySingletEmission(double emvec[ALPHASIZE],int statetype,struct prior_s * prior)168 ProbifySingletEmission(double emvec[ALPHASIZE],
169 int statetype,
170 struct prior_s *prior)
171 {
172 int x;
173 double denom;
174 double *em_prior;
175
176 /* Choose the correct prior probability distribution to use.
177 */
178 switch (statetype) {
179 case uMATL_ST: em_prior = prior->matl_prior; break;
180 case uMATR_ST: em_prior = prior->matr_prior; break;
181 case uINSL_ST: em_prior = prior->insl_prior; break;
182 case uINSR_ST: em_prior = prior->insr_prior; break;
183 default: Die("statetype %d is not a singlet emitting state\n", statetype);
184 }
185
186 denom = 0.0;
187 for (x = 0; x < ALPHASIZE; x++)
188 {
189 emvec[x] = emvec[x] + prior->emalpha[StatetypeIndex(statetype)] * em_prior[x];
190 denom += emvec[x];
191 }
192 if (denom > 0.0)
193 for (x = 0; x < ALPHASIZE; x++)
194 emvec[x] /= denom;
195 }
196
197
198 /* Function: ProbifyPairEmission()
199 *
200 * Purpose: Convert a MATP pairwise emission matrix from counts to probabilities.
201 *
202 * Args: emx: the emission matrix
203 * prior: prior probability distributions
204 *
205 * Return: (void). Values in emx become probabilities.
206 */
207 void
ProbifyPairEmission(double emx[ALPHASIZE][ALPHASIZE],struct prior_s * prior)208 ProbifyPairEmission(double emx[ALPHASIZE][ALPHASIZE],
209 struct prior_s *prior)
210 {
211 int x,y;
212 double denom;
213
214 denom = 0.0;
215 for (x = 0; x < ALPHASIZE; x++)
216 for (y = 0; y < ALPHASIZE; y++)
217 {
218 emx[x][y] = emx[x][y] + prior->emalpha[MATP_ST] * prior->matp_prior[x][y];
219 denom += emx[x][y];
220 }
221 if (denom > 0.0)
222 for (x = 0; x < ALPHASIZE; x++)
223 for (y = 0; y < ALPHASIZE; y++)
224 emx[x][y] /= denom;
225 }
226