1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-2003 Washington University School of Medicine
4 * All Rights Reserved
5 *
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
8 * for details.
9 *****************************************************************/
10
11 /* prior.c
12 * SRE, Mon Nov 18 15:44:08 1996
13 *
14 * Support for Dirichlet prior data structure, p7prior_s.
15 */
16
17 #include "funcs.h"
18
19
20 static struct p7prior_s *default_amino_prior(void);
21 static struct p7prior_s *default_nucleic_prior();
22
23 /* Function: P7AllocPrior(), P7FreePrior()
24 *
25 * Purpose: Allocation and free'ing of a prior structure.
26 * Very simple, but might get more complex someday.
27 */
28 struct p7prior_s *
P7AllocPrior(void)29 P7AllocPrior(void)
30 { return (struct p7prior_s *) MallocOrDie (sizeof(struct p7prior_s)); }
31
32 void
P7FreePrior(struct p7prior_s * pri)33 P7FreePrior(struct p7prior_s *pri)
34 { free(pri); }
35
36
37 /* Function: P7LaplacePrior()
38 *
39 * Purpose: Create a Laplace plus-one prior. (single component Dirichlets).
40 * Global alphabet info is assumed to have been set already.
41 *
42 * Args: (void)
43 *
44 * Return: prior. Allocated here; call FreePrior() to free it.
45 */
46 struct p7prior_s *
P7LaplacePrior(void)47 P7LaplacePrior(void)
48 {
49 //get HMMERTaskLocalData
50 HMMERTaskLocalData *tld = getHMMERTaskLocalData();
51 alphabet_s *al = &tld->al;
52
53 struct p7prior_s *pri;
54
55 pri = P7AllocPrior();
56 pri->strategy = PRI_DCHLET;
57
58 pri->tnum = 1;
59 pri->tq[0] = 1.;
60 FSet(pri->t[0], 8, 1.);
61
62 pri->mnum = 1;
63 pri->mq[0] = 1.;
64 FSet(pri->m[0], al->Alphabet_size, 1.);
65
66 pri->inum = 1;
67 pri->iq[0] = 1.;
68 FSet(pri->i[0], al->Alphabet_size, 1.);
69
70 return pri;
71 }
72
73 /* Function: P7DefaultPrior()
74 *
75 * Purpose: Set up a somewhat more realistic single component
76 * Dirichlet prior than Laplace.
77 */
P7DefaultPrior(void)78 struct p7prior_s * P7DefaultPrior(void)
79 {
80 //get HMMERTaskLocalData
81 HMMERTaskLocalData *tld = getHMMERTaskLocalData();
82 alphabet_s *al = &tld->al;
83
84 switch (al->Alphabet_type) {
85 case hmmAMINO: return default_amino_prior();
86 case hmmNUCLEIC: return default_nucleic_prior();
87 case hmmNOTSETYET: Die("Can't set prior; alphabet type not set yet");
88 }
89 /*NOTREACHED*/
90 return NULL;
91 }
92
93
94
95 /* Function: P7DefaultNullModel()
96 *
97 * Purpose: Set up a default random sequence model, using
98 * global aafq[]'s for protein or 1/Alphabet_size for anything
99 * else. randomseq is alloc'ed in caller. Alphabet information
100 * must already be known.
101 */
102 void
P7DefaultNullModel(float * null,float * ret_p1)103 P7DefaultNullModel(float *null, float *ret_p1)
104 {
105 //get HMMERTaskLocalData
106 HMMERTaskLocalData *tld = getHMMERTaskLocalData();
107 alphabet_s *al = &tld->al;
108
109 int x;
110 if (al->Alphabet_type == hmmAMINO) {
111 for (x = 0; x < al->Alphabet_size; x++)
112 null[x] = aafq[x];
113 *ret_p1 = 350./351.; /* rationale: approx avg protein length. */
114 } else {
115 for (x = 0; x < al->Alphabet_size; x++)
116 null[x] = 1.0 / (float) al->Alphabet_size;
117 *ret_p1 = 1000./1001.; /* rationale: approx inter-Alu distance. */
118 }
119 }
120
121
122 /* Function: P7PriorifyHMM()
123 *
124 * Purpose: Add pseudocounts to an HMM using Dirichlet priors,
125 * and renormalize the HMM.
126 *
127 * Args: hmm -- the HMM to add counts to (counts form)
128 * pri -- the Dirichlet prior to use
129 *
130 * Return: (void)
131 * HMM returns in probability form.
132 */
133 void
P7PriorifyHMM(struct plan7_s * hmm,struct p7prior_s * pri)134 P7PriorifyHMM(struct plan7_s *hmm, struct p7prior_s *pri)
135 {
136 int k; /* counter for model position */
137 float d; /* a denominator */
138 float tq[MAXDCHLET]; /* prior distribution over mixtures */
139 float mq[MAXDCHLET]; /* prior distribution over mixtures */
140 float iq[MAXDCHLET]; /* prior distribution over mixtures */
141
142 /* Model-dependent transitions are handled simply; Laplace.
143 */
144 FSet(hmm->begin+2, hmm->M-1, 0.); /* wipe internal BM entries */
145 FSet(hmm->end+1, hmm->M-1, 0.); /* wipe internal ME exits */
146 d = hmm->tbd1 + hmm->begin[1] + 2.;
147 hmm->tbd1 = (hmm->tbd1 + 1.)/ d;
148 hmm->begin[1] = (hmm->begin[1] + 1.)/ d;
149 hmm->end[hmm->M] = 1.0;
150
151 /* Main model transitions and emissions
152 */
153 for (k = 1; k < hmm->M; k++)
154 {
155 /* The following code chunk is experimental.
156 * Collaboration with Michael Asman, Erik Sonnhammer, CGR Stockholm.
157 * Only activated if X-PR* annotation has been used, in which
158 * priors are overridden and a single Dirichlet component is
159 * specified for each column (using structural annotation).
160 * If X-PR* annotation is not used, which is usually the case,
161 * the following code has no effect (observe how the real prior
162 * distributions are copied into tq, mq, iq).
163 */
164 if (hmm->tpri != NULL && hmm->tpri[k] >= 0)
165 {
166 if (hmm->tpri[k] >= pri->tnum) Die("X-PRT annotation out of range");
167 FSet(tq, pri->tnum, 0.0);
168 tq[hmm->tpri[k]] = 1.0;
169 }
170 else
171 FCopy(tq, pri->tq, pri->tnum);
172 if (hmm->mpri != NULL && hmm->mpri[k] >= 0)
173 {
174 if (hmm->mpri[k] >= pri->mnum) Die("X-PRM annotation out of range");
175 FSet(mq, pri->mnum, 0.0);
176 mq[hmm->mpri[k]] = 1.0;
177 }
178 else
179 FCopy(mq, pri->mq, pri->mnum);
180 if (hmm->ipri != NULL && hmm->ipri[k] >= 0)
181 {
182 if (hmm->ipri[k] >= pri->inum) Die("X-PRI annotation out of range");
183 FSet(iq, pri->inum, 0.0);
184 iq[hmm->ipri[k]] = 1.0;
185 }
186 else
187 FCopy(iq, pri->iq, pri->inum);
188
189 /* This is the main line of the code:
190 */
191 P7PriorifyTransitionVector(hmm->t[k], pri, tq);
192 P7PriorifyEmissionVector(hmm->mat[k], pri, pri->mnum, mq, pri->m, NULL);
193 P7PriorifyEmissionVector(hmm->ins[k], pri, pri->inum, iq, pri->i, NULL);
194 }
195
196 /* We repeat the above steps just for the final match state, M.
197 */
198 if (hmm->mpri != NULL && hmm->mpri[hmm->M] >= 0)
199 {
200 if (hmm->mpri[hmm->M] >= pri->mnum) Die("X-PRM annotation out of range");
201 FSet(mq, pri->mnum, 0.0);
202 mq[hmm->mpri[hmm->M]] = 1.0;
203 }
204 else
205 FCopy(mq, pri->mq, pri->mnum);
206
207 P7PriorifyEmissionVector(hmm->mat[hmm->M], pri, pri->mnum, mq, pri->m, NULL);
208
209 /* Now we're done. Convert the counts-based HMM to probabilities.
210 */
211 Plan7Renormalize(hmm);
212 }
213
214
215 /* Function: P7PriorifyEmissionVector()
216 *
217 * Purpose: Add prior pseudocounts to an observed
218 * emission count vector and renormalize.
219 *
220 * Can return the posterior mixture probabilities
221 * P(q | counts) if ret_mix[MAXDCHLET] is passed.
222 * Else, pass NULL.
223 *
224 * Args: vec - the 4 or 20-long vector of counts to modify
225 * pri - prior data structure
226 * num - pri->mnum or pri->inum; # of mixtures
227 * eq - pri->mq or pri->iq; prior mixture probabilities
228 * e - pri->i or pri->m; Dirichlet components
229 * ret_mix - filled with posterior mixture probabilities, or NULL
230 *
231 * Return: (void)
232 * The counts in vec are changed and normalized to probabilities.
233 */
234 void
P7PriorifyEmissionVector(float * vec,struct p7prior_s * pri,int num,float eq[MAXDCHLET],float e[MAXDCHLET][MAXABET],float * ret_mix)235 P7PriorifyEmissionVector(float *vec, struct p7prior_s *pri,
236 int num, float eq[MAXDCHLET], float e[MAXDCHLET][MAXABET],
237 float *ret_mix)
238 {
239 //get HMMERTaskLocalData
240 HMMERTaskLocalData *tld = getHMMERTaskLocalData();
241 alphabet_s *al = &tld->al;
242
243 int x; /* counter over vec */
244 int q; /* counter over mixtures */
245 float mix[MAXDCHLET]; /* posterior distribution over mixtures */
246 float totc; /* total counts */
247 float tota; /* total alpha terms */
248 float xi; /* X_i term, Sjolander eq. 41 */
249
250 /* Calculate mix[], which is the posterior probability
251 * P(q | n) of mixture component q given the count vector n
252 *
253 * (side effect note: note that an insert vector in a PAM prior
254 * is passed with num = 1, bypassing pam prior code; this means
255 * that inserts cannot be mixture Dirichlets...)
256 * [SRE, 12/24/00: the above comment is cryptic! what the hell does that
257 * mean, inserts can't be mixtures? doesn't seem to be true. it
258 * may mean that in a PAM prior, you can't have a mixture for inserts,
259 * but I don't even understand that. The insert vectors aren't passed
260 * with num=1!!]
261 */
262 mix[0] = 1.0;
263 if (pri->strategy == PRI_DCHLET && num > 1)
264 {
265 for (q = 0; q < num; q++)
266 {
267 mix[q] = eq[q] > 0.0 ? log(eq[q]) : -999.;
268 mix[q] += Logp_cvec(vec, al->Alphabet_size, e[q]);
269 }
270 LogNorm(mix, num); /* now mix[q] is P(component_q | n) */
271 }
272 else if (pri->strategy == PRI_PAM && num > 1)
273 { /* pam prior uses aa frequencies as `P(q|n)' */
274 for (q = 0; q < al->Alphabet_size; q++)
275 mix[q] = vec[q];
276 FNorm(mix, al->Alphabet_size);
277 }
278
279 /* Convert the counts to probabilities, following Sjolander (1996)
280 */
281 totc = FSum(vec, al->Alphabet_size);
282 for (x = 0; x < al->Alphabet_size; x++) {
283 xi = 0.0;
284 for (q = 0; q < num; q++) {
285 tota = FSum(e[q], al->Alphabet_size);
286 xi += mix[q] * (vec[x] + e[q][x]) / (totc + tota);
287 }
288 vec[x] = xi;
289 }
290 FNorm(vec, al->Alphabet_size);
291
292 if (ret_mix != NULL)
293 for (q = 0; q < num; q++)
294 ret_mix[q] = mix[q];
295 }
296
297
298
299 /* Function: P7PriorifyTransitionVector()
300 *
301 * Purpose: Add prior pseudocounts to transition vector,
302 * which contains three different probability vectors
303 * for m, d, and i.
304 *
305 * Args: t - state transitions, counts: 3 for M, 2 for I, 2 for D.
306 * prior - Dirichlet prior information
307 * tq - prior distribution over Dirichlet components.
308 * (overrides prior->iq[]; used for alternative
309 * methods of conditioning prior on structural data)
310 *
311 * Return: (void)
312 * t is changed, and renormalized -- comes back as
313 * probability vectors.
314 */
315 void
P7PriorifyTransitionVector(float * t,struct p7prior_s * prior,float tq[MAXDCHLET])316 P7PriorifyTransitionVector(float *t, struct p7prior_s *prior,
317 float tq[MAXDCHLET])
318 {
319 int ts;
320 int q;
321 float mix[MAXDCHLET];
322 float totm, totd, toti; /* total counts in three transition vecs */
323 float xi; /* Sjolander's X_i term */
324
325 mix[0] = 1.0; /* default is simple one component */
326 if ((prior->strategy == PRI_DCHLET || prior->strategy == PRI_PAM) && prior->mnum > 1)
327 {
328 for (q = 0; q < prior->tnum; q++)
329 {
330 mix[q] = tq[q] > 0.0 ? log(tq[q]) : -999.;
331 mix[q] += Logp_cvec(t, 3, prior->t[q]); /* 3 match */
332 mix[q] += Logp_cvec(t+3, 2, prior->t[q]+3); /* 2 insert */
333 mix[q] += Logp_cvec(t+5, 2, prior->t[q]+5); /* 2 delete */
334 }
335 LogNorm(mix, prior->tnum); /* mix[q] is now P(q | counts) */
336 }
337 /* precalc some denominators */
338 totm = FSum(t,3);
339 toti = t[TIM] + t[TII];
340 totd = t[TDM] + t[TDD];
341
342 for (ts = 0; ts < 7; ts++)
343 {
344 xi = 0.0;
345 for (q = 0; q < prior->tnum; q++)
346 {
347 switch (ts) {
348 case TMM: case TMI: case TMD:
349 xi += mix[q] * (t[ts] + prior->t[q][ts]) /
350 (totm + FSum(prior->t[q], 3));
351 break;
352 case TIM: case TII:
353 xi += mix[q] * (t[ts] + prior->t[q][ts]) /
354 (toti + prior->t[q][TIM] + prior->t[q][TII]);
355 break;
356 case TDM: case TDD:
357 xi += mix[q] * (t[ts] + prior->t[q][ts]) /
358 (totd + prior->t[q][TDM] + prior->t[q][TDD]);
359 break;
360 }
361 }
362 t[ts] = xi;
363 }
364 FNorm(t, 3); /* match */
365 FNorm(t+3, 2); /* insert */
366 FNorm(t+5, 2); /* delete */
367 }
368
369
370 /* Function: default_amino_prior()
371 *
372 * Purpose: Set the default protein prior.
373 */
374 static struct p7prior_s *
default_amino_prior(void)375 default_amino_prior(void)
376 {
377 struct p7prior_s *pri;
378 int q, x;
379 /* default match mixture coefficients */
380 static float defmq[9] = {
381 0.178091, 0.056591, 0.0960191, 0.0781233, 0.0834977,
382 0.0904123, 0.114468, 0.0682132, 0.234585 };
383
384 /* default match mixture Dirichlet components */
385 static float defm[9][20] = {
386 { 0.270671, 0.039848, 0.017576, 0.016415, 0.014268,
387 0.131916, 0.012391, 0.022599, 0.020358, 0.030727,
388 0.015315, 0.048298, 0.053803, 0.020662, 0.023612,
389 0.216147, 0.147226, 0.065438, 0.003758, 0.009621 },
390 { 0.021465, 0.010300, 0.011741, 0.010883, 0.385651,
391 0.016416, 0.076196, 0.035329, 0.013921, 0.093517,
392 0.022034, 0.028593, 0.013086, 0.023011, 0.018866,
393 0.029156, 0.018153, 0.036100, 0.071770, 0.419641 },
394 { 0.561459, 0.045448, 0.438366, 0.764167, 0.087364,
395 0.259114, 0.214940, 0.145928, 0.762204, 0.247320,
396 0.118662, 0.441564, 0.174822, 0.530840, 0.465529,
397 0.583402, 0.445586, 0.227050, 0.029510, 0.121090 },
398 { 0.070143, 0.011140, 0.019479, 0.094657, 0.013162,
399 0.048038, 0.077000, 0.032939, 0.576639, 0.072293,
400 0.028240, 0.080372, 0.037661, 0.185037, 0.506783,
401 0.073732, 0.071587, 0.042532, 0.011254, 0.028723 },
402 { 0.041103, 0.014794, 0.005610, 0.010216, 0.153602,
403 0.007797, 0.007175, 0.299635, 0.010849, 0.999446,
404 0.210189, 0.006127, 0.013021, 0.019798, 0.014509,
405 0.012049, 0.035799, 0.180085, 0.012744, 0.026466 },
406 { 0.115607, 0.037381, 0.012414, 0.018179, 0.051778,
407 0.017255, 0.004911, 0.796882, 0.017074, 0.285858,
408 0.075811, 0.014548, 0.015092, 0.011382, 0.012696,
409 0.027535, 0.088333, 0.944340, 0.004373, 0.016741 },
410 { 0.093461, 0.004737, 0.387252, 0.347841, 0.010822,
411 0.105877, 0.049776, 0.014963, 0.094276, 0.027761,
412 0.010040, 0.187869, 0.050018, 0.110039, 0.038668,
413 0.119471, 0.065802, 0.025430, 0.003215, 0.018742 },
414 { 0.452171, 0.114613, 0.062460, 0.115702, 0.284246,
415 0.140204, 0.100358, 0.550230, 0.143995, 0.700649,
416 0.276580, 0.118569, 0.097470, 0.126673, 0.143634,
417 0.278983, 0.358482, 0.661750, 0.061533, 0.199373 },
418 { 0.005193, 0.004039, 0.006722, 0.006121, 0.003468,
419 0.016931, 0.003647, 0.002184, 0.005019, 0.005990,
420 0.001473, 0.004158, 0.009055, 0.003630, 0.006583,
421 0.003172, 0.003690, 0.002967, 0.002772, 0.002686 },
422 };
423
424 pri = P7AllocPrior();
425 pri->strategy = PRI_DCHLET;
426
427 /* Transition priors are subjective, but borrowed from GJM's estimations
428 * on Pfam
429 */
430 pri->tnum = 1;
431 pri->tq[0] = 1.0;
432 pri->t[0][TMM] = 0.7939;
433 pri->t[0][TMI] = 0.0278; /* Markus suggests: ~10x MD: ~0.036: test! */
434 pri->t[0][TMD] = 0.0135; /* Markus suggests: ~0.1x MI: ~0.004 */
435 pri->t[0][TIM] = 0.1551;
436 pri->t[0][TII] = 0.1331;
437 pri->t[0][TDM] = 0.9002;
438 pri->t[0][TDD] = 0.5630;
439
440 /* Match emission priors are a mixture Dirichlet,
441 * from Kimmen Sjolander (Blocks9)
442 */
443 pri->mnum = 9;
444 for (q = 0; q < pri->mnum; q++)
445 {
446 pri->mq[q] = defmq[q];
447 for (x = 0; x < 20; x++)
448 pri->m[q][x] = defm[q][x];
449 }
450
451 /* These insert emission priors are subjective. Observed frequencies
452 * were obtained from PFAM 1.0, 10 Nov 96;
453 * see ~/projects/plan7/InsertStatistics.
454 * Inserts are slightly biased towards polar residues and away from
455 * hydrophobic residues.
456 */
457 pri->inum = 1;
458 pri->iq[0] = 1.;
459 pri->i[0][0] = 681.; /* A */
460 pri->i[0][1] = 120.; /* C */
461 pri->i[0][2] = 623.; /* D */
462 pri->i[0][3] = 651.; /* E */
463 pri->i[0][4] = 313.; /* F */
464 pri->i[0][5] = 902.; /* G */
465 pri->i[0][6] = 241.; /* H */
466 pri->i[0][7] = 371.; /* I */
467 pri->i[0][8] = 687.; /* K */
468 pri->i[0][9] = 676.; /* L */
469 pri->i[0][10] = 143.; /* M */
470 pri->i[0][11] = 548.; /* N */
471 pri->i[0][12] = 647.; /* P */
472 pri->i[0][13] = 415.; /* Q */
473 pri->i[0][14] = 551.; /* R */
474 pri->i[0][15] = 926.; /* S */
475 pri->i[0][16] = 623.; /* T */
476 pri->i[0][17] = 505.; /* V */
477 pri->i[0][18] = 102.; /* W */
478 pri->i[0][19] = 269.; /* Y */
479
480 return pri;
481 }
482
483
484 /* Function: default_nucleic_prior()
485 *
486 * Purpose: Set the default DNA prior. (for now, almost a Laplace)
487 */
488 static struct p7prior_s *
default_nucleic_prior(void)489 default_nucleic_prior(void)
490 {
491 //get HMMERTaskLocalData
492 HMMERTaskLocalData *tld = getHMMERTaskLocalData();
493 alphabet_s *al = &tld->al;
494
495 struct p7prior_s *pri;
496
497 pri = P7AllocPrior();
498 pri->strategy = PRI_DCHLET;
499
500 /* The use of the Pfam-trained amino acid transition priors
501 * here is TOTALLY bogus. But it works better than a straight
502 * Laplace, esp. for Maxmodelmaker(). For example, a Laplace
503 * prior builds M=1 models for a single sequence GAATTC (at
504 * one time an open "bug").
505 */
506 pri->tnum = 1;
507 pri->tq[0] = 1.;
508 pri->t[0][TMM] = 0.7939;
509 pri->t[0][TMI] = 0.0278;
510 pri->t[0][TMD] = 0.0135;
511 pri->t[0][TIM] = 0.1551;
512 pri->t[0][TII] = 0.1331;
513 pri->t[0][TDM] = 0.9002;
514 pri->t[0][TDD] = 0.5630;
515
516 pri->mnum = 1;
517 pri->mq[0] = 1.;
518 FSet(pri->m[0], al->Alphabet_size, 1.);
519
520 pri->inum = 1;
521 pri->iq[0] = 1.;
522 FSet(pri->i[0], al->Alphabet_size, 1.);
523
524 return pri;
525 }
526
527