1 /* Defining domain number and coordinates in a significant hit by
2 * posterior sampling and clustering.
3 *
4 * SRE, Wed Jan 9 07:26:34 2008 [Janelia]
5 */
6 #include "p7_config.h"
7 #include "easel.h"
8 #include "esl_cluster.h"
9 #include "esl_vectorops.h"
10 #include "hmmer.h"
11
12 /* Function: p7_spensemble_Create()
13 * Synopsis: Allocates a <P7_SPENSEMBLE>
14 * Incept: SRE, Wed Jan 9 10:00:14 2008 [Janelia]
15 *
16 * Purpose: Create a new <P7_SPENSEMBL> with specified initial
17 * allocation sizes: <init_n> for the number of sampled
18 * segment pairs, <init_epc> for the range over
19 * which one of a domain's (i,j,k,m) sampled endpoints
20 * falls, and <init_sigc> for the number of significant
21 * clusters (domains) that will be defined.
22 *
23 * The values of these initial allocations are only
24 * relevant to tuning memory performance, because the
25 * object is reallocated/grown as needed. You can make
26 * guesses, and the better your guess, the fewer
27 * reallocations will be needed; but everything will work
28 * fine regardless of what these initial allocations are.
29 *
30 * A <P7_SPENSEMBLE> is designed to be reused for many
31 * target sequences and/or models, to minimize alloc/free
32 * calls.
33 *
34 * Args: init_n - initial allocation for # of sampled segment pairs
35 * init_epc - initial allocation for maximum endpoint range
36 * init_sigc - initial allocation for # of significant clusters, domains
37 *
38 * Returns: a pointer to the new <P7_SPENSEMBLE>.
39 *
40 * Throws: <NULL> on allocation failure.
41 */
42 P7_SPENSEMBLE *
p7_spensemble_Create(int init_n,int init_epc,int init_sigc)43 p7_spensemble_Create(int init_n, int init_epc, int init_sigc)
44 {
45 P7_SPENSEMBLE *sp = NULL;
46 int status;
47
48 ESL_ALLOC(sp, sizeof(P7_SPENSEMBLE));
49 sp->sp = NULL;
50 sp->workspace = NULL;
51 sp->assignment = NULL;
52 sp->epc = NULL;
53 sp->sigc = NULL;
54
55 sp->nalloc = init_n;
56 sp->epc_alloc = init_epc;
57 sp->nsigc_alloc = init_sigc;
58
59 ESL_ALLOC(sp->sp, sizeof(struct p7_spcoord_s) * sp->nalloc);
60 ESL_ALLOC(sp->workspace, sizeof(int) * sp->nalloc * 2); /* workspace is 2n */
61 ESL_ALLOC(sp->assignment, sizeof(int) * sp->nalloc);
62 ESL_ALLOC(sp->epc, sizeof(int) * sp->epc_alloc);
63 ESL_ALLOC(sp->sigc, sizeof(struct p7_spcoord_s) * sp->nsigc_alloc);
64 sp->nsamples = 0;
65 sp->n = 0;
66 sp->nc = 0;
67 sp->nsigc = 0;
68 return sp;
69
70 ERROR:
71 p7_spensemble_Destroy(sp);
72 return NULL;
73 }
74
75 /* Function: p7_spensemble_Reuse()
76 * Synopsis: Reinitializes a <P7_SPENSEMBLE>.
77 * Incept: SRE, Wed Jan 9 10:26:36 2008 [Janelia]
78 *
79 * Purpose: Reinitialize <sp> so it can be used again to collect
80 * and process a new segment pair ensemble, without
81 * having to free and reallocate.
82 *
83 * Returns: <eslOK> on success.
84 */
85 int
p7_spensemble_Reuse(P7_SPENSEMBLE * sp)86 p7_spensemble_Reuse(P7_SPENSEMBLE *sp)
87 {
88 sp->nsamples = 0;
89 sp->n = 0;
90 sp->nc = 0;
91 sp->nsigc = 0;
92 return eslOK;
93 }
94
95 /* Function: p7_spsensemble_Add()
96 * Synopsis: Add a new segment pair to a growing ensemble.
97 * Incept: SRE, Wed Jan 9 10:28:04 2008 [Janelia]
98 *
99 * Purpose: Adds a new segment pair to a growing ensemble <sp>.
100 * The segment pair is defined by start/end positions
101 * <i>,<j> on a target sequence (1..L), and start/end
102 * positions <k>,<m> on a query model (1..M).
103 *
104 * You also provide the index <sampleidx> of which sampled
105 * traceback this segment pair came from; each traceback
106 * contains one or more segment pairs. These <sampleidx>
107 * indices start at 0 and they must arrive sequentially:
108 * that is, the caller must <Add()> all the segment pairs
109 * from traceback sample 0, then <Add()> all the segment
110 * pairs from traceback sample 1, and so on.
111 *
112 * The reason to enforce sequential addition has to do with
113 * the internals of the ensemble clustering algorithm;
114 * specifically with how it calculates the posterior
115 * probability of a cluster in the ensemble. You can't
116 * calculate the posterior probability of a cluster simply
117 * by dividing the number of segment pairs in a cluster by
118 * the total number of traces, because you can get
119 * "probabilities" of greater than one: sometimes more than
120 * one pair from the same trace get clustered together
121 * (because one domain got split into two or more segment
122 * pairs). Rather, what it does is to count the total
123 * number of traces that have one or more segments in the
124 * cluster, divided by the total number of traces. An
125 * efficient way to implement this is, when counting
126 * segments that belong to a cluster, only increment the
127 * numerator if the segment has a different traceback index
128 * than the last segment we counted in this cluster. (We'd
129 * rather not have to keep track of a table of all the
130 * traceback indices we've seen so far.)
131 *
132 * Args: sp - ensemble to add this segment pair to
133 * sampleidx - index of traceback that this seg pair came from (0..nsamples-1)
134 * i,j - start,end position on target sequence (1..L)
135 * k,m - start,end position on query model (1..M)
136 *
137 * Returns: <eslOK> on success.
138 *
139 * Throws: <eslEINVAL> if the <sampleidx> is out of order.
140 * <eslEMEM> if a reallocation fails.
141 */
142 int
p7_spensemble_Add(P7_SPENSEMBLE * sp,int sampleidx,int i,int j,int k,int m)143 p7_spensemble_Add(P7_SPENSEMBLE *sp, int sampleidx, int i, int j, int k, int m)
144 {
145 int status;
146
147 if (sampleidx > sp->nsamples) ESL_EXCEPTION(eslEINVAL, "seg pair's <sampleidx> is out of order");
148 else if (sampleidx == sp->nsamples) sp->nsamples++;
149
150 if (sp->n >= sp->nalloc) {
151 void *p;
152 ESL_RALLOC(sp->sp, p, sizeof(struct p7_spcoord_s) * sp->nalloc * 2);
153 ESL_RALLOC(sp->workspace, p, sizeof(int) * sp->nalloc * 4); /* remember, workspace is 2n */
154 ESL_RALLOC(sp->assignment, p, sizeof(int) * sp->nalloc * 2);
155 sp->nalloc *= 2;
156 }
157
158 sp->sp[sp->n].idx = sampleidx;
159 sp->sp[sp->n].i = i;
160 sp->sp[sp->n].j = j;
161 sp->sp[sp->n].k = k;
162 sp->sp[sp->n].m = m;
163 sp->n++;
164 return eslOK;
165
166 ERROR:
167 return status;
168 }
169
170
171 /* struct p7_linkparam_s:
172 * used just within this .c, as part of setting up the clustering problem in
173 * the form that Easel's general SLC algorithm will take it.
174 */
175 struct p7_linkparam_s {
176 float min_overlap; /* 0.8 means >= 80% overlap of (smaller/larger) segment is required, both in seq and hmm */
177 int of_smaller; /* TRUE means overlap fraction is w.r.t. smaller segment; FALSE means w.r.t. larger segment */
178 int max_diagdiff; /* 4 means either start or endpoints of two segments must be within <= 4 diagonals of each other */
179 float min_posterior; /* 0.25 means a cluster must occur w/ >= 25% posterior probability in the sample to be "significant" */
180 float min_endpointp; /* 0.02 means choose widest endpoint with post. prob. of at least 2% */
181 };
182
183
184 /* link_spsamples():
185 *
186 * Defines the rule used for single linkage clustering of sampled
187 * domain coordinates. (API is dictated by Easel's general single
188 * linkage clustering routine.)
189 */
190 static int
link_spsamples(const void * v1,const void * v2,const void * prm,int * ret_link)191 link_spsamples(const void *v1, const void *v2, const void *prm, int *ret_link)
192 {
193 struct p7_spcoord_s *h1 = (struct p7_spcoord_s *) v1;
194 struct p7_spcoord_s *h2 = (struct p7_spcoord_s *) v2;
195 struct p7_linkparam_s *param = (struct p7_linkparam_s *) prm;
196 int nov, n;
197 int d1, d2;
198
199 /* seq overlap test */
200 nov = ESL_MIN(h1->j, h2->j) - ESL_MAX(h1->i, h2->i) + 1; /* overlap */
201 n = (param->of_smaller ? ESL_MIN(h1->j - h1->i + 1, h2->j - h2->i + 1) : /* min length of the two hits */
202 ESL_MAX(h1->j - h1->i + 1, h2->j - h2->i + 1)); /* max length of the two hits */
203 if ((float) nov / (float) n < param->min_overlap) { *ret_link = FALSE; return eslOK; }
204
205 /* hmm overlap test */
206 nov = ESL_MIN(h1->m, h2->m) - ESL_MAX(h1->k, h2->k);
207 n = (param->of_smaller ? ESL_MIN(h1->m - h1->k + 1, h2->m - h2->k + 1) :
208 ESL_MAX(h1->m - h1->k + 1, h2->m - h2->k + 1));
209 if ((float) nov / (float) n < param->min_overlap) { *ret_link = FALSE; return eslOK; }
210
211 /* nearby diagonal test */
212 d1 = (h1->i - h1->k); d2 = (h2->i - h2->k); if (abs(d1-d2) <= param->max_diagdiff) { *ret_link = TRUE; return eslOK; }
213 d1 = (h1->j - h1->m); d2 = (h2->j - h2->m); if (abs(d1-d2) <= param->max_diagdiff) { *ret_link = TRUE; return eslOK; }
214
215 *ret_link = FALSE;
216 return eslOK;
217 }
218
219 /* cluster_orderer()
220 * is the routine that gets passed to qsort() to sort
221 * the significant clusters by order of occurrence on
222 * the target sequence
223 */
224 static int
cluster_orderer(const void * v1,const void * v2)225 cluster_orderer(const void *v1, const void *v2)
226 {
227 struct p7_spcoord_s *h1 = (struct p7_spcoord_s *) v1;
228 struct p7_spcoord_s *h2 = (struct p7_spcoord_s *) v2;
229
230 if (h1->i < h2->i) return -1;
231 else if (h1->i > h2->i) return 1;
232 else return 0;
233 }
234
235 /* Function: p7_spensemble_Cluster()
236 * Synopsis: Cluster a seg pair ensemble and define domains.
237 * Incept: SRE, Wed Jan 9 11:04:07 2008 [Janelia]
238 *
239 * Purpose: Given a collected segment pair ensemble <sp>, cluster it;
240 * identify significant clusters with high posterior probability;
241 * and define consensus endpoints for each significant cluster.
242 *
243 * Clustering is single-linkage. The linkage rule is
244 * controlled by the <min_overlap>, <of_smaller>, and
245 * <max_diagdiff> parameters. To be linked, two segments
246 * must overlap by a fraction $\geq$ <min_overlap>,
247 * relative to either the smaller or larger of the two
248 * segments (<of_smaller = TRUE> or <FALSE>), in both their
249 * sequence and model coords, and either the start or end of both
250 * segments must lie within $\leq$ <max_diagdiff> diagonals
251 * of each other.
252 *
253 * The threshold for cluster "significance" is controlled
254 * by the <min_posterior> parameter. Clusters with
255 * posterior probability $\geq$ this threshold are called
256 * significant.
257 *
258 * Consensus endpoint definition within a cluster is
259 * controlled by the <min_endpointp> parameter. The widest
260 * endpoint that has a posterior probability of $\geq
261 * min_endpointp> is chosen; this is done independently for
262 * each coordinate (i,j,k,m).
263 *
264 * A reasonable (and tested) parameterization is
265 * <min_overlap = 0.8>, <of_smaller = TRUE>, <max_diagdiff
266 * = 4>, <min_posterior = 0.25>, <min_endpointp = 0.02>.
267 *
268 * Args: sp - segment pair ensemble to cluster
269 * min_overlap - linkage requires fractional overlap >= this, in both seq and hmm segments
270 * of_smaller - overlap fraction denominators uses either the smaller (if TRUE) or larger (if FALSE) segment
271 * max_diagdiff - linkage requires that start, end points of both seg pairs are <= this
272 * min_posterior - clusters with posterior prob >= this are defined as significant
273 * min_endpointp - widest endpoint with post prob >= this is defined as consensus endpoint coord
274 *
275 * Returns: the number of significant clusters in <*ret_nclusters>.
276 * The caller can then obtain consensus endpoints for each cluster
277 * by making a series of <p7_spensemble_GetClusterCoords()> calls.
278 *
279 */
280 int
p7_spensemble_Cluster(P7_SPENSEMBLE * sp,float min_overlap,int of_smaller,int max_diagdiff,float min_posterior,float min_endpointp,int * ret_nclusters)281 p7_spensemble_Cluster(P7_SPENSEMBLE *sp,
282 float min_overlap, int of_smaller, int max_diagdiff, float min_posterior, float min_endpointp,
283 int *ret_nclusters)
284 {
285 struct p7_linkparam_s param;
286 int status;
287 int c;
288 int h;
289 int idx_of_last;
290 int *ninc = NULL;
291 int cwindow_width;
292 int epc_threshold;
293 int imin, jmin, kmin, mmin;
294 int imax, jmax, kmax, mmax;
295 int best_i, best_j, best_k, best_m;
296
297 /* set up a single linkage clustering problem for Easel's general routine */
298 param.min_overlap = min_overlap;
299 param.of_smaller = of_smaller;
300 param.max_diagdiff = max_diagdiff;
301 param.min_posterior = min_posterior;
302 param.min_endpointp = min_endpointp;
303 if ((status = esl_cluster_SingleLinkage(sp->sp, sp->n, sizeof(struct p7_spcoord_s), link_spsamples, (void *) ¶m,
304 sp->workspace, sp->assignment, &(sp->nc))) != eslOK) goto ERROR;
305
306 ESL_ALLOC(ninc, sizeof(int) * sp->nc);
307
308 /* Look at each cluster in turn; most will be too small to worry about. */
309 for (c = 0; c < sp->nc; c++)
310 {
311 /* Calculate posterior probability of each cluster.
312 * The extra wrinkle here is that this probability is w.r.t the number of sampled traces;
313 * but the clusters might contain more than one seg pair from a given trace.
314 * That's what the idx_of_last logic is doing, avoiding double-counting.
315 */
316 idx_of_last = -1;
317 for (ninc[c] = 0, h = 0; h < sp->n; h++) {
318 if (sp->assignment[h] == c) {
319 if (sp->sp[h].idx != idx_of_last) ninc[c]++;
320 idx_of_last = sp->sp[h].idx;
321 }
322 }
323 /* Reject low probability clusters: */
324 if ((float) ninc[c] / (float) sp->nsamples < min_posterior) continue;
325
326 /* Find the maximum extent of all seg pairs in this cluster in i,j k,m */
327 for (imin = 0, h = 0; h < sp->n; h++)
328 if (sp->assignment[h] == c)
329 {
330 if (imin == 0) {
331 imin = imax = sp->sp[h].i;
332 jmin = jmax = sp->sp[h].j;
333 kmin = kmax = sp->sp[h].k;
334 mmin = mmax = sp->sp[h].m;
335 } else {
336 imin = ESL_MIN(imin, sp->sp[h].i); imax = ESL_MAX(imax, sp->sp[h].i);
337 jmin = ESL_MIN(jmin, sp->sp[h].j); jmax = ESL_MAX(jmax, sp->sp[h].j);
338 kmin = ESL_MIN(kmin, sp->sp[h].k); kmax = ESL_MAX(kmax, sp->sp[h].k);
339 mmin = ESL_MIN(mmin, sp->sp[h].m); mmax = ESL_MAX(mmax, sp->sp[h].m);
340 }
341 }
342
343 /* Set up a window in which we can examine the end point distributions for i,j,k,m in turn, independently */
344 cwindow_width = ESL_MAX(ESL_MAX(imax-imin+1, jmax-jmin+1),
345 ESL_MAX(kmax-kmin+1, mmax-mmin+1));
346 if (cwindow_width > sp->epc_alloc) {
347 void *p;
348 ESL_RALLOC(sp->epc, p, sizeof(int) * cwindow_width);
349 sp->epc_alloc = cwindow_width;
350 }
351
352 epc_threshold = (int) ceilf((float) ninc[c] * min_endpointp); /* round up. freq of >= epc_threshold means we're >= min_p */
353
354 /* Identify the leftmost i that has enough endpoints. */
355 esl_vec_ISet(sp->epc, imax-imin+1, 0);
356 for (h = 0; h < sp->n; h++) if (sp->assignment[h] == c) sp->epc[sp->sp[h].i-imin]++;
357 for (best_i = imin; best_i <= imax; best_i++) if (sp->epc[best_i-imin] >= epc_threshold) break;
358 if (best_i > imax) best_i = imin + esl_vec_IArgMax(sp->epc, imax-imin+1);
359
360 /* Identify the leftmost k that has enough endpoints */
361 esl_vec_ISet(sp->epc, kmax-kmin+1, 0);
362 for (h = 0; h < sp->n; h++) if (sp->assignment[h] == c) sp->epc[sp->sp[h].k-kmin]++;
363 for (best_k = kmin; best_k <= kmax; best_k++) if (sp->epc[best_k-kmin] >= epc_threshold) break;
364 if (best_k > kmax) best_k = kmin + esl_vec_IArgMax(sp->epc, kmax-kmin+1);
365
366 /* Identify the rightmost j that has enough endpoints. */
367 esl_vec_ISet(sp->epc, jmax-jmin+1, 0);
368 for (h = 0; h < sp->n; h++) if (sp->assignment[h] == c) sp->epc[sp->sp[h].j-jmin]++;
369 for (best_j = jmax; best_j >= jmin; best_j--) if (sp->epc[best_j-jmin] >= epc_threshold) break;
370 if (best_j < jmin) best_j = jmin + esl_vec_IArgMax(sp->epc, jmax-jmin+1);
371
372 /* Identify the rightmost m that has enough endpoints. */
373 esl_vec_ISet(sp->epc, mmax-mmin+1, 0);
374 for (h = 0; h < sp->n; h++) if (sp->assignment[h] == c) sp->epc[sp->sp[h].m-mmin]++;
375 for (best_m = mmax; best_m >= mmin; best_m--) if (sp->epc[best_m-mmin] >= epc_threshold) break;
376 if (best_m < mmin) best_m = mmin + esl_vec_IArgMax(sp->epc, mmax-mmin+1);
377
378 /* If there's no well-defined domain (say, a long stretch of biased composition),
379 the coords above might come out inconsistent; in this case, just reject the domain.
380 */
381 if (best_i > best_j || best_k > best_m) continue;
382
383 if (sp->nsigc >= sp->nsigc_alloc) {
384 void *p;
385 ESL_RALLOC(sp->sigc, p, sizeof(struct p7_spcoord_s) * sp->nsigc_alloc * 2);
386 sp->nsigc_alloc *= 2;
387 }
388
389 sp->sigc[sp->nsigc].i = best_i;
390 sp->sigc[sp->nsigc].j = best_j;
391 sp->sigc[sp->nsigc].k = best_k;
392 sp->sigc[sp->nsigc].m = best_m;
393 sp->sigc[sp->nsigc].idx = c;
394 sp->sigc[sp->nsigc].prob = (float) ninc[c] / (float) sp->nsamples;
395 sp->nsigc++;
396 }
397
398 /* Now we need to make sure those domains are ordered by start point,
399 * because later we're going to calculate overlaps by i_cur - j_prv
400 */
401 qsort((void *) sp->sigc, sp->nsigc, sizeof(struct p7_spcoord_s), cluster_orderer);
402
403 free(ninc);
404 *ret_nclusters = sp->nsigc;
405 return eslOK;
406
407 ERROR:
408 if (ninc != NULL) free(ninc);
409 *ret_nclusters = 0;
410 return status;
411 }
412
413 /* Function: p7_spensemble_GetClusterCoords()
414 * Synopsis: Retrieve consensus coords of one significant segment pair cluster.
415 * Incept: SRE, Wed Jan 9 11:39:27 2008 [Janelia]
416 *
417 * Purpose: Retrieve the consensus coords of significant segment pair cluster <which>
418 * from the ensemble <sp>, which has already been clustered with
419 * <p7_spensemble_Cluster()>.
420 *
421 * Returns: <eslOK> on success, and the consensus coords are in <*opt_i>, <*opt_j>,
422 * <*opt_k>, and <*opt_m>; the (sampled) posterior probability of the
423 * cluster is in <*opt_p>. All of these returned values are optional;
424 * the caller can pass a <NULL> for any value it's not interested in
425 * retrieving.
426 */
427 int
p7_spensemble_GetClusterCoords(P7_SPENSEMBLE * sp,int which,int * opt_i,int * opt_j,int * opt_k,int * opt_m,float * opt_p)428 p7_spensemble_GetClusterCoords(P7_SPENSEMBLE *sp, int which, int *opt_i, int *opt_j, int *opt_k, int *opt_m, float *opt_p)
429 {
430 if (opt_i != NULL) *opt_i = sp->sigc[which].i;
431 if (opt_j != NULL) *opt_j = sp->sigc[which].j;
432 if (opt_k != NULL) *opt_k = sp->sigc[which].k;
433 if (opt_m != NULL) *opt_m = sp->sigc[which].m;
434 if (opt_p != NULL) *opt_p = sp->sigc[which].prob;
435 return eslOK;
436 }
437
438
439 /* Function: p7_spensemble_Destroy()
440 * Synopsis: Deallocate a <P7_SPENSEMBLE>
441 * Incept: SRE, Wed Jan 9 11:42:01 2008 [Janelia]
442 *
443 * Purpose: Destroys a <P7_SPENSEMBLE>.
444 */
445 void
p7_spensemble_Destroy(P7_SPENSEMBLE * sp)446 p7_spensemble_Destroy(P7_SPENSEMBLE *sp)
447 {
448 if (sp == NULL) return;
449 if (sp->sp != NULL) free(sp->sp);
450 if (sp->workspace != NULL) free(sp->workspace);
451 if (sp->assignment != NULL) free(sp->assignment);
452 if (sp->epc != NULL) free(sp->epc);
453 if (sp->sigc != NULL) free(sp->sigc);
454 free(sp);
455 }
456
457
458
459 /*****************************************************************
460 * Benchmark and example.
461 *****************************************************************/
462
463 #ifdef p7SPENSEMBLE_EXAMPLE
464 /*
465 gcc -g -I. -L. -I ../easel -L ../easel -Dp7SPENSEMBLE_EXAMPLE -o example p7_spensemble.c -lhmmer -leasel -lm
466 */
467 #include "p7_config.h"
468
469 #include <stdio.h>
470 #include <stdlib.h>
471
472 #include "easel.h"
473 #include "esl_getopts.h"
474 #include "esl_random.h"
475 #include "esl_alphabet.h"
476 #include "esl_dmatrix.h"
477 #include "esl_sq.h"
478 #include "esl_sqio.h"
479 #include "esl_cluster.h"
480 #include "esl_vectorops.h"
481 #include "esl_stopwatch.h"
482 #include "hmmer.h"
483
484 static ESL_OPTIONS options[] = {
485 /* name type default env range toggles reqs incomp help docgroup*/
486 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
487 { "-s", eslARG_INT, "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
488 { "-N", eslARG_INT, "1000", NULL, NULL, NULL, NULL, NULL, "number of trace samples to take", 0 },
489 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
490 };
491 static char usage[] = "[-options] <hmmfile> <seqfile>";
492 static char banner[] = "example, test, benchmark of defining domains by posterior sampling";
493
494 int
main(int argc,char ** argv)495 main(int argc, char **argv)
496 {
497 ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage);
498 ESL_STOPWATCH *w = esl_stopwatch_Create();
499 ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s"));
500 char *hmmfile = esl_opt_GetArg(go, 1);
501 char *seqfile = esl_opt_GetArg(go, 2);
502 int format = eslSQFILE_FASTA;
503 ESL_ALPHABET *abc = NULL;
504 P7_HMMFILE *hfp = NULL;
505 P7_HMM *hmm = NULL;
506 P7_BG *bg = NULL;
507 P7_PROFILE *gm = NULL;
508 P7_OPROFILE *om = NULL;
509 P7_OMX *fwd = NULL;
510 ESL_SQ *sq = NULL;
511 ESL_SQFILE *sqfp = NULL;
512 P7_TRACE *tr = NULL;
513 P7_SPENSEMBLE *sp = NULL;
514 int N = esl_opt_GetInteger(go, "-N");
515 int t,d,nc;
516 int i,j,k,m;
517 float sc;
518 float prob;
519 int status;
520
521 /* Read in one HMM */
522 if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile);
523 if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM");
524 p7_hmmfile_Close(hfp);
525
526 /* Read in one sequence */
527 sq = esl_sq_CreateDigital(abc);
528 status = esl_sqfile_Open(seqfile, format, NULL, &sqfp);
529 if (status == eslENOTFOUND) p7_Fail("No such file.");
530 else if (status == eslEFORMAT) p7_Fail("Format unrecognized.");
531 else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz.");
532 else if (status != eslOK) p7_Fail("Open failed, code %d.", status);
533 if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence");
534 esl_sqfile_Close(sqfp);
535
536 /* Configure a profile from the HMM */
537 bg = p7_bg_Create(abc);
538 p7_bg_SetLength(bg, sq->n);
539 gm = p7_profile_Create(hmm->M, abc);
540 om = p7_oprofile_Create(hmm->M, abc);
541 p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL);
542 p7_oprofile_Convert(gm, om);
543 p7_oprofile_ReconfigLength(om, sq->n);
544
545 /* Allocate DP matrix for Forward, and run a Forward calculation in it */
546 fwd = p7_omx_Create(gm->M, sq->n, sq->n);
547 p7_Forward (sq->dsq, sq->n, om, fwd, &sc);
548
549 /* Allocate a trace container, and an spensemble */
550 tr = p7_trace_Create();
551 sp = p7_spensemble_Create(1024, 64, 32);
552
553 /* Start the stopwatch. Now we're in domain processing steps. */
554 esl_stopwatch_Start(w);
555
556 /* Collect N traces, add their domain coords to the ensemble, and cluster */
557 for (t = 0; t < N; t++) {
558 p7_StochasticTrace(r, sq->dsq, sq->n, om, fwd, tr);
559 p7_trace_Index(tr);
560
561 for (d = 0; d < tr->ndom; d++)
562 p7_spensemble_Add(sp, t, tr->sqfrom[d], tr->sqto[d], tr->hmmfrom[d], tr->hmmto[d]);
563 p7_trace_Reuse(tr);
564 }
565 p7_spensemble_Cluster(sp, 0.8, TRUE, 4, 0.25, 0.02, &nc);
566 for (d = 0; d < nc; d++) {
567 p7_spensemble_GetClusterCoords(sp, d, &i, &j, &k, &m, &prob);
568 printf("domain %-4d : %6d %6d %6d %6d p=%.4f\n", d, i, j, k, m, prob);
569 }
570
571 /* Done. */
572 esl_stopwatch_Stop(w);
573 esl_stopwatch_Display(stdout, w, "# CPU time: ");
574
575 p7_spensemble_Destroy(sp);
576 p7_trace_Destroy(tr);
577 esl_sq_Destroy(sq);
578 p7_omx_Destroy(fwd);
579 p7_profile_Destroy(gm);
580 p7_oprofile_Destroy(om);
581 p7_bg_Destroy(bg);
582 p7_hmm_Destroy(hmm);
583 esl_stopwatch_Destroy(w);
584 esl_randomness_Destroy(r);
585 esl_alphabet_Destroy(abc);
586 esl_getopts_Destroy(go);
587 return 0;
588 }
589 #endif /*p7SPENSEMBLE_EXAMPLE*/
590