1 /* Defining domain number and coordinates in a significant hit by
2  * posterior sampling and clustering.
3  *
4  * SRE, Wed Jan  9 07:26:34 2008 [Janelia]
5  */
6 #include "p7_config.h"
7 #include "easel.h"
8 #include "esl_cluster.h"
9 #include "esl_vectorops.h"
10 #include "hmmer.h"
11 
12 /* Function:  p7_spensemble_Create()
13  * Synopsis:  Allocates a <P7_SPENSEMBLE>
14  * Incept:    SRE, Wed Jan  9 10:00:14 2008 [Janelia]
15  *
16  * Purpose:   Create a new <P7_SPENSEMBL> with specified initial
17  *            allocation sizes: <init_n> for the number of sampled
18  *            segment pairs, <init_epc> for the range over
19  *            which one of a domain's (i,j,k,m) sampled endpoints
20  *            falls, and <init_sigc> for the number of significant
21  *            clusters (domains) that will be defined.
22  *
23  *            The values of these initial allocations are only
24  *            relevant to tuning memory performance, because the
25  *            object is reallocated/grown as needed. You can make
26  *            guesses, and the better your guess, the fewer
27  *            reallocations will be needed; but everything will work
28  *            fine regardless of what these initial allocations are.
29  *
30  *            A <P7_SPENSEMBLE> is designed to be reused for many
31  *            target sequences and/or models, to minimize alloc/free
32  *            calls.
33  *
34  * Args:      init_n     - initial allocation for # of sampled segment pairs
35  *            init_epc   - initial allocation for maximum endpoint range
36  *            init_sigc  - initial allocation for # of significant clusters, domains
37  *
38  * Returns:   a pointer to the new <P7_SPENSEMBLE>.
39  *
40  * Throws:    <NULL> on allocation failure.
41  */
42 P7_SPENSEMBLE *
p7_spensemble_Create(int init_n,int init_epc,int init_sigc)43 p7_spensemble_Create(int init_n, int init_epc, int init_sigc)
44 {
45   P7_SPENSEMBLE *sp = NULL;
46   int            status;
47 
48   ESL_ALLOC(sp, sizeof(P7_SPENSEMBLE));
49   sp->sp            = NULL;
50   sp->workspace     = NULL;
51   sp->assignment    = NULL;
52   sp->epc           = NULL;
53   sp->sigc          = NULL;
54 
55   sp->nalloc        = init_n;
56   sp->epc_alloc     = init_epc;
57   sp->nsigc_alloc   = init_sigc;
58 
59   ESL_ALLOC(sp->sp,         sizeof(struct p7_spcoord_s) * sp->nalloc);
60   ESL_ALLOC(sp->workspace,  sizeof(int)                 * sp->nalloc * 2); /* workspace is 2n */
61   ESL_ALLOC(sp->assignment, sizeof(int)                 * sp->nalloc);
62   ESL_ALLOC(sp->epc,        sizeof(int)                 * sp->epc_alloc);
63   ESL_ALLOC(sp->sigc,       sizeof(struct p7_spcoord_s) * sp->nsigc_alloc);
64   sp->nsamples  = 0;
65   sp->n         = 0;
66   sp->nc        = 0;
67   sp->nsigc     = 0;
68   return sp;
69 
70  ERROR:
71   p7_spensemble_Destroy(sp);
72   return NULL;
73 }
74 
75 /* Function:  p7_spensemble_Reuse()
76  * Synopsis:  Reinitializes a <P7_SPENSEMBLE>.
77  * Incept:    SRE, Wed Jan  9 10:26:36 2008 [Janelia]
78  *
79  * Purpose:   Reinitialize <sp> so it can be used again to collect
80  *            and process a new segment pair ensemble, without
81  *            having to free and reallocate.
82  *
83  * Returns:   <eslOK> on success.
84  */
85 int
p7_spensemble_Reuse(P7_SPENSEMBLE * sp)86 p7_spensemble_Reuse(P7_SPENSEMBLE *sp)
87 {
88   sp->nsamples = 0;
89   sp->n        = 0;
90   sp->nc       = 0;
91   sp->nsigc    = 0;
92   return eslOK;
93 }
94 
95 /* Function:  p7_spsensemble_Add()
96  * Synopsis:  Add a new segment pair to a growing ensemble.
97  * Incept:    SRE, Wed Jan  9 10:28:04 2008 [Janelia]
98  *
99  * Purpose:   Adds a new segment pair to a growing ensemble <sp>.
100  *            The segment pair is defined by start/end positions
101  *            <i>,<j> on a target sequence (1..L), and start/end
102  *            positions <k>,<m> on a query model (1..M).
103  *
104  *            You also provide the index <sampleidx> of which sampled
105  *            traceback this segment pair came from; each traceback
106  *            contains one or more segment pairs. These <sampleidx>
107  *            indices start at 0 and they must arrive sequentially:
108  *            that is, the caller must <Add()> all the segment pairs
109  *            from traceback sample 0, then <Add()> all the segment
110  *            pairs from traceback sample 1, and so on.
111  *
112  *            The reason to enforce sequential addition has to do with
113  *            the internals of the ensemble clustering algorithm;
114  *            specifically with how it calculates the posterior
115  *            probability of a cluster in the ensemble. You can't
116  *            calculate the posterior probability of a cluster simply
117  *            by dividing the number of segment pairs in a cluster by
118  *            the total number of traces, because you can get
119  *            "probabilities" of greater than one: sometimes more than
120  *            one pair from the same trace get clustered together
121  *            (because one domain got split into two or more segment
122  *            pairs). Rather, what it does is to count the total
123  *            number of traces that have one or more segments in the
124  *            cluster, divided by the total number of traces. An
125  *            efficient way to implement this is, when counting
126  *            segments that belong to a cluster, only increment the
127  *            numerator if the segment has a different traceback index
128  *            than the last segment we counted in this cluster. (We'd
129  *            rather not have to keep track of a table of all the
130  *            traceback indices we've seen so far.)
131  *
132  * Args:      sp        - ensemble to add this segment pair to
133  *            sampleidx - index of traceback that this seg pair came from (0..nsamples-1)
134  *            i,j       - start,end position on target sequence (1..L)
135  *            k,m       - start,end position on query model (1..M)
136  *
137  * Returns:   <eslOK> on success.
138  *
139  * Throws:    <eslEINVAL> if the <sampleidx> is out of order.
140  *            <eslEMEM> if a reallocation fails.
141  */
142 int
p7_spensemble_Add(P7_SPENSEMBLE * sp,int sampleidx,int i,int j,int k,int m)143 p7_spensemble_Add(P7_SPENSEMBLE *sp, int sampleidx, int i, int j, int k, int m)
144 {
145   int status;
146 
147   if      (sampleidx > sp->nsamples)  ESL_EXCEPTION(eslEINVAL, "seg pair's <sampleidx> is out of order");
148   else if (sampleidx == sp->nsamples) sp->nsamples++;
149 
150   if (sp->n >= sp->nalloc) {
151     void *p;
152     ESL_RALLOC(sp->sp,         p, sizeof(struct p7_spcoord_s)  * sp->nalloc * 2);
153     ESL_RALLOC(sp->workspace,  p, sizeof(int)                  * sp->nalloc * 4); /* remember, workspace is 2n */
154     ESL_RALLOC(sp->assignment, p, sizeof(int)                  * sp->nalloc * 2);
155     sp->nalloc *= 2;
156   }
157 
158   sp->sp[sp->n].idx = sampleidx;
159   sp->sp[sp->n].i   = i;
160   sp->sp[sp->n].j   = j;
161   sp->sp[sp->n].k   = k;
162   sp->sp[sp->n].m   = m;
163   sp->n++;
164   return eslOK;
165 
166  ERROR:
167   return status;
168 }
169 
170 
171 /* struct p7_linkparam_s:
172  * used just within this .c, as part of setting up the clustering problem in
173  * the form that Easel's general SLC algorithm will take it.
174  */
175 struct p7_linkparam_s {
176   float min_overlap;	/* 0.8 means >= 80% overlap of (smaller/larger) segment is required, both in seq and hmm               */
177   int   of_smaller;	/* TRUE means overlap fraction is w.r.t. smaller segment; FALSE means w.r.t. larger segment            */
178   int   max_diagdiff;	/* 4 means either start or endpoints of two segments must be within <= 4 diagonals of each other       */
179   float min_posterior;	/* 0.25 means a cluster must occur w/ >= 25% posterior probability in the sample to be "significant"   */
180   float min_endpointp;	/* 0.02 means choose widest endpoint with post. prob. of at least 2%                                   */
181 };
182 
183 
184 /* link_spsamples():
185  *
186  * Defines the rule used for single linkage clustering of sampled
187  * domain coordinates. (API is dictated by Easel's general single
188  * linkage clustering routine.)
189  */
190 static int
link_spsamples(const void * v1,const void * v2,const void * prm,int * ret_link)191 link_spsamples(const void *v1, const void *v2, const void *prm, int *ret_link)
192 {
193   struct p7_spcoord_s   *h1    = (struct p7_spcoord_s *)   v1;
194   struct p7_spcoord_s   *h2    = (struct p7_spcoord_s *)   v2;
195   struct p7_linkparam_s *param = (struct p7_linkparam_s *) prm;
196   int  nov, n;
197   int  d1, d2;
198 
199   /* seq overlap test */
200   nov = ESL_MIN(h1->j, h2->j) - ESL_MAX(h1->i, h2->i) + 1;      /* overlap  */
201   n = (param->of_smaller ? ESL_MIN(h1->j - h1->i + 1,  h2->j - h2->i + 1) :     /* min length of the two hits */
202                            ESL_MAX(h1->j - h1->i + 1,  h2->j - h2->i + 1));      /* max length of the two hits */
203   if ((float) nov / (float) n  < param->min_overlap) { *ret_link = FALSE; return eslOK; }
204 
205   /* hmm overlap test */
206   nov = ESL_MIN(h1->m, h2->m) - ESL_MAX(h1->k, h2->k);
207   n   = (param->of_smaller ? ESL_MIN(h1->m - h1->k + 1,  h2->m - h2->k + 1) :
208                              ESL_MAX(h1->m - h1->k + 1,  h2->m - h2->k + 1));
209   if ((float) nov / (float)  n < param->min_overlap) { *ret_link = FALSE; return eslOK; }
210 
211   /* nearby diagonal test */
212   d1 = (h1->i - h1->k); d2 = (h2->i - h2->k);  if (abs(d1-d2) <= param->max_diagdiff) { *ret_link = TRUE; return eslOK; }
213   d1 = (h1->j - h1->m); d2 = (h2->j - h2->m);  if (abs(d1-d2) <= param->max_diagdiff) { *ret_link = TRUE; return eslOK; }
214 
215   *ret_link = FALSE;
216   return eslOK;
217 }
218 
219 /* cluster_orderer()
220  * is the routine that gets passed to qsort() to sort
221  * the significant clusters by order of occurrence on
222  * the target sequence
223  */
224 static int
cluster_orderer(const void * v1,const void * v2)225 cluster_orderer(const void *v1, const void *v2)
226 {
227   struct p7_spcoord_s   *h1    = (struct p7_spcoord_s *)   v1;
228   struct p7_spcoord_s   *h2    = (struct p7_spcoord_s *)   v2;
229 
230   if      (h1->i < h2->i) return -1;
231   else if (h1->i > h2->i) return 1;
232   else                    return 0;
233 }
234 
235 /* Function:  p7_spensemble_Cluster()
236  * Synopsis:  Cluster a seg pair ensemble and define domains.
237  * Incept:    SRE, Wed Jan  9 11:04:07 2008 [Janelia]
238  *
239  * Purpose:   Given a collected segment pair ensemble <sp>, cluster it;
240  *            identify significant clusters with high posterior probability;
241  *            and define consensus endpoints for each significant cluster.
242  *
243  *            Clustering is single-linkage. The linkage rule is
244  *            controlled by the <min_overlap>, <of_smaller>, and
245  *            <max_diagdiff> parameters. To be linked, two segments
246  *            must overlap by a fraction $\geq$ <min_overlap>,
247  *            relative to either the smaller or larger of the two
248  *            segments (<of_smaller = TRUE> or <FALSE>), in both their
249  *            sequence and model coords, and either the start or end of both
250  *            segments must lie within $\leq$ <max_diagdiff> diagonals
251  *            of each other.
252  *
253  *            The threshold for cluster "significance" is controlled
254  *            by the <min_posterior> parameter. Clusters with
255  *            posterior probability $\geq$ this threshold are called
256  *            significant.
257  *
258  *            Consensus endpoint definition within a cluster is
259  *            controlled by the <min_endpointp> parameter. The widest
260  *            endpoint that has a posterior probability of $\geq
261  *            min_endpointp> is chosen; this is done independently for
262  *            each coordinate (i,j,k,m).
263  *
264  *            A reasonable (and tested) parameterization is
265  *            <min_overlap = 0.8>, <of_smaller = TRUE>, <max_diagdiff
266  *            = 4>, <min_posterior = 0.25>, <min_endpointp = 0.02>.
267  *
268  * Args:      sp            - segment pair ensemble to cluster
269  *            min_overlap   - linkage requires fractional overlap >= this, in both seq and hmm segments
270  *            of_smaller    - overlap fraction denominators uses either the smaller (if TRUE) or larger (if FALSE) segment
271  *            max_diagdiff  - linkage requires that start, end points of both seg pairs are <= this
272  *            min_posterior - clusters with posterior prob >= this are defined as significant
273  *            min_endpointp - widest endpoint with post prob >= this is defined as consensus endpoint coord
274  *
275  * Returns:   the number of significant clusters in <*ret_nclusters>.
276  *            The caller can then obtain consensus endpoints for each cluster
277  *            by making a series of <p7_spensemble_GetClusterCoords()> calls.
278  *
279  */
280 int
p7_spensemble_Cluster(P7_SPENSEMBLE * sp,float min_overlap,int of_smaller,int max_diagdiff,float min_posterior,float min_endpointp,int * ret_nclusters)281 p7_spensemble_Cluster(P7_SPENSEMBLE *sp,
282 		      float min_overlap, int of_smaller, int max_diagdiff, float min_posterior, float min_endpointp,
283 		      int *ret_nclusters)
284 {
285   struct p7_linkparam_s param;
286   int status;
287   int c;
288   int h;
289   int idx_of_last;
290   int *ninc = NULL;
291   int cwindow_width;
292   int epc_threshold;
293   int imin, jmin, kmin, mmin;
294   int imax, jmax, kmax, mmax;
295   int best_i, best_j, best_k, best_m;
296 
297   /* set up a single linkage clustering problem for Easel's general routine */
298   param.min_overlap   = min_overlap;
299   param.of_smaller    = of_smaller;
300   param.max_diagdiff  = max_diagdiff;
301   param.min_posterior = min_posterior;
302   param.min_endpointp = min_endpointp;
303   if ((status = esl_cluster_SingleLinkage(sp->sp, sp->n, sizeof(struct p7_spcoord_s), link_spsamples, (void *) &param,
304 					  sp->workspace, sp->assignment, &(sp->nc))) != eslOK) goto ERROR;
305 
306   ESL_ALLOC(ninc, sizeof(int) * sp->nc);
307 
308   /* Look at each cluster in turn; most will be too small to worry about. */
309   for (c = 0; c < sp->nc; c++)
310     {
311       /* Calculate posterior probability of each cluster.
312        * The extra wrinkle here is that this probability is w.r.t the number of sampled traces;
313        * but the clusters might contain more than one seg pair from a given trace.
314        * That's what the idx_of_last logic is doing, avoiding double-counting.
315        */
316       idx_of_last = -1;
317       for (ninc[c] = 0, h = 0; h < sp->n; h++) {
318 	if (sp->assignment[h] == c) {
319 	  if (sp->sp[h].idx != idx_of_last) ninc[c]++;
320 	  idx_of_last = sp->sp[h].idx;
321 	}
322       }
323       /* Reject low probability clusters: */
324       if ((float) ninc[c] / (float) sp->nsamples < min_posterior) continue;
325 
326       /* Find the maximum extent of all seg pairs in this cluster in i,j k,m */
327       for (imin = 0, h = 0; h < sp->n; h++)
328 	if (sp->assignment[h] == c)
329 	  {
330 	    if (imin == 0) {
331 	      imin = imax = sp->sp[h].i;
332 	      jmin = jmax = sp->sp[h].j;
333 	      kmin = kmax = sp->sp[h].k;
334 	      mmin = mmax = sp->sp[h].m;
335 	    } else {
336 	      imin = ESL_MIN(imin, sp->sp[h].i);  imax = ESL_MAX(imax, sp->sp[h].i);
337 	      jmin = ESL_MIN(jmin, sp->sp[h].j);  jmax = ESL_MAX(jmax, sp->sp[h].j);
338 	      kmin = ESL_MIN(kmin, sp->sp[h].k);  kmax = ESL_MAX(kmax, sp->sp[h].k);
339 	      mmin = ESL_MIN(mmin, sp->sp[h].m);  mmax = ESL_MAX(mmax, sp->sp[h].m);
340 	    }
341 	  }
342 
343       /* Set up a window in which we can examine the end point distributions for i,j,k,m in turn, independently */
344       cwindow_width = ESL_MAX(ESL_MAX(imax-imin+1, jmax-jmin+1),
345 			      ESL_MAX(kmax-kmin+1, mmax-mmin+1));
346       if (cwindow_width > sp->epc_alloc) {
347 	void *p;
348 	ESL_RALLOC(sp->epc, p, sizeof(int) * cwindow_width);
349 	sp->epc_alloc = cwindow_width;
350       }
351 
352       epc_threshold = (int) ceilf((float) ninc[c] * min_endpointp); /* round up.  freq of >= epc_threshold means we're >= min_p */
353 
354       /* Identify the leftmost i that has enough endpoints. */
355       esl_vec_ISet(sp->epc, imax-imin+1, 0);
356       for (h = 0; h < sp->n; h++) if (sp->assignment[h] == c) sp->epc[sp->sp[h].i-imin]++;
357       for (best_i = imin; best_i <= imax; best_i++) if (sp->epc[best_i-imin] >= epc_threshold) break;
358       if (best_i > imax) best_i = imin + esl_vec_IArgMax(sp->epc, imax-imin+1);
359 
360       /* Identify the leftmost k that has enough endpoints */
361       esl_vec_ISet(sp->epc, kmax-kmin+1, 0);
362       for (h = 0; h < sp->n; h++) if (sp->assignment[h] == c) sp->epc[sp->sp[h].k-kmin]++;
363       for (best_k = kmin; best_k <= kmax; best_k++) if (sp->epc[best_k-kmin] >= epc_threshold) break;
364       if (best_k > kmax) best_k = kmin + esl_vec_IArgMax(sp->epc, kmax-kmin+1);
365 
366       /* Identify the rightmost j that has enough endpoints. */
367       esl_vec_ISet(sp->epc, jmax-jmin+1, 0);
368       for (h = 0; h < sp->n; h++) if (sp->assignment[h] == c) sp->epc[sp->sp[h].j-jmin]++;
369       for (best_j = jmax; best_j >= jmin; best_j--) if (sp->epc[best_j-jmin] >= epc_threshold) break;
370       if (best_j < jmin) best_j = jmin + esl_vec_IArgMax(sp->epc, jmax-jmin+1);
371 
372       /* Identify the rightmost m that has enough endpoints. */
373       esl_vec_ISet(sp->epc, mmax-mmin+1, 0);
374       for (h = 0; h < sp->n; h++) if (sp->assignment[h] == c) sp->epc[sp->sp[h].m-mmin]++;
375       for (best_m = mmax; best_m >= mmin; best_m--) if (sp->epc[best_m-mmin] >= epc_threshold) break;
376       if (best_m < mmin) best_m = mmin + esl_vec_IArgMax(sp->epc, mmax-mmin+1);
377 
378       /* If there's no well-defined domain (say, a long stretch of biased composition),
379 	 the coords above might come out inconsistent; in this case, just reject the domain.
380        */
381       if (best_i > best_j || best_k > best_m) continue;
382 
383       if (sp->nsigc >= sp->nsigc_alloc) {
384 	void *p;
385 	ESL_RALLOC(sp->sigc, p, sizeof(struct p7_spcoord_s) * sp->nsigc_alloc * 2);
386 	sp->nsigc_alloc *= 2;
387       }
388 
389       sp->sigc[sp->nsigc].i    = best_i;
390       sp->sigc[sp->nsigc].j    = best_j;
391       sp->sigc[sp->nsigc].k    = best_k;
392       sp->sigc[sp->nsigc].m    = best_m;
393       sp->sigc[sp->nsigc].idx  = c;
394       sp->sigc[sp->nsigc].prob = (float) ninc[c] / (float) sp->nsamples;
395       sp->nsigc++;
396     }
397 
398   /* Now we need to make sure those domains are ordered by start point,
399    * because later we're going to calculate overlaps by i_cur - j_prv
400    */
401   qsort((void *) sp->sigc, sp->nsigc, sizeof(struct p7_spcoord_s), cluster_orderer);
402 
403   free(ninc);
404   *ret_nclusters = sp->nsigc;
405   return eslOK;
406 
407  ERROR:
408   if (ninc != NULL) free(ninc);
409   *ret_nclusters = 0;
410   return status;
411 }
412 
413 /* Function:  p7_spensemble_GetClusterCoords()
414  * Synopsis:  Retrieve consensus coords of one significant segment pair cluster.
415  * Incept:    SRE, Wed Jan  9 11:39:27 2008 [Janelia]
416  *
417  * Purpose:   Retrieve the consensus coords of significant segment pair cluster <which>
418  *            from the ensemble <sp>, which has already been clustered with
419  *            <p7_spensemble_Cluster()>.
420  *
421  * Returns:   <eslOK> on success, and the consensus coords are in <*opt_i>, <*opt_j>,
422  *            <*opt_k>, and <*opt_m>; the (sampled) posterior probability of the
423  *            cluster is in <*opt_p>. All of these returned values are optional;
424  *            the caller can pass a <NULL> for any value it's not interested in
425  *            retrieving.
426  */
427 int
p7_spensemble_GetClusterCoords(P7_SPENSEMBLE * sp,int which,int * opt_i,int * opt_j,int * opt_k,int * opt_m,float * opt_p)428 p7_spensemble_GetClusterCoords(P7_SPENSEMBLE *sp, int which, int *opt_i, int *opt_j, int *opt_k, int *opt_m, float *opt_p)
429 {
430   if (opt_i != NULL) *opt_i = sp->sigc[which].i;
431   if (opt_j != NULL) *opt_j = sp->sigc[which].j;
432   if (opt_k != NULL) *opt_k = sp->sigc[which].k;
433   if (opt_m != NULL) *opt_m = sp->sigc[which].m;
434   if (opt_p != NULL) *opt_p = sp->sigc[which].prob;
435   return eslOK;
436 }
437 
438 
439 /* Function:  p7_spensemble_Destroy()
440  * Synopsis:  Deallocate a <P7_SPENSEMBLE>
441  * Incept:    SRE, Wed Jan  9 11:42:01 2008 [Janelia]
442  *
443  * Purpose:   Destroys a <P7_SPENSEMBLE>.
444  */
445 void
p7_spensemble_Destroy(P7_SPENSEMBLE * sp)446 p7_spensemble_Destroy(P7_SPENSEMBLE *sp)
447 {
448   if (sp == NULL) return;
449   if (sp->sp         != NULL) free(sp->sp);
450   if (sp->workspace  != NULL) free(sp->workspace);
451   if (sp->assignment != NULL) free(sp->assignment);
452   if (sp->epc        != NULL) free(sp->epc);
453   if (sp->sigc       != NULL) free(sp->sigc);
454   free(sp);
455 }
456 
457 
458 
459 /*****************************************************************
460  * Benchmark and example.
461  *****************************************************************/
462 
463 #ifdef p7SPENSEMBLE_EXAMPLE
464 /*
465    gcc -g -I. -L. -I ../easel -L ../easel -Dp7SPENSEMBLE_EXAMPLE -o example p7_spensemble.c -lhmmer -leasel -lm
466  */
467 #include "p7_config.h"
468 
469 #include <stdio.h>
470 #include <stdlib.h>
471 
472 #include "easel.h"
473 #include "esl_getopts.h"
474 #include "esl_random.h"
475 #include "esl_alphabet.h"
476 #include "esl_dmatrix.h"
477 #include "esl_sq.h"
478 #include "esl_sqio.h"
479 #include "esl_cluster.h"
480 #include "esl_vectorops.h"
481 #include "esl_stopwatch.h"
482 #include "hmmer.h"
483 
484 static ESL_OPTIONS options[] = {
485   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
486   { "-h",        eslARG_NONE,   FALSE, NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",             0 },
487   { "-s",        eslARG_INT,     "42", NULL, NULL,  NULL,  NULL, NULL, "set random number seed to <n>",                    0 },
488   { "-N",        eslARG_INT,   "1000", NULL, NULL,  NULL,  NULL, NULL, "number of trace samples to take",                  0 },
489   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
490 };
491 static char usage[]  = "[-options] <hmmfile> <seqfile>";
492 static char banner[] = "example, test, benchmark of defining domains by posterior sampling";
493 
494 int
main(int argc,char ** argv)495 main(int argc, char **argv)
496 {
497   ESL_GETOPTS    *go      = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage);
498   ESL_STOPWATCH  *w       = esl_stopwatch_Create();
499   ESL_RANDOMNESS *r       = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s"));
500   char           *hmmfile = esl_opt_GetArg(go, 1);
501   char           *seqfile = esl_opt_GetArg(go, 2);
502   int             format  = eslSQFILE_FASTA;
503   ESL_ALPHABET   *abc     = NULL;
504   P7_HMMFILE     *hfp     = NULL;
505   P7_HMM         *hmm     = NULL;
506   P7_BG          *bg      = NULL;
507   P7_PROFILE     *gm      = NULL;
508   P7_OPROFILE    *om      = NULL;
509   P7_OMX         *fwd     = NULL;
510   ESL_SQ         *sq      = NULL;
511   ESL_SQFILE     *sqfp    = NULL;
512   P7_TRACE       *tr      = NULL;
513   P7_SPENSEMBLE  *sp      = NULL;
514   int             N       = esl_opt_GetInteger(go, "-N");
515   int             t,d,nc;
516   int             i,j,k,m;
517   float           sc;
518   float           prob;
519   int             status;
520 
521   /* Read in one HMM */
522   if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile);
523   if (p7_hmmfile_Read(hfp, &abc, &hmm)            != eslOK) p7_Fail("Failed to read HMM");
524   p7_hmmfile_Close(hfp);
525 
526   /* Read in one sequence */
527   sq     = esl_sq_CreateDigital(abc);
528   status = esl_sqfile_Open(seqfile, format, NULL, &sqfp);
529   if      (status == eslENOTFOUND) p7_Fail("No such file.");
530   else if (status == eslEFORMAT)   p7_Fail("Format unrecognized.");
531   else if (status == eslEINVAL)    p7_Fail("Can't autodetect stdin or .gz.");
532   else if (status != eslOK)        p7_Fail("Open failed, code %d.", status);
533   if  (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence");
534   esl_sqfile_Close(sqfp);
535 
536   /* Configure a profile from the HMM */
537   bg = p7_bg_Create(abc);
538   p7_bg_SetLength(bg, sq->n);
539   gm = p7_profile_Create(hmm->M, abc);
540   om = p7_oprofile_Create(hmm->M, abc);
541   p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL);
542   p7_oprofile_Convert(gm, om);
543   p7_oprofile_ReconfigLength(om, sq->n);
544 
545   /* Allocate DP matrix for Forward, and run a Forward calculation in it */
546   fwd = p7_omx_Create(gm->M, sq->n, sq->n);
547   p7_Forward (sq->dsq, sq->n, om, fwd, &sc);
548 
549   /* Allocate a trace container, and an spensemble */
550   tr = p7_trace_Create();
551   sp = p7_spensemble_Create(1024, 64, 32);
552 
553   /* Start the stopwatch. Now we're in domain processing steps. */
554   esl_stopwatch_Start(w);
555 
556   /* Collect N traces, add their domain coords to the ensemble, and cluster */
557   for (t = 0; t < N; t++) {
558     p7_StochasticTrace(r, sq->dsq, sq->n, om, fwd, tr);
559     p7_trace_Index(tr);
560 
561     for (d = 0; d < tr->ndom; d++)
562       p7_spensemble_Add(sp, t, tr->sqfrom[d], tr->sqto[d], tr->hmmfrom[d], tr->hmmto[d]);
563     p7_trace_Reuse(tr);
564   }
565   p7_spensemble_Cluster(sp, 0.8, TRUE, 4, 0.25, 0.02, &nc);
566   for (d = 0; d < nc; d++) {
567     p7_spensemble_GetClusterCoords(sp, d, &i, &j, &k, &m, &prob);
568     printf("domain %-4d :  %6d %6d   %6d %6d   p=%.4f\n", d, i, j, k, m, prob);
569   }
570 
571   /* Done. */
572   esl_stopwatch_Stop(w);
573   esl_stopwatch_Display(stdout, w, "# CPU time: ");
574 
575   p7_spensemble_Destroy(sp);
576   p7_trace_Destroy(tr);
577   esl_sq_Destroy(sq);
578   p7_omx_Destroy(fwd);
579   p7_profile_Destroy(gm);
580   p7_oprofile_Destroy(om);
581   p7_bg_Destroy(bg);
582   p7_hmm_Destroy(hmm);
583   esl_stopwatch_Destroy(w);
584   esl_randomness_Destroy(r);
585   esl_alphabet_Destroy(abc);
586   esl_getopts_Destroy(go);
587   return 0;
588 }
589 #endif /*p7SPENSEMBLE_EXAMPLE*/
590