1 /* P7_TOPHITS: implementation of ranked list of top-scoring hits
2 *
3 * Contents:
4 * 1. The P7_TOPHITS object.
5 * 2. Standard (human-readable) output of pipeline results.
6 * 3. Tabular (parsable) output of pipeline results.
7 * 4. Benchmark driver.
8 * 5. Test driver.
9 */
10 #include "p7_config.h"
11
12 #include <stdlib.h>
13 #include <string.h>
14 #include <limits.h>
15
16 #include "easel.h"
17 #include "hmmer.h"
18
19 /*****************************************************************
20 *= 1. The P7_TOPHITS object
21 *****************************************************************/
22
23 /* Function: p7_tophits_Create()
24 * Synopsis: Allocate a hit list.
25 *
26 * Purpose: Allocates a new <P7_TOPHITS> hit list and return a pointer
27 * to it.
28 *
29 * Throws: <NULL> on allocation failure.
30 */
31 P7_TOPHITS *
p7_tophits_Create(void)32 p7_tophits_Create(void)
33 {
34 P7_TOPHITS *h = NULL;
35 int default_nalloc = 256;
36 int status;
37
38 ESL_ALLOC(h, sizeof(P7_TOPHITS));
39 h->hit = NULL;
40 h->unsrt = NULL;
41
42 ESL_ALLOC(h->hit, sizeof(P7_HIT *) * default_nalloc);
43 ESL_ALLOC(h->unsrt, sizeof(P7_HIT) * default_nalloc);
44 h->Nalloc = default_nalloc;
45 h->N = 0;
46 h->nreported = 0;
47 h->nincluded = 0;
48 h->is_sorted_by_sortkey = TRUE; /* but only because there's 0 hits */
49 h->is_sorted_by_seqidx = FALSE;
50 h->hit[0] = h->unsrt; /* if you're going to call it "sorted" when it contains just one hit, you need this */
51 return h;
52
53 ERROR:
54 p7_tophits_Destroy(h);
55 return NULL;
56 }
57
58
59 /* Function: p7_tophits_Grow()
60 * Synopsis: Reallocates a larger hit list, if needed.
61 *
62 * Purpose: If list <h> cannot hold another hit, doubles
63 * the internal allocation.
64 *
65 * Returns: <eslOK> on success.
66 *
67 * Throws: <eslEMEM> on allocation failure; in this case,
68 * the data in <h> are unchanged.
69 */
70 int
p7_tophits_Grow(P7_TOPHITS * h)71 p7_tophits_Grow(P7_TOPHITS *h)
72 {
73 void *p;
74 P7_HIT *ori = h->unsrt;
75 uint64_t Nalloc = h->Nalloc * 2; /* grow by doubling */
76 int i;
77 int status;
78
79 if (h->N < h->Nalloc) return eslOK; /* we have enough room for another hit */
80
81 ESL_RALLOC(h->hit, p, sizeof(P7_HIT *) * Nalloc);
82 ESL_RALLOC(h->unsrt, p, sizeof(P7_HIT) * Nalloc);
83
84 /* If we grow a sorted list, we have to translate the pointers
85 * in h->hit, because h->unsrt might have just moved in memory.
86 */
87 if (h->is_sorted_by_seqidx || h->is_sorted_by_sortkey)
88 {
89 for (i = 0; i < h->N; i++)
90 h->hit[i] = h->unsrt + (h->hit[i] - ori);
91 }
92
93 h->Nalloc = Nalloc;
94 return eslOK;
95
96 ERROR:
97 return eslEMEM;
98 }
99
100
101 /* Function: p7_tophits_CreateNextHit()
102 * Synopsis: Get pointer to new structure for recording a hit.
103 *
104 * Purpose: Ask the top hits object <h> to do any necessary
105 * internal allocation and bookkeeping to add a new,
106 * empty hit to its list; return a pointer to
107 * this new <P7_HIT> structure for data to be filled
108 * in by the caller.
109 *
110 * Returns: <eslOK> on success.
111 *
112 * Throws: <eslEMEM> on allocation error.
113 */
114 int
p7_tophits_CreateNextHit(P7_TOPHITS * h,P7_HIT ** ret_hit)115 p7_tophits_CreateNextHit(P7_TOPHITS *h, P7_HIT **ret_hit)
116 {
117 P7_HIT *hit = NULL;
118 int status;
119
120 if ((status = p7_tophits_Grow(h)) != eslOK) goto ERROR;
121
122 hit = &(h->unsrt[h->N]);
123 h->N++;
124 if (h->N >= 2)
125 {
126 h->is_sorted_by_seqidx = FALSE;
127 h->is_sorted_by_sortkey = FALSE;
128 }
129
130 hit->name = NULL;
131 hit->acc = NULL;
132 hit->desc = NULL;
133 hit->sortkey = 0.0;
134
135 hit->score = 0.0;
136 hit->pre_score = 0.0;
137 hit->sum_score = 0.0;
138
139 hit->lnP = 0.0;
140 hit->pre_lnP = 0.0;
141 hit->sum_lnP = 0.0;
142
143 hit->ndom = 0;
144 hit->nexpected = 0.0;
145 hit->nregions = 0;
146 hit->nclustered = 0;
147 hit->noverlaps = 0;
148 hit->nenvelopes = 0;
149
150 hit->flags = p7_HITFLAGS_DEFAULT;
151 hit->nreported = 0;
152 hit->nincluded = 0;
153 hit->best_domain = -1;
154 hit->dcl = NULL;
155 hit->offset = 0;
156
157 *ret_hit = hit;
158 return eslOK;
159
160 ERROR:
161 *ret_hit = NULL;
162 return status;
163 }
164
165
166
167 /* Function: p7_tophits_Add()
168 * Synopsis: Add a hit to the top hits list.
169 *
170 * Purpose: Adds a hit to the top hits list <h>.
171 *
172 * <name>, <acc>, and <desc> are copied, so caller may free
173 * them if it likes.
174 *
175 * Only the pointer <ali> is kept. Caller turns over memory
176 * management of <ali> to the top hits object; <ali> will
177 * be free'd when the top hits structure is free'd.
178 *
179 * Args: h - active top hit list
180 * name - name of target
181 * acc - accession of target (may be NULL)
182 * desc - description of target (may be NULL)
183 * sortkey - value to sort by: bigger is better
184 * score - score of this hit
185 * lnP - log P-value of this hit
186 * mothersc - score of parent whole sequence
187 * mother_lnP - log P-value of parent whole sequence
188 * sqfrom - 1..L pos in target seq of start
189 * sqto - 1..L pos; sqfrom > sqto if rev comp
190 * sqlen - length of sequence, L
191 * hmmfrom - 0..M+1 pos in HMM of start
192 * hmmto - 0..M+1 pos in HMM of end
193 * hmmlen - length of HMM, M
194 * domidx - number of this domain
195 * ndom - total # of domains in sequence
196 * ali - optional printable alignment info
197 *
198 * Returns: <eslOK> on success.
199 *
200 * Throws: <eslEMEM> if reallocation failed.
201 *
202 * Note: Is this actually used anywhere? (SRE, 10 Dec 08)
203 * I think it's not up to date.
204 *
205 * That's right. This function is obsolete.
206 * But it is used in benchmark and test code, so you can't
207 * delete it yet; benchmarks and test code should be
208 * revised (SRE, 26 Oct 09)
209 */
210 int
p7_tophits_Add(P7_TOPHITS * h,char * name,char * acc,char * desc,double sortkey,float score,double lnP,float mothersc,double mother_lnP,int sqfrom,int sqto,int sqlen,int hmmfrom,int hmmto,int hmmlen,int domidx,int ndom,P7_ALIDISPLAY * ali)211 p7_tophits_Add(P7_TOPHITS *h,
212 char *name, char *acc, char *desc,
213 double sortkey,
214 float score, double lnP,
215 float mothersc, double mother_lnP,
216 int sqfrom, int sqto, int sqlen,
217 int hmmfrom, int hmmto, int hmmlen,
218 int domidx, int ndom,
219 P7_ALIDISPLAY *ali)
220 {
221 int status;
222
223 if ((status = p7_tophits_Grow(h)) != eslOK) return status;
224 if ((status = esl_strdup(name, -1, &(h->unsrt[h->N].name))) != eslOK) return status;
225 if ((status = esl_strdup(acc, -1, &(h->unsrt[h->N].acc))) != eslOK) return status;
226 if ((status = esl_strdup(desc, -1, &(h->unsrt[h->N].desc))) != eslOK) return status;
227 h->unsrt[h->N].sortkey = sortkey;
228 h->unsrt[h->N].score = score;
229 h->unsrt[h->N].pre_score = 0.0;
230 h->unsrt[h->N].sum_score = 0.0;
231 h->unsrt[h->N].lnP = lnP;
232 h->unsrt[h->N].pre_lnP = 0.0;
233 h->unsrt[h->N].sum_lnP = 0.0;
234 h->unsrt[h->N].nexpected = 0;
235 h->unsrt[h->N].nregions = 0;
236 h->unsrt[h->N].nclustered = 0;
237 h->unsrt[h->N].noverlaps = 0;
238 h->unsrt[h->N].nenvelopes = 0;
239 h->unsrt[h->N].ndom = ndom;
240 h->unsrt[h->N].flags = 0;
241 h->unsrt[h->N].nreported = 0;
242 h->unsrt[h->N].nincluded = 0;
243 h->unsrt[h->N].best_domain= 0;
244 h->unsrt[h->N].dcl = NULL;
245 h->N++;
246
247 if (h->N >= 2) {
248 h->is_sorted_by_seqidx = FALSE;
249 h->is_sorted_by_sortkey = FALSE;
250 }
251 return eslOK;
252 }
253
254 /* hit_sorter(): qsort's pawn, below */
255 static int
hit_sorter_by_sortkey(const void * vh1,const void * vh2)256 hit_sorter_by_sortkey(const void *vh1, const void *vh2)
257 {
258 P7_HIT *h1 = *((P7_HIT **) vh1); /* don't ask. don't change. Don't Panic. */
259 P7_HIT *h2 = *((P7_HIT **) vh2);
260 int c;
261
262 if (h1->sortkey < h2->sortkey) return 1;
263 else if (h1->sortkey > h2->sortkey) return -1;
264 else {
265 if ( (c = strcmp(h1->name, h2->name)) != 0) return c;
266
267 /* if on different strand, the positive strand goes first, else use position */
268 int dir1 = (h1->dcl[0].iali < h1->dcl[0].jali ? 1 : -1);
269 int dir2 = (h2->dcl[0].iali < h2->dcl[0].jali ? 1 : -1);
270 if (dir1 != dir2) return dir2; // so if dir1 is pos (1), and dir2 is neg (-1), this will return -1, placing h1 before h2; otherwise, vice versa
271 else {
272 if (h1->dcl[0].iali > h2->dcl[0].iali) return 1;
273 else if(h1->dcl[0].iali < h2->dcl[0].iali) return -1;
274 else return 0;
275 }
276 }
277 }
278
279 /* used before duplicate hit removal in an nhmmer longtarget pipeline: */
280 static int
hit_sorter_by_seqidx_aliposition(const void * vh1,const void * vh2)281 hit_sorter_by_seqidx_aliposition(const void *vh1, const void *vh2)
282 {
283 P7_HIT *h1 = *((P7_HIT **) vh1);
284 P7_HIT *h2 = *((P7_HIT **) vh2);
285 int64_t s1, e1, s2, e2;
286 int dir1, dir2;
287
288 if (h1->seqidx > h2->seqidx) return 1; /* first key, seq_idx (unique id for sequences), low to high */
289 else if (h1->seqidx < h2->seqidx) return -1;
290
291 // if on different strand, the positive strand goes first, else use position
292 s1 = h1->dcl[0].iali; e1 = h1->dcl[0].jali; if (s1 < e1) { dir1 = 1; } else { dir1 = -1; ESL_SWAP(s1, e1, int64_t); }
293 s2 = h2->dcl[0].iali; e2 = h2->dcl[0].jali; if (s2 < e2) { dir2 = 1; } else { dir2 = -1; ESL_SWAP(s2, e2, int64_t); }
294
295 if (dir1 != dir2) return dir2; // so if dir1 is pos (1), and dir2 is neg (-1), this will return -1, placing h1 before h2; otherwise, vice versa
296
297 if (s1 > s2) return 1; // sort primarily from smallest to largest start pos
298 else if (s1 < s2) return -1;
299 else if (e1 < e2) return 1; // secondarily, larger to smallest end position (i.e. longer hit first)
300 else if (e1 > e2) return -1;
301 else return 0;
302 }
303
304 /* similarly, for nhmmscan longtarget pipeline: */
305 static int
hit_sorter_by_modelname_aliposition(const void * vh1,const void * vh2)306 hit_sorter_by_modelname_aliposition(const void *vh1, const void *vh2)
307 {
308 P7_HIT *h1 = *((P7_HIT **) vh1);
309 P7_HIT *h2 = *((P7_HIT **) vh2);
310 int64_t s1, e1, s2, e2;
311 int dir1, dir2;
312 int res;
313
314 if ( ( res = esl_strcmp(h1->name, h2->name)) != 0 ) return res; /* first key, seq_idx (unique id for sequences), low to high */
315
316 s1 = h1->dcl[0].iali; e1 = h1->dcl[0].jali; if (s1 < e1) { dir1 = 1; } else { dir1 = -1; ESL_SWAP(s1, e1, int64_t); }
317 s2 = h2->dcl[0].iali; e2 = h2->dcl[0].jali; if (s2 < e2) { dir2 = 1; } else { dir2 = -1; ESL_SWAP(s2, e2, int64_t); }
318
319 if (dir1 != dir2) return dir2;
320
321 if (s1 > s2) return 1;
322 else if (s1 < s2) return -1;
323 else if (e1 < e2) return 1;
324 else if (e1 > e2) return -1;
325 else return 0;
326 }
327
328
329 /* Function: p7_tophits_SortBySortkey()
330 * Synopsis: Sorts a hit list.
331 *
332 * Purpose: Sorts a top hit list. After this call,
333 * <h->hit[i]> points to the i'th ranked
334 * <P7_HIT> for all <h->N> hits.
335 *
336 * Returns: <eslOK> on success.
337 */
338 int
p7_tophits_SortBySortkey(P7_TOPHITS * h)339 p7_tophits_SortBySortkey(P7_TOPHITS *h)
340 {
341 int i;
342
343 if (h->is_sorted_by_sortkey) return eslOK;
344 for (i = 0; i < h->N; i++) h->hit[i] = h->unsrt + i;
345 if (h->N > 1) qsort(h->hit, h->N, sizeof(P7_HIT *), hit_sorter_by_sortkey);
346 h->is_sorted_by_seqidx = FALSE;
347 h->is_sorted_by_sortkey = TRUE;
348 return eslOK;
349 }
350
351
352 /* Function: p7_tophits_SortBySeqidxAndAlipos()
353 * Synopsis: Sorts a hit list by sequence index and position for nhmmer.
354 * sequence at which the hit's first domain begins (used in nhmmer)
355 *
356 * Purpose: Sorts a top hit list, suitable for subsequent hit
357 * duplicate removal by `p7_tophits_RemoveDuplicates()` in
358 * an nhmmer longtarget pipeline: by sequence index, then
359 * by strand (+ before - hits), then by smallest-to-largest
360 * start position (in watson coords), then by
361 * largest-to-smallest end position (in watson
362 * coords). Correctness of logic of
363 * `p7_tophits_RemoveDuplicates()` is sensitive to this
364 * ordering.
365 *
366 * After this call, <h->hit[i]> points to the i'th ranked
367 * <P7_HIT> for all <h->N> hits.
368 *
369 * Returns: <eslOK> on success.
370 */
371 int
p7_tophits_SortBySeqidxAndAlipos(P7_TOPHITS * h)372 p7_tophits_SortBySeqidxAndAlipos(P7_TOPHITS *h)
373 {
374 int i;
375
376 if (h->is_sorted_by_seqidx) return eslOK;
377 for (i = 0; i < h->N; i++) h->hit[i] = h->unsrt + i;
378 if (h->N > 1) qsort(h->hit, h->N, sizeof(P7_HIT *), hit_sorter_by_seqidx_aliposition);
379 h->is_sorted_by_sortkey = FALSE;
380 h->is_sorted_by_seqidx = TRUE;
381 return eslOK;
382 }
383
384 /* Function: p7_tophits_SortByModelnameAndAlipos()
385 * Synopsis: Sorts a hit list by model name and position for nhmmscan.
386 *
387 * Purpose: Like `p7_tophits_SortBySeqidxAndAlipos()` but for
388 * nhmmscan, which sorts on model name not target seq idx.
389 *
390 * Returns: <eslOK> on success.
391 */
392 int
p7_tophits_SortByModelnameAndAlipos(P7_TOPHITS * h)393 p7_tophits_SortByModelnameAndAlipos(P7_TOPHITS *h)
394 {
395 int i;
396
397 if (h->is_sorted_by_seqidx) return eslOK;
398 for (i = 0; i < h->N; i++) h->hit[i] = h->unsrt + i;
399 if (h->N > 1) qsort(h->hit, h->N, sizeof(P7_HIT *), hit_sorter_by_modelname_aliposition);
400 h->is_sorted_by_sortkey = FALSE;
401 h->is_sorted_by_seqidx = TRUE;
402 return eslOK;
403 }
404
405
406 /* Function: p7_tophits_Merge()
407 * Synopsis: Merge two top hits lists.
408 *
409 * Purpose: Merge <h2> into <h1>. Upon return, <h1>
410 * contains the sorted, merged list. <h2>
411 * is effectively destroyed; caller should
412 * not access it further, and may as well free
413 * it immediately.
414 *
415 * Returns: <eslOK> on success.
416 *
417 * Throws: <eslEMEM> on allocation failure, and
418 * both <h1> and <h2> remain valid.
419 */
420 int
p7_tophits_Merge(P7_TOPHITS * h1,P7_TOPHITS * h2)421 p7_tophits_Merge(P7_TOPHITS *h1, P7_TOPHITS *h2)
422 {
423 void *p;
424 P7_HIT **new_hit = NULL;
425 P7_HIT *ori1 = h1->unsrt; /* original base of h1's data */
426 P7_HIT *new2;
427 int i,j,k;
428 uint64_t Nalloc = h1->N + h2->N;
429 int status;
430
431 if(h2->N <= 0) return eslOK;
432
433 /* Make sure the two lists are sorted */
434 if ((status = p7_tophits_SortBySortkey(h1)) != eslOK) goto ERROR;
435 if ((status = p7_tophits_SortBySortkey(h2)) != eslOK) goto ERROR;
436
437 /* Attempt our allocations, so we fail early if we fail.
438 * Reallocating h1->unsrt screws up h1->hit, so fix it.
439 */
440 ESL_RALLOC(h1->unsrt, p, sizeof(P7_HIT) * Nalloc);
441 ESL_ALLOC (new_hit, sizeof(P7_HIT *) * Nalloc);
442 for (i = 0; i < h1->N; i++)
443 h1->hit[i] = h1->unsrt + (h1->hit[i] - ori1);
444
445 /* Append h2's unsorted data array to h1. h2's data begin at <new2> */
446 new2 = h1->unsrt + h1->N;
447 memcpy(new2, h2->unsrt, sizeof(P7_HIT) * h2->N);
448
449 /* Merge the sorted hit lists */
450 for (i=0,j=0,k=0; i < h1->N && j < h2->N ; k++)
451 new_hit[k] = (hit_sorter_by_sortkey(&h1->hit[i], &h2->hit[j]) > 0) ? new2 + (h2->hit[j++] - h2->unsrt) : h1->hit[i++];
452 while (i < h1->N) new_hit[k++] = h1->hit[i++];
453 while (j < h2->N) new_hit[k++] = new2 + (h2->hit[j++] - h2->unsrt);
454
455 /* h2 now turns over management of name, acc, desc memory to h1;
456 * nullify its pointers, to prevent double free. */
457 for (i = 0; i < h2->N; i++)
458 {
459 h2->unsrt[i].name = NULL;
460 h2->unsrt[i].acc = NULL;
461 h2->unsrt[i].desc = NULL;
462 h2->unsrt[i].dcl = NULL;
463 }
464
465 /* Construct the new grown h1 */
466 free(h1->hit);
467 h1->hit = new_hit;
468 h1->Nalloc = Nalloc;
469 h1->N += h2->N;
470 /* and is_sorted is TRUE, as a side effect of p7_tophits_Sort() above. */
471 return eslOK;
472
473 ERROR:
474 if (new_hit != NULL) free(new_hit);
475 return status;
476 }
477
478 /* Function: p7_tophits_GetMaxPositionLength()
479 * Synopsis: Returns maximum position length in hit list (targets).
480 *
481 * Purpose: Returns the length of the longest hit location (start/end)
482 * of all the registered hits, in chars. This is useful when
483 * deciding how to format output.
484 *
485 * The maximum is taken over all registered hits. This
486 * opens a possible side effect: caller might print only
487 * the top hits, and the max name length in these top hits
488 * may be different than the max length over all the hits.
489 *
490 * Used specifically for nhmmer output, so expects only one
491 * domain per hit
492 *
493 * If there are no hits in <h>, or none of the
494 * hits have names, returns 0.
495 */
496 int
p7_tophits_GetMaxPositionLength(P7_TOPHITS * h)497 p7_tophits_GetMaxPositionLength(P7_TOPHITS *h)
498 {
499 int64_t i;
500 int max = 0;
501 int n;
502 char buffer [13];
503
504 for (i = 0; i < h->N; i++) {
505 if (h->unsrt[i].dcl[0].iali > 0) {
506 n = sprintf (buffer, "%" PRId64 "", h->unsrt[i].dcl[0].iali);
507 max = ESL_MAX(n, max);
508 n = sprintf (buffer, "%" PRId64 "", h->unsrt[i].dcl[0].jali);
509 max = ESL_MAX(n, max);
510 }
511 }
512 return max;
513 }
514
515 /* Function: p7_tophits_GetMaxNameLength()
516 * Synopsis: Returns maximum name length in hit list (targets).
517 *
518 * Purpose: Returns the maximum name length of all the registered
519 * hits, in chars. This is useful when deciding how to
520 * format output.
521 *
522 * The maximum is taken over all registered hits. This
523 * opens a possible side effect: caller might print only
524 * the top hits, and the max name length in these top hits
525 * may be different than the max length over all the hits.
526 *
527 * If there are no hits in <h>, or none of the
528 * hits have names, returns 0.
529 */
530 int
p7_tophits_GetMaxNameLength(P7_TOPHITS * h)531 p7_tophits_GetMaxNameLength(P7_TOPHITS *h)
532 {
533 int i, max, n;
534 for (max = 0, i = 0; i < h->N; i++)
535 if (h->unsrt[i].name != NULL) {
536 n = strlen(h->unsrt[i].name);
537 max = ESL_MAX(n, max);
538 }
539 return max;
540 }
541
542 /* Function: p7_tophits_GetMaxAccessionLength()
543 * Synopsis: Returns maximum accession length in hit list (targets).
544 *
545 * Purpose: Same as <p7_tophits_GetMaxNameLength()>, but for
546 * accessions. If there are no hits in <h>, or none
547 * of the hits have accessions, returns 0.
548 */
549 int
p7_tophits_GetMaxAccessionLength(P7_TOPHITS * h)550 p7_tophits_GetMaxAccessionLength(P7_TOPHITS *h)
551 {
552 int i, max, n;
553 for (max = 0, i = 0; i < h->N; i++)
554 if (h->unsrt[i].acc != NULL) {
555 n = strlen(h->unsrt[i].acc);
556 max = ESL_MAX(n, max);
557 }
558 return max;
559 }
560
561 /* Function: p7_tophits_GetMaxShownLength()
562 * Synopsis: Returns max shown name/accession length in hit list.
563 *
564 * Purpose: Same as <p7_tophits_GetMaxNameLength()>, but
565 * for the case when --acc is on, where
566 * we show accession if one is available, and
567 * fall back to showing the name if it is not.
568 * Returns the max length of whatever is being
569 * shown as the reported "name".
570 */
571 int
p7_tophits_GetMaxShownLength(P7_TOPHITS * h)572 p7_tophits_GetMaxShownLength(P7_TOPHITS *h)
573 {
574 int i, max, n;
575 for (max = 0, i = 0; i < h->N; i++)
576 {
577 if (h->unsrt[i].acc != NULL && h->unsrt[i].acc[0] != '\0')
578 {
579 n = strlen(h->unsrt[i].acc);
580 max = ESL_MAX(n, max);
581 }
582 else if (h->unsrt[i].name != NULL)
583 {
584 n = strlen(h->unsrt[i].name);
585 max = ESL_MAX(n, max);
586 }
587 }
588 return max;
589 }
590
591
592 /* Function: p7_tophits_Reuse()
593 * Synopsis: Reuse a hit list, freeing internals.
594 *
595 * Purpose: Reuse the tophits list <h>; save as
596 * many malloc/free cycles as possible,
597 * as opposed to <Destroy()>'ing it and
598 * <Create>'ing a new one.
599 */
600 int
p7_tophits_Reuse(P7_TOPHITS * h)601 p7_tophits_Reuse(P7_TOPHITS *h)
602 {
603 int i, j;
604
605 if (h == NULL) return eslOK;
606 if (h->unsrt != NULL)
607 {
608 for (i = 0; i < h->N; i++)
609 {
610 if (h->unsrt[i].name != NULL) free(h->unsrt[i].name);
611 if (h->unsrt[i].acc != NULL) free(h->unsrt[i].acc);
612 if (h->unsrt[i].desc != NULL) free(h->unsrt[i].desc);
613 if (h->unsrt[i].dcl != NULL) {
614 for (j = 0; j < h->unsrt[i].ndom; j++)
615 if (h->unsrt[i].dcl[j].ad != NULL) p7_alidisplay_Destroy(h->unsrt[i].dcl[j].ad);
616 free(h->unsrt[i].dcl);
617 }
618 }
619 }
620 h->N = 0;
621 h->is_sorted_by_seqidx = FALSE;
622 h->is_sorted_by_sortkey = TRUE; /* because there are 0 hits */
623 h->hit[0] = h->unsrt;
624 return eslOK;
625 }
626
627 /* Function: p7_tophits_Destroy()
628 * Synopsis: Frees a hit list.
629 */
630 void
p7_tophits_Destroy(P7_TOPHITS * h)631 p7_tophits_Destroy(P7_TOPHITS *h)
632 {
633 int i,j;
634 if (h == NULL) return;
635 if (h->hit != NULL) free(h->hit);
636 if (h->unsrt != NULL)
637 {
638 for (i = 0; i < h->N; i++)
639 {
640 if (h->unsrt[i].name != NULL) free(h->unsrt[i].name);
641 if (h->unsrt[i].acc != NULL) free(h->unsrt[i].acc);
642 if (h->unsrt[i].desc != NULL) free(h->unsrt[i].desc);
643 if (h->unsrt[i].dcl != NULL) {
644 for (j = 0; j < h->unsrt[i].ndom; j++) {
645 if (h->unsrt[i].dcl[j].ad != NULL) p7_alidisplay_Destroy(h->unsrt[i].dcl[j].ad);
646 if (h->unsrt[i].dcl[j].scores_per_pos != NULL) free (h->unsrt[i].dcl->scores_per_pos);
647 }
648 free(h->unsrt[i].dcl);
649 }
650 }
651 free(h->unsrt);
652 }
653 free(h);
654 return;
655 }
656 /*---------------- end, P7_TOPHITS object -----------------------*/
657
658
659
660
661
662
663 /*****************************************************************
664 * 2. Standard (human-readable) output of pipeline results
665 *****************************************************************/
666
667 /* workaround_bug_h74():
668 * Different envelopes, identical alignment
669 *
670 * Bug #h74, though extremely rare, arises from a limitation in H3's
671 * implementation of Forward/Backward, as follows:
672 *
673 * 1. A multidomain region is analyzed by stochastic clustering
674 * 2. Overlapping envelopes are found (w.r.t sequence coords), though
675 * trace clusters are distinct if HMM endpoints are also considered.
676 * 3. We have no facility for limiting Forward/Backward to a specified
677 * range of profile coordinates, so each envelope is passed to
678 * rescore_isolated_domain() and analyzed independently.
679 * 4. Optimal accuracy alignment may identify exactly the same alignment
680 * in the overlap region shared by the two envelopes.
681 *
682 * The disturbing result is two different envelopes that have
683 * identical alignments and alignment endpoints.
684 *
685 * The correct fix is to define envelopes not only by sequence
686 * endpoints but also by profile endpoints, passing them to
687 * rescore_isolated_domain(), and limiting F/B calculations to this
688 * pieces of the DP lattice. This requires a fair amount of work,
689 * adding to the optimized API.
690 *
691 * The workaround is to detect when there are duplicate alignments,
692 * and only display one. We show the one with the best bit score.
693 *
694 * If we ever implement envelope-limited versions of F/B, revisit this
695 * fix.
696 *
697 * SRE, Tue Dec 22 16:27:04 2009
698 * xref J5/130; notebook/2009/1222-hmmer-bug-h74
699 */
700 static int
workaround_bug_h74(P7_TOPHITS * th)701 workaround_bug_h74(P7_TOPHITS *th)
702 {
703 int h;
704 int d1, d2;
705 int dremoved;
706
707 for (h = 0; h < th->N; h++)
708 if (th->hit[h]->noverlaps)
709 {
710 for (d1 = 0; d1 < th->hit[h]->ndom; d1++)
711 for (d2 = d1+1; d2 < th->hit[h]->ndom; d2++)
712 if (th->hit[h]->dcl[d1].iali == th->hit[h]->dcl[d2].iali &&
713 th->hit[h]->dcl[d1].jali == th->hit[h]->dcl[d2].jali)
714 {
715 dremoved = (th->hit[h]->dcl[d1].bitscore >= th->hit[h]->dcl[d2].bitscore) ? d2 : d1;
716 if (th->hit[h]->dcl[dremoved].is_reported) { th->hit[h]->dcl[dremoved].is_reported = FALSE; th->hit[h]->nreported--; }
717 if (th->hit[h]->dcl[dremoved].is_included) { th->hit[h]->dcl[dremoved].is_included = FALSE; th->hit[h]->nincluded--; }
718 }
719 }
720 return eslOK;
721 }
722
723
724
725 /* Function: p7_tophits_ComputeNhmmerEvalues()
726 * Synopsis: Compute e-values based on pvalues and window sizes.
727 *
728 * Purpose: After nhmmer pipeline has completed, the th object contains
729 * hits where the p-values haven't yet been converted to
730 * e-values. That modification depends on an established
731 * number of sequences. In nhmmer, this is computed as N/W,
732 * for a database of N residues, where W is some standardized
733 * window length (nhmmer passes om->max_length). E-values are
734 * set here based on that formula. We also set the sortkey so
735 * the output will be sorted correctly.
736 *
737 * Returns: <eslOK> on success.
738 */
739 int
p7_tophits_ComputeNhmmerEvalues(P7_TOPHITS * th,double N,int W)740 p7_tophits_ComputeNhmmerEvalues(P7_TOPHITS *th, double N, int W)
741 {
742 int i; /* counters over hits */
743
744 for (i = 0; i < th->N ; i++)
745 {
746 th->unsrt[i].lnP += log((float)N / (float)W);
747 th->unsrt[i].dcl[0].lnP = th->unsrt[i].lnP;
748 th->unsrt[i].sortkey = -1.0 * th->unsrt[i].lnP;
749 }
750 return eslOK;
751 }
752
753
754 /* Function: p7_tophits_RemoveDuplicates()
755 * Synopsis: Remove overlapping hits.
756 *
757 * Purpose: After nhmmer pipeline has completed, the TopHits object may
758 * contain duplicates if the target was broken into overlapping
759 * windows. Scan through, and remove duplicates. Since the
760 * duplicates may be incomplete (one sequence is a partial
761 * hit because it's window didn't cover the full length of
762 * the hit), keep the one with better p-value
763 *
764 * Returns: <eslOK> on success.
765 */
766 int
p7_tophits_RemoveDuplicates(P7_TOPHITS * th,int using_bit_cutoffs)767 p7_tophits_RemoveDuplicates(P7_TOPHITS *th, int using_bit_cutoffs)
768 {
769 int i; /* counter over hits */
770 int j; /* previous un-duplicated hit */
771 int s_i, s_j, e_i, e_j, dir_i, dir_j, len_i, len_j;
772 int intersect_alistart, intersect_aliend, intersect_alilen;
773 int intersect_hmmstart, intersect_hmmend, intersect_hmmlen;
774 double p_i, p_j;
775 int remove;
776
777 if (th->N<2) return eslOK;
778
779 j=0;
780 for (i = 1; i < th->N; i++)
781 {
782 p_j = th->hit[j]->lnP;
783 s_j = th->hit[j]->dcl[0].iali;
784 e_j = th->hit[j]->dcl[0].jali;
785 dir_j = (s_j < e_j ? 1 : -1);
786 if (dir_j == -1) ESL_SWAP(s_j, e_j, int);
787 len_j = e_j - s_j + 1 ;
788
789 p_i = th->hit[i]->lnP;
790 s_i = th->hit[i]->dcl[0].iali;
791 e_i = th->hit[i]->dcl[0].jali;
792 dir_i = (s_i < e_i ? 1 : -1);
793 if (dir_i == -1) ESL_SWAP(s_i, e_i, int);
794 len_i = e_i - s_i + 1 ;
795
796 // these will only matter if seqidx and strand are the same
797 intersect_alistart = s_i>s_j ? s_i : s_j;
798 intersect_aliend = e_i<e_j ? e_i : e_j;
799 intersect_alilen = intersect_aliend - intersect_alistart + 1;
800
801 intersect_hmmstart = (th->hit[i]->dcl[0].ad->hmmfrom > th->hit[j]->dcl[0].ad->hmmfrom) ? th->hit[i]->dcl[0].ad->hmmfrom : th->hit[j]->dcl[0].ad->hmmfrom;
802 intersect_hmmend = (th->hit[i]->dcl[0].ad->hmmto < th->hit[j]->dcl[0].ad->hmmto) ? th->hit[i]->dcl[0].ad->hmmto : th->hit[j]->dcl[0].ad->hmmto;
803 intersect_hmmlen = intersect_hmmend - intersect_hmmstart + 1;
804
805 if ( esl_strcmp(th->hit[i]->name, th->hit[i-1]->name) == 0 && // same model
806 th->hit[i]->seqidx == th->hit[i-1]->seqidx && // same source sequence
807 dir_i == dir_j && // only bother removing if the overlapping hits are on the same strand
808 intersect_hmmlen > 0 && // only if they're both hitting similar parts of the model
809 (
810 ( s_i >= s_j-3 && s_i <= s_j+3) || // at least one side is essentially flush
811 ( e_i >= e_j-3 && e_i <= e_j+3) ||
812 ( intersect_alilen >= len_i * 0.95) || // or one of the hits covers >90% of the other
813 ( intersect_alilen >= len_j * 0.95)
814 ))
815 {
816 /* Force one to go unreported. I prefer to keep the one with the
817 * better e-value. This addresses two issues
818 * (1) longer hits sometimes encounter higher bias corrections,
819 * leading to lower scores; seems better to focus on the
820 * high-scoring heart of the alignment, if we have a
821 * choice
822 * (2) it is possible that a lower-scoring longer hit (see #1)
823 * that is close to threshold will pass the pipeline in
824 * one condition and not the other (e.g. --toponly, or
825 * single vs multi threaded), and if longer hits obscure
826 * shorter higher-scoring ones, a shorter "hit" might be
827 * lost by being obscured by a longer one that is subsequently
828 * removed due to insufficient score.
829 * see late notes in ~wheelert/notebook/2012/0518-dfam-scripts/00NOTES
830 */
831 //remove = 0; // 1 := keep i, 0 := keep i-1
832 remove = p_i < p_j ? j : i;
833
834 th->hit[remove]->flags |= p7_IS_DUPLICATE;
835 if (using_bit_cutoffs) { // report/include flags were already included, need to remove them here
836 th->hit[remove]->flags &= ~p7_IS_REPORTED;
837 th->hit[remove]->flags &= ~p7_IS_INCLUDED;
838 }
839
840 j = (remove == j ? i : j);
841 }
842 else j = i;
843 }
844 return eslOK;
845 }
846
847
848
849 /* Function: p7_tophits_Threshold()
850 * Synopsis: Apply score and E-value thresholds to a hitlist before output.
851 *
852 * Purpose: After a pipeline has completed, go through it and mark all
853 * the targets and domains that are "significant" (satisfying
854 * the reporting thresholds set for the pipeline).
855 *
856 * Also sets the final total number of reported and
857 * included targets, the number of reported and included
858 * targets in each target, and the size of the search space
859 * for per-domain conditional E-value calculations,
860 * <pli->domZ>. By default, <pli->domZ> is the number of
861 * significant targets reported.
862 *
863 * If model-specific thresholds were used in the pipeline,
864 * we cannot apply those thresholds now. They were already
865 * applied in the pipeline. In this case all we're
866 * responsible for here is counting them (setting
867 * nreported, nincluded counters).
868 *
869 * Returns: <eslOK> on success.
870 */
871 int
p7_tophits_Threshold(P7_TOPHITS * th,P7_PIPELINE * pli)872 p7_tophits_Threshold(P7_TOPHITS *th, P7_PIPELINE *pli)
873 {
874 int h, d; /* counters over sequence hits, domains in sequences */
875
876 /* Flag reported, included targets (if we're using general thresholds) */
877 if (! pli->use_bit_cutoffs)
878 {
879 for (h = 0; h < th->N; h++)
880 {
881
882 if ( !(th->hit[h]->flags & p7_IS_DUPLICATE) &&
883 p7_pli_TargetReportable(pli, th->hit[h]->score, th->hit[h]->lnP))
884 {
885 th->hit[h]->flags |= p7_IS_REPORTED;
886 if (p7_pli_TargetIncludable(pli, th->hit[h]->score, th->hit[h]->lnP))
887 th->hit[h]->flags |= p7_IS_INCLUDED;
888
889 if (pli->long_targets) { // no domains in dna search, so:
890 th->hit[h]->dcl[0].is_reported = th->hit[h]->flags & p7_IS_REPORTED;
891 th->hit[h]->dcl[0].is_included = th->hit[h]->flags & p7_IS_INCLUDED;
892 }
893 }
894 }
895 }
896
897 /* Count reported, included targets */
898 th->nreported = 0;
899 th->nincluded = 0;
900 for (h = 0; h < th->N; h++)
901 {
902 if (th->hit[h]->flags & p7_IS_REPORTED) th->nreported++;
903 if (th->hit[h]->flags & p7_IS_INCLUDED) th->nincluded++;
904 }
905
906 /* Now we can determined domZ, the effective search space in which additional domains are found */
907 if (pli->domZ_setby == p7_ZSETBY_NTARGETS) pli->domZ = (double) th->nreported;
908
909
910 /* Second pass is over domains, flagging reportable/includable ones.
911 * Depends on knowing the domZ we just set.
912 * Note how this enforces a hierarchical logic of
913 * (sequence|domain) must be reported to be included, and
914 * domain can only be (reported|included) if whole sequence is too.
915 */
916 if (! pli->use_bit_cutoffs && !pli->long_targets)
917 {
918 for (h = 0; h < th->N; h++)
919 {
920 if (th->hit[h]->flags & p7_IS_REPORTED)
921 {
922 for (d = 0; d < th->hit[h]->ndom; d++)
923 {
924 if (p7_pli_DomainReportable(pli, th->hit[h]->dcl[d].bitscore, th->hit[h]->dcl[d].lnP))
925 th->hit[h]->dcl[d].is_reported = TRUE;
926 if ((th->hit[h]->flags & p7_IS_INCLUDED) &&
927 p7_pli_DomainIncludable(pli, th->hit[h]->dcl[d].bitscore, th->hit[h]->dcl[d].lnP))
928 th->hit[h]->dcl[d].is_included = TRUE;
929 }
930 }
931 }
932 }
933
934 /* Count the reported, included domains */
935 for (h = 0; h < th->N; h++)
936 for (d = 0; d < th->hit[h]->ndom; d++)
937 {
938 if (th->hit[h]->dcl[d].is_reported) th->hit[h]->nreported++;
939 if (th->hit[h]->dcl[d].is_included) th->hit[h]->nincluded++;
940 }
941
942 workaround_bug_h74(th); /* blech. This function is defined above; see commentary and crossreferences there. */
943
944 return eslOK;
945 }
946
947
948
949
950
951 /* Function: p7_tophits_CompareRanking()
952 * Synopsis: Compare current top hits to previous top hits ranking.
953 *
954 * Purpose: Using a keyhash <kh> of the previous top hits and the
955 * their ranks, look at the current top hits list <th>
956 * and flag new hits that are included for the first time
957 * (by setting <p7_IS_NEW> flag) and hits that were
958 * included previously, but are now below the inclusion
959 * threshold in the list (<by setting <p7_IS_DROPPED>
960 * flag).
961 *
962 * The <th> must already have been processed by
963 * <p7_tophits_Threshold()>. We assume the <is_included>,
964 * <is_reported> flags are set on the appropriate hits.
965 *
966 * Upon return, the keyhash <kh> is updated to hash the
967 * current top hits list and their ranks.
968 *
969 * Optionally, <*opt_nnew> is set to the number of
970 * newly included hits. jackhmmer uses this as part of
971 * its convergence criteria, for example.
972 *
973 * These flags affect output of top target hits from
974 * <p7_tophits_Targets()>.
975 *
976 * It only makes sense to call this function in context of
977 * an iterative search.
978 *
979 * The <p7_IS_NEW> flag is comprehensive: all new hits
980 * are flagged (and counted in <*opt_nnew>). The <p7_WAS_DROPPED>
981 * flag is not comprehensive: only those hits that still
982 * appear in the current top hits list are flagged. If a
983 * hit dropped entirely off the list, it isn't counted
984 * as "dropped". (This could be done, but we would want
985 * to have two keyhashes, one old and one new, to do the
986 * necessary comparisons efficiently.)
987 *
988 * If the target names in <th> are not unique, results may
989 * be strange.
990 *
991 * Args: th - current top hits list
992 * kh - hash of top hits' ranks (in: previous tophits; out: <th>'s tophits)
993 * opt_nnew - optRETURN: number of new hits above inclusion threshold
994 *
995 * Returns: <eslOK> on success.
996 *
997 * Throws: <eslEMEM> if <kh> needed to be reallocated but this failed.
998 */
999 int
p7_tophits_CompareRanking(P7_TOPHITS * th,ESL_KEYHASH * kh,int * opt_nnew)1000 p7_tophits_CompareRanking(P7_TOPHITS *th, ESL_KEYHASH *kh, int *opt_nnew)
1001 {
1002 int nnew = 0;
1003 int oldrank;
1004 int h;
1005 int status;
1006
1007 /* Flag the hits in the list with whether they're new in the included top hits,
1008 * and whether they've dropped off the included list.
1009 */
1010 for (h = 0; h < th->N; h++)
1011 {
1012 esl_keyhash_Lookup(kh, th->hit[h]->name, -1, &oldrank);
1013
1014 if (th->hit[h]->flags & p7_IS_INCLUDED)
1015 {
1016 if (oldrank == -1) { th->hit[h]->flags |= p7_IS_NEW; nnew++; }
1017 }
1018 else
1019 {
1020 if (oldrank >= 0) th->hit[h]->flags |= p7_IS_DROPPED;
1021 }
1022 }
1023
1024 /* Replace the old rank list with the new one */
1025 esl_keyhash_Reuse(kh);
1026 for (h = 0; h < th->N; h++)
1027 {
1028 if (th->hit[h]->flags & p7_IS_INCLUDED)
1029 {
1030 /* What happens when the same sequence name appears twice? It gets stored with higher rank */
1031 status = esl_keyhash_Store(kh, th->hit[h]->name, -1, NULL);
1032 if (status != eslOK && status != eslEDUP) goto ERROR;
1033 }
1034 }
1035
1036 if (opt_nnew != NULL) *opt_nnew = nnew;
1037 return eslOK;
1038
1039 ERROR:
1040 if (opt_nnew != NULL) *opt_nnew = 0;
1041 return status;
1042 }
1043
1044
1045 /* Function: p7_tophits_Targets()
1046 * Synopsis: Format and write a top target hits list to an output stream.
1047 *
1048 * Purpose: Output a list of the reportable top target hits in <th>
1049 * in human-readable ASCII text format to stream <ofp>, using
1050 * final pipeline accounting stored in <pli>.
1051 *
1052 * The tophits list <th> should already be sorted (see
1053 * <p7_tophits_Sort()> and thresholded (see
1054 * <p7_tophits_Threshold>).
1055 *
1056 * Returns: <eslOK> on success.
1057 *
1058 * Throws: <eslEWRITE> on write failure.
1059 */
1060 int
p7_tophits_Targets(FILE * ofp,P7_TOPHITS * th,P7_PIPELINE * pli,int textw)1061 p7_tophits_Targets(FILE *ofp, P7_TOPHITS *th, P7_PIPELINE *pli, int textw)
1062 {
1063 char newness;
1064 int h;
1065 int d;
1066 int namew;
1067 int posw;
1068 int descw;
1069 char *showname;
1070
1071 int have_printed_incthresh = FALSE;
1072
1073 /* when --acc is on, we'll show accession if available, and fall back to name */
1074 if (pli->show_accessions) namew = ESL_MAX(8, p7_tophits_GetMaxShownLength(th));
1075 else namew = ESL_MAX(8, p7_tophits_GetMaxNameLength(th));
1076
1077
1078 if (pli->long_targets)
1079 {
1080 posw = ESL_MAX(6, p7_tophits_GetMaxPositionLength(th));
1081
1082 if (textw > 0) descw = ESL_MAX(32, textw - namew - 2*posw - 32); /* 32 chars excluding desc and two posw's is from the format: 2 + 9+2 +6+2 +5+2 +<name>+1 +<startpos>+1 +<endpos>+1 +1 */
1083 else descw = 0; /* unlimited desc length is handled separately */
1084
1085 if (fprintf(ofp, "Scores for complete hit%s:\n", pli->mode == p7_SEARCH_SEQS ? "s" : "") < 0)
1086 ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1087 if (fprintf(ofp, " %9s %6s %5s %-*s %*s %*s %s\n",
1088 "E-value", " score", " bias", namew, (pli->mode == p7_SEARCH_SEQS ? "Sequence":"Model"), posw, "start", posw, "end", "Description") < 0)
1089 ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1090 if (fprintf(ofp, " %9s %6s %5s %-*s %*s %*s %s\n",
1091 "-------", "------", "-----", namew, "--------", posw, "-----", posw, "-----", "-----------") < 0)
1092 ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1093 }
1094 else
1095 {
1096
1097 if (textw > 0) descw = ESL_MAX(32, textw - namew - 61); /* 61 chars excluding desc is from the format: 2 + 22+2 +22+2 +8+2 +<name>+1 */
1098 else descw = 0; /* unlimited desc length is handled separately */
1099
1100
1101 /* The minimum width of the target table is 111 char: 47 from fields, 8 from min name, 32 from min desc, 13 spaces */
1102 if (fprintf(ofp, "Scores for complete sequence%s (score includes all domains):\n", pli->mode == p7_SEARCH_SEQS ? "s" : "") < 0)
1103 ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1104 if (fprintf(ofp, " %22s %22s %8s\n", " --- full sequence ---", " --- best 1 domain ---", "-#dom-") < 0)
1105 ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1106 if (fprintf(ofp, " %9s %6s %5s %9s %6s %5s %5s %2s %-*s %s\n",
1107 "E-value", " score", " bias", "E-value", " score", " bias", " exp", "N", namew, (pli->mode == p7_SEARCH_SEQS ? "Sequence":"Model"), "Description") < 0)
1108 ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1109 if (fprintf(ofp, " %9s %6s %5s %9s %6s %5s %5s %2s %-*s %s\n",
1110 "-------", "------", "-----", "-------", "------", "-----", " ----", "--", namew, "--------", "-----------") < 0)
1111 ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1112 }
1113
1114 for (h = 0; h < th->N; h++)
1115 {
1116 if (th->hit[h]->flags & p7_IS_REPORTED)
1117 {
1118 d = th->hit[h]->best_domain;
1119
1120 if (! (th->hit[h]->flags & p7_IS_INCLUDED) && ! have_printed_incthresh)
1121 {
1122 if (fprintf(ofp, " ------ inclusion threshold ------\n") < 0)
1123 ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1124 have_printed_incthresh = TRUE;
1125 }
1126
1127 if (pli->show_accessions)
1128 { /* the --acc option: report accessions rather than names if possible */
1129 if (th->hit[h]->acc != NULL && th->hit[h]->acc[0] != '\0') showname = th->hit[h]->acc;
1130 else showname = th->hit[h]->name;
1131 }
1132 else
1133 showname = th->hit[h]->name;
1134
1135 if (th->hit[h]->flags & p7_IS_NEW) newness = '+';
1136 else if (th->hit[h]->flags & p7_IS_DROPPED) newness = '-';
1137 else newness = ' ';
1138
1139 if (pli->long_targets)
1140 {
1141 if (fprintf(ofp, "%c %9.2g %6.1f %5.1f %-*s %*" PRId64 " %*" PRId64 "",
1142 newness,
1143 exp(th->hit[h]->lnP), // * pli->Z,
1144 th->hit[h]->score,
1145 eslCONST_LOG2R * th->hit[h]->dcl[d].dombias, // an nhmmer hit is really a domain, so this is the hit's bias correction
1146 namew, showname,
1147 posw, th->hit[h]->dcl[d].iali,
1148 posw, th->hit[h]->dcl[d].jali) < 0)
1149 ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1150 }
1151 else
1152 {
1153 if (fprintf(ofp, "%c %9.2g %6.1f %5.1f %9.2g %6.1f %5.1f %5.1f %2d %-*s ",
1154 newness,
1155 exp(th->hit[h]->lnP) * pli->Z,
1156 th->hit[h]->score,
1157 th->hit[h]->pre_score - th->hit[h]->score, /* bias correction */
1158 exp(th->hit[h]->dcl[d].lnP) * pli->Z,
1159 th->hit[h]->dcl[d].bitscore,
1160 eslCONST_LOG2R * th->hit[h]->dcl[d].dombias, /* convert NATS to BITS at last moment */
1161 th->hit[h]->nexpected,
1162 th->hit[h]->nreported,
1163 namew, showname) < 0)
1164 ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1165 }
1166
1167 if (textw > 0)
1168 {
1169 if (fprintf(ofp, " %-.*s\n", descw, th->hit[h]->desc == NULL ? "" : th->hit[h]->desc) < 0)
1170 ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1171 }
1172 else
1173 {
1174 if (fprintf(ofp, " %s\n", th->hit[h]->desc == NULL ? "" : th->hit[h]->desc) < 0)
1175 ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1176 }
1177 /* do NOT use *s with unlimited (INT_MAX) line length. Some systems
1178 * have an fprintf() bug here (we found one on an Opteron/SUSE Linux
1179 * system (#h66)
1180 */
1181 }
1182 }
1183
1184 if (th->nreported == 0)
1185 {
1186 if (fprintf(ofp, "\n [No hits detected that satisfy reporting thresholds]\n") < 0)
1187 ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1188 }
1189 return eslOK;
1190 }
1191
1192
1193 /* Function: p7_tophits_Domains()
1194 * Synopsis: Standard output format for top domain hits and alignments.
1195 *
1196 * Purpose: For each reportable target sequence, output a tabular summary
1197 * of reportable domains found in it, followed by alignments of
1198 * each domain.
1199 *
1200 * Similar to <p7_tophits_Targets()>; see additional notes there.
1201 *
1202 * Returns: <eslOK> on success.
1203 *
1204 * Throws: <eslEWRITE> if a write to <ofp> fails; for example, if
1205 * the disk fills up.
1206 */
1207 int
p7_tophits_Domains(FILE * ofp,P7_TOPHITS * th,P7_PIPELINE * pli,int textw)1208 p7_tophits_Domains(FILE *ofp, P7_TOPHITS *th, P7_PIPELINE *pli, int textw)
1209 {
1210 int h, d;
1211 int nd;
1212 int namew, descw;
1213 char *showname;
1214 int status;
1215
1216 if (pli->long_targets)
1217 {
1218 if (fprintf(ofp, "Annotation for each hit %s:\n",
1219 pli->show_alignments ? " (and alignments)" : "") < 0)
1220 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1221 }
1222 else
1223 {
1224 if (fprintf(ofp, "Domain annotation for each %s%s:\n",
1225 pli->mode == p7_SEARCH_SEQS ? "sequence" : "model",
1226 pli->show_alignments ? " (and alignments)" : "") < 0)
1227 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1228 }
1229
1230 for (h = 0; h < th->N; h++)
1231 if (th->hit[h]->flags & p7_IS_REPORTED)
1232 {
1233 if (pli->show_accessions && th->hit[h]->acc != NULL && th->hit[h]->acc[0] != '\0')
1234 {
1235 showname = th->hit[h]->acc;
1236 namew = strlen(th->hit[h]->acc);
1237 }
1238 else
1239 {
1240 showname = th->hit[h]->name;
1241 namew = strlen(th->hit[h]->name);
1242 }
1243
1244 if (textw > 0)
1245 {
1246 descw = ESL_MAX(32, textw - namew - 5);
1247 if (fprintf(ofp, ">> %s %-.*s\n", showname, descw, (th->hit[h]->desc == NULL ? "" : th->hit[h]->desc)) < 0)
1248 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1249 }
1250 else
1251 {
1252 if (fprintf(ofp, ">> %s %s\n", showname, (th->hit[h]->desc == NULL ? "" : th->hit[h]->desc)) < 0)
1253 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1254 }
1255
1256 if (th->hit[h]->nreported == 0)
1257 {
1258 if (fprintf(ofp," [No individual domains that satisfy reporting thresholds (although complete target did)]\n\n") < 0)
1259 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1260 continue;
1261 }
1262
1263
1264 if (pli->long_targets)
1265 {
1266 /* The dna hit table is 119 char wide:
1267 score bias Evalue hmmfrom hmm to alifrom ali to envfrom env to hqfrom hq to sq len acc
1268 ------ ----- --------- ------- ------- --------- --------- --------- --------- --------- --------- --------- ----
1269 ! 82.7 104.4 4.9e-22 782 998 .. 241981174 241980968 .. 241981174 241980966 .. 241981174 241980968 234234233 0.78
1270 */
1271 if (fprintf(ofp, " %6s %5s %9s %9s %9s %2s %9s %9s %2s %9s %9s %9s %2s %4s\n", "score", "bias", " Evalue", "hmmfrom", "hmm to", " ", " alifrom ", " ali to ", " ", " envfrom ", " env to ", (pli->mode == p7_SEARCH_SEQS ? " sq len " : " mod len "), " ", "acc") < 0)
1272 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1273 if (fprintf(ofp, " %6s %5s %9s %9s %9s %2s %9s %9s %2s %9s %9s %9s %2s %4s\n", "------", "-----", "---------", "-------", "-------", " ", "---------", "---------", " ", "---------", "---------", "---------", " ", "----") < 0)
1274 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1275 }
1276 else
1277 {
1278 /* The domain table is 101 char wide:
1279 # score bias c-Evalue i-Evalue hmmfrom hmmto alifrom ali to envfrom env to acc
1280 --- ------ ----- --------- --------- ------- ------- ------- ------- ------- ------- ----
1281 1 ? 123.4 23.1 9.7e-11 6.8e-9 3 1230 .. 1 492 [] 2 490 .] 0.90
1282 123 ! 1234.5 123.4 123456789 123456789 1234567 1234567 .. 1234567 1234567 [] 1234567 1234568 .] 0.12
1283 */
1284 if (fprintf(ofp, " %3s %6s %5s %9s %9s %7s %7s %2s %7s %7s %2s %7s %7s %2s %4s\n", "#", "score", "bias", "c-Evalue", "i-Evalue", "hmmfrom", "hmm to", " ", "alifrom", "ali to", " ", "envfrom", "env to", " ", "acc") < 0)
1285 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1286 if (fprintf(ofp, " %3s %6s %5s %9s %9s %7s %7s %2s %7s %7s %2s %7s %7s %2s %4s\n", "---", "------", "-----", "---------", "---------", "-------", "-------", " ", "-------", "-------", " ", "-------", "-------", " ", "----") < 0)
1287 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1288 }
1289
1290
1291 /* Domain hit table for each reported domain in this reported sequence. */
1292 nd = 0;
1293 for (d = 0; d < th->hit[h]->ndom; d++)
1294 {
1295 if (th->hit[h]->dcl[d].is_reported)
1296 {
1297 nd++;
1298 if (pli->long_targets)
1299 {
1300 if (fprintf(ofp, " %c %6.1f %5.1f %9.2g %9d %9d %c%c %9" PRId64 " %9" PRId64 " %c%c %9" PRId64 " %9" PRId64 " %c%c %9" PRId64 " %4.2f\n",
1301 //nd,
1302 th->hit[h]->dcl[d].is_included ? '!' : '?',
1303 th->hit[h]->dcl[d].bitscore,
1304 th->hit[h]->dcl[d].dombias * eslCONST_LOG2R, /* convert NATS to BITS at last moment */
1305 exp(th->hit[h]->dcl[d].lnP),
1306 th->hit[h]->dcl[d].ad->hmmfrom,
1307 th->hit[h]->dcl[d].ad->hmmto,
1308 (th->hit[h]->dcl[d].ad->hmmfrom == 1) ? '[' : '.',
1309 (th->hit[h]->dcl[d].ad->hmmto == th->hit[h]->dcl[d].ad->M) ? ']' : '.',
1310 th->hit[h]->dcl[d].ad->sqfrom,
1311 th->hit[h]->dcl[d].ad->sqto,
1312 (th->hit[h]->dcl[d].ad->sqfrom == 1) ? '[' : '.',
1313 (th->hit[h]->dcl[d].ad->sqto == th->hit[h]->dcl[d].ad->L) ? ']' : '.',
1314 th->hit[h]->dcl[d].ienv,
1315 th->hit[h]->dcl[d].jenv,
1316 (th->hit[h]->dcl[d].ienv == 1) ? '[' : '.',
1317 (th->hit[h]->dcl[d].jenv == th->hit[h]->dcl[d].ad->L) ? ']' : '.',
1318 th->hit[h]->dcl[d].ad->L,
1319 (th->hit[h]->dcl[d].oasc / (1.0 + fabs((float) (th->hit[h]->dcl[d].jenv - th->hit[h]->dcl[d].ienv))))) < 0)
1320 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1321 }
1322 else
1323 {
1324 if (fprintf(ofp, " %3d %c %6.1f %5.1f %9.2g %9.2g %7d %7d %c%c",
1325 nd,
1326 th->hit[h]->dcl[d].is_included ? '!' : '?',
1327 th->hit[h]->dcl[d].bitscore,
1328 th->hit[h]->dcl[d].dombias * eslCONST_LOG2R, /* convert NATS to BITS at last moment */
1329 exp(th->hit[h]->dcl[d].lnP) * pli->domZ,
1330 exp(th->hit[h]->dcl[d].lnP) * pli->Z,
1331 th->hit[h]->dcl[d].ad->hmmfrom,
1332 th->hit[h]->dcl[d].ad->hmmto,
1333 (th->hit[h]->dcl[d].ad->hmmfrom == 1) ? '[' : '.',
1334 (th->hit[h]->dcl[d].ad->hmmto == th->hit[h]->dcl[d].ad->M ) ? ']' : '.') < 0)
1335 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1336
1337 if (fprintf(ofp, " %7" PRId64 " %7" PRId64 " %c%c",
1338 th->hit[h]->dcl[d].ad->sqfrom,
1339 th->hit[h]->dcl[d].ad->sqto,
1340 (th->hit[h]->dcl[d].ad->sqfrom == 1) ? '[' : '.',
1341 (th->hit[h]->dcl[d].ad->sqto == th->hit[h]->dcl[d].ad->L) ? ']' : '.') < 0)
1342 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1343
1344 if (fprintf(ofp, " %7" PRId64 " %7" PRId64 " %c%c",
1345 th->hit[h]->dcl[d].ienv,
1346 th->hit[h]->dcl[d].jenv,
1347 (th->hit[h]->dcl[d].ienv == 1) ? '[' : '.',
1348 (th->hit[h]->dcl[d].jenv == th->hit[h]->dcl[d].ad->L) ? ']' : '.') < 0)
1349 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1350
1351 if (fprintf(ofp, " %4.2f\n",
1352 (th->hit[h]->dcl[d].oasc / (1.0 + fabs((float) (th->hit[h]->dcl[d].jenv - th->hit[h]->dcl[d].ienv))))) < 0)
1353 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1354 }
1355
1356 }
1357 } // end of domain table in this reported sequence.
1358
1359 /* Alignment data for each reported domain in this reported sequence. */
1360 if (pli->show_alignments)
1361 {
1362 if (pli->long_targets)
1363 {
1364 if (fprintf(ofp, "\n Alignment:\n") < 0)
1365 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1366 }
1367 else
1368 {
1369 if (fprintf(ofp, "\n Alignments for each domain:\n") < 0)
1370 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1371 nd = 0;
1372 }
1373
1374 for (d = 0; d < th->hit[h]->ndom; d++)
1375 if (th->hit[h]->dcl[d].is_reported)
1376 {
1377 nd++;
1378 if (!pli->long_targets)
1379 {
1380 if (fprintf(ofp, " == domain %d", nd ) < 0)
1381 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1382 }
1383 if (fprintf(ofp, " score: %.1f bits", th->hit[h]->dcl[d].bitscore) < 0)
1384 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1385 if (!pli->long_targets)
1386 {
1387 if (fprintf(ofp, "; conditional E-value: %.2g\n", exp(th->hit[h]->dcl[d].lnP) * pli->domZ) < 0)
1388 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1389 }
1390 else
1391 {
1392 if (fprintf(ofp, "\n") < 0)
1393 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1394 }
1395
1396 if ((status = p7_alidisplay_Print(ofp, th->hit[h]->dcl[d].ad, 40, textw, pli)) != eslOK) return status;
1397
1398 if (fprintf(ofp, "\n") < 0)
1399 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1400 }
1401 }
1402 else // alignment reporting is off:
1403 {
1404 if (fprintf(ofp, "\n") < 0)
1405 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1406 }
1407
1408 } // end, loop over all reported hits
1409
1410 if (th->nreported == 0)
1411 {
1412 if (fprintf(ofp, "\n [No targets detected that satisfy reporting thresholds]\n") < 0)
1413 ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1414 }
1415 return eslOK;
1416 }
1417
1418
1419 /* Function: p7_tophits_Alignment()
1420 * Synopsis: Create multiple alignment of all included domains.
1421 *
1422 * Purpose: Create a multiple alignment of all domains marked
1423 * "includable" in the top hits list <th>. Return it in
1424 * <*ret_msa>.
1425 *
1426 * Use of <optflags> is identical to <optflags> in <p7_tracealign_Seqs()>.
1427 * Possible flags include <p7_DIGITIZE>, <p7_ALL_CONSENSUS_COLS>,
1428 * and <p7_TRIM>; they may be OR'ed together. Otherwise, pass
1429 * <p7_DEFAULT> to set no flags.
1430 *
1431 * Caller may optionally provide <inc_sqarr>, <inc_trarr>, and
1432 * <inc_n> to include additional sequences in the alignment
1433 * (the jackhmmer query, for example). Otherwise, pass <NULL, NULL, 0>.
1434 *
1435 * Returns: <eslOK> on success, and <*ret_msa> points to a new MSA that
1436 * the caller is responsible for freeing.
1437 *
1438 * Returns <eslFAIL> if there are no included domains,
1439 * in which case <*ret_msa> is <NULL>.
1440 *
1441 * Throws: <eslEMEM> on allocation failure;
1442 * <eslECORRUPT> on unexpected internal data corruption.
1443 *
1444 * Xref: J4/29: incept.
1445 * J4/76: added inc_sqarr, inc_trarr, inc_n, optflags
1446 * H4/72: iss131, segfault if top seq has no reported domains
1447 * H5/88: iss140, jackhmmer --fast segfaults
1448 */
1449 int
p7_tophits_Alignment(const P7_TOPHITS * th,const ESL_ALPHABET * abc,ESL_SQ ** inc_sqarr,P7_TRACE ** inc_trarr,int inc_n,int optflags,ESL_MSA ** ret_msa)1450 p7_tophits_Alignment(const P7_TOPHITS *th, const ESL_ALPHABET *abc,
1451 ESL_SQ **inc_sqarr, P7_TRACE **inc_trarr, int inc_n,
1452 int optflags, ESL_MSA **ret_msa)
1453 {
1454 ESL_SQ **sqarr = NULL;
1455 P7_TRACE **trarr = NULL;
1456 ESL_MSA *msa = NULL;
1457 int ndom = 0;
1458 int M = 0;
1459 int h, d, y;
1460 int status;
1461
1462 /* How many domains will be included in the new alignment? We also
1463 * set M here; we don't have hmm, but every ali has a copy.
1464 *
1465 * Beware: although p7_Pipeline() must not register hits that have
1466 * no domains, it did happen {iss131}. So we're careful (here, at
1467 * least) about testing ndom before reaching into dcl[] to get M.
1468 */
1469 for (h = 0; h < th->N; h++)
1470 if (th->hit[h]->flags & p7_IS_INCLUDED)
1471 for (d = 0; d < th->hit[h]->ndom; d++)
1472 if (th->hit[h]->dcl[d].is_included)
1473 {
1474 if (M == 0) M = th->hit[h]->dcl[d].ad->M;
1475 ndom++;
1476 }
1477
1478 if (inc_n)
1479 {
1480 if (M == 0) M = inc_trarr[0]->M; // unusual case where there's an included trace, but no other aligned hits: get M from included trace
1481 else if (M != inc_trarr[0]->M) // spot check that included traces appear to have same # of consensus positions; iss#140
1482 ESL_XEXCEPTION(eslECORRUPT, "top hits and included trace(s) have different profile lengths");
1483 }
1484 if (inc_n+ndom == 0) { status = eslFAIL; goto ERROR; }
1485
1486 /* Allocation */
1487 ESL_ALLOC(sqarr, sizeof(ESL_SQ *) * (ndom + inc_n));
1488 ESL_ALLOC(trarr, sizeof(P7_TRACE *) * (ndom + inc_n));
1489 /* Inclusion of preexisting seqs, traces: make copy of pointers */
1490 for (y = 0; y < inc_n; y++) { sqarr[y] = inc_sqarr[y]; trarr[y] = inc_trarr[y]; }
1491 for (; y < (ndom+inc_n); y++) { sqarr[y] = NULL; trarr[y] = NULL; }
1492
1493 /* Make faux sequences, traces from hit list */
1494 y = inc_n;
1495 for (h = 0; h < th->N; h++)
1496 if (th->hit[h]->flags & p7_IS_INCLUDED)
1497 {
1498 for (d = 0; d < th->hit[h]->ndom; d++)
1499 if (th->hit[h]->dcl[d].is_included)
1500 {
1501 if ((status = p7_alidisplay_Backconvert(th->hit[h]->dcl[d].ad, abc, &(sqarr[y]), &(trarr[y]))) != eslOK) goto ERROR;
1502 y++;
1503 }
1504 }
1505
1506 /* Make the multiple alignment */
1507 if ((status = p7_tracealign_Seqs(sqarr, trarr, inc_n+ndom, M, optflags, NULL, &msa)) != eslOK) goto ERROR;
1508
1509 /* Clean up */
1510 for (y = inc_n; y < ndom+inc_n; y++) esl_sq_Destroy(sqarr[y]);
1511 for (y = inc_n; y < ndom+inc_n; y++) p7_trace_Destroy(trarr[y]);
1512 free(sqarr);
1513 free(trarr);
1514 *ret_msa = msa;
1515 return eslOK;
1516
1517 ERROR:
1518 if (sqarr != NULL) { for (y = inc_n; y < ndom+inc_n; y++) if (sqarr[y] != NULL) esl_sq_Destroy(sqarr[y]); free(sqarr); }
1519 if (trarr != NULL) { for (y = inc_n; y < ndom+inc_n; y++) if (trarr[y] != NULL) p7_trace_Destroy(trarr[y]); free(trarr); }
1520 if (msa != NULL) esl_msa_Destroy(msa);
1521 *ret_msa = NULL;
1522 return status;
1523 }
1524 /*---------------- end, standard output format ------------------*/
1525
1526
1527
1528
1529
1530 /*****************************************************************
1531 * 3. Tabular (parsable) output of pipeline results.
1532 *****************************************************************/
1533
1534 /* Function: p7_tophits_TabularTargets()
1535 * Synopsis: Output parsable table of per-sequence hits.
1536 *
1537 * Purpose: Output a parseable table of reportable per-sequence hits
1538 * in sorted tophits list <th> in an easily parsed ASCII
1539 * tabular form to stream <ofp>, using final pipeline
1540 * accounting stored in <pli>.
1541 *
1542 * Designed to be concatenated for multiple queries and
1543 * multiple top hits list.
1544 *
1545 * Returns: <eslOK> on success.
1546 *
1547 * Throws: <eslEWRITE> if a write to <ofp> fails; for example, if
1548 * the disk fills up.
1549 */
1550 int
p7_tophits_TabularTargets(FILE * ofp,char * qname,char * qacc,P7_TOPHITS * th,P7_PIPELINE * pli,int show_header)1551 p7_tophits_TabularTargets(FILE *ofp, char *qname, char *qacc, P7_TOPHITS *th, P7_PIPELINE *pli, int show_header)
1552 {
1553 int qnamew = ESL_MAX(20, strlen(qname));
1554 int tnamew = ESL_MAX(20, p7_tophits_GetMaxNameLength(th));
1555 int qaccw = ((qacc != NULL) ? ESL_MAX(10, strlen(qacc)) : 10);
1556 int taccw = ESL_MAX(10, p7_tophits_GetMaxAccessionLength(th));
1557 int posw = (pli->long_targets ? ESL_MAX(7, p7_tophits_GetMaxPositionLength(th)) : 0);
1558 int h,d;
1559
1560 if (show_header)
1561 {
1562 if (pli->long_targets)
1563 {
1564 if (fprintf(ofp, "#%-*s %-*s %-*s %-*s %s %s %*s %*s %*s %*s %*s %6s %9s %6s %5s %s\n",
1565 tnamew-1, " target name", taccw, "accession", qnamew, "query name", qaccw, "accession", "hmmfrom", "hmm to", posw, "alifrom", posw, "ali to", posw, "envfrom", posw, "env to", posw, ( pli->mode == p7_SCAN_MODELS ? "modlen" : "sq len" ), "strand", " E-value", " score", " bias", "description of target") < 0)
1566 ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-sequence hit list: write failed");
1567 if (fprintf(ofp, "#%*s %*s %*s %*s %s %s %*s %*s %*s %*s %*s %6s %9s %6s %5s %s\n",
1568 tnamew-1, "-------------------", taccw, "----------", qnamew, "--------------------", qaccw, "----------", "-------", "-------", posw, "-------", posw, "-------", posw, "-------", posw, "-------", posw, "-------", "------", "---------", "------", "-----", "---------------------") < 0)
1569 ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-per-sequence hit list: write failed");
1570 }
1571 else
1572 {
1573 if (fprintf(ofp, "#%*s %22s %22s %33s\n", tnamew+qnamew+taccw+qaccw+2, "", "--- full sequence ----", "--- best 1 domain ----", "--- domain number estimation ----") < 0)
1574 ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-sequence hit list: write failed");
1575 if (fprintf(ofp, "#%-*s %-*s %-*s %-*s %9s %6s %5s %9s %6s %5s %5s %3s %3s %3s %3s %3s %3s %3s %s\n",
1576 tnamew-1, " target name", taccw, "accession", qnamew, "query name", qaccw, "accession", " E-value", " score", " bias", " E-value", " score", " bias", "exp", "reg", "clu", " ov", "env", "dom", "rep", "inc", "description of target") < 0)
1577 ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-sequence hit list: write failed");
1578 if (fprintf(ofp, "#%*s %*s %*s %*s %9s %6s %5s %9s %6s %5s %5s %3s %3s %3s %3s %3s %3s %3s %s\n",
1579 tnamew-1, "-------------------", taccw, "----------", qnamew, "--------------------", qaccw, "----------", "---------", "------", "-----", "---------", "------", "-----", "---", "---", "---", "---", "---", "---", "---", "---", "---------------------") < 0)
1580 ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-sequence hit list: write failed");
1581 }
1582 }
1583
1584 for (h = 0; h < th->N; h++)
1585 if (th->hit[h]->flags & p7_IS_REPORTED)
1586 {
1587 d = th->hit[h]->best_domain;
1588 if (pli->long_targets)
1589 {
1590 if (fprintf(ofp, "%-*s %-*s %-*s %-*s %7d %7d %*" PRId64 " %*" PRId64 " %*" PRId64 " %*" PRId64 " %*" PRId64 " %6s %9.2g %6.1f %5.1f %s\n",
1591 tnamew, th->hit[h]->name,
1592 taccw, th->hit[h]->acc ? th->hit[h]->acc : "-",
1593 qnamew, qname,
1594 qaccw, ( (qacc != NULL && qacc[0] != '\0') ? qacc : "-"),
1595 th->hit[h]->dcl[d].ad->hmmfrom,
1596 th->hit[h]->dcl[d].ad->hmmto,
1597 posw, th->hit[h]->dcl[d].iali,
1598 posw, th->hit[h]->dcl[d].jali,
1599 posw, th->hit[h]->dcl[d].ienv,
1600 posw, th->hit[h]->dcl[d].jenv,
1601 posw, th->hit[h]->dcl[0].ad->L,
1602 (th->hit[h]->dcl[d].iali < th->hit[h]->dcl[d].jali ? " + " : " - "),
1603 exp(th->hit[h]->lnP),
1604 th->hit[h]->score,
1605 th->hit[h]->dcl[d].dombias * eslCONST_LOG2R, /* convert NATS to BITS at last moment */
1606 th->hit[h]->desc == NULL ? "-" : th->hit[h]->desc ) < 0)
1607 ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-sequence hit list: write failed");
1608 }
1609 else
1610 {
1611 if (fprintf(ofp, "%-*s %-*s %-*s %-*s %9.2g %6.1f %5.1f %9.2g %6.1f %5.1f %5.1f %3d %3d %3d %3d %3d %3d %3d %s\n",
1612 tnamew, th->hit[h]->name,
1613 taccw, th->hit[h]->acc ? th->hit[h]->acc : "-",
1614 qnamew, qname,
1615 qaccw, ( (qacc != NULL && qacc[0] != '\0') ? qacc : "-"),
1616 exp(th->hit[h]->lnP) * pli->Z,
1617 th->hit[h]->score,
1618 th->hit[h]->pre_score - th->hit[h]->score, /* bias correction */
1619 exp(th->hit[h]->dcl[d].lnP) * pli->Z,
1620 th->hit[h]->dcl[d].bitscore,
1621 th->hit[h]->dcl[d].dombias * eslCONST_LOG2R, /* convert NATS to BITS at last moment */
1622 th->hit[h]->nexpected,
1623 th->hit[h]->nregions,
1624 th->hit[h]->nclustered,
1625 th->hit[h]->noverlaps,
1626 th->hit[h]->nenvelopes,
1627 th->hit[h]->ndom,
1628 th->hit[h]->nreported,
1629 th->hit[h]->nincluded,
1630 (th->hit[h]->desc == NULL ? "-" : th->hit[h]->desc)) < 0)
1631 ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-sequence hit list: write failed");
1632 }
1633 }
1634 return eslOK;
1635 }
1636
1637
1638 /* Function: p7_tophits_TabularDomains()
1639 * Synopsis: Output parseable table of per-domain hits
1640 *
1641 * Purpose: Output a parseable table of reportable per-domain hits
1642 * in sorted tophits list <th> in an easily parsed ASCII
1643 * tabular form to stream <ofp>, using final pipeline
1644 * accounting stored in <pli>.
1645 *
1646 * Designed to be concatenated for multiple queries and
1647 * multiple top hits list.
1648 *
1649 * Returns: <eslOK> on success.
1650 *
1651 * Throws: <eslEWRITE> if a write to <ofp> fails; for example, if
1652 * the disk fills up.
1653 */
1654 int
p7_tophits_TabularDomains(FILE * ofp,char * qname,char * qacc,P7_TOPHITS * th,P7_PIPELINE * pli,int show_header)1655 p7_tophits_TabularDomains(FILE *ofp, char *qname, char *qacc, P7_TOPHITS *th, P7_PIPELINE *pli, int show_header)
1656 {
1657
1658 int qnamew = ESL_MAX(20, strlen(qname));
1659 int tnamew = ESL_MAX(20, p7_tophits_GetMaxNameLength(th));
1660 int qaccw = (qacc ? ESL_MAX(10, strlen(qacc)) : 10);
1661 int taccw = ESL_MAX(10, p7_tophits_GetMaxAccessionLength(th));
1662 int tlen, qlen;
1663 int h,d,nd;
1664
1665 if (show_header)
1666 {
1667 if (fprintf(ofp, "#%*s %22s %40s %11s %11s %11s\n", tnamew+qnamew-1+15+taccw+qaccw, "", "--- full sequence ---", "-------------- this domain -------------", "hmm coord", "ali coord", "env coord") < 0)
1668 ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-domain hit list: write failed");
1669 if (fprintf(ofp, "#%-*s %-*s %5s %-*s %-*s %5s %9s %6s %5s %3s %3s %9s %9s %6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n",
1670 tnamew-1, " target name", taccw, "accession", "tlen", qnamew, "query name", qaccw, "accession", "qlen", "E-value", "score", "bias", "#", "of", "c-Evalue", "i-Evalue", "score", "bias", "from", "to", "from", "to", "from", "to", "acc", "description of target") < 0)
1671 ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-domain hit list: write failed");
1672 if (fprintf(ofp, "#%*s %*s %5s %*s %*s %5s %9s %6s %5s %3s %3s %9s %9s %6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n",
1673 tnamew-1, "-------------------", taccw, "----------", "-----", qnamew, "--------------------", qaccw, "----------", "-----", "---------", "------", "-----", "---", "---", "---------", "---------", "------", "-----", "-----", "-----", "-----", "-----", "-----", "-----", "----", "---------------------") < 0)
1674 ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-domain hit list: write failed");
1675 }
1676
1677 for (h = 0; h < th->N; h++)
1678 if (th->hit[h]->flags & p7_IS_REPORTED)
1679 {
1680 nd = 0;
1681 for (d = 0; d < th->hit[h]->ndom; d++)
1682 if (th->hit[h]->dcl[d].is_reported)
1683 {
1684 nd++;
1685
1686 /* in hmmsearch, targets are seqs and queries are HMMs;
1687 * in hmmscan, the reverse. but in the ALIDISPLAY
1688 * structure, lengths L and M are for seq and HMMs, not
1689 * for query and target, so sort it out.
1690 */
1691 if (pli->mode == p7_SEARCH_SEQS) { qlen = th->hit[h]->dcl[d].ad->M; tlen = th->hit[h]->dcl[d].ad->L; }
1692 else { qlen = th->hit[h]->dcl[d].ad->L; tlen = th->hit[h]->dcl[d].ad->M; }
1693
1694
1695
1696 if (fprintf(ofp, "%-*s %-*s %5d %-*s %-*s %5d %9.2g %6.1f %5.1f %3d %3d %9.2g %9.2g %6.1f %5.1f %5d %5d %5" PRId64 " %5" PRId64 " %5" PRId64 " %5" PRId64 " %4.2f %s\n",
1697 tnamew, th->hit[h]->name,
1698 taccw, th->hit[h]->acc ? th->hit[h]->acc : "-",
1699 tlen,
1700 qnamew, qname,
1701 qaccw, ( (qacc != NULL && qacc[0] != '\0') ? qacc : "-"),
1702 qlen,
1703 exp(th->hit[h]->lnP) * pli->Z,
1704 th->hit[h]->score,
1705 th->hit[h]->pre_score - th->hit[h]->score, /* bias correction */
1706 nd,
1707 th->hit[h]->nreported,
1708 exp(th->hit[h]->dcl[d].lnP) * pli->domZ,
1709 exp(th->hit[h]->dcl[d].lnP) * pli->Z,
1710 th->hit[h]->dcl[d].bitscore,
1711 th->hit[h]->dcl[d].dombias * eslCONST_LOG2R, /* NATS to BITS at last moment */
1712 th->hit[h]->dcl[d].ad->hmmfrom,
1713 th->hit[h]->dcl[d].ad->hmmto,
1714 th->hit[h]->dcl[d].ad->sqfrom,
1715 th->hit[h]->dcl[d].ad->sqto,
1716 th->hit[h]->dcl[d].ienv,
1717 th->hit[h]->dcl[d].jenv,
1718 (th->hit[h]->dcl[d].oasc / (1.0 + fabs((float) (th->hit[h]->dcl[d].jenv - th->hit[h]->dcl[d].ienv)))),
1719 (th->hit[h]->desc ? th->hit[h]->desc : "-")) < 0)
1720 ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-domain hit list: write failed");
1721
1722 }
1723 }
1724 return eslOK;
1725 }
1726
1727
1728 /* Function: p7_tophits_TabularXfam()
1729 * Synopsis: Output parsable table(s) of hits, in format desired by Xfam.
1730 *
1731 * Purpose: Output a parseable table of reportable hits in sorted
1732 * tophits list <th> in an easily parsed ASCII tabular
1733 * form to stream <ofp>, using final pipeline accounting
1734 * stored in <pli>.
1735 *
1736 * For long-target nucleotide queries, this will print the
1737 * same hits as p7_tophits_TabularTargets(), but with the
1738 * smaller number of (reordered) fields required by Dfam
1739 * scripts.
1740 *
1741 * For protein queries, this will print two tables:
1742 * (a) per-sequence hits as presented by
1743 * p7_tophits_TabularTargets(), but formatted for
1744 * Pfam scripts;
1745 * (b) per-domain hits, similar to those presented by
1746 * p7_tophits_TabularDomains(), but sorted by
1747 * score/e-value, and formated for Pfam scripts.
1748 *
1749 * Returns: <eslOK> on success.
1750 *
1751 * Throws: <eslEMEM> on allocation failure.
1752 * <eslEWRITE> if a write to <ofp> fails; for example, if
1753 * the disk fills up.
1754 */
1755 int
p7_tophits_TabularXfam(FILE * ofp,char * qname,char * qacc,P7_TOPHITS * th,P7_PIPELINE * pli)1756 p7_tophits_TabularXfam(FILE *ofp, char *qname, char *qacc, P7_TOPHITS *th, P7_PIPELINE *pli)
1757 {
1758 P7_TOPHITS *domHitlist = NULL;
1759 P7_HIT *domhit = NULL;
1760 int tnamew = ESL_MAX(20, p7_tophits_GetMaxNameLength(th));
1761 int taccw = ESL_MAX(20, p7_tophits_GetMaxAccessionLength(th));
1762 int qnamew = ESL_MAX(20, strlen(qname));
1763 int ndom = 0;
1764 int posw = (pli->long_targets ? ESL_MAX(7, p7_tophits_GetMaxPositionLength(th)) : 0);
1765 int h,d;
1766 int status;
1767
1768
1769 if (pli->long_targets)
1770 {
1771 if (fprintf(ofp, "# hit scores\n# ----------\n#\n") < 0)
1772 ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1773 if (fprintf(ofp, "# %-*s %-*s %-*s %6s %9s %5s %s %s %6s %*s %*s %*s %*s %*s %s\n",
1774 tnamew-1, "target name", taccw, "acc", qnamew, "query name", "bits", " e-value", " bias", "hmm-st", "hmm-en", "strand", posw, "ali-st", posw, "ali-en", posw, "env-st", posw, "env-en", posw, ( pli->mode == p7_SCAN_MODELS ? "modlen" : "sq-len" ), "description of target") < 0)
1775 ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1776 if (fprintf(ofp, "# %-*s %-*s %-*s %6s %9s %5s %s %s %6s %*s %*s %*s %*s %*s %s\n",
1777 tnamew-1, "-------------------", taccw, "-------------------", qnamew, "-------------------", "------", "---------", "-----", "-------", "-------", "------", posw, "-------", posw, "-------", posw, "-------", posw, "-------", posw, "-------", "---------------------") < 0)
1778 ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1779
1780 for (h = 0; h < th->N; h++)
1781 if (th->hit[h]->flags & p7_IS_REPORTED)
1782 {
1783 //d = th->hit[h]->best_domain;
1784 if (fprintf(ofp, "%-*s %-*s %-*s %6.1f %9.2g %5.1f %7d %7d %s %*" PRId64 " %*" PRId64 " %*" PRId64 " %*" PRId64 " %*" PRId64 " %s\n",
1785 tnamew, th->hit[h]->name,
1786 taccw, ( pli->mode == p7_SCAN_MODELS ? (th->hit[h]->acc ? th->hit[h]->acc : "-") : ((qacc && qacc[0] != '\0') ? qacc : "-")),
1787 qnamew, qname,
1788 th->hit[h]->score,
1789 exp(th->hit[h]->lnP),
1790 th->hit[h]->dcl[0].dombias * eslCONST_LOG2R, /* convert nats to bits at last moment */
1791 th->hit[h]->dcl[0].ad->hmmfrom,
1792 th->hit[h]->dcl[0].ad->hmmto,
1793 (th->hit[h]->dcl[0].iali < th->hit[h]->dcl[0].jali ? " + " : " - "),
1794 posw, th->hit[h]->dcl[0].iali,
1795 posw, th->hit[h]->dcl[0].jali,
1796 posw, th->hit[h]->dcl[0].ienv,
1797 posw, th->hit[h]->dcl[0].jenv,
1798 posw, th->hit[h]->dcl[0].ad->L,
1799 th->hit[h]->desc == NULL ? "-" : th->hit[h]->desc) < 0)
1800 ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1801 }
1802 }
1803 else
1804 {
1805 if (fprintf(ofp, "# Sequence scores\n# ---------------\n#\n") < 0)
1806 ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1807 if (fprintf(ofp, "# %-*s %6s %9s %3s %5s %5s %s\n",
1808 tnamew-1, "name", " bits", " E-value", "n", "exp", " bias", "description") < 0)
1809 ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1810 if (fprintf(ofp, "# %*s %6s %9s %3s %5s %5s %s\n",
1811 tnamew-1, "-------------------", "------", "---------","---", "-----", "-----", "---------------------") < 0)
1812 ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1813
1814 for (h = 0; h < th->N; h++)
1815 {
1816 if (th->hit[h]->flags & p7_IS_REPORTED)
1817 {
1818 if (fprintf(ofp, "%-*s %6.1f %9.2g %3d %5.1f %5.1f %s\n",
1819 tnamew, th->hit[h]->name,
1820 th->hit[h]->score,
1821 exp(th->hit[h]->lnP) * pli->Z,
1822 th->hit[h]->ndom,
1823 th->hit[h]->nexpected,
1824 th->hit[h]->pre_score - th->hit[h]->score, /* bias correction */
1825 (th->hit[h]->desc == NULL ? "-" : th->hit[h]->desc)) < 0)
1826 ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1827
1828 for (d = 0; d < th->hit[h]->ndom; d++)
1829 if (th->hit[h]->dcl[d].is_reported)
1830 ndom ++;
1831 }
1832 }
1833 if (fprintf(ofp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1834
1835 /* Need to sort the domains. One way to do this is to re-use the hit sorting machinery,
1836 * so we create one "hit" for each domain, then hand it off to the sorter
1837 */
1838 if ((domHitlist = p7_tophits_Create()) == NULL) return eslEMEM;
1839 for (h = 0; h < th->N; h++)
1840 {
1841 if (th->hit[h]->flags & p7_IS_REPORTED)
1842 {
1843 int ndomReported = 0;
1844 for (d = 0; d < th->hit[h]->ndom; d++)
1845 {
1846 if (th->hit[h]->dcl[d].is_reported)
1847 {
1848 p7_tophits_CreateNextHit(domHitlist, &domhit);
1849 ndomReported++;
1850 ESL_ALLOC(domhit->dcl, sizeof(P7_DOMAIN) );
1851
1852 domhit->ndom = ndomReported; // re-using this variable to track the ordinal value of the domain in the original hit list that generated this pseudo-hit
1853 domhit->name = th->hit[h]->name;
1854 domhit->desc = th->hit[h]->desc;
1855 domhit->dcl[0] = th->hit[h]->dcl[d];
1856 domhit->sortkey = pli->inc_by_E ? -1.0 * th->hit[h]->dcl[d].lnP : th->hit[h]->dcl[d].bitscore;
1857 }
1858 }
1859 }
1860 }
1861 p7_tophits_SortBySortkey(domHitlist);
1862
1863 // Now with this list of sorted "hits" (really domains)
1864 if (fprintf(ofp, "# Domain scores\n# -------------\n#\n") < 0)
1865 ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1866 if (fprintf(ofp, "# %-*s %6s %9s %5s %5s %6s %6s %6s %6s %6s %6s %s\n",
1867 tnamew-1, " name", "bits", "E-value", "hit", "bias", "env-st", "env-en", "ali-st", "ali-en", "hmm-st", "hmm-en", "description") < 0)
1868 ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1869 if (fprintf(ofp, "# %*s %6s %9s %5s %5s %6s %6s %6s %6s %6s %6s %s\n",
1870 tnamew-1, "-------------------", "------", "---------", "-----", "-----", "------", "------", "------", "------", "------", "------", "---------------------") < 0)
1871 ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1872
1873 for (h = 0; h < domHitlist->N; h++)
1874 {
1875 domhit = domHitlist->hit[h];
1876
1877 if (fprintf(ofp, "%-*s %6.1f %9.2g %5d %5.1f %6" PRId64 " %6" PRId64 " %6" PRId64 " %6" PRId64 " %6d %6d %s\n",
1878 tnamew, domHitlist->hit[h]->name,
1879 domhit->dcl[0].bitscore,
1880 exp(domhit->dcl[0].lnP) * pli->Z, //i-Evalue
1881 domhit->ndom,
1882 domhit->dcl[0].dombias * eslCONST_LOG2R, // NATS to BITS at last moment
1883 domhit->dcl[0].ienv,
1884 domhit->dcl[0].jenv,
1885 domhit->dcl[0].ad->sqfrom,
1886 domhit->dcl[0].ad->sqto,
1887 domhit->dcl[0].ad->hmmfrom,
1888 domhit->dcl[0].ad->hmmto,
1889 (domhit->desc ? domhit->desc : "-")) < 0)
1890 ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1891 }
1892 free (domHitlist->unsrt);
1893 free (domHitlist->hit);
1894 free (domHitlist);
1895 }
1896 return eslOK;
1897
1898 ERROR:
1899 if (domHitlist)
1900 {
1901 free (domHitlist->unsrt);
1902 free (domHitlist->hit);
1903 free (domHitlist);
1904 }
1905 return status;
1906 }
1907
1908
1909 /* Function: p7_tophits_AliScores()
1910 * Synopsis: Output per-position scores for each position of each query/hit pair
1911 *
1912 * Purpose: This depends on per-alignment-position scores having been
1913 * previously computed, as in p7_pipeline_computeAliScores()
1914 *
1915 * Returns: <eslOK> on success.
1916 *
1917 * Throws: none
1918 */
1919 int
p7_tophits_AliScores(FILE * ofp,char * qname,P7_TOPHITS * th)1920 p7_tophits_AliScores(FILE *ofp, char *qname, P7_TOPHITS *th )
1921 {
1922 P7_HIT *hit;
1923 int h, i;
1924 float *scores;
1925
1926 for (h = 0; h < th->N; h++) {
1927 hit = th->hit[h];
1928 if (hit->flags & p7_IS_REPORTED)
1929 {
1930 fprintf (ofp, "%s %s %" PRId64 " %" PRId64 " :", qname, hit->name, hit->dcl[0].iali, hit->dcl[0].jali);
1931
1932 scores = hit->dcl[0].scores_per_pos;
1933 for (i=0; i<hit->dcl[0].ad->N; i++) {
1934 if (scores[i] == -eslINFINITY)
1935 fprintf (ofp, " >");
1936 else
1937 fprintf (ofp, " %.3f", scores[i] * eslCONST_LOG2R);
1938
1939 }
1940 fprintf (ofp, "\n");
1941 }
1942
1943 }
1944 return eslOK;
1945
1946 }
1947
1948
1949
1950
1951 /* Function: p7_tophits_TabularTail()
1952 * Synopsis: Print a trailer on a tabular output file.
1953 *
1954 * Purpose: Print some metadata as a trailer on a tabular output file:
1955 * date/time, the program, HMMER3 version info, the pipeline mode (SCAN or SEARCH),
1956 * the query and target filenames, a spoof commandline
1957 * recording the entire program configuration, and
1958 * a "fini!" that's useful for detecting successful
1959 * output completion.
1960 *
1961 * Args: ofp - open tabular output file (either --tblout or --domtblout)
1962 * progname - "hmmscan", for example
1963 * pipemode - p7_SEARCH_SEQS | p7_SCAN_MODELS
1964 * qfile - name of query file, or '-' for stdin, or '[none]' if NULL
1965 * tfile - name of target file, or '-' for stdin, or '[none]' if NULL
1966 * go - program configuration; used to generate spoofed command line
1967 *
1968 * Returns: <eslOK>.
1969 *
1970 * Throws: <eslEMEM> on allocation failure.
1971 * <eslESYS> if time() or ctime_r() system calls fail.
1972 * <eslEWRITE> on write failure.
1973 *
1974 * Xref: SRE:J7/54
1975 */
1976 int
p7_tophits_TabularTail(FILE * ofp,const char * progname,enum p7_pipemodes_e pipemode,const char * qfile,const char * tfile,const ESL_GETOPTS * go)1977 p7_tophits_TabularTail(FILE *ofp, const char *progname, enum p7_pipemodes_e pipemode, const char *qfile, const char *tfile, const ESL_GETOPTS *go)
1978 {
1979 time_t date = time(NULL);
1980 char *spoof_cmd = NULL;
1981 char *cwd = NULL;
1982 char timestamp[32];
1983 char modestamp[16];
1984 int status;
1985
1986
1987 if ((status = esl_opt_SpoofCmdline(go, &spoof_cmd)) != eslOK) goto ERROR;
1988 if (date == -1) ESL_XEXCEPTION(eslESYS, "time() failed");
1989 if ((ctime_r(&date, timestamp)) == NULL) ESL_XEXCEPTION(eslESYS, "ctime_r() failed");
1990 switch (pipemode) {
1991 case p7_SEARCH_SEQS: strcpy(modestamp, "SEARCH"); break;
1992 case p7_SCAN_MODELS: strcpy(modestamp, "SCAN"); break;
1993 default: ESL_EXCEPTION(eslEINCONCEIVABLE, "wait, what? no such pipemode");
1994 }
1995 esl_getcwd(&cwd);
1996
1997 if (fprintf(ofp, "#\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
1998 if (fprintf(ofp, "# Program: %s\n", (progname == NULL) ? "[none]" : progname) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
1999 if (fprintf(ofp, "# Version: %s (%s)\n", HMMER_VERSION, HMMER_DATE) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2000 if (fprintf(ofp, "# Pipeline mode: %s\n", modestamp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2001 if (fprintf(ofp, "# Query file: %s\n", (qfile == NULL) ? "[none]" : qfile) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2002 if (fprintf(ofp, "# Target file: %s\n", (tfile == NULL) ? "[none]" : tfile) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2003 if (fprintf(ofp, "# Option settings: %s\n", spoof_cmd) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2004 if (fprintf(ofp, "# Current dir: %s\n", (cwd == NULL) ? "[unknown]" : cwd) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2005 if (fprintf(ofp, "# Date: %s", timestamp) < 0) /* timestamp ends in \n */ ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2006 if (fprintf(ofp, "# [ok]\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2007
2008 free(spoof_cmd);
2009 if (cwd) free(cwd);
2010 return eslOK;
2011
2012 ERROR:
2013 if (spoof_cmd) free(spoof_cmd);
2014 if (cwd) free(cwd);
2015 return status;
2016 }
2017 /*------------------- end, tabular output -----------------------*/
2018
2019
2020
2021
2022 /*****************************************************************
2023 * 4. Benchmark driver
2024 *****************************************************************/
2025 #ifdef p7TOPHITS_BENCHMARK
2026 /*
2027 gcc -o benchmark-tophits -std=gnu99 -g -O2 -I. -L. -I../easel -L../easel -Dp7TOPHITS_BENCHMARK p7_tophits.c -lhmmer -leasel -lm
2028 ./benchmark-tophits
2029
2030 As of 28 Dec 07, shows 0.20u for 10 lists of 10,000 hits each (at least ~100x normal expectation),
2031 so we expect top hits list time to be negligible for typical hmmsearch/hmmscan runs.
2032
2033 If needed, we do have opportunity for optimization, however - especially in memory handling.
2034 */
2035 #include "p7_config.h"
2036
2037 #include "easel.h"
2038 #include "esl_getopts.h"
2039 #include "esl_stopwatch.h"
2040 #include "esl_random.h"
2041
2042 #include "hmmer.h"
2043
2044 static ESL_OPTIONS options[] = {
2045 /* name type default env range toggles reqs incomp help docgroup*/
2046 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
2047 { "-s", eslARG_INT, "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
2048 { "-M", eslARG_INT, "10", NULL, NULL, NULL, NULL, NULL, "number of top hits lists to simulate and merge", 0 },
2049 { "-N", eslARG_INT, "10000", NULL, NULL, NULL, NULL, NULL, "number of top hits to simulate", 0 },
2050 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
2051 };
2052 static char usage[] = "[-options]";
2053 static char banner[] = "benchmark driver for P7_TOPHITS";
2054
2055 int
main(int argc,char ** argv)2056 main(int argc, char **argv)
2057 {
2058 ESL_GETOPTS *go = p7_CreateDefaultApp(options, 0, argc, argv, banner, usage);
2059 ESL_STOPWATCH *w = esl_stopwatch_Create();
2060 ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s"));
2061 int N = esl_opt_GetInteger(go, "-N");
2062 int M = esl_opt_GetInteger(go, "-M");
2063 P7_TOPHITS **h = NULL;
2064 double *sortkeys = NULL;
2065 char name[] = "not_unique_name";
2066 char acc[] = "not_unique_acc";
2067 char desc[] = "Test description for the purposes of making the benchmark allocate space";
2068 int i,j;
2069 int status;
2070
2071 /* prep work: generate our sort keys before starting to time anything */
2072 ESL_ALLOC(h, sizeof(P7_TOPHITS *) * M); /* allocate pointers for M lists */
2073 ESL_ALLOC(sortkeys, sizeof(double) * N * M);
2074 for (i = 0; i < N*M; i++) sortkeys[i] = esl_random(r);
2075
2076 esl_stopwatch_Start(w);
2077
2078 /* generate M "random" lists and sort them */
2079 for (j = 0; j < M; j++)
2080 {
2081 h[j] = p7_tophits_Create();
2082 for (i = 0; i < N; i++)
2083 p7_tophits_Add(h[j], name, acc, desc, sortkeys[j*N + i],
2084 (float) sortkeys[j*N+i], sortkeys[j*N+i],
2085 (float) sortkeys[j*N+i], sortkeys[j*N+i],
2086 i, i, N,
2087 i, i, N,
2088 i, N, NULL);
2089 p7_tophits_SortBySortkey(h[j]);
2090 }
2091 /* then merge them into one big list in h[0] */
2092 for (j = 1; j < M; j++)
2093 {
2094 p7_tophits_Merge(h[0], h[j]);
2095 p7_tophits_Destroy(h[j]);
2096 }
2097
2098 esl_stopwatch_Stop(w);
2099
2100 p7_tophits_Destroy(h[0]);
2101 status = eslOK;
2102 ERROR:
2103 esl_getopts_Destroy(go);
2104 esl_stopwatch_Destroy(w);
2105 esl_randomness_Destroy(r);
2106 if (sortkeys != NULL) free(sortkeys);
2107 if (h != NULL) free(h);
2108 return status;
2109 }
2110 #endif /*p7TOPHITS_BENCHMARK*/
2111
2112
2113 /*****************************************************************
2114 * 5. Test driver
2115 *****************************************************************/
2116
2117 #ifdef p7TOPHITS_TESTDRIVE
2118 /*
2119 gcc -o tophits_utest -std=gnu99 -g -O2 -I. -L. -I../easel -L../easel -Dp7TOPHITS_TESTDRIVE p7_tophits.c -lhmmer -leasel -lm
2120 ./tophits_test
2121 */
2122 #include "p7_config.h"
2123
2124 #include "easel.h"
2125 #include "esl_getopts.h"
2126 #include "esl_stopwatch.h"
2127 #include "esl_random.h"
2128
2129 #include "hmmer.h"
2130
2131 static ESL_OPTIONS options[] = {
2132 /* name type default env range toggles reqs incomp help docgroup*/
2133 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
2134 { "-s", eslARG_INT, "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
2135 { "-N", eslARG_INT, "100", NULL, NULL, NULL, NULL, NULL, "number of top hits to simulate", 0 },
2136 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
2137 };
2138
2139 static char usage[] = "[-options]";
2140 static char banner[] = "test driver for P7_TOPHITS";
2141
2142 int
main(int argc,char ** argv)2143 main(int argc, char **argv)
2144 {
2145 ESL_GETOPTS *go = p7_CreateDefaultApp(options, 0, argc, argv, banner, usage);
2146 ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s"));
2147 int N = esl_opt_GetInteger(go, "-N");
2148 P7_TOPHITS *h1 = NULL;
2149 P7_TOPHITS *h2 = NULL;
2150 P7_TOPHITS *h3 = NULL;
2151 char name[] = "not_unique_name";
2152 char acc[] = "not_unique_acc";
2153 char desc[] = "Test description for the purposes of making the test driver allocate space";
2154 double key;
2155 int i;
2156
2157 h1 = p7_tophits_Create();
2158 h2 = p7_tophits_Create();
2159 h3 = p7_tophits_Create();
2160
2161 for (i = 0; i < N; i++)
2162 {
2163 key = esl_random(r);
2164 p7_tophits_Add(h1, name, acc, desc, key, (float) key, key, (float) key, key, i, i, N, i, i, N, 1, 1, NULL);
2165 key = 10.0 * esl_random(r);
2166 p7_tophits_Add(h2, name, acc, desc, key, (float) key, key, (float) key, key, i, i, N, i, i, N, 2, 2, NULL);
2167 key = 0.1 * esl_random(r);
2168 p7_tophits_Add(h3, name, acc, desc, key, (float) key, key, (float) key, key, i, i, N, i, i, N, 3, 3, NULL);
2169 }
2170 p7_tophits_Add(h1, "last", NULL, NULL, -1.0, (float) key, key, (float) key, key, i, i, N, i, i, N, 1, 1, NULL);
2171 p7_tophits_Add(h1, "first", NULL, NULL, 20.0, (float) key, key, (float) key, key, i, i, N, i, i, N, 1, 1, NULL);
2172
2173 p7_tophits_SortBySortkey(h1);
2174 if (strcmp(h1->hit[0]->name, "first") != 0) esl_fatal("sort failed (top is %s = %f)", h1->hit[0]->name, h1->hit[0]->sortkey);
2175 if (strcmp(h1->hit[N+1]->name, "last") != 0) esl_fatal("sort failed (last is %s = %f)", h1->hit[N+1]->name, h1->hit[N+1]->sortkey);
2176
2177 p7_tophits_Merge(h1, h2);
2178 if (strcmp(h1->hit[0]->name, "first") != 0) esl_fatal("after merge 1, sort failed (top is %s = %f)", h1->hit[0]->name, h1->hit[0]->sortkey);
2179 if (strcmp(h1->hit[2*N+1]->name, "last") != 0) esl_fatal("after merge 1, sort failed (last is %s = %f)", h1->hit[2*N+1]->name, h1->hit[2*N+1]->sortkey);
2180
2181 p7_tophits_Merge(h3, h1);
2182 if (strcmp(h3->hit[0]->name, "first") != 0) esl_fatal("after merge 2, sort failed (top is %s = %f)", h3->hit[0]->name, h3->hit[0]->sortkey);
2183 if (strcmp(h3->hit[3*N+1]->name, "last") != 0) esl_fatal("after merge 2, sort failed (last is %s = %f)", h3->hit[3*N+1]->name, h3->hit[3*N+1]->sortkey);
2184
2185 if (p7_tophits_GetMaxNameLength(h3) != strlen(name)) esl_fatal("GetMaxNameLength() failed");
2186
2187 p7_tophits_Destroy(h1);
2188 p7_tophits_Destroy(h2);
2189 p7_tophits_Destroy(h3);
2190 esl_randomness_Destroy(r);
2191 esl_getopts_Destroy(go);
2192 return eslOK;
2193 }
2194 #endif /*p7TOPHITS_TESTDRIVE*/
2195
2196
2197
2198