1 /* P7_TOPHITS: implementation of ranked list of top-scoring hits
2  *
3  * Contents:
4  *    1. The P7_TOPHITS object.
5  *    2. Standard (human-readable) output of pipeline results.
6  *    3. Tabular (parsable) output of pipeline results.
7  *    4. Benchmark driver.
8  *    5. Test driver.
9  */
10 #include "p7_config.h"
11 
12 #include <stdlib.h>
13 #include <string.h>
14 #include <limits.h>
15 
16 #include "easel.h"
17 #include "hmmer.h"
18 
19 /*****************************************************************
20  *= 1. The P7_TOPHITS object
21  *****************************************************************/
22 
23 /* Function:  p7_tophits_Create()
24  * Synopsis:  Allocate a hit list.
25  *
26  * Purpose:   Allocates a new <P7_TOPHITS> hit list and return a pointer
27  *            to it.
28  *
29  * Throws:    <NULL> on allocation failure.
30  */
31 P7_TOPHITS *
p7_tophits_Create(void)32 p7_tophits_Create(void)
33 {
34   P7_TOPHITS *h = NULL;
35   int         default_nalloc = 256;
36   int         status;
37 
38   ESL_ALLOC(h, sizeof(P7_TOPHITS));
39   h->hit    = NULL;
40   h->unsrt  = NULL;
41 
42   ESL_ALLOC(h->hit,   sizeof(P7_HIT *) * default_nalloc);
43   ESL_ALLOC(h->unsrt, sizeof(P7_HIT)   * default_nalloc);
44   h->Nalloc    = default_nalloc;
45   h->N         = 0;
46   h->nreported = 0;
47   h->nincluded = 0;
48   h->is_sorted_by_sortkey = TRUE; /* but only because there's 0 hits */
49   h->is_sorted_by_seqidx  = FALSE;
50   h->hit[0]    = h->unsrt;        /* if you're going to call it "sorted" when it contains just one hit, you need this */
51   return h;
52 
53  ERROR:
54   p7_tophits_Destroy(h);
55   return NULL;
56 }
57 
58 
59 /* Function:  p7_tophits_Grow()
60  * Synopsis:  Reallocates a larger hit list, if needed.
61  *
62  * Purpose:   If list <h> cannot hold another hit, doubles
63  *            the internal allocation.
64  *
65  * Returns:   <eslOK> on success.
66  *
67  * Throws:    <eslEMEM> on allocation failure; in this case,
68  *            the data in <h> are unchanged.
69  */
70 int
p7_tophits_Grow(P7_TOPHITS * h)71 p7_tophits_Grow(P7_TOPHITS *h)
72 {
73   void   *p;
74   P7_HIT *ori    = h->unsrt;
75   uint64_t Nalloc = h->Nalloc * 2;    /* grow by doubling */
76   int     i;
77   int     status;
78 
79   if (h->N < h->Nalloc) return eslOK; /* we have enough room for another hit */
80 
81   ESL_RALLOC(h->hit,   p, sizeof(P7_HIT *) * Nalloc);
82   ESL_RALLOC(h->unsrt, p, sizeof(P7_HIT)   * Nalloc);
83 
84   /* If we grow a sorted list, we have to translate the pointers
85    * in h->hit, because h->unsrt might have just moved in memory.
86    */
87   if (h->is_sorted_by_seqidx || h->is_sorted_by_sortkey)
88   {
89       for (i = 0; i < h->N; i++)
90         h->hit[i] = h->unsrt + (h->hit[i] - ori);
91   }
92 
93   h->Nalloc = Nalloc;
94   return eslOK;
95 
96  ERROR:
97   return eslEMEM;
98 }
99 
100 
101 /* Function:  p7_tophits_CreateNextHit()
102  * Synopsis:  Get pointer to new structure for recording a hit.
103  *
104  * Purpose:   Ask the top hits object <h> to do any necessary
105  *            internal allocation and bookkeeping to add a new,
106  *            empty hit to its list; return a pointer to
107  *            this new <P7_HIT> structure for data to be filled
108  *            in by the caller.
109  *
110  * Returns:   <eslOK> on success.
111  *
112  * Throws:    <eslEMEM> on allocation error.
113  */
114 int
p7_tophits_CreateNextHit(P7_TOPHITS * h,P7_HIT ** ret_hit)115 p7_tophits_CreateNextHit(P7_TOPHITS *h, P7_HIT **ret_hit)
116 {
117   P7_HIT *hit = NULL;
118   int     status;
119 
120   if ((status = p7_tophits_Grow(h)) != eslOK) goto ERROR;
121 
122   hit = &(h->unsrt[h->N]);
123   h->N++;
124   if (h->N >= 2)
125   {
126       h->is_sorted_by_seqidx = FALSE;
127       h->is_sorted_by_sortkey = FALSE;
128   }
129 
130   hit->name         = NULL;
131   hit->acc          = NULL;
132   hit->desc         = NULL;
133   hit->sortkey      = 0.0;
134 
135   hit->score        = 0.0;
136   hit->pre_score    = 0.0;
137   hit->sum_score    = 0.0;
138 
139   hit->lnP          = 0.0;
140   hit->pre_lnP      = 0.0;
141   hit->sum_lnP      = 0.0;
142 
143   hit->ndom         = 0;
144   hit->nexpected    = 0.0;
145   hit->nregions     = 0;
146   hit->nclustered   = 0;
147   hit->noverlaps    = 0;
148   hit->nenvelopes   = 0;
149 
150   hit->flags        = p7_HITFLAGS_DEFAULT;
151   hit->nreported    = 0;
152   hit->nincluded    = 0;
153   hit->best_domain  = -1;
154   hit->dcl          = NULL;
155   hit->offset       = 0;
156 
157   *ret_hit = hit;
158   return eslOK;
159 
160  ERROR:
161   *ret_hit = NULL;
162   return status;
163 }
164 
165 
166 
167 /* Function:  p7_tophits_Add()
168  * Synopsis:  Add a hit to the top hits list.
169  *
170  * Purpose:   Adds a hit to the top hits list <h>.
171  *
172  *            <name>, <acc>, and <desc> are copied, so caller may free
173  *            them if it likes.
174  *
175  *            Only the pointer <ali> is kept. Caller turns over memory
176  *            management of <ali> to the top hits object; <ali> will
177  *            be free'd when the top hits structure is free'd.
178  *
179  * Args:      h        - active top hit list
180  *            name     - name of target
181  *            acc      - accession of target (may be NULL)
182  *            desc     - description of target (may be NULL)
183  *            sortkey  - value to sort by: bigger is better
184  *            score    - score of this hit
185  *            lnP      - log P-value of this hit
186  *            mothersc - score of parent whole sequence
187  *            mother_lnP - log P-value of parent whole sequence
188  *            sqfrom   - 1..L pos in target seq  of start
189  *            sqto     - 1..L pos; sqfrom > sqto if rev comp
190  *            sqlen    - length of sequence, L
191  *            hmmfrom  - 0..M+1 pos in HMM of start
192  *            hmmto    - 0..M+1 pos in HMM of end
193  *            hmmlen   - length of HMM, M
194  *            domidx   - number of this domain
195  *            ndom     - total # of domains in sequence
196  *            ali      - optional printable alignment info
197  *
198  * Returns:   <eslOK> on success.
199  *
200  * Throws:    <eslEMEM> if reallocation failed.
201  *
202  * Note:      Is this actually used anywhere? (SRE, 10 Dec 08)
203  *            I think it's not up to date.
204  *
205  *            That's right. This function is obsolete.
206  *            But it is used in benchmark and test code, so you can't
207  *            delete it yet; benchmarks and test code should be
208  *            revised (SRE, 26 Oct 09)
209  */
210 int
p7_tophits_Add(P7_TOPHITS * h,char * name,char * acc,char * desc,double sortkey,float score,double lnP,float mothersc,double mother_lnP,int sqfrom,int sqto,int sqlen,int hmmfrom,int hmmto,int hmmlen,int domidx,int ndom,P7_ALIDISPLAY * ali)211 p7_tophits_Add(P7_TOPHITS *h,
212          char *name, char *acc, char *desc,
213          double sortkey,
214          float score,    double lnP,
215          float mothersc, double mother_lnP,
216          int sqfrom, int sqto, int sqlen,
217          int hmmfrom, int hmmto, int hmmlen,
218          int domidx, int ndom,
219          P7_ALIDISPLAY *ali)
220 {
221   int status;
222 
223   if ((status = p7_tophits_Grow(h))                           != eslOK) return status;
224   if ((status = esl_strdup(name, -1, &(h->unsrt[h->N].name))) != eslOK) return status;
225   if ((status = esl_strdup(acc,  -1, &(h->unsrt[h->N].acc)))  != eslOK) return status;
226   if ((status = esl_strdup(desc, -1, &(h->unsrt[h->N].desc))) != eslOK) return status;
227   h->unsrt[h->N].sortkey    = sortkey;
228   h->unsrt[h->N].score      = score;
229   h->unsrt[h->N].pre_score  = 0.0;
230   h->unsrt[h->N].sum_score  = 0.0;
231   h->unsrt[h->N].lnP        = lnP;
232   h->unsrt[h->N].pre_lnP    = 0.0;
233   h->unsrt[h->N].sum_lnP    = 0.0;
234   h->unsrt[h->N].nexpected  = 0;
235   h->unsrt[h->N].nregions   = 0;
236   h->unsrt[h->N].nclustered = 0;
237   h->unsrt[h->N].noverlaps  = 0;
238   h->unsrt[h->N].nenvelopes = 0;
239   h->unsrt[h->N].ndom       = ndom;
240   h->unsrt[h->N].flags      = 0;
241   h->unsrt[h->N].nreported  = 0;
242   h->unsrt[h->N].nincluded  = 0;
243   h->unsrt[h->N].best_domain= 0;
244   h->unsrt[h->N].dcl        = NULL;
245   h->N++;
246 
247   if (h->N >= 2) {
248     h->is_sorted_by_seqidx = FALSE;
249     h->is_sorted_by_sortkey = FALSE;
250   }
251   return eslOK;
252 }
253 
254 /* hit_sorter(): qsort's pawn, below */
255 static int
hit_sorter_by_sortkey(const void * vh1,const void * vh2)256 hit_sorter_by_sortkey(const void *vh1, const void *vh2)
257 {
258   P7_HIT *h1 = *((P7_HIT **) vh1);  /* don't ask. don't change. Don't Panic. */
259   P7_HIT *h2 = *((P7_HIT **) vh2);
260   int     c;
261 
262   if      (h1->sortkey < h2->sortkey) return  1;
263   else if (h1->sortkey > h2->sortkey) return -1;
264   else {
265     if ( (c = strcmp(h1->name, h2->name)) != 0) return c;
266 
267     /* if on different strand, the positive strand goes first, else use position */
268     int dir1 = (h1->dcl[0].iali < h1->dcl[0].jali ? 1 : -1);
269     int dir2 = (h2->dcl[0].iali < h2->dcl[0].jali ? 1 : -1);
270     if (dir1 != dir2) return dir2; // so if dir1 is pos (1), and dir2 is neg (-1), this will return -1, placing h1 before h2;  otherwise, vice versa
271     else {
272       if     (h1->dcl[0].iali > h2->dcl[0].iali) return  1;
273       else if(h1->dcl[0].iali < h2->dcl[0].iali) return -1;
274       else                                       return  0;
275     }
276   }
277 }
278 
279 /* used before duplicate hit removal in an nhmmer longtarget pipeline: */
280 static int
hit_sorter_by_seqidx_aliposition(const void * vh1,const void * vh2)281 hit_sorter_by_seqidx_aliposition(const void *vh1, const void *vh2)
282 {
283   P7_HIT  *h1 = *((P7_HIT **) vh1);
284   P7_HIT  *h2 = *((P7_HIT **) vh2);
285   int64_t  s1, e1, s2, e2;
286   int      dir1, dir2;
287 
288   if      (h1->seqidx > h2->seqidx) return  1; /* first key, seq_idx (unique id for sequences), low to high */
289   else if (h1->seqidx < h2->seqidx) return -1;
290 
291   // if on different strand, the positive strand goes first, else use position
292   s1 = h1->dcl[0].iali;  e1 = h1->dcl[0].jali;  if (s1 < e1) { dir1 = 1; } else { dir1 = -1; ESL_SWAP(s1, e1, int64_t); }
293   s2 = h2->dcl[0].iali;  e2 = h2->dcl[0].jali;  if (s2 < e2) { dir2 = 1; } else { dir2 = -1; ESL_SWAP(s2, e2, int64_t); }
294 
295   if (dir1 != dir2) return dir2; // so if dir1 is pos (1), and dir2 is neg (-1), this will return -1, placing h1 before h2;  otherwise, vice versa
296 
297   if      (s1 > s2) return  1;   // sort primarily from smallest to largest start pos
298   else if (s1 < s2) return -1;
299   else if (e1 < e2) return  1;   // secondarily, larger to smallest end position (i.e. longer hit first)
300   else if (e1 > e2) return -1;
301   else              return  0;
302 }
303 
304 /* similarly, for nhmmscan longtarget pipeline: */
305 static int
hit_sorter_by_modelname_aliposition(const void * vh1,const void * vh2)306 hit_sorter_by_modelname_aliposition(const void *vh1, const void *vh2)
307 {
308   P7_HIT  *h1 = *((P7_HIT **) vh1);
309   P7_HIT  *h2 = *((P7_HIT **) vh2);
310   int64_t  s1, e1, s2, e2;
311   int      dir1, dir2;
312   int      res;
313 
314   if  ( ( res = esl_strcmp(h1->name, h2->name))  != 0 ) return res; /* first key, seq_idx (unique id for sequences), low to high */
315 
316   s1 = h1->dcl[0].iali;  e1 = h1->dcl[0].jali;  if (s1 < e1) { dir1 = 1; } else { dir1 = -1; ESL_SWAP(s1, e1, int64_t); }
317   s2 = h2->dcl[0].iali;  e2 = h2->dcl[0].jali;  if (s2 < e2) { dir2 = 1; } else { dir2 = -1; ESL_SWAP(s2, e2, int64_t); }
318 
319   if (dir1 != dir2) return dir2;
320 
321   if      (s1 > s2) return  1;
322   else if (s1 < s2) return -1;
323   else if (e1 < e2) return  1;
324   else if (e1 > e2) return -1;
325   else              return  0;
326 }
327 
328 
329 /* Function:  p7_tophits_SortBySortkey()
330  * Synopsis:  Sorts a hit list.
331  *
332  * Purpose:   Sorts a top hit list. After this call,
333  *            <h->hit[i]> points to the i'th ranked
334  *            <P7_HIT> for all <h->N> hits.
335  *
336  * Returns:   <eslOK> on success.
337  */
338 int
p7_tophits_SortBySortkey(P7_TOPHITS * h)339 p7_tophits_SortBySortkey(P7_TOPHITS *h)
340 {
341   int i;
342 
343   if (h->is_sorted_by_sortkey)  return eslOK;
344   for (i = 0; i < h->N; i++) h->hit[i] = h->unsrt + i;
345   if (h->N > 1)  qsort(h->hit, h->N, sizeof(P7_HIT *), hit_sorter_by_sortkey);
346   h->is_sorted_by_seqidx  = FALSE;
347   h->is_sorted_by_sortkey = TRUE;
348   return eslOK;
349 }
350 
351 
352 /* Function:  p7_tophits_SortBySeqidxAndAlipos()
353  * Synopsis:  Sorts a hit list by sequence index and position for nhmmer.
354  *            sequence at which the hit's first domain begins (used in nhmmer)
355  *
356  * Purpose: Sorts a top hit list, suitable for subsequent hit
357  *            duplicate removal by `p7_tophits_RemoveDuplicates()` in
358  *            an nhmmer longtarget pipeline: by sequence index, then
359  *            by strand (+ before - hits), then by smallest-to-largest
360  *            start position (in watson coords), then by
361  *            largest-to-smallest end position (in watson
362  *            coords). Correctness of logic of
363  *            `p7_tophits_RemoveDuplicates()` is sensitive to this
364  *            ordering.
365  *
366  *            After this call, <h->hit[i]> points to the i'th ranked
367  *            <P7_HIT> for all <h->N> hits.
368  *
369  * Returns:   <eslOK> on success.
370  */
371 int
p7_tophits_SortBySeqidxAndAlipos(P7_TOPHITS * h)372 p7_tophits_SortBySeqidxAndAlipos(P7_TOPHITS *h)
373 {
374   int i;
375 
376   if (h->is_sorted_by_seqidx)  return eslOK;
377   for (i = 0; i < h->N; i++) h->hit[i] = h->unsrt + i;
378   if (h->N > 1)  qsort(h->hit, h->N, sizeof(P7_HIT *), hit_sorter_by_seqidx_aliposition);
379   h->is_sorted_by_sortkey = FALSE;
380   h->is_sorted_by_seqidx  = TRUE;
381   return eslOK;
382 }
383 
384 /* Function:  p7_tophits_SortByModelnameAndAlipos()
385  * Synopsis:  Sorts a hit list by model name and position for nhmmscan.
386  *
387  * Purpose:   Like `p7_tophits_SortBySeqidxAndAlipos()` but for
388  *            nhmmscan, which sorts on model name not target seq idx.
389  *
390  * Returns:   <eslOK> on success.
391  */
392 int
p7_tophits_SortByModelnameAndAlipos(P7_TOPHITS * h)393 p7_tophits_SortByModelnameAndAlipos(P7_TOPHITS *h)
394 {
395   int i;
396 
397   if (h->is_sorted_by_seqidx)  return eslOK;
398   for (i = 0; i < h->N; i++) h->hit[i] = h->unsrt + i;
399   if (h->N > 1)  qsort(h->hit, h->N, sizeof(P7_HIT *), hit_sorter_by_modelname_aliposition);
400   h->is_sorted_by_sortkey = FALSE;
401   h->is_sorted_by_seqidx  = TRUE;
402   return eslOK;
403 }
404 
405 
406 /* Function:  p7_tophits_Merge()
407  * Synopsis:  Merge two top hits lists.
408  *
409  * Purpose:   Merge <h2> into <h1>. Upon return, <h1>
410  *            contains the sorted, merged list. <h2>
411  *            is effectively destroyed; caller should
412  *            not access it further, and may as well free
413  *            it immediately.
414  *
415  * Returns:   <eslOK> on success.
416  *
417  * Throws:    <eslEMEM> on allocation failure, and
418  *            both <h1> and <h2> remain valid.
419  */
420 int
p7_tophits_Merge(P7_TOPHITS * h1,P7_TOPHITS * h2)421 p7_tophits_Merge(P7_TOPHITS *h1, P7_TOPHITS *h2)
422 {
423   void    *p;
424   P7_HIT **new_hit = NULL;
425   P7_HIT  *ori1    = h1->unsrt;    /* original base of h1's data */
426   P7_HIT  *new2;
427   int      i,j,k;
428   uint64_t Nalloc = h1->N + h2->N;
429   int      status;
430 
431   if(h2->N <= 0) return eslOK;
432 
433   /* Make sure the two lists are sorted */
434   if ((status = p7_tophits_SortBySortkey(h1)) != eslOK) goto ERROR;
435   if ((status = p7_tophits_SortBySortkey(h2)) != eslOK) goto ERROR;
436 
437   /* Attempt our allocations, so we fail early if we fail.
438    * Reallocating h1->unsrt screws up h1->hit, so fix it.
439    */
440   ESL_RALLOC(h1->unsrt, p, sizeof(P7_HIT) * Nalloc);
441   ESL_ALLOC (new_hit, sizeof(P7_HIT *)    * Nalloc);
442   for (i = 0; i < h1->N; i++)
443     h1->hit[i] = h1->unsrt + (h1->hit[i] - ori1);
444 
445   /* Append h2's unsorted data array to h1. h2's data begin at <new2> */
446   new2 = h1->unsrt + h1->N;
447   memcpy(new2, h2->unsrt, sizeof(P7_HIT) * h2->N);
448 
449   /* Merge the sorted hit lists */
450   for (i=0,j=0,k=0; i < h1->N && j < h2->N ; k++)
451     new_hit[k] = (hit_sorter_by_sortkey(&h1->hit[i], &h2->hit[j]) > 0) ? new2 + (h2->hit[j++] - h2->unsrt) : h1->hit[i++];
452   while (i < h1->N) new_hit[k++] = h1->hit[i++];
453   while (j < h2->N) new_hit[k++] = new2 + (h2->hit[j++] - h2->unsrt);
454 
455   /* h2 now turns over management of name, acc, desc memory to h1;
456    * nullify its pointers, to prevent double free.  */
457   for (i = 0; i < h2->N; i++)
458   {
459       h2->unsrt[i].name = NULL;
460       h2->unsrt[i].acc  = NULL;
461       h2->unsrt[i].desc = NULL;
462       h2->unsrt[i].dcl  = NULL;
463   }
464 
465   /* Construct the new grown h1 */
466   free(h1->hit);
467   h1->hit    = new_hit;
468   h1->Nalloc = Nalloc;
469   h1->N     += h2->N;
470   /* and is_sorted is TRUE, as a side effect of p7_tophits_Sort() above. */
471   return eslOK;
472 
473  ERROR:
474   if (new_hit != NULL) free(new_hit);
475   return status;
476 }
477 
478 /* Function:  p7_tophits_GetMaxPositionLength()
479  * Synopsis:  Returns maximum position length in hit list (targets).
480  *
481  * Purpose:   Returns the length of the longest hit location (start/end)
482  *               of all the registered hits, in chars. This is useful when
483  *               deciding how to format output.
484  *
485  *            The maximum is taken over all registered hits. This
486  *            opens a possible side effect: caller might print only
487  *            the top hits, and the max name length in these top hits
488  *            may be different than the max length over all the hits.
489  *
490  *            Used specifically for nhmmer output, so expects only one
491  *            domain per hit
492  *
493  *            If there are no hits in <h>, or none of the
494  *            hits have names, returns 0.
495  */
496 int
p7_tophits_GetMaxPositionLength(P7_TOPHITS * h)497 p7_tophits_GetMaxPositionLength(P7_TOPHITS *h)
498 {
499   int64_t i;
500   int max = 0;
501   int n;
502   char buffer [13];
503 
504   for (i = 0; i < h->N; i++) {
505     if (h->unsrt[i].dcl[0].iali > 0) {
506       n = sprintf (buffer, "%" PRId64 "", h->unsrt[i].dcl[0].iali);
507       max = ESL_MAX(n, max);
508       n = sprintf (buffer, "%" PRId64 "", h->unsrt[i].dcl[0].jali);
509       max = ESL_MAX(n, max);
510     }
511   }
512   return max;
513 }
514 
515 /* Function:  p7_tophits_GetMaxNameLength()
516  * Synopsis:  Returns maximum name length in hit list (targets).
517  *
518  * Purpose:   Returns the maximum name length of all the registered
519  *            hits, in chars. This is useful when deciding how to
520  *            format output.
521  *
522  *            The maximum is taken over all registered hits. This
523  *            opens a possible side effect: caller might print only
524  *            the top hits, and the max name length in these top hits
525  *            may be different than the max length over all the hits.
526  *
527  *            If there are no hits in <h>, or none of the
528  *            hits have names, returns 0.
529  */
530 int
p7_tophits_GetMaxNameLength(P7_TOPHITS * h)531 p7_tophits_GetMaxNameLength(P7_TOPHITS *h)
532 {
533   int i, max, n;
534   for (max = 0, i = 0; i < h->N; i++)
535     if (h->unsrt[i].name != NULL) {
536       n   = strlen(h->unsrt[i].name);
537       max = ESL_MAX(n, max);
538     }
539   return max;
540 }
541 
542 /* Function:  p7_tophits_GetMaxAccessionLength()
543  * Synopsis:  Returns maximum accession length in hit list (targets).
544  *
545  * Purpose:   Same as <p7_tophits_GetMaxNameLength()>, but for
546  *            accessions. If there are no hits in <h>, or none
547  *            of the hits have accessions, returns 0.
548  */
549 int
p7_tophits_GetMaxAccessionLength(P7_TOPHITS * h)550 p7_tophits_GetMaxAccessionLength(P7_TOPHITS *h)
551 {
552   int i, max, n;
553   for (max = 0, i = 0; i < h->N; i++)
554     if (h->unsrt[i].acc != NULL) {
555       n   = strlen(h->unsrt[i].acc);
556       max = ESL_MAX(n, max);
557     }
558   return max;
559 }
560 
561 /* Function:  p7_tophits_GetMaxShownLength()
562  * Synopsis:  Returns max shown name/accession length in hit list.
563  *
564  * Purpose:   Same as <p7_tophits_GetMaxNameLength()>, but
565  *            for the case when --acc is on, where
566  *            we show accession if one is available, and
567  *            fall back to showing the name if it is not.
568  *            Returns the max length of whatever is being
569  *            shown as the reported "name".
570  */
571 int
p7_tophits_GetMaxShownLength(P7_TOPHITS * h)572 p7_tophits_GetMaxShownLength(P7_TOPHITS *h)
573 {
574   int i, max, n;
575   for (max = 0, i = 0; i < h->N; i++)
576   {
577     if (h->unsrt[i].acc != NULL && h->unsrt[i].acc[0] != '\0')
578     {
579       n   = strlen(h->unsrt[i].acc);
580       max = ESL_MAX(n, max);
581     }
582     else if (h->unsrt[i].name != NULL)
583     {
584       n   = strlen(h->unsrt[i].name);
585       max = ESL_MAX(n, max);
586     }
587   }
588   return max;
589 }
590 
591 
592 /* Function:  p7_tophits_Reuse()
593  * Synopsis:  Reuse a hit list, freeing internals.
594  *
595  * Purpose:   Reuse the tophits list <h>; save as
596  *            many malloc/free cycles as possible,
597  *            as opposed to <Destroy()>'ing it and
598  *            <Create>'ing a new one.
599  */
600 int
p7_tophits_Reuse(P7_TOPHITS * h)601 p7_tophits_Reuse(P7_TOPHITS *h)
602 {
603   int i, j;
604 
605   if (h == NULL) return eslOK;
606   if (h->unsrt != NULL)
607   {
608     for (i = 0; i < h->N; i++)
609     {
610       if (h->unsrt[i].name != NULL) free(h->unsrt[i].name);
611       if (h->unsrt[i].acc  != NULL) free(h->unsrt[i].acc);
612       if (h->unsrt[i].desc != NULL) free(h->unsrt[i].desc);
613       if (h->unsrt[i].dcl  != NULL) {
614         for (j = 0; j < h->unsrt[i].ndom; j++)
615           if (h->unsrt[i].dcl[j].ad != NULL) p7_alidisplay_Destroy(h->unsrt[i].dcl[j].ad);
616         free(h->unsrt[i].dcl);
617       }
618     }
619   }
620   h->N         = 0;
621   h->is_sorted_by_seqidx = FALSE;
622   h->is_sorted_by_sortkey = TRUE;  /* because there are 0 hits */
623   h->hit[0]    = h->unsrt;
624   return eslOK;
625 }
626 
627 /* Function:  p7_tophits_Destroy()
628  * Synopsis:  Frees a hit list.
629  */
630 void
p7_tophits_Destroy(P7_TOPHITS * h)631 p7_tophits_Destroy(P7_TOPHITS *h)
632 {
633   int i,j;
634   if (h == NULL) return;
635   if (h->hit   != NULL) free(h->hit);
636   if (h->unsrt != NULL)
637   {
638     for (i = 0; i < h->N; i++)
639     {
640       if (h->unsrt[i].name != NULL) free(h->unsrt[i].name);
641       if (h->unsrt[i].acc  != NULL) free(h->unsrt[i].acc);
642       if (h->unsrt[i].desc != NULL) free(h->unsrt[i].desc);
643       if (h->unsrt[i].dcl  != NULL) {
644         for (j = 0; j < h->unsrt[i].ndom; j++) {
645           if (h->unsrt[i].dcl[j].ad             != NULL) p7_alidisplay_Destroy(h->unsrt[i].dcl[j].ad);
646 	  if (h->unsrt[i].dcl[j].scores_per_pos != NULL) free (h->unsrt[i].dcl->scores_per_pos);
647 	}
648         free(h->unsrt[i].dcl);
649       }
650     }
651     free(h->unsrt);
652   }
653   free(h);
654   return;
655 }
656 /*---------------- end, P7_TOPHITS object -----------------------*/
657 
658 
659 
660 
661 
662 
663 /*****************************************************************
664  * 2. Standard (human-readable) output of pipeline results
665  *****************************************************************/
666 
667 /* workaround_bug_h74():
668  * Different envelopes, identical alignment
669  *
670  * Bug #h74, though extremely rare, arises from a limitation in H3's
671  * implementation of Forward/Backward, as follows:
672  *
673  *  1. A multidomain region is analyzed by stochastic clustering
674  *  2. Overlapping envelopes are found (w.r.t sequence coords), though
675  *     trace clusters are distinct if HMM endpoints are also considered.
676  *  3. We have no facility for limiting Forward/Backward to a specified
677  *     range of profile coordinates, so each envelope is passed to
678  *     rescore_isolated_domain() and analyzed independently.
679  *  4. Optimal accuracy alignment may identify exactly the same alignment
680  *     in the overlap region shared by the two envelopes.
681  *
682  * The disturbing result is two different envelopes that have
683  * identical alignments and alignment endpoints.
684  *
685  * The correct fix is to define envelopes not only by sequence
686  * endpoints but also by profile endpoints, passing them to
687  * rescore_isolated_domain(), and limiting F/B calculations to this
688  * pieces of the DP lattice. This requires a fair amount of work,
689  * adding to the optimized API.
690  *
691  * The workaround is to detect when there are duplicate alignments,
692  * and only display one. We show the one with the best bit score.
693  *
694  * If we ever implement envelope-limited versions of F/B, revisit this
695  * fix.
696  *
697  * SRE, Tue Dec 22 16:27:04 2009
698  * xref J5/130; notebook/2009/1222-hmmer-bug-h74
699  */
700 static int
workaround_bug_h74(P7_TOPHITS * th)701 workaround_bug_h74(P7_TOPHITS *th)
702 {
703   int h;
704   int d1, d2;
705   int dremoved;
706 
707   for (h = 0; h < th->N; h++)
708     if (th->hit[h]->noverlaps)
709     {
710         for (d1 = 0; d1 < th->hit[h]->ndom; d1++)
711           for (d2 = d1+1; d2 < th->hit[h]->ndom; d2++)
712             if (th->hit[h]->dcl[d1].iali == th->hit[h]->dcl[d2].iali &&
713                 th->hit[h]->dcl[d1].jali == th->hit[h]->dcl[d2].jali)
714             {
715                 dremoved = (th->hit[h]->dcl[d1].bitscore >= th->hit[h]->dcl[d2].bitscore) ? d2 : d1;
716                 if (th->hit[h]->dcl[dremoved].is_reported) { th->hit[h]->dcl[dremoved].is_reported = FALSE; th->hit[h]->nreported--; }
717                 if (th->hit[h]->dcl[dremoved].is_included) { th->hit[h]->dcl[dremoved].is_included = FALSE; th->hit[h]->nincluded--; }
718             }
719     }
720   return eslOK;
721 }
722 
723 
724 
725 /* Function:  p7_tophits_ComputeNhmmerEvalues()
726  * Synopsis:  Compute e-values based on pvalues and window sizes.
727  *
728  * Purpose:   After nhmmer pipeline has completed, the th object contains
729  *               hits where the p-values haven't yet been converted to
730  *               e-values. That modification depends on an established
731  *               number of sequences. In nhmmer, this is computed as N/W,
732  *               for a database of N residues, where W is some standardized
733  *               window length (nhmmer passes om->max_length). E-values are
734  *               set here based on that formula. We also set the sortkey so
735  *               the output will be sorted correctly.
736  *
737  * Returns:   <eslOK> on success.
738  */
739 int
p7_tophits_ComputeNhmmerEvalues(P7_TOPHITS * th,double N,int W)740 p7_tophits_ComputeNhmmerEvalues(P7_TOPHITS *th, double N, int W)
741 {
742   int i;    /* counters over hits */
743 
744   for (i = 0; i < th->N ; i++)
745   {
746     th->unsrt[i].lnP        += log((float)N / (float)W);
747     th->unsrt[i].dcl[0].lnP  = th->unsrt[i].lnP;
748     th->unsrt[i].sortkey     = -1.0 * th->unsrt[i].lnP;
749   }
750   return eslOK;
751 }
752 
753 
754 /* Function:  p7_tophits_RemoveDuplicates()
755  * Synopsis:  Remove overlapping hits.
756  *
757  * Purpose:   After nhmmer pipeline has completed, the TopHits object may
758  *               contain duplicates if the target was broken into overlapping
759  *               windows. Scan through, and remove duplicates.  Since the
760  *               duplicates may be incomplete (one sequence is a partial
761  *               hit because it's window didn't cover the full length of
762  *               the hit), keep the one with better p-value
763  *
764  * Returns:   <eslOK> on success.
765  */
766 int
p7_tophits_RemoveDuplicates(P7_TOPHITS * th,int using_bit_cutoffs)767 p7_tophits_RemoveDuplicates(P7_TOPHITS *th, int using_bit_cutoffs)
768 {
769   int     i;    /* counter over hits */
770   int     j;    /* previous un-duplicated hit */
771   int     s_i, s_j, e_i, e_j, dir_i, dir_j, len_i, len_j;
772   int     intersect_alistart, intersect_aliend, intersect_alilen;
773   int     intersect_hmmstart, intersect_hmmend, intersect_hmmlen;
774   double  p_i, p_j;
775   int remove;
776 
777   if (th->N<2) return eslOK;
778 
779   j=0;
780   for (i = 1; i < th->N; i++)
781   {
782       p_j   = th->hit[j]->lnP;
783       s_j   = th->hit[j]->dcl[0].iali;
784       e_j   = th->hit[j]->dcl[0].jali;
785       dir_j = (s_j < e_j ? 1 : -1);
786       if (dir_j == -1) ESL_SWAP(s_j, e_j, int);
787       len_j = e_j - s_j + 1 ;
788 
789       p_i   = th->hit[i]->lnP;
790       s_i   = th->hit[i]->dcl[0].iali;
791       e_i   = th->hit[i]->dcl[0].jali;
792       dir_i = (s_i < e_i ? 1 : -1);
793       if (dir_i == -1) ESL_SWAP(s_i, e_i, int);
794       len_i = e_i - s_i + 1 ;
795 
796       // these will only matter if seqidx and strand are the same
797       intersect_alistart  = s_i>s_j ? s_i : s_j;
798       intersect_aliend    = e_i<e_j ? e_i : e_j;
799       intersect_alilen    = intersect_aliend - intersect_alistart + 1;
800 
801       intersect_hmmstart = (th->hit[i]->dcl[0].ad->hmmfrom > th->hit[j]->dcl[0].ad->hmmfrom) ? th->hit[i]->dcl[0].ad->hmmfrom : th->hit[j]->dcl[0].ad->hmmfrom;
802       intersect_hmmend   = (th->hit[i]->dcl[0].ad->hmmto   < th->hit[j]->dcl[0].ad->hmmto)   ? th->hit[i]->dcl[0].ad->hmmto : th->hit[j]->dcl[0].ad->hmmto;
803       intersect_hmmlen   = intersect_hmmend - intersect_hmmstart + 1;
804 
805       if ( esl_strcmp(th->hit[i]->name, th->hit[i-1]->name) == 0  && // same model
806            th->hit[i]->seqidx ==  th->hit[i-1]->seqidx  &&           // same source sequence
807            dir_i == dir_j &&                                         // only bother removing if the overlapping hits are on the same strand
808            intersect_hmmlen > 0 &&                                   // only if they're both hitting similar parts of the model
809            (
810                ( s_i >= s_j-3 && s_i <= s_j+3) ||     // at least one side is essentially flush
811                ( e_i >= e_j-3 && e_i <= e_j+3) ||
812                ( intersect_alilen >= len_i * 0.95) || // or one of the hits covers >90% of the other
813                ( intersect_alilen >= len_j * 0.95)
814 	   ))
815       {
816         /* Force one to go unreported.  I prefer to keep the one with the
817          * better e-value.  This addresses two issues
818          * (1) longer hits sometimes encounter higher bias corrections,
819          *     leading to lower scores; seems better to focus on the
820          *     high-scoring heart of the alignment, if we have a
821          *     choice
822          * (2) it is possible that a lower-scoring longer hit (see #1)
823          *     that is close to threshold will pass the pipeline in
824          *     one condition and not the other (e.g. --toponly, or
825          *     single vs multi threaded), and if longer hits obscure
826          *     shorter higher-scoring ones, a shorter "hit" might be
827          *     lost by being obscured by a longer one that is subsequently
828          *     removed due to insufficient score.
829          * see late notes in ~wheelert/notebook/2012/0518-dfam-scripts/00NOTES
830         */
831         //remove = 0; // 1 := keep i,  0 := keep i-1
832         remove = p_i < p_j ? j : i;
833 
834         th->hit[remove]->flags |= p7_IS_DUPLICATE;
835         if (using_bit_cutoffs) { // report/include flags were already included, need to remove them here
836           th->hit[remove]->flags &= ~p7_IS_REPORTED;
837           th->hit[remove]->flags &= ~p7_IS_INCLUDED;
838         }
839 
840         j = (remove == j ? i : j);
841       }
842       else j = i;
843   }
844   return eslOK;
845 }
846 
847 
848 
849 /* Function:  p7_tophits_Threshold()
850  * Synopsis:  Apply score and E-value thresholds to a hitlist before output.
851  *
852  * Purpose:   After a pipeline has completed, go through it and mark all
853  *            the targets and domains that are "significant" (satisfying
854  *            the reporting thresholds set for the pipeline).
855  *
856  *            Also sets the final total number of reported and
857  *            included targets, the number of reported and included
858  *            targets in each target, and the size of the search space
859  *            for per-domain conditional E-value calculations,
860  *            <pli->domZ>. By default, <pli->domZ> is the number of
861  *            significant targets reported.
862  *
863  *            If model-specific thresholds were used in the pipeline,
864  *            we cannot apply those thresholds now. They were already
865  *            applied in the pipeline. In this case all we're
866  *            responsible for here is counting them (setting
867  *            nreported, nincluded counters).
868  *
869  * Returns:   <eslOK> on success.
870  */
871 int
p7_tophits_Threshold(P7_TOPHITS * th,P7_PIPELINE * pli)872 p7_tophits_Threshold(P7_TOPHITS *th, P7_PIPELINE *pli)
873 {
874   int h, d;    /* counters over sequence hits, domains in sequences */
875 
876   /* Flag reported, included targets (if we're using general thresholds) */
877   if (! pli->use_bit_cutoffs)
878   {
879     for (h = 0; h < th->N; h++)
880     {
881 
882       if ( !(th->hit[h]->flags & p7_IS_DUPLICATE) &&
883           p7_pli_TargetReportable(pli, th->hit[h]->score, th->hit[h]->lnP))
884       {
885           th->hit[h]->flags |= p7_IS_REPORTED;
886           if (p7_pli_TargetIncludable(pli, th->hit[h]->score, th->hit[h]->lnP))
887               th->hit[h]->flags |= p7_IS_INCLUDED;
888 
889           if (pli->long_targets) { // no domains in dna search, so:
890             th->hit[h]->dcl[0].is_reported = th->hit[h]->flags & p7_IS_REPORTED;
891             th->hit[h]->dcl[0].is_included = th->hit[h]->flags & p7_IS_INCLUDED;
892           }
893       }
894     }
895   }
896 
897   /* Count reported, included targets */
898   th->nreported = 0;
899   th->nincluded = 0;
900   for (h = 0; h < th->N; h++)
901   {
902       if (th->hit[h]->flags & p7_IS_REPORTED)  th->nreported++;
903       if (th->hit[h]->flags & p7_IS_INCLUDED)  th->nincluded++;
904   }
905 
906   /* Now we can determined domZ, the effective search space in which additional domains are found */
907   if (pli->domZ_setby == p7_ZSETBY_NTARGETS) pli->domZ = (double) th->nreported;
908 
909 
910   /* Second pass is over domains, flagging reportable/includable ones.
911    * Depends on knowing the domZ we just set.
912    * Note how this enforces a hierarchical logic of
913    * (sequence|domain) must be reported to be included, and
914    * domain can only be (reported|included) if whole sequence is too.
915    */
916   if (! pli->use_bit_cutoffs && !pli->long_targets)
917   {
918     for (h = 0; h < th->N; h++)
919     {
920       if (th->hit[h]->flags & p7_IS_REPORTED)
921       {
922         for (d = 0; d < th->hit[h]->ndom; d++)
923         {
924           if (p7_pli_DomainReportable(pli, th->hit[h]->dcl[d].bitscore, th->hit[h]->dcl[d].lnP))
925             th->hit[h]->dcl[d].is_reported = TRUE;
926           if ((th->hit[h]->flags & p7_IS_INCLUDED) &&
927               p7_pli_DomainIncludable(pli, th->hit[h]->dcl[d].bitscore, th->hit[h]->dcl[d].lnP))
928             th->hit[h]->dcl[d].is_included = TRUE;
929         }
930       }
931     }
932   }
933 
934   /* Count the reported, included domains */
935   for (h = 0; h < th->N; h++)
936     for (d = 0; d < th->hit[h]->ndom; d++)
937     {
938         if (th->hit[h]->dcl[d].is_reported) th->hit[h]->nreported++;
939         if (th->hit[h]->dcl[d].is_included) th->hit[h]->nincluded++;
940     }
941 
942   workaround_bug_h74(th);  /* blech. This function is defined above; see commentary and crossreferences there. */
943 
944   return eslOK;
945 }
946 
947 
948 
949 
950 
951 /* Function:  p7_tophits_CompareRanking()
952  * Synopsis:  Compare current top hits to previous top hits ranking.
953  *
954  * Purpose:   Using a keyhash <kh> of the previous top hits and the
955  *            their ranks, look at the current top hits list <th>
956  *            and flag new hits that are included for the first time
957  *            (by setting <p7_IS_NEW> flag) and hits that were
958  *            included previously, but are now below the inclusion
959  *            threshold in the list (<by setting <p7_IS_DROPPED>
960  *            flag).
961  *
962  *            The <th> must already have been processed by
963  *            <p7_tophits_Threshold()>. We assume the <is_included>,
964  *            <is_reported> flags are set on the appropriate hits.
965  *
966  *            Upon return, the keyhash <kh> is updated to hash the
967  *            current top hits list and their ranks.
968  *
969  *            Optionally, <*opt_nnew> is set to the number of
970  *            newly included hits. jackhmmer uses this as part of
971  *            its convergence criteria, for example.
972  *
973  *            These flags affect output of top target hits from
974  *            <p7_tophits_Targets()>.
975  *
976  *            It only makes sense to call this function in context of
977  *            an iterative search.
978  *
979  *            The <p7_IS_NEW> flag is comprehensive: all new hits
980  *            are flagged (and counted in <*opt_nnew>). The <p7_WAS_DROPPED>
981  *            flag is not comprehensive: only those hits that still
982  *            appear in the current top hits list are flagged. If a
983  *            hit dropped entirely off the list, it isn't counted
984  *            as "dropped". (This could be done, but we would want
985  *            to have two keyhashes, one old and one new, to do the
986  *            necessary comparisons efficiently.)
987  *
988  *            If the target names in <th> are not unique, results may
989  *            be strange.
990  *
991  * Args:      th         - current top hits list
992  *            kh         - hash of top hits' ranks (in: previous tophits; out: <th>'s tophits)
993  *            opt_nnew   - optRETURN: number of new hits above inclusion threshold
994  *
995  * Returns:   <eslOK> on success.
996  *
997  * Throws:    <eslEMEM> if <kh> needed to be reallocated but this failed.
998  */
999 int
p7_tophits_CompareRanking(P7_TOPHITS * th,ESL_KEYHASH * kh,int * opt_nnew)1000 p7_tophits_CompareRanking(P7_TOPHITS *th, ESL_KEYHASH *kh, int *opt_nnew)
1001 {
1002   int nnew = 0;
1003   int oldrank;
1004   int h;
1005   int status;
1006 
1007   /* Flag the hits in the list with whether they're new in the included top hits,
1008    * and whether they've dropped off the included list.
1009    */
1010   for (h = 0; h < th->N; h++)
1011   {
1012     esl_keyhash_Lookup(kh, th->hit[h]->name, -1, &oldrank);
1013 
1014     if (th->hit[h]->flags & p7_IS_INCLUDED)
1015     {
1016       if (oldrank == -1) { th->hit[h]->flags |= p7_IS_NEW; nnew++; }
1017     }
1018     else
1019     {
1020       if (oldrank >=  0) th->hit[h]->flags |= p7_IS_DROPPED;
1021     }
1022   }
1023 
1024   /* Replace the old rank list with the new one */
1025   esl_keyhash_Reuse(kh);
1026   for (h = 0; h < th->N; h++)
1027   {
1028     if (th->hit[h]->flags & p7_IS_INCLUDED)
1029     {
1030       /* What happens when the same sequence name appears twice? It gets stored with higher rank */
1031       status = esl_keyhash_Store(kh, th->hit[h]->name, -1, NULL);
1032       if (status != eslOK && status != eslEDUP) goto ERROR;
1033     }
1034   }
1035 
1036   if (opt_nnew != NULL) *opt_nnew = nnew;
1037   return eslOK;
1038 
1039  ERROR:
1040   if (opt_nnew != NULL) *opt_nnew = 0;
1041   return status;
1042 }
1043 
1044 
1045 /* Function:  p7_tophits_Targets()
1046  * Synopsis:  Format and write a top target hits list to an output stream.
1047  *
1048  * Purpose:   Output a list of the reportable top target hits in <th>
1049  *            in human-readable ASCII text format to stream <ofp>, using
1050  *            final pipeline accounting stored in <pli>.
1051  *
1052  *            The tophits list <th> should already be sorted (see
1053  *            <p7_tophits_Sort()> and thresholded (see
1054  *            <p7_tophits_Threshold>).
1055  *
1056  * Returns:   <eslOK> on success.
1057  *
1058  * Throws:    <eslEWRITE> on write failure.
1059  */
1060 int
p7_tophits_Targets(FILE * ofp,P7_TOPHITS * th,P7_PIPELINE * pli,int textw)1061 p7_tophits_Targets(FILE *ofp, P7_TOPHITS *th, P7_PIPELINE *pli, int textw)
1062 {
1063   char   newness;
1064   int    h;
1065   int    d;
1066   int    namew;
1067   int    posw;
1068   int    descw;
1069   char  *showname;
1070 
1071   int    have_printed_incthresh = FALSE;
1072 
1073   /* when --acc is on, we'll show accession if available, and fall back to name */
1074   if (pli->show_accessions) namew = ESL_MAX(8, p7_tophits_GetMaxShownLength(th));
1075   else                      namew = ESL_MAX(8, p7_tophits_GetMaxNameLength(th));
1076 
1077 
1078   if (pli->long_targets)
1079   {
1080       posw = ESL_MAX(6, p7_tophits_GetMaxPositionLength(th));
1081 
1082       if (textw >  0)           descw = ESL_MAX(32, textw - namew - 2*posw - 32); /* 32 chars excluding desc and two posw's is from the format: 2 + 9+2 +6+2 +5+2 +<name>+1 +<startpos>+1 +<endpos>+1 +1 */
1083       else                      descw = 0;                               /* unlimited desc length is handled separately */
1084 
1085       if (fprintf(ofp, "Scores for complete hit%s:\n",     pli->mode == p7_SEARCH_SEQS ? "s" : "") < 0)
1086         ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1087       if (fprintf(ofp, "  %9s %6s %5s  %-*s %*s %*s  %s\n",
1088       "E-value", " score", " bias", namew, (pli->mode == p7_SEARCH_SEQS ? "Sequence":"Model"), posw, "start", posw, "end", "Description") < 0)
1089         ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1090       if (fprintf(ofp, "  %9s %6s %5s  %-*s %*s %*s  %s\n",
1091       "-------", "------", "-----", namew, "--------", posw, "-----", posw, "-----", "-----------") < 0)
1092         ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1093   }
1094   else
1095   {
1096 
1097       if (textw >  0)           descw = ESL_MAX(32, textw - namew - 61); /* 61 chars excluding desc is from the format: 2 + 22+2 +22+2 +8+2 +<name>+1 */
1098       else                      descw = 0;                               /* unlimited desc length is handled separately */
1099 
1100 
1101       /* The minimum width of the target table is 111 char: 47 from fields, 8 from min name, 32 from min desc, 13 spaces */
1102       if (fprintf(ofp, "Scores for complete sequence%s (score includes all domains):\n", pli->mode == p7_SEARCH_SEQS ? "s" : "") < 0)
1103         ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1104       if (fprintf(ofp, "  %22s  %22s  %8s\n",                              " --- full sequence ---",        " --- best 1 domain ---",   "-#dom-") < 0)
1105         ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1106       if (fprintf(ofp, "  %9s %6s %5s  %9s %6s %5s  %5s %2s  %-*s %s\n",
1107       "E-value", " score", " bias", "E-value", " score", " bias", "  exp",  "N", namew, (pli->mode == p7_SEARCH_SEQS ? "Sequence":"Model"), "Description") < 0)
1108         ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1109       if (fprintf(ofp, "  %9s %6s %5s  %9s %6s %5s  %5s %2s  %-*s %s\n",
1110       "-------", "------", "-----", "-------", "------", "-----", " ----", "--", namew, "--------", "-----------") < 0)
1111         ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1112   }
1113 
1114   for (h = 0; h < th->N; h++)
1115     {
1116       if (th->hit[h]->flags & p7_IS_REPORTED)
1117 	{
1118 	  d    = th->hit[h]->best_domain;
1119 
1120 	  if (! (th->hit[h]->flags & p7_IS_INCLUDED) && ! have_printed_incthresh)
1121 	    {
1122 	      if (fprintf(ofp, "  ------ inclusion threshold ------\n") < 0)
1123 		ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1124 	      have_printed_incthresh = TRUE;
1125 	    }
1126 
1127 	  if (pli->show_accessions)
1128 	    {   /* the --acc option: report accessions rather than names if possible */
1129 	      if (th->hit[h]->acc != NULL && th->hit[h]->acc[0] != '\0') showname = th->hit[h]->acc;
1130 	      else                                                       showname = th->hit[h]->name;
1131 	    }
1132 	  else
1133 	    showname = th->hit[h]->name;
1134 
1135 	  if      (th->hit[h]->flags & p7_IS_NEW)     newness = '+';
1136 	  else if (th->hit[h]->flags & p7_IS_DROPPED) newness = '-';
1137 	  else                                        newness = ' ';
1138 
1139 	  if (pli->long_targets)
1140 	    {
1141 	      if (fprintf(ofp, "%c %9.2g %6.1f %5.1f  %-*s %*" PRId64 " %*" PRId64 "",
1142 			  newness,
1143 			  exp(th->hit[h]->lnP), // * pli->Z,
1144 			  th->hit[h]->score,
1145 			  eslCONST_LOG2R * th->hit[h]->dcl[d].dombias, // an nhmmer hit is really a domain, so this is the hit's bias correction
1146 			  namew, showname,
1147 			  posw, th->hit[h]->dcl[d].iali,
1148 			  posw, th->hit[h]->dcl[d].jali) < 0)
1149 		ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1150 	    }
1151 	  else
1152 	    {
1153 	      if (fprintf(ofp, "%c %9.2g %6.1f %5.1f  %9.2g %6.1f %5.1f  %5.1f %2d  %-*s ",
1154 			  newness,
1155 			  exp(th->hit[h]->lnP) * pli->Z,
1156 			  th->hit[h]->score,
1157 			  th->hit[h]->pre_score - th->hit[h]->score, /* bias correction */
1158 			  exp(th->hit[h]->dcl[d].lnP) * pli->Z,
1159 			  th->hit[h]->dcl[d].bitscore,
1160 			  eslCONST_LOG2R * th->hit[h]->dcl[d].dombias, /* convert NATS to BITS at last moment */
1161 			  th->hit[h]->nexpected,
1162 			  th->hit[h]->nreported,
1163 			  namew, showname) < 0)
1164 		ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1165 	    }
1166 
1167 	  if (textw > 0)
1168 	    {
1169 	      if (fprintf(ofp, " %-.*s\n", descw, th->hit[h]->desc == NULL ? "" : th->hit[h]->desc) < 0)
1170 		ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1171 	    }
1172 	  else
1173 	    {
1174 	      if (fprintf(ofp, " %s\n",           th->hit[h]->desc == NULL ? "" : th->hit[h]->desc) < 0)
1175 		ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1176 	    }
1177 	  /* do NOT use *s with unlimited (INT_MAX) line length. Some systems
1178 	   * have an fprintf() bug here (we found one on an Opteron/SUSE Linux
1179 	   * system (#h66)
1180 	   */
1181 	}
1182     }
1183 
1184   if (th->nreported == 0)
1185     {
1186       if (fprintf(ofp, "\n   [No hits detected that satisfy reporting thresholds]\n") < 0)
1187         ESL_EXCEPTION_SYS(eslEWRITE, "per-sequence hit list: write failed");
1188     }
1189   return eslOK;
1190 }
1191 
1192 
1193 /* Function:  p7_tophits_Domains()
1194  * Synopsis:  Standard output format for top domain hits and alignments.
1195  *
1196  * Purpose:   For each reportable target sequence, output a tabular summary
1197  *            of reportable domains found in it, followed by alignments of
1198  *            each domain.
1199  *
1200  *            Similar to <p7_tophits_Targets()>; see additional notes there.
1201  *
1202  * Returns:   <eslOK> on success.
1203  *
1204  * Throws:    <eslEWRITE> if a write to <ofp> fails; for example, if
1205  *            the disk fills up.
1206  */
1207 int
p7_tophits_Domains(FILE * ofp,P7_TOPHITS * th,P7_PIPELINE * pli,int textw)1208 p7_tophits_Domains(FILE *ofp, P7_TOPHITS *th, P7_PIPELINE *pli, int textw)
1209 {
1210   int   h, d;
1211   int   nd;
1212   int   namew, descw;
1213   char *showname;
1214   int   status;
1215 
1216   if (pli->long_targets)
1217     {
1218       if (fprintf(ofp, "Annotation for each hit %s:\n",
1219 		  pli->show_alignments ? " (and alignments)" : "") < 0)
1220         ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1221     }
1222   else
1223     {
1224       if (fprintf(ofp, "Domain annotation for each %s%s:\n",
1225 		  pli->mode == p7_SEARCH_SEQS ? "sequence" : "model",
1226 		  pli->show_alignments ? " (and alignments)" : "") < 0)
1227         ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1228     }
1229 
1230   for (h = 0; h < th->N; h++)
1231     if (th->hit[h]->flags & p7_IS_REPORTED)
1232       {
1233 	if (pli->show_accessions && th->hit[h]->acc != NULL && th->hit[h]->acc[0] != '\0')
1234 	  {
1235 	    showname = th->hit[h]->acc;
1236 	    namew    = strlen(th->hit[h]->acc);
1237 	  }
1238 	else
1239 	  {
1240 	    showname = th->hit[h]->name;
1241 	    namew = strlen(th->hit[h]->name);
1242 	  }
1243 
1244 	if (textw > 0)
1245 	  {
1246 	    descw = ESL_MAX(32, textw - namew - 5);
1247 	    if (fprintf(ofp, ">> %s  %-.*s\n", showname, descw, (th->hit[h]->desc == NULL ? "" : th->hit[h]->desc)) < 0)
1248 	      ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1249 	  }
1250 	else
1251 	  {
1252 	    if (fprintf(ofp, ">> %s  %s\n",    showname,        (th->hit[h]->desc == NULL ? "" : th->hit[h]->desc)) < 0)
1253 	      ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1254 	  }
1255 
1256 	if (th->hit[h]->nreported == 0)
1257 	  {
1258 	    if (fprintf(ofp,"   [No individual domains that satisfy reporting thresholds (although complete target did)]\n\n") < 0)
1259 	      ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1260 	    continue;
1261 	  }
1262 
1263 
1264 	if (pli->long_targets)
1265 	  {
1266 	    /* The dna hit table is 119 char wide:
1267                    score  bias    Evalue hmmfrom  hmm to     alifrom    ali to      envfrom    env to       hqfrom     hq to   sq len      acc
1268                    ------ ----- --------- ------- -------    --------- ---------    --------- ---------    --------- --------- ---------    ----
1269                !     82.7 104.4   4.9e-22     782     998 .. 241981174 241980968 .. 241981174 241980966 .. 241981174 241980968 234234233   0.78
1270              */
1271 	    if (fprintf(ofp, "   %6s %5s %9s %9s %9s %2s %9s %9s %2s %9s %9s    %9s %2s %4s\n",  "score",  "bias",  "  Evalue", "hmmfrom",  "hmm to", "  ", " alifrom ",  " ali to ", "  ",  " envfrom ",  " env to ",  (pli->mode == p7_SEARCH_SEQS ? "  sq len " : " mod len "), "  ",  "acc")  < 0)
1272 	      ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1273 	    if (fprintf(ofp, "   %6s %5s %9s %9s %9s %2s %9s %9s %2s %9s %9s    %9s %2s %4s\n",  "------", "-----", "---------", "-------", "-------", "  ", "---------", "---------", "  ", "---------", "---------",  "---------", "  ", "----") < 0)
1274 	      ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1275 	  }
1276 	else
1277 	  {
1278 	    /* The domain table is 101 char wide:
1279                 #     score  bias  c-Evalue  i-Evalue hmmfrom   hmmto    alifrom  ali to    envfrom  env to     acc
1280                ---   ------ ----- --------- --------- ------- -------    ------- -------    ------- -------    ----
1281                  1 ?  123.4  23.1   9.7e-11    6.8e-9       3    1230 ..       1     492 []       2     490 .] 0.90
1282                123 ! 1234.5 123.4 123456789 123456789 1234567 1234567 .. 1234567 1234567 [] 1234567 1234568 .] 0.12
1283 	    */
1284 	    if (fprintf(ofp, " %3s   %6s %5s %9s %9s %7s %7s %2s %7s %7s %2s %7s %7s %2s %4s\n",    "#",  "score",  "bias",  "c-Evalue",  "i-Evalue", "hmmfrom",  "hmm to", "  ", "alifrom",  "ali to", "  ", "envfrom",  "env to", "  ",  "acc")  < 0)
1285               ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1286 	    if (fprintf(ofp, " %3s   %6s %5s %9s %9s %7s %7s %2s %7s %7s %2s %7s %7s %2s %4s\n",  "---", "------", "-----", "---------", "---------", "-------", "-------", "  ", "-------", "-------", "  ", "-------", "-------", "  ", "----")  < 0)
1287 	      ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1288 	  }
1289 
1290 
1291 	/* Domain hit table for each reported domain in this reported sequence. */
1292 	nd = 0;
1293 	for (d = 0; d < th->hit[h]->ndom; d++)
1294 	  {
1295 	    if (th->hit[h]->dcl[d].is_reported)
1296 	      {
1297 		nd++;
1298 		if (pli->long_targets)
1299 		  {
1300 		    if (fprintf(ofp, " %c %6.1f %5.1f %9.2g %9d %9d %c%c %9" PRId64 " %9" PRId64 " %c%c %9" PRId64 " %9" PRId64 " %c%c %9" PRId64 "    %4.2f\n",
1301 				//nd,
1302 				th->hit[h]->dcl[d].is_included ? '!' : '?',
1303 				th->hit[h]->dcl[d].bitscore,
1304 				th->hit[h]->dcl[d].dombias * eslCONST_LOG2R, /* convert NATS to BITS at last moment */
1305 				exp(th->hit[h]->dcl[d].lnP),
1306 				th->hit[h]->dcl[d].ad->hmmfrom,
1307 				th->hit[h]->dcl[d].ad->hmmto,
1308 				(th->hit[h]->dcl[d].ad->hmmfrom == 1) ? '[' : '.',
1309 				(th->hit[h]->dcl[d].ad->hmmto   == th->hit[h]->dcl[d].ad->M) ? ']' : '.',
1310 				th->hit[h]->dcl[d].ad->sqfrom,
1311 				th->hit[h]->dcl[d].ad->sqto,
1312 				(th->hit[h]->dcl[d].ad->sqfrom == 1) ? '[' : '.',
1313 				(th->hit[h]->dcl[d].ad->sqto   == th->hit[h]->dcl[d].ad->L) ? ']' : '.',
1314 				th->hit[h]->dcl[d].ienv,
1315 				th->hit[h]->dcl[d].jenv,
1316 				(th->hit[h]->dcl[d].ienv == 1) ? '[' : '.',
1317 				(th->hit[h]->dcl[d].jenv == th->hit[h]->dcl[d].ad->L) ? ']' : '.',
1318 				th->hit[h]->dcl[d].ad->L,
1319 				(th->hit[h]->dcl[d].oasc / (1.0 + fabs((float) (th->hit[h]->dcl[d].jenv - th->hit[h]->dcl[d].ienv))))) < 0)
1320 		      ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1321 		  }
1322 		else
1323 		  {
1324 		    if (fprintf(ofp, " %3d %c %6.1f %5.1f %9.2g %9.2g %7d %7d %c%c",
1325 				nd,
1326 				th->hit[h]->dcl[d].is_included ? '!' : '?',
1327 				th->hit[h]->dcl[d].bitscore,
1328 				th->hit[h]->dcl[d].dombias * eslCONST_LOG2R, /* convert NATS to BITS at last moment */
1329 				exp(th->hit[h]->dcl[d].lnP) * pli->domZ,
1330 				exp(th->hit[h]->dcl[d].lnP) * pli->Z,
1331 				th->hit[h]->dcl[d].ad->hmmfrom,
1332 				th->hit[h]->dcl[d].ad->hmmto,
1333 				(th->hit[h]->dcl[d].ad->hmmfrom == 1) ? '[' : '.',
1334 				(th->hit[h]->dcl[d].ad->hmmto   == th->hit[h]->dcl[d].ad->M ) ? ']' : '.') < 0)
1335 		      ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1336 
1337 		    if (fprintf(ofp, " %7" PRId64 " %7" PRId64 " %c%c",
1338 				th->hit[h]->dcl[d].ad->sqfrom,
1339 				th->hit[h]->dcl[d].ad->sqto,
1340 				(th->hit[h]->dcl[d].ad->sqfrom == 1) ? '[' : '.',
1341 				(th->hit[h]->dcl[d].ad->sqto   == th->hit[h]->dcl[d].ad->L) ? ']' : '.') < 0)
1342 		      ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1343 
1344 		    if (fprintf(ofp, " %7" PRId64 " %7" PRId64 " %c%c",
1345 				th->hit[h]->dcl[d].ienv,
1346 				th->hit[h]->dcl[d].jenv,
1347 				(th->hit[h]->dcl[d].ienv == 1) ? '[' : '.',
1348 				(th->hit[h]->dcl[d].jenv == th->hit[h]->dcl[d].ad->L) ? ']' : '.') < 0)
1349 		      ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1350 
1351 		    if (fprintf(ofp, " %4.2f\n",
1352 				(th->hit[h]->dcl[d].oasc / (1.0 + fabs((float) (th->hit[h]->dcl[d].jenv - th->hit[h]->dcl[d].ienv))))) < 0)
1353 		      ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1354 		  }
1355 
1356 	      }
1357 	  } // end of domain table in this reported sequence.
1358 
1359 	/* Alignment data for each reported domain in this reported sequence. */
1360 	if (pli->show_alignments)
1361 	  {
1362 	    if (pli->long_targets)
1363 	      {
1364 		if (fprintf(ofp, "\n  Alignment:\n") < 0)
1365 		  ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1366 	      }
1367 	    else
1368 	      {
1369 		if (fprintf(ofp, "\n  Alignments for each domain:\n") < 0)
1370 		  ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1371 		nd = 0;
1372 	      }
1373 
1374 	    for (d = 0; d < th->hit[h]->ndom; d++)
1375 	      if (th->hit[h]->dcl[d].is_reported)
1376 		{
1377 		  nd++;
1378 		  if (!pli->long_targets)
1379 		    {
1380 		      if (fprintf(ofp, "  == domain %d", nd ) < 0)
1381 			ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1382 		    }
1383 		  if (fprintf(ofp, "  score: %.1f bits", th->hit[h]->dcl[d].bitscore) < 0)
1384 		    ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1385 		  if (!pli->long_targets)
1386 		    {
1387 		      if (fprintf(ofp, ";  conditional E-value: %.2g\n",  exp(th->hit[h]->dcl[d].lnP) * pli->domZ) < 0)
1388 			ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1389 		    }
1390 		  else
1391 		    {
1392 		      if (fprintf(ofp, "\n") < 0)
1393 			ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1394 		    }
1395 
1396 		  if ((status = p7_alidisplay_Print(ofp, th->hit[h]->dcl[d].ad, 40, textw, pli)) != eslOK) return status;
1397 
1398 		  if (fprintf(ofp, "\n") < 0)
1399 		    ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1400 		}
1401 	  }
1402 	else // alignment reporting is off:
1403 	  {
1404 	    if (fprintf(ofp, "\n") < 0)
1405 	      ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1406 	  }
1407 
1408       } // end, loop over all reported hits
1409 
1410   if (th->nreported == 0)
1411     {
1412       if (fprintf(ofp, "\n   [No targets detected that satisfy reporting thresholds]\n") < 0)
1413         ESL_EXCEPTION_SYS(eslEWRITE, "domain hit list: write failed");
1414     }
1415   return eslOK;
1416 }
1417 
1418 
1419 /* Function:  p7_tophits_Alignment()
1420  * Synopsis:  Create multiple alignment of all included domains.
1421  *
1422  * Purpose:   Create a multiple alignment of all domains marked
1423  *            "includable" in the top hits list <th>. Return it in
1424  *            <*ret_msa>.
1425  *
1426  *            Use of <optflags> is identical to <optflags> in <p7_tracealign_Seqs()>.
1427  *            Possible flags include <p7_DIGITIZE>, <p7_ALL_CONSENSUS_COLS>,
1428  *            and <p7_TRIM>; they may be OR'ed together. Otherwise, pass
1429  *            <p7_DEFAULT> to set no flags.
1430  *
1431  *            Caller may optionally provide <inc_sqarr>, <inc_trarr>, and
1432  *            <inc_n> to include additional sequences in the alignment
1433  *            (the jackhmmer query, for example). Otherwise, pass <NULL, NULL, 0>.
1434  *
1435  * Returns:   <eslOK> on success, and <*ret_msa> points to a new MSA that
1436  *            the caller is responsible for freeing.
1437  *
1438  *            Returns <eslFAIL> if there are no included domains,
1439  *            in which case <*ret_msa> is <NULL>.
1440  *
1441  * Throws:    <eslEMEM> on allocation failure;
1442  *            <eslECORRUPT> on unexpected internal data corruption.
1443  *
1444  * Xref:      J4/29: incept.
1445  *            J4/76: added inc_sqarr, inc_trarr, inc_n, optflags
1446  *            H4/72: iss131, segfault if top seq has no reported domains
1447  *            H5/88: iss140, jackhmmer --fast segfaults
1448  */
1449 int
p7_tophits_Alignment(const P7_TOPHITS * th,const ESL_ALPHABET * abc,ESL_SQ ** inc_sqarr,P7_TRACE ** inc_trarr,int inc_n,int optflags,ESL_MSA ** ret_msa)1450 p7_tophits_Alignment(const P7_TOPHITS *th, const ESL_ALPHABET *abc,
1451                      ESL_SQ **inc_sqarr, P7_TRACE **inc_trarr, int inc_n,
1452                      int optflags, ESL_MSA **ret_msa)
1453 {
1454   ESL_SQ   **sqarr = NULL;
1455   P7_TRACE **trarr = NULL;
1456   ESL_MSA   *msa   = NULL;
1457   int        ndom  = 0;
1458   int        M     = 0;
1459   int        h, d, y;
1460   int        status;
1461 
1462   /* How many domains will be included in the new alignment?  We also
1463    * set M here; we don't have hmm, but every ali has a copy.
1464    *
1465    * Beware: although p7_Pipeline() must not register hits that have
1466    * no domains, it did happen {iss131}. So we're careful (here, at
1467    * least) about testing ndom before reaching into dcl[] to get M.
1468    */
1469   for (h = 0; h < th->N; h++)
1470     if (th->hit[h]->flags & p7_IS_INCLUDED)
1471       for (d = 0; d < th->hit[h]->ndom; d++)
1472 	if (th->hit[h]->dcl[d].is_included)
1473 	  {
1474 	    if (M == 0) M = th->hit[h]->dcl[d].ad->M;
1475 	    ndom++;
1476 	  }
1477 
1478   if (inc_n)
1479     {
1480       if      (M == 0)  M = inc_trarr[0]->M;  // unusual case where there's an included trace, but no other aligned hits: get M from included trace
1481       else if (M != inc_trarr[0]->M)          // spot check that included traces appear to have same # of consensus positions; iss#140
1482 	ESL_XEXCEPTION(eslECORRUPT, "top hits and included trace(s) have different profile lengths");
1483     }
1484   if (inc_n+ndom == 0) { status = eslFAIL; goto ERROR; }
1485 
1486   /* Allocation */
1487   ESL_ALLOC(sqarr, sizeof(ESL_SQ *)   * (ndom + inc_n));
1488   ESL_ALLOC(trarr, sizeof(P7_TRACE *) * (ndom + inc_n));
1489   /* Inclusion of preexisting seqs, traces: make copy of pointers */
1490   for (y = 0; y < inc_n;        y++) { sqarr[y] = inc_sqarr[y];  trarr[y] = inc_trarr[y]; }
1491   for (;      y < (ndom+inc_n); y++) { sqarr[y] = NULL;          trarr[y] = NULL; }
1492 
1493   /* Make faux sequences, traces from hit list */
1494   y = inc_n;
1495   for (h = 0; h < th->N; h++)
1496     if (th->hit[h]->flags & p7_IS_INCLUDED)
1497     {
1498         for (d = 0; d < th->hit[h]->ndom; d++)
1499           if (th->hit[h]->dcl[d].is_included)
1500           {
1501               if ((status = p7_alidisplay_Backconvert(th->hit[h]->dcl[d].ad, abc, &(sqarr[y]), &(trarr[y]))) != eslOK) goto ERROR;
1502               y++;
1503           }
1504     }
1505 
1506   /* Make the multiple alignment */
1507   if ((status = p7_tracealign_Seqs(sqarr, trarr, inc_n+ndom, M, optflags, NULL, &msa)) != eslOK) goto ERROR;
1508 
1509   /* Clean up */
1510   for (y = inc_n; y < ndom+inc_n; y++) esl_sq_Destroy(sqarr[y]);
1511   for (y = inc_n; y < ndom+inc_n; y++) p7_trace_Destroy(trarr[y]);
1512   free(sqarr);
1513   free(trarr);
1514   *ret_msa = msa;
1515   return eslOK;
1516 
1517  ERROR:
1518   if (sqarr != NULL) { for (y = inc_n; y < ndom+inc_n; y++) if (sqarr[y] != NULL) esl_sq_Destroy(sqarr[y]);   free(sqarr); }
1519   if (trarr != NULL) { for (y = inc_n; y < ndom+inc_n; y++) if (trarr[y] != NULL) p7_trace_Destroy(trarr[y]); free(trarr); }
1520   if (msa   != NULL) esl_msa_Destroy(msa);
1521   *ret_msa = NULL;
1522   return status;
1523 }
1524 /*---------------- end, standard output format ------------------*/
1525 
1526 
1527 
1528 
1529 
1530 /*****************************************************************
1531  * 3. Tabular (parsable) output of pipeline results.
1532  *****************************************************************/
1533 
1534 /* Function:  p7_tophits_TabularTargets()
1535  * Synopsis:  Output parsable table of per-sequence hits.
1536  *
1537  * Purpose:   Output a parseable table of reportable per-sequence hits
1538  *            in sorted tophits list <th> in an easily parsed ASCII
1539  *            tabular form to stream <ofp>, using final pipeline
1540  *            accounting stored in <pli>.
1541  *
1542  *            Designed to be concatenated for multiple queries and
1543  *            multiple top hits list.
1544  *
1545  * Returns:   <eslOK> on success.
1546  *
1547  * Throws:    <eslEWRITE> if a write to <ofp> fails; for example, if
1548  *            the disk fills up.
1549  */
1550 int
p7_tophits_TabularTargets(FILE * ofp,char * qname,char * qacc,P7_TOPHITS * th,P7_PIPELINE * pli,int show_header)1551 p7_tophits_TabularTargets(FILE *ofp, char *qname, char *qacc, P7_TOPHITS *th, P7_PIPELINE *pli, int show_header)
1552 {
1553   int qnamew = ESL_MAX(20, strlen(qname));
1554   int tnamew = ESL_MAX(20, p7_tophits_GetMaxNameLength(th));
1555   int qaccw  = ((qacc != NULL) ? ESL_MAX(10, strlen(qacc)) : 10);
1556   int taccw  = ESL_MAX(10, p7_tophits_GetMaxAccessionLength(th));
1557   int posw   = (pli->long_targets ? ESL_MAX(7, p7_tophits_GetMaxPositionLength(th)) : 0);
1558   int h,d;
1559 
1560   if (show_header)
1561   {
1562       if (pli->long_targets)
1563       {
1564         if (fprintf(ofp, "#%-*s %-*s %-*s %-*s %s %s %*s %*s %*s %*s %*s %6s %9s %6s %5s  %s\n",
1565           tnamew-1, " target name",        taccw, "accession",  qnamew, "query name",           qaccw, "accession", "hmmfrom", "hmm to", posw, "alifrom", posw, "ali to", posw, "envfrom", posw, "env to", posw, ( pli->mode == p7_SCAN_MODELS ? "modlen" : "sq len" ), "strand", "  E-value", " score", " bias", "description of target") < 0)
1566           ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-sequence hit list: write failed");
1567         if (fprintf(ofp, "#%*s %*s %*s %*s %s %s %*s %*s %*s %*s %*s %6s %9s %6s %5s %s\n",
1568           tnamew-1, "-------------------", taccw, "----------", qnamew, "--------------------", qaccw, "----------", "-------", "-------", posw, "-------", posw, "-------",  posw, "-------", posw, "-------", posw, "-------", "------", "---------", "------", "-----", "---------------------") < 0)
1569           ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-per-sequence hit list: write failed");
1570       }
1571       else
1572       {
1573         if (fprintf(ofp, "#%*s %22s %22s %33s\n", tnamew+qnamew+taccw+qaccw+2, "", "--- full sequence ----", "--- best 1 domain ----", "--- domain number estimation ----") < 0)
1574           ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-sequence hit list: write failed");
1575         if (fprintf(ofp, "#%-*s %-*s %-*s %-*s %9s %6s %5s %9s %6s %5s %5s %3s %3s %3s %3s %3s %3s %3s %s\n",
1576           tnamew-1, " target name",        taccw, "accession",  qnamew, "query name",           qaccw, "accession",  "  E-value", " score", " bias", "  E-value", " score", " bias", "exp", "reg", "clu", " ov", "env", "dom", "rep", "inc", "description of target") < 0)
1577           ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-sequence hit list: write failed");
1578         if (fprintf(ofp, "#%*s %*s %*s %*s %9s %6s %5s %9s %6s %5s %5s %3s %3s %3s %3s %3s %3s %3s %s\n",
1579           tnamew-1, "-------------------", taccw, "----------", qnamew, "--------------------", qaccw, "----------", "---------", "------", "-----", "---------", "------", "-----", "---", "---", "---", "---", "---", "---", "---", "---", "---------------------") < 0)
1580           ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-sequence hit list: write failed");
1581       }
1582   }
1583 
1584   for (h = 0; h < th->N; h++)
1585     if (th->hit[h]->flags & p7_IS_REPORTED)
1586     {
1587         d    = th->hit[h]->best_domain;
1588         if (pli->long_targets)
1589         {
1590             if (fprintf(ofp, "%-*s %-*s %-*s %-*s %7d %7d %*" PRId64 " %*" PRId64 " %*" PRId64 " %*" PRId64 " %*" PRId64 " %6s %9.2g %6.1f %5.1f  %s\n",
1591                 tnamew, th->hit[h]->name,
1592                 taccw,  th->hit[h]->acc ? th->hit[h]->acc : "-",
1593                 qnamew, qname,
1594                 qaccw,  ( (qacc != NULL && qacc[0] != '\0') ? qacc : "-"),
1595                 th->hit[h]->dcl[d].ad->hmmfrom,
1596                 th->hit[h]->dcl[d].ad->hmmto,
1597                 posw, th->hit[h]->dcl[d].iali,
1598                 posw, th->hit[h]->dcl[d].jali,
1599                 posw, th->hit[h]->dcl[d].ienv,
1600                 posw, th->hit[h]->dcl[d].jenv,
1601                 posw, th->hit[h]->dcl[0].ad->L,
1602                 (th->hit[h]->dcl[d].iali < th->hit[h]->dcl[d].jali ? "   +  "  :  "   -  "),
1603                 exp(th->hit[h]->lnP),
1604                 th->hit[h]->score,
1605                 th->hit[h]->dcl[d].dombias * eslCONST_LOG2R, /* convert NATS to BITS at last moment */
1606                 th->hit[h]->desc == NULL ? "-" :  th->hit[h]->desc ) < 0)
1607                   ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-sequence hit list: write failed");
1608         }
1609         else
1610         {
1611                 if (fprintf(ofp, "%-*s %-*s %-*s %-*s %9.2g %6.1f %5.1f %9.2g %6.1f %5.1f %5.1f %3d %3d %3d %3d %3d %3d %3d %s\n",
1612                 tnamew, th->hit[h]->name,
1613                 taccw,  th->hit[h]->acc ? th->hit[h]->acc : "-",
1614                 qnamew, qname,
1615                 qaccw,  ( (qacc != NULL && qacc[0] != '\0') ? qacc : "-"),
1616                 exp(th->hit[h]->lnP) * pli->Z,
1617                 th->hit[h]->score,
1618                 th->hit[h]->pre_score - th->hit[h]->score, /* bias correction */
1619                 exp(th->hit[h]->dcl[d].lnP) * pli->Z,
1620                 th->hit[h]->dcl[d].bitscore,
1621                 th->hit[h]->dcl[d].dombias * eslCONST_LOG2R, /* convert NATS to BITS at last moment */
1622                 th->hit[h]->nexpected,
1623                 th->hit[h]->nregions,
1624                 th->hit[h]->nclustered,
1625                 th->hit[h]->noverlaps,
1626                 th->hit[h]->nenvelopes,
1627                 th->hit[h]->ndom,
1628                 th->hit[h]->nreported,
1629                 th->hit[h]->nincluded,
1630                 (th->hit[h]->desc == NULL ? "-" : th->hit[h]->desc)) < 0)
1631                   ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-sequence hit list: write failed");
1632         }
1633     }
1634   return eslOK;
1635 }
1636 
1637 
1638 /* Function:  p7_tophits_TabularDomains()
1639  * Synopsis:  Output parseable table of per-domain hits
1640  *
1641  * Purpose:   Output a parseable table of reportable per-domain hits
1642  *            in sorted tophits list <th> in an easily parsed ASCII
1643  *            tabular form to stream <ofp>, using final pipeline
1644  *            accounting stored in <pli>.
1645  *
1646  *            Designed to be concatenated for multiple queries and
1647  *            multiple top hits list.
1648  *
1649  * Returns:   <eslOK> on success.
1650  *
1651  * Throws:    <eslEWRITE> if a write to <ofp> fails; for example, if
1652  *            the disk fills up.
1653  */
1654 int
p7_tophits_TabularDomains(FILE * ofp,char * qname,char * qacc,P7_TOPHITS * th,P7_PIPELINE * pli,int show_header)1655 p7_tophits_TabularDomains(FILE *ofp, char *qname, char *qacc, P7_TOPHITS *th, P7_PIPELINE *pli, int show_header)
1656 {
1657 
1658   int qnamew = ESL_MAX(20, strlen(qname));
1659   int tnamew = ESL_MAX(20, p7_tophits_GetMaxNameLength(th));
1660   int qaccw  = (qacc ? ESL_MAX(10, strlen(qacc)) : 10);
1661   int taccw  = ESL_MAX(10, p7_tophits_GetMaxAccessionLength(th));
1662   int tlen, qlen;
1663   int h,d,nd;
1664 
1665   if (show_header)
1666     {
1667          if (fprintf(ofp, "#%*s %22s %40s %11s %11s %11s\n", tnamew+qnamew-1+15+taccw+qaccw, "",                                   "--- full sequence ---",        "-------------- this domain -------------",                "hmm coord",      "ali coord",     "env coord") < 0)
1668             ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-domain hit list: write failed");
1669          if (fprintf(ofp, "#%-*s %-*s %5s %-*s %-*s %5s %9s %6s %5s %3s %3s %9s %9s %6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n",
1670             tnamew-1, " target name",        taccw, "accession",  "tlen",  qnamew, "query name",           qaccw, "accession",  "qlen",  "E-value",   "score",  "bias",  "#",   "of",  "c-Evalue",  "i-Evalue",  "score",  "bias",  "from",  "to",    "from",  "to",   "from",   "to",    "acc",  "description of target") < 0)
1671             ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-domain hit list: write failed");
1672          if (fprintf(ofp, "#%*s %*s %5s %*s %*s %5s %9s %6s %5s %3s %3s %9s %9s %6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n",
1673            tnamew-1, "-------------------", taccw, "----------", "-----", qnamew, "--------------------", qaccw, "----------", "-----", "---------", "------", "-----", "---", "---", "---------", "---------", "------", "-----", "-----", "-----", "-----", "-----", "-----", "-----", "----", "---------------------") < 0)
1674            ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-domain hit list: write failed");
1675     }
1676 
1677   for (h = 0; h < th->N; h++)
1678     if (th->hit[h]->flags & p7_IS_REPORTED)
1679     {
1680         nd = 0;
1681         for (d = 0; d < th->hit[h]->ndom; d++)
1682           if (th->hit[h]->dcl[d].is_reported)
1683           {
1684               nd++;
1685 
1686               /* in hmmsearch, targets are seqs and queries are HMMs;
1687                * in hmmscan, the reverse.  but in the ALIDISPLAY
1688                * structure, lengths L and M are for seq and HMMs, not
1689                * for query and target, so sort it out.
1690                */
1691               if (pli->mode == p7_SEARCH_SEQS) { qlen = th->hit[h]->dcl[d].ad->M; tlen = th->hit[h]->dcl[d].ad->L;  }
1692               else                             { qlen = th->hit[h]->dcl[d].ad->L; tlen = th->hit[h]->dcl[d].ad->M;  }
1693 
1694 
1695 
1696               if (fprintf(ofp, "%-*s %-*s %5d %-*s %-*s %5d %9.2g %6.1f %5.1f %3d %3d %9.2g %9.2g %6.1f %5.1f %5d %5d %5" PRId64 " %5" PRId64 " %5" PRId64 " %5" PRId64 " %4.2f %s\n",
1697                 tnamew, th->hit[h]->name,
1698                 taccw,  th->hit[h]->acc ? th->hit[h]->acc : "-",
1699                 tlen,
1700                 qnamew, qname,
1701                 qaccw,  ( (qacc != NULL && qacc[0] != '\0') ? qacc : "-"),
1702                 qlen,
1703                 exp(th->hit[h]->lnP) * pli->Z,
1704                 th->hit[h]->score,
1705                 th->hit[h]->pre_score - th->hit[h]->score, /* bias correction */
1706                 nd,
1707                 th->hit[h]->nreported,
1708                 exp(th->hit[h]->dcl[d].lnP) * pli->domZ,
1709                 exp(th->hit[h]->dcl[d].lnP) * pli->Z,
1710                 th->hit[h]->dcl[d].bitscore,
1711                 th->hit[h]->dcl[d].dombias * eslCONST_LOG2R, /* NATS to BITS at last moment */
1712                 th->hit[h]->dcl[d].ad->hmmfrom,
1713                 th->hit[h]->dcl[d].ad->hmmto,
1714                 th->hit[h]->dcl[d].ad->sqfrom,
1715                 th->hit[h]->dcl[d].ad->sqto,
1716                 th->hit[h]->dcl[d].ienv,
1717                 th->hit[h]->dcl[d].jenv,
1718                 (th->hit[h]->dcl[d].oasc / (1.0 + fabs((float) (th->hit[h]->dcl[d].jenv - th->hit[h]->dcl[d].ienv)))),
1719                 (th->hit[h]->desc ?  th->hit[h]->desc : "-")) < 0)
1720                   ESL_EXCEPTION_SYS(eslEWRITE, "tabular per-domain hit list: write failed");
1721 
1722           }
1723       }
1724   return eslOK;
1725 }
1726 
1727 
1728 /* Function:  p7_tophits_TabularXfam()
1729  * Synopsis:  Output parsable table(s) of hits, in format desired by Xfam.
1730  *
1731  * Purpose:   Output a parseable table of reportable hits in sorted
1732  *            tophits list <th> in an easily parsed ASCII tabular
1733  *            form to stream <ofp>, using final pipeline accounting
1734  *            stored in <pli>.
1735  *
1736  *            For long-target nucleotide queries, this will print the
1737  *            same hits as p7_tophits_TabularTargets(), but with the
1738  *            smaller number of (reordered) fields required by Dfam
1739  *            scripts.
1740  *
1741  *            For protein queries, this will print two tables:
1742  *            (a) per-sequence hits as presented by
1743  *                p7_tophits_TabularTargets(), but formatted for
1744  *                Pfam scripts;
1745  *            (b) per-domain hits, similar to those presented by
1746  *                p7_tophits_TabularDomains(), but sorted by
1747  *                score/e-value, and formated for Pfam scripts.
1748  *
1749  * Returns:   <eslOK> on success.
1750  *
1751  * Throws:    <eslEMEM> on allocation failure.
1752  *            <eslEWRITE> if a write to <ofp> fails; for example, if
1753  *            the disk fills up.
1754  */
1755 int
p7_tophits_TabularXfam(FILE * ofp,char * qname,char * qacc,P7_TOPHITS * th,P7_PIPELINE * pli)1756 p7_tophits_TabularXfam(FILE *ofp, char *qname, char *qacc, P7_TOPHITS *th, P7_PIPELINE *pli)
1757 {
1758   P7_TOPHITS *domHitlist = NULL;
1759   P7_HIT     *domhit     = NULL;
1760   int         tnamew     = ESL_MAX(20, p7_tophits_GetMaxNameLength(th));
1761   int         taccw      = ESL_MAX(20, p7_tophits_GetMaxAccessionLength(th));
1762   int         qnamew     = ESL_MAX(20, strlen(qname));
1763   int         ndom       = 0;
1764   int         posw       = (pli->long_targets ? ESL_MAX(7, p7_tophits_GetMaxPositionLength(th)) : 0);
1765   int         h,d;
1766   int         status;
1767 
1768 
1769   if (pli->long_targets)
1770   {
1771     if (fprintf(ofp, "# hit scores\n# ----------\n#\n") < 0)
1772       ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1773     if (fprintf(ofp, "# %-*s %-*s %-*s %6s %9s %5s  %s  %s %6s %*s %*s %*s %*s %*s   %s\n",
1774     tnamew-1, "target name", taccw, "acc", qnamew, "query name", "bits", "  e-value", " bias", "hmm-st", "hmm-en", "strand", posw, "ali-st", posw, "ali-en", posw, "env-st", posw, "env-en", posw, ( pli->mode == p7_SCAN_MODELS ? "modlen" : "sq-len" ), "description of target") < 0)
1775       ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1776     if (fprintf(ofp, "# %-*s %-*s %-*s %6s %9s %5s %s %s %6s %*s %*s %*s %*s %*s   %s\n",
1777     tnamew-1, "-------------------", taccw, "-------------------", qnamew, "-------------------",  "------",  "---------", "-----", "-------", "-------", "------", posw, "-------", posw, "-------",  posw, "-------", posw, "-------", posw, "-------", "---------------------") < 0)
1778       ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1779 
1780     for (h = 0; h < th->N; h++)
1781       if (th->hit[h]->flags & p7_IS_REPORTED)
1782       {
1783           //d    = th->hit[h]->best_domain;
1784           if (fprintf(ofp, "%-*s  %-*s %-*s %6.1f %9.2g %5.1f %7d %7d %s %*" PRId64 " %*" PRId64 " %*" PRId64 " %*" PRId64 " %*" PRId64 "   %s\n",
1785 		      tnamew, th->hit[h]->name,
1786 		      taccw, ( pli->mode == p7_SCAN_MODELS ? (th->hit[h]->acc ? th->hit[h]->acc : "-") : ((qacc && qacc[0] != '\0') ? qacc : "-")),
1787 		      qnamew, qname,
1788 		      th->hit[h]->score,
1789 		      exp(th->hit[h]->lnP),
1790 		      th->hit[h]->dcl[0].dombias * eslCONST_LOG2R, /* convert nats to bits at last moment */
1791 		      th->hit[h]->dcl[0].ad->hmmfrom,
1792 		      th->hit[h]->dcl[0].ad->hmmto,
1793 		      (th->hit[h]->dcl[0].iali < th->hit[h]->dcl[0].jali ? "   +  "  :  "   -  "),
1794 		      posw, th->hit[h]->dcl[0].iali,
1795 		      posw, th->hit[h]->dcl[0].jali,
1796 		      posw, th->hit[h]->dcl[0].ienv,
1797 		      posw, th->hit[h]->dcl[0].jenv,
1798 		      posw, th->hit[h]->dcl[0].ad->L,
1799 		      th->hit[h]->desc == NULL ?  "-" : th->hit[h]->desc) < 0)
1800             ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1801       }
1802   }
1803   else
1804   {
1805       if (fprintf(ofp, "# Sequence scores\n# ---------------\n#\n") < 0)
1806         ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1807       if (fprintf(ofp, "# %-*s %6s %9s %3s %5s %5s    %s\n",
1808       tnamew-1, "name",  " bits", "  E-value", "n",  "exp", " bias", "description") < 0)
1809         ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1810       if (fprintf(ofp, "# %*s %6s %9s %3s %5s %5s    %s\n",
1811       tnamew-1, "-------------------",  "------", "---------","---", "-----",  "-----", "---------------------") < 0)
1812         ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1813 
1814       for (h = 0; h < th->N; h++)
1815       {
1816         if (th->hit[h]->flags & p7_IS_REPORTED)
1817         {
1818           if (fprintf(ofp, "%-*s  %6.1f %9.2g %3d %5.1f %5.1f    %s\n",
1819           tnamew, th->hit[h]->name,
1820           th->hit[h]->score,
1821           exp(th->hit[h]->lnP) * pli->Z,
1822           th->hit[h]->ndom,
1823           th->hit[h]->nexpected,
1824           th->hit[h]->pre_score - th->hit[h]->score, /* bias correction */
1825           (th->hit[h]->desc == NULL ? "-" : th->hit[h]->desc)) < 0)
1826             ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1827 
1828           for (d = 0; d < th->hit[h]->ndom; d++)
1829             if (th->hit[h]->dcl[d].is_reported)
1830               ndom ++;
1831         }
1832       }
1833       if (fprintf(ofp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1834 
1835       /* Need to sort the domains.  One way to do this is to re-use the hit sorting machinery,
1836        * so we create one "hit" for each domain, then hand it off to the sorter
1837        */
1838       if ((domHitlist  = p7_tophits_Create()) == NULL) return eslEMEM;
1839       for (h = 0; h < th->N; h++)
1840       {
1841         if (th->hit[h]->flags & p7_IS_REPORTED)
1842         {
1843           int ndomReported = 0;
1844           for (d = 0; d < th->hit[h]->ndom; d++)
1845           {
1846             if (th->hit[h]->dcl[d].is_reported)
1847             {
1848               p7_tophits_CreateNextHit(domHitlist, &domhit);
1849               ndomReported++;
1850               ESL_ALLOC(domhit->dcl, sizeof(P7_DOMAIN) );
1851 
1852               domhit->ndom       = ndomReported;  // re-using this variable to track the ordinal value of the domain in the original hit list that generated this pseudo-hit
1853               domhit->name       = th->hit[h]->name;
1854               domhit->desc       = th->hit[h]->desc;
1855               domhit->dcl[0]     = th->hit[h]->dcl[d];
1856               domhit->sortkey    = pli->inc_by_E ? -1.0 * th->hit[h]->dcl[d].lnP : th->hit[h]->dcl[d].bitscore;
1857             }
1858           }
1859         }
1860       }
1861       p7_tophits_SortBySortkey(domHitlist);
1862 
1863       // Now with this list of sorted "hits" (really domains)
1864       if (fprintf(ofp, "# Domain scores\n# -------------\n#\n") < 0)
1865         ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1866       if (fprintf(ofp, "# %-*s %6s %9s %5s %5s %6s %6s %6s %6s %6s %6s     %s\n",
1867       tnamew-1, " name",  "bits", "E-value", "hit", "bias",      "env-st",  "env-en",  "ali-st",  "ali-en",  "hmm-st",  "hmm-en",   "description") < 0)
1868         ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1869       if (fprintf(ofp, "# %*s %6s %9s %5s %5s %6s %6s %6s %6s %6s %6s      %s\n",
1870       tnamew-1, "-------------------",  "------", "---------", "-----", "-----", "------", "------", "------", "------", "------", "------", "---------------------") < 0)
1871         ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1872 
1873       for (h = 0; h < domHitlist->N; h++)
1874       {
1875         domhit = domHitlist->hit[h];
1876 
1877         if (fprintf(ofp, "%-*s  %6.1f %9.2g %5d %5.1f %6" PRId64 " %6" PRId64 " %6" PRId64 " %6" PRId64 " %6d %6d     %s\n",
1878               tnamew, domHitlist->hit[h]->name,
1879               domhit->dcl[0].bitscore,
1880               exp(domhit->dcl[0].lnP) * pli->Z, //i-Evalue
1881               domhit->ndom,
1882               domhit->dcl[0].dombias * eslCONST_LOG2R, // NATS to BITS at last moment
1883               domhit->dcl[0].ienv,
1884               domhit->dcl[0].jenv,
1885               domhit->dcl[0].ad->sqfrom,
1886               domhit->dcl[0].ad->sqto,
1887               domhit->dcl[0].ad->hmmfrom,
1888               domhit->dcl[0].ad->hmmto,
1889               (domhit->desc ?  domhit->desc : "-")) < 0)
1890                 ESL_XEXCEPTION_SYS(eslEWRITE, "xfam tabular output: write failed");
1891       }
1892       free (domHitlist->unsrt);
1893       free (domHitlist->hit);
1894       free (domHitlist);
1895   }
1896   return eslOK;
1897 
1898  ERROR:
1899   if (domHitlist)
1900   {
1901       free (domHitlist->unsrt);
1902       free (domHitlist->hit);
1903       free (domHitlist);
1904   }
1905   return status;
1906 }
1907 
1908 
1909 /* Function:  p7_tophits_AliScores()
1910  * Synopsis:  Output per-position scores for each position of each query/hit pair
1911  *
1912  * Purpose:   This depends on per-alignment-position scores having been
1913  *            previously computed, as in p7_pipeline_computeAliScores()
1914  *
1915  * Returns:   <eslOK> on success.
1916  *
1917  * Throws:    none
1918  */
1919 int
p7_tophits_AliScores(FILE * ofp,char * qname,P7_TOPHITS * th)1920 p7_tophits_AliScores(FILE *ofp, char *qname, P7_TOPHITS *th )
1921 {
1922   P7_HIT *hit;
1923   int h, i;
1924   float *scores;
1925 
1926   for (h = 0; h < th->N; h++) {
1927     hit = th->hit[h];
1928     if (hit->flags & p7_IS_REPORTED)
1929     {
1930       fprintf (ofp, "%s %s %" PRId64 " %" PRId64 " :", qname, hit->name, hit->dcl[0].iali, hit->dcl[0].jali);
1931 
1932       scores = hit->dcl[0].scores_per_pos;
1933       for (i=0; i<hit->dcl[0].ad->N; i++) {
1934         if (scores[i] == -eslINFINITY)
1935           fprintf (ofp, " >");
1936         else
1937           fprintf (ofp, " %.3f", scores[i] * eslCONST_LOG2R);
1938 
1939       }
1940       fprintf (ofp, "\n");
1941     }
1942 
1943   }
1944   return eslOK;
1945 
1946 }
1947 
1948 
1949 
1950 
1951 /* Function:  p7_tophits_TabularTail()
1952  * Synopsis:  Print a trailer on a tabular output file.
1953  *
1954  * Purpose:   Print some metadata as a trailer on a tabular output file:
1955  *            date/time, the program, HMMER3 version info, the pipeline mode (SCAN or SEARCH),
1956  *            the query and target filenames, a spoof commandline
1957  *            recording the entire program configuration, and
1958  *            a "fini!" that's useful for detecting successful
1959  *            output completion.
1960  *
1961  * Args:      ofp       - open tabular output file (either --tblout or --domtblout)
1962  *            progname  - "hmmscan", for example
1963  *            pipemode  - p7_SEARCH_SEQS | p7_SCAN_MODELS
1964  *            qfile     - name of query file, or '-' for stdin, or '[none]' if NULL
1965  *            tfile     - name of target file, or '-' for stdin, or '[none]' if NULL
1966  *            go        - program configuration; used to generate spoofed command line
1967  *
1968  * Returns:   <eslOK>.
1969  *
1970  * Throws:    <eslEMEM> on allocation failure.
1971  *            <eslESYS> if time() or ctime_r() system calls fail.
1972  *            <eslEWRITE> on write failure.
1973  *
1974  * Xref:      SRE:J7/54
1975  */
1976 int
p7_tophits_TabularTail(FILE * ofp,const char * progname,enum p7_pipemodes_e pipemode,const char * qfile,const char * tfile,const ESL_GETOPTS * go)1977 p7_tophits_TabularTail(FILE *ofp, const char *progname, enum p7_pipemodes_e pipemode, const char *qfile, const char *tfile, const ESL_GETOPTS *go)
1978 {
1979    time_t date           = time(NULL);
1980    char  *spoof_cmd      = NULL;
1981    char  *cwd            = NULL;
1982    char   timestamp[32];
1983    char   modestamp[16];
1984    int    status;
1985 
1986 
1987   if ((status = esl_opt_SpoofCmdline(go, &spoof_cmd)) != eslOK) goto ERROR;
1988   if (date == -1)                                               ESL_XEXCEPTION(eslESYS, "time() failed");
1989   if ((ctime_r(&date, timestamp)) == NULL)                      ESL_XEXCEPTION(eslESYS, "ctime_r() failed");
1990   switch (pipemode) {
1991     case p7_SEARCH_SEQS: strcpy(modestamp, "SEARCH"); break;
1992     case p7_SCAN_MODELS: strcpy(modestamp, "SCAN");   break;
1993     default:             ESL_EXCEPTION(eslEINCONCEIVABLE, "wait, what? no such pipemode");
1994   }
1995   esl_getcwd(&cwd);
1996 
1997   if (fprintf(ofp, "#\n") < 0)                                                                    ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
1998   if (fprintf(ofp, "# Program:         %s\n",      (progname == NULL) ? "[none]" : progname) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
1999   if (fprintf(ofp, "# Version:         %s (%s)\n", HMMER_VERSION, HMMER_DATE) < 0)                ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2000   if (fprintf(ofp, "# Pipeline mode:   %s\n",      modestamp) < 0)                                ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2001   if (fprintf(ofp, "# Query file:      %s\n",      (qfile    == NULL) ? "[none]" : qfile) < 0)    ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2002   if (fprintf(ofp, "# Target file:     %s\n",      (tfile    == NULL) ? "[none]" : tfile) < 0)    ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2003   if (fprintf(ofp, "# Option settings: %s\n",      spoof_cmd) < 0)                                ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2004   if (fprintf(ofp, "# Current dir:     %s\n",      (cwd      == NULL) ? "[unknown]" : cwd) < 0)   ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2005   if (fprintf(ofp, "# Date:            %s",        timestamp) < 0) /* timestamp ends in \n */     ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2006   if (fprintf(ofp, "# [ok]\n") < 0)                                                               ESL_XEXCEPTION_SYS(eslEWRITE, "tabular output tail, write failed");
2007 
2008   free(spoof_cmd);
2009   if (cwd) free(cwd);
2010   return eslOK;
2011 
2012  ERROR:
2013   if (spoof_cmd) free(spoof_cmd);
2014   if (cwd)       free(cwd);
2015   return status;
2016 }
2017 /*------------------- end, tabular output -----------------------*/
2018 
2019 
2020 
2021 
2022 /*****************************************************************
2023  * 4. Benchmark driver
2024  *****************************************************************/
2025 #ifdef p7TOPHITS_BENCHMARK
2026 /*
2027   gcc -o benchmark-tophits -std=gnu99 -g -O2 -I. -L. -I../easel -L../easel -Dp7TOPHITS_BENCHMARK p7_tophits.c -lhmmer -leasel -lm
2028   ./benchmark-tophits
2029 
2030   As of 28 Dec 07, shows 0.20u for 10 lists of 10,000 hits each (at least ~100x normal expectation),
2031   so we expect top hits list time to be negligible for typical hmmsearch/hmmscan runs.
2032 
2033   If needed, we do have opportunity for optimization, however - especially in memory handling.
2034  */
2035 #include "p7_config.h"
2036 
2037 #include "easel.h"
2038 #include "esl_getopts.h"
2039 #include "esl_stopwatch.h"
2040 #include "esl_random.h"
2041 
2042 #include "hmmer.h"
2043 
2044 static ESL_OPTIONS options[] = {
2045   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
2046   { "-h",        eslARG_NONE,   FALSE, NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",             0 },
2047   { "-s",        eslARG_INT,     "42", NULL, NULL,  NULL,  NULL, NULL, "set random number seed to <n>",                    0 },
2048   { "-M",        eslARG_INT,     "10", NULL, NULL,  NULL,  NULL, NULL, "number of top hits lists to simulate and merge",   0 },
2049   { "-N",        eslARG_INT,  "10000", NULL, NULL,  NULL,  NULL, NULL, "number of top hits to simulate",                   0 },
2050   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
2051 };
2052 static char usage[]  = "[-options]";
2053 static char banner[] = "benchmark driver for P7_TOPHITS";
2054 
2055 int
main(int argc,char ** argv)2056 main(int argc, char **argv)
2057 {
2058   ESL_GETOPTS    *go       = p7_CreateDefaultApp(options, 0, argc, argv, banner, usage);
2059   ESL_STOPWATCH  *w        = esl_stopwatch_Create();
2060   ESL_RANDOMNESS *r        = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s"));
2061   int             N        = esl_opt_GetInteger(go, "-N");
2062   int             M        = esl_opt_GetInteger(go, "-M");
2063   P7_TOPHITS    **h        = NULL;
2064   double         *sortkeys = NULL;
2065   char            name[]   = "not_unique_name";
2066   char            acc[]    = "not_unique_acc";
2067   char            desc[]   = "Test description for the purposes of making the benchmark allocate space";
2068   int             i,j;
2069   int             status;
2070 
2071   /* prep work: generate our sort keys before starting to time anything    */
2072   ESL_ALLOC(h,        sizeof(P7_TOPHITS *) * M); /* allocate pointers for M lists */
2073   ESL_ALLOC(sortkeys, sizeof(double) * N * M);
2074   for (i = 0; i < N*M; i++) sortkeys[i] = esl_random(r);
2075 
2076   esl_stopwatch_Start(w);
2077 
2078   /* generate M "random" lists and sort them */
2079   for (j = 0; j < M; j++)
2080   {
2081       h[j] = p7_tophits_Create();
2082       for (i = 0; i < N; i++)
2083         p7_tophits_Add(h[j], name, acc, desc, sortkeys[j*N + i],
2084             (float) sortkeys[j*N+i], sortkeys[j*N+i],
2085             (float) sortkeys[j*N+i], sortkeys[j*N+i],
2086             i, i, N,
2087             i, i, N,
2088             i, N, NULL);
2089       p7_tophits_SortBySortkey(h[j]);
2090   }
2091   /* then merge them into one big list in h[0] */
2092   for (j = 1; j < M; j++)
2093   {
2094       p7_tophits_Merge(h[0], h[j]);
2095       p7_tophits_Destroy(h[j]);
2096   }
2097 
2098   esl_stopwatch_Stop(w);
2099 
2100   p7_tophits_Destroy(h[0]);
2101   status = eslOK;
2102  ERROR:
2103   esl_getopts_Destroy(go);
2104   esl_stopwatch_Destroy(w);
2105   esl_randomness_Destroy(r);
2106   if (sortkeys != NULL) free(sortkeys);
2107   if (h != NULL) free(h);
2108   return status;
2109 }
2110 #endif /*p7TOPHITS_BENCHMARK*/
2111 
2112 
2113 /*****************************************************************
2114  * 5. Test driver
2115  *****************************************************************/
2116 
2117 #ifdef p7TOPHITS_TESTDRIVE
2118 /*
2119   gcc -o tophits_utest -std=gnu99 -g -O2 -I. -L. -I../easel -L../easel -Dp7TOPHITS_TESTDRIVE p7_tophits.c -lhmmer -leasel -lm
2120   ./tophits_test
2121 */
2122 #include "p7_config.h"
2123 
2124 #include "easel.h"
2125 #include "esl_getopts.h"
2126 #include "esl_stopwatch.h"
2127 #include "esl_random.h"
2128 
2129 #include "hmmer.h"
2130 
2131 static ESL_OPTIONS options[] = {
2132   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
2133   { "-h",        eslARG_NONE,   FALSE, NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",             0 },
2134   { "-s",        eslARG_INT,     "42", NULL, NULL,  NULL,  NULL, NULL, "set random number seed to <n>",                    0 },
2135   { "-N",        eslARG_INT,    "100", NULL, NULL,  NULL,  NULL, NULL, "number of top hits to simulate",                   0 },
2136   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
2137 };
2138 
2139 static char usage[]  = "[-options]";
2140 static char banner[] = "test driver for P7_TOPHITS";
2141 
2142 int
main(int argc,char ** argv)2143 main(int argc, char **argv)
2144 {
2145   ESL_GETOPTS    *go       = p7_CreateDefaultApp(options, 0, argc, argv, banner, usage);
2146   ESL_RANDOMNESS *r        = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s"));
2147   int             N        = esl_opt_GetInteger(go, "-N");
2148   P7_TOPHITS     *h1       = NULL;
2149   P7_TOPHITS     *h2       = NULL;
2150   P7_TOPHITS     *h3       = NULL;
2151   char            name[]   = "not_unique_name";
2152   char            acc[]    = "not_unique_acc";
2153   char            desc[]   = "Test description for the purposes of making the test driver allocate space";
2154   double          key;
2155   int             i;
2156 
2157   h1 = p7_tophits_Create();
2158   h2 = p7_tophits_Create();
2159   h3 = p7_tophits_Create();
2160 
2161   for (i = 0; i < N; i++)
2162   {
2163       key = esl_random(r);
2164       p7_tophits_Add(h1, name, acc, desc, key, (float) key, key, (float) key, key, i, i, N, i, i, N, 1, 1, NULL);
2165       key = 10.0 * esl_random(r);
2166       p7_tophits_Add(h2, name, acc, desc, key, (float) key, key, (float) key, key, i, i, N, i, i, N, 2, 2, NULL);
2167       key = 0.1 * esl_random(r);
2168       p7_tophits_Add(h3, name, acc, desc, key, (float) key, key, (float) key, key, i, i, N, i, i, N, 3, 3, NULL);
2169   }
2170   p7_tophits_Add(h1, "last",  NULL, NULL, -1.0, (float) key, key, (float) key, key, i, i, N, i, i, N, 1, 1, NULL);
2171   p7_tophits_Add(h1, "first", NULL, NULL, 20.0, (float) key, key, (float) key, key, i, i, N, i, i, N, 1, 1, NULL);
2172 
2173   p7_tophits_SortBySortkey(h1);
2174   if (strcmp(h1->hit[0]->name,   "first") != 0) esl_fatal("sort failed (top is %s = %f)", h1->hit[0]->name,   h1->hit[0]->sortkey);
2175   if (strcmp(h1->hit[N+1]->name, "last")  != 0) esl_fatal("sort failed (last is %s = %f)", h1->hit[N+1]->name, h1->hit[N+1]->sortkey);
2176 
2177   p7_tophits_Merge(h1, h2);
2178   if (strcmp(h1->hit[0]->name,     "first") != 0) esl_fatal("after merge 1, sort failed (top is %s = %f)", h1->hit[0]->name,     h1->hit[0]->sortkey);
2179   if (strcmp(h1->hit[2*N+1]->name, "last")  != 0) esl_fatal("after merge 1, sort failed (last is %s = %f)", h1->hit[2*N+1]->name, h1->hit[2*N+1]->sortkey);
2180 
2181   p7_tophits_Merge(h3, h1);
2182   if (strcmp(h3->hit[0]->name,     "first") != 0) esl_fatal("after merge 2, sort failed (top is %s = %f)", h3->hit[0]->name,     h3->hit[0]->sortkey);
2183   if (strcmp(h3->hit[3*N+1]->name, "last")  != 0) esl_fatal("after merge 2, sort failed (last is %s = %f)", h3->hit[3*N+1]->name,     h3->hit[3*N+1]->sortkey);
2184 
2185   if (p7_tophits_GetMaxNameLength(h3) != strlen(name)) esl_fatal("GetMaxNameLength() failed");
2186 
2187   p7_tophits_Destroy(h1);
2188   p7_tophits_Destroy(h2);
2189   p7_tophits_Destroy(h3);
2190   esl_randomness_Destroy(r);
2191   esl_getopts_Destroy(go);
2192   return eslOK;
2193 }
2194 #endif /*p7TOPHITS_TESTDRIVE*/
2195 
2196 
2197 
2198