1 /************************************************************
2  * HMMER - Biological sequence analysis with profile HMMs
3  * Copyright (C) 1992-1997 Sean R. Eddy
4  *
5  *   This source code is distributed under the terms of the
6  *   GNU General Public License. See the files COPYING and
7  *   GNULICENSE for details.
8  *
9  ************************************************************/
10 
11 /* tophits.c
12  *
13  * Routines for storing, sorting, displaying high scoring hits
14  * and alignments.
15  *
16  *****************************************************************************
17  *
18  * main API:
19  *
20  * AllocTophits()       - allocation
21  * FreeTophits()        - free'ing
22  * RegisterHit()        - put information about a hit in the list
23  * GetRankedHit()       - recovers information about a hit
24  * FullSortTophits()    - sorts the top H hits.
25  *
26  *****************************************************************************
27  * Brief example of use:
28  *
29  *   struct tophit_s   *yourhits;   // list of hits
30  *   struct fancyali_s *ali;        // (optional structure) alignment of a hit
31  *
32  *   yourhits = AllocTophits(200);
33  *   (for every hit in a search) {
34  *        if (do_alignments)
35  *           ali = Trace2FancyAli();  // You provide a function/structure here
36  *        if (score > threshold)
37  *           RegisterHit(yourhits, ...)
38  *     }
39  *
40  *   FullSortTophits(yourhits);      // Sort hits by evalue
41  *   for (i = 0; i < 100; i++)       // Recover hits out in ranked order
42  *     {
43  *       GetRankedHit(yourhits, i, ...);
44  *                                   // Presumably you'd print here...
45  *     }
46  *   FreeTophits(yourhits);
47  ***************************************************************************
48  *
49  * Estimated storage per hit:
50  *        coords:   16 bytes
51  *        scores:    8 bytes
52  *     name/desc:  192 bytes
53  *     alignment: 1000 bytes   total = ~1200 bytes with alignment;
54  *                                   = ~200 bytes without
55  *     Designed for: 10^5 hits (20 MB) or 10^4 alignments (10 MB)
56  */
57 
58 #include <string.h>
59 #include <float.h>
60 #include <limits.h>
61 
62 #include "structs.h"
63 #include "funcs.h"
64 
65 /* Function: AllocTophits()
66  *
67  * Purpose:  Allocate a struct tophit_s, for maintaining
68  *           a list of top-scoring hits in a database search.
69  *
70  * Args:     lumpsize - allocation lumpsize
71  *
72  * Return:   An allocated struct hit_s. Caller must free.
73  */
74 struct tophit_s *
AllocTophits(int lumpsize)75 AllocTophits(int lumpsize)
76 {
77   struct tophit_s *hitlist;
78 
79   hitlist        = MallocOrDie (sizeof(struct tophit_s));
80   hitlist->hit   = NULL;
81   hitlist->unsrt = MallocOrDie (lumpsize * sizeof(struct hit_s));
82   hitlist->alloc = lumpsize;
83   hitlist->num   = 0;
84   hitlist->lump  = lumpsize;
85   return hitlist;
86 }
87 void
GrowTophits(struct tophit_s * h)88 GrowTophits(struct tophit_s *h)
89 {
90   h->unsrt = ReallocOrDie(h->unsrt,(h->alloc + h->lump) * sizeof(struct hit_s));
91   h->alloc += h->lump;
92 }
93 void
FreeTophits(struct tophit_s * h)94 FreeTophits(struct tophit_s *h)
95 {
96   int pos;
97   for (pos = 0; pos < h->num; pos++)
98     {
99       if (h->unsrt[pos].ali  != NULL) FreeFancyAli(h->unsrt[pos].ali);
100       if (h->unsrt[pos].name != NULL) free(h->unsrt[pos].name);
101       if (h->unsrt[pos].desc != NULL) free(h->unsrt[pos].desc);
102     }
103   free(h->unsrt);
104   if (h->hit != NULL) free(h->hit);
105   free(h);
106 }
107 
108 struct fancyali_s *
AllocFancyAli(void)109 AllocFancyAli(void)
110 {
111   struct fancyali_s *ali;
112 
113   ali = MallocOrDie (sizeof(struct fancyali_s));
114   ali->rfline = ali->csline = ali->model = ali->mline = ali->aseq = NULL;
115   ali->query  = ali->target = NULL;
116   ali->sqfrom = ali->sqto   = 0;
117   return ali;
118 }
119 void
FreeFancyAli(struct fancyali_s * ali)120 FreeFancyAli(struct fancyali_s *ali)
121 {
122   if (ali != NULL) {
123     if (ali->rfline != NULL) free(ali->rfline);
124     if (ali->csline != NULL) free(ali->csline);
125     if (ali->model  != NULL) free(ali->model);
126     if (ali->mline  != NULL) free(ali->mline);
127     if (ali->aseq   != NULL) free(ali->aseq);
128     if (ali->query  != NULL) free(ali->query);
129     if (ali->target != NULL) free(ali->target);
130     free(ali);
131   }
132 }
133 
134 /* Function: RegisterHit()
135  *
136  * Purpose:  Add a new hit to a list of top hits.
137  *
138  *           "ali", if provided, is a pointer to allocated memory
139  *           for an alignment output structure.
140  *           Management is turned over to the top hits structure.
141  *           Caller should not free them; they will be free'd by
142  *           the FreeTophits() call.
143  *
144  *           In contrast, "name" and "desc" are copied, so caller
145  *           is still responsible for these.
146  *
147  *           Number of args is unwieldy.
148  *
149  * Args:     h        - active top hit list
150  *           key      - value to sort by: bigger is better
151  *           pvalue   - P-value of this hit
152  *           score    - score of this hit
153  *           motherp  - P-value of parent whole sequence
154  *           mothersc - score of parent whole sequence
155  *           name     - name of target sequence
156  *           desc     - description of target sequence
157  *           sqfrom   - 1..L pos in target seq  of start
158  *           sqto     - 1..L pos; sqfrom > sqto if rev comp
159  *           sqlen    - length of sequence, L
160  *           hmmfrom  - 0..M+1 pos in HMM of start
161  *           hmmto    - 0..M+1 pos in HMM of end
162  *           hmmlen   - length of HMM, M
163  *           domidx   - number of this domain
164  *           ndom     - total # of domains in sequence
165  *           ali      - optional printable alignment info
166  *
167  * Return:   (void)
168  *           hitlist is modified and possibly reallocated internally.
169  */
170 void
RegisterHit(struct tophit_s * h,double key,double pvalue,float score,double motherp,float mothersc,char * name,char * desc,int sqfrom,int sqto,int sqlen,int hmmfrom,int hmmto,int hmmlen,int domidx,int ndom,struct fancyali_s * ali)171 RegisterHit(struct tophit_s *h, double key,
172 	    double pvalue, float score, double motherp, float mothersc,
173 	    char *name, char *desc,
174 	    int sqfrom, int sqto, int sqlen,
175 	    int hmmfrom, int hmmto, int hmmlen,
176 	    int domidx, int ndom,
177 	    struct fancyali_s *ali)
178 {
179   /* Check to see if list is full and we must realloc.
180    */
181   if (h->num == h->alloc) GrowTophits(h);
182 
183   h->unsrt[h->num].name    = Strdup(name);
184   h->unsrt[h->num].desc    = Strdup(desc);
185   h->unsrt[h->num].sortkey = key;
186   h->unsrt[h->num].pvalue  = pvalue;
187   h->unsrt[h->num].score   = score;
188   h->unsrt[h->num].motherp = motherp;
189   h->unsrt[h->num].mothersc= mothersc;
190   h->unsrt[h->num].sqfrom  = sqfrom;
191   h->unsrt[h->num].sqto    = sqto;
192   h->unsrt[h->num].sqlen   = sqlen;
193   h->unsrt[h->num].hmmfrom = hmmfrom;
194   h->unsrt[h->num].hmmto   = hmmto;
195   h->unsrt[h->num].hmmlen  = hmmlen;
196   h->unsrt[h->num].domidx  = domidx;
197   h->unsrt[h->num].ndom    = ndom;
198   h->unsrt[h->num].ali     = ali;
199   h->num++;
200   return;
201 }
202 
203 /* Function: GetRankedHit()
204  * Date:     SRE, Tue Oct 28 10:06:48 1997 [Newton Institute, Cambridge UK]
205  *
206  * Purpose:  Recover the data from the i'th ranked hit.
207  *           Any of the data ptrs may be passed as NULL for fields
208  *           you don't want. hitlist must have been sorted first.
209  *
210  *           name, desc, and ali are returned as pointers, not copies;
211  *           don't free them!
212  */
213 void
GetRankedHit(struct tophit_s * h,int rank,double * r_pvalue,float * r_score,double * r_motherp,float * r_mothersc,char ** r_name,char ** r_desc,int * r_sqfrom,int * r_sqto,int * r_sqlen,int * r_hmmfrom,int * r_hmmto,int * r_hmmlen,int * r_domidx,int * r_ndom,struct fancyali_s ** r_ali)214 GetRankedHit(struct tophit_s *h, int rank,
215 	     double *r_pvalue, float *r_score,
216 	     double *r_motherp, float *r_mothersc,
217 	     char **r_name, char **r_desc,
218 	     int *r_sqfrom, int *r_sqto, int *r_sqlen,
219 	     int *r_hmmfrom, int *r_hmmto, int *r_hmmlen,
220 	     int *r_domidx, int *r_ndom,
221 	     struct fancyali_s **r_ali)
222 {
223   if (r_pvalue  != NULL) *r_pvalue  = h->hit[rank]->pvalue;
224   if (r_score   != NULL) *r_score   = h->hit[rank]->score;
225   if (r_motherp != NULL) *r_motherp = h->hit[rank]->motherp;
226   if (r_mothersc!= NULL) *r_mothersc= h->hit[rank]->mothersc;
227   if (r_name    != NULL) *r_name    = h->hit[rank]->name;
228   if (r_desc    != NULL) *r_desc    = h->hit[rank]->desc;
229   if (r_sqfrom  != NULL) *r_sqfrom  = h->hit[rank]->sqfrom;
230   if (r_sqto    != NULL) *r_sqto    = h->hit[rank]->sqto;
231   if (r_sqlen   != NULL) *r_sqlen   = h->hit[rank]->sqlen;
232   if (r_hmmfrom != NULL) *r_hmmfrom = h->hit[rank]->hmmfrom;
233   if (r_hmmto   != NULL) *r_hmmto   = h->hit[rank]->hmmto;
234   if (r_hmmlen  != NULL) *r_hmmlen  = h->hit[rank]->hmmlen;
235   if (r_domidx  != NULL) *r_domidx  = h->hit[rank]->domidx;
236   if (r_ndom    != NULL) *r_ndom    = h->hit[rank]->ndom;
237   if (r_ali     != NULL) *r_ali     = h->hit[rank]->ali;
238 }
239 
240 /* Function: TophitsMaxName()
241  *
242  * Purpose:  Returns the maximum name length in a top hits list;
243  *           doesn't need to be sorted yet.
244  */
245 int
TophitsMaxName(struct tophit_s * h)246 TophitsMaxName(struct tophit_s *h)
247 {
248   int i;
249   int len, maxlen;
250 
251   maxlen = 0;
252   for (i = 0; i < h->num; i++)
253     {
254       len = strlen(h->unsrt[i].name);
255       if (len > maxlen) maxlen = len;
256     }
257   return maxlen;
258 }
259 
260 /* Function: FullSortTophits()
261  *
262  * Purpose:  Completely sort the top hits list. Calls
263  *           qsort() to do the sorting, and uses
264  *           hit_comparison() to do the comparison.
265  *
266  * Args:     h - top hits structure
267  */
268 int
hit_comparison(const void * vh1,const void * vh2)269 hit_comparison(const void *vh1, const void *vh2)
270 {
271            /* don't ask. don't change. and, Don't Panic. */
272   struct hit_s *h1 = *((struct hit_s **) vh1);
273   struct hit_s *h2 = *((struct hit_s **) vh2);
274 
275   if      (h1->sortkey < h2->sortkey)  return  1;
276   else if (h1->sortkey > h2->sortkey)  return -1;
277   else if (h1->sortkey == h2->sortkey) return  0;
278   /*NOTREACHED*/
279   return 0;
280 }
281 void
FullSortTophits(struct tophit_s * h)282 FullSortTophits(struct tophit_s *h)
283 {
284   int i;
285 
286   /* If we don't have /any/ hits, then don't
287    * bother.
288    */
289   if (h->num == 0) return;
290 
291   /* Assign the ptrs in h->hit.
292    */
293   h->hit = MallocOrDie(h->num * sizeof(struct hit_s *));
294   for (i = 0; i < h->num; i++)
295     h->hit[i] = &(h->unsrt[i]);
296 
297   /* Sort the pointers. Don't bother if we've only got one.
298    */
299   if (h->num > 1)
300     qsort(h->hit, h->num, sizeof(struct hit_s *), hit_comparison);
301 }
302 
303 
304 
305 /* Function: TophitsReport()
306  * Date:     Thu Dec 18 13:19:18 1997
307  *
308  * Purpose:  Generate a printout summarizing how much
309  *           memory is used by a tophits structure,
310  *           how many hits are stored, and how much
311  *           waste there is from not knowing nseqs.
312  *
313  * Args:     h    - the sorted tophits list
314  *           E    - the cutoff in Evalue
315  *           nseq - the final number of seqs used for Eval
316  *
317  * Return:   (void)
318  *           Prints information on stdout
319  */
320 void
TophitsReport(struct tophit_s * h,double E,int nseq)321 TophitsReport(struct tophit_s *h, double E, int nseq)
322 {
323   int i;
324   int memused;
325   int x;
326   int n;
327 
328   /* Count up how much memory is used
329    * in the whole list.
330    */
331   memused = sizeof(struct hit_s) * h->alloc + sizeof(struct tophit_s);
332   for (i = 0; i < h->num; i++)
333     {
334       if (h->unsrt[i].name != NULL)
335 	memused += strlen(h->unsrt[i].name) + 1;
336       if (h->unsrt[i].desc != NULL)
337 	memused += strlen(h->unsrt[i].desc) + 1;
338       if (h->unsrt[i].ali != NULL)
339 	{
340 	  memused += sizeof(struct fancyali_s);
341 	  x = 0;
342 	  if (h->unsrt[i].ali->rfline != NULL) x++;
343 	  if (h->unsrt[i].ali->csline != NULL) x++;
344   	  if (h->unsrt[i].ali->model  != NULL) x++;
345 	  if (h->unsrt[i].ali->mline  != NULL) x++;
346 	  if (h->unsrt[i].ali->aseq   != NULL) x++;
347 	  memused += x * (h->unsrt[i].ali->len + 1);
348 
349 	  if (h->unsrt[i].ali->query  != NULL)
350 	    memused += strlen(h->unsrt[i].ali->query) + 1;
351 	  if (h->unsrt[i].ali->target != NULL)
352 	    memused += strlen(h->unsrt[i].ali->target) + 1;
353 	}
354     }
355 
356   /* Count how many hits actually satisfy the E cutoff.
357    */
358   n = 0;
359   for (i = 0; i < h->num; i++)
360     {
361       if (h->hit[i]->pvalue * (double) nseq >= E) break;
362       n++;
363     }
364 
365   /* Format and print a summary
366    */
367   printf("tophits_s report:\n");
368   printf("     Total hits:           %d\n", h->num);
369   printf("     Satisfying E cutoff:  %d\n", n);
370   printf("     Total memory:         %dK\n", memused / 1000);
371 }
372