1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1997 Sean R. Eddy
4 *
5 * This source code is distributed under the terms of the
6 * GNU General Public License. See the files COPYING and
7 * GNULICENSE for details.
8 *
9 ************************************************************/
10
11 /* tophits.c
12 *
13 * Routines for storing, sorting, displaying high scoring hits
14 * and alignments.
15 *
16 *****************************************************************************
17 *
18 * main API:
19 *
20 * AllocTophits() - allocation
21 * FreeTophits() - free'ing
22 * RegisterHit() - put information about a hit in the list
23 * GetRankedHit() - recovers information about a hit
24 * FullSortTophits() - sorts the top H hits.
25 *
26 *****************************************************************************
27 * Brief example of use:
28 *
29 * struct tophit_s *yourhits; // list of hits
30 * struct fancyali_s *ali; // (optional structure) alignment of a hit
31 *
32 * yourhits = AllocTophits(200);
33 * (for every hit in a search) {
34 * if (do_alignments)
35 * ali = Trace2FancyAli(); // You provide a function/structure here
36 * if (score > threshold)
37 * RegisterHit(yourhits, ...)
38 * }
39 *
40 * FullSortTophits(yourhits); // Sort hits by evalue
41 * for (i = 0; i < 100; i++) // Recover hits out in ranked order
42 * {
43 * GetRankedHit(yourhits, i, ...);
44 * // Presumably you'd print here...
45 * }
46 * FreeTophits(yourhits);
47 ***************************************************************************
48 *
49 * Estimated storage per hit:
50 * coords: 16 bytes
51 * scores: 8 bytes
52 * name/desc: 192 bytes
53 * alignment: 1000 bytes total = ~1200 bytes with alignment;
54 * = ~200 bytes without
55 * Designed for: 10^5 hits (20 MB) or 10^4 alignments (10 MB)
56 */
57
58 #include <string.h>
59 #include <float.h>
60 #include <limits.h>
61
62 #include "structs.h"
63 #include "funcs.h"
64
65 /* Function: AllocTophits()
66 *
67 * Purpose: Allocate a struct tophit_s, for maintaining
68 * a list of top-scoring hits in a database search.
69 *
70 * Args: lumpsize - allocation lumpsize
71 *
72 * Return: An allocated struct hit_s. Caller must free.
73 */
74 struct tophit_s *
AllocTophits(int lumpsize)75 AllocTophits(int lumpsize)
76 {
77 struct tophit_s *hitlist;
78
79 hitlist = MallocOrDie (sizeof(struct tophit_s));
80 hitlist->hit = NULL;
81 hitlist->unsrt = MallocOrDie (lumpsize * sizeof(struct hit_s));
82 hitlist->alloc = lumpsize;
83 hitlist->num = 0;
84 hitlist->lump = lumpsize;
85 return hitlist;
86 }
87 void
GrowTophits(struct tophit_s * h)88 GrowTophits(struct tophit_s *h)
89 {
90 h->unsrt = ReallocOrDie(h->unsrt,(h->alloc + h->lump) * sizeof(struct hit_s));
91 h->alloc += h->lump;
92 }
93 void
FreeTophits(struct tophit_s * h)94 FreeTophits(struct tophit_s *h)
95 {
96 int pos;
97 for (pos = 0; pos < h->num; pos++)
98 {
99 if (h->unsrt[pos].ali != NULL) FreeFancyAli(h->unsrt[pos].ali);
100 if (h->unsrt[pos].name != NULL) free(h->unsrt[pos].name);
101 if (h->unsrt[pos].desc != NULL) free(h->unsrt[pos].desc);
102 }
103 free(h->unsrt);
104 if (h->hit != NULL) free(h->hit);
105 free(h);
106 }
107
108 struct fancyali_s *
AllocFancyAli(void)109 AllocFancyAli(void)
110 {
111 struct fancyali_s *ali;
112
113 ali = MallocOrDie (sizeof(struct fancyali_s));
114 ali->rfline = ali->csline = ali->model = ali->mline = ali->aseq = NULL;
115 ali->query = ali->target = NULL;
116 ali->sqfrom = ali->sqto = 0;
117 return ali;
118 }
119 void
FreeFancyAli(struct fancyali_s * ali)120 FreeFancyAli(struct fancyali_s *ali)
121 {
122 if (ali != NULL) {
123 if (ali->rfline != NULL) free(ali->rfline);
124 if (ali->csline != NULL) free(ali->csline);
125 if (ali->model != NULL) free(ali->model);
126 if (ali->mline != NULL) free(ali->mline);
127 if (ali->aseq != NULL) free(ali->aseq);
128 if (ali->query != NULL) free(ali->query);
129 if (ali->target != NULL) free(ali->target);
130 free(ali);
131 }
132 }
133
134 /* Function: RegisterHit()
135 *
136 * Purpose: Add a new hit to a list of top hits.
137 *
138 * "ali", if provided, is a pointer to allocated memory
139 * for an alignment output structure.
140 * Management is turned over to the top hits structure.
141 * Caller should not free them; they will be free'd by
142 * the FreeTophits() call.
143 *
144 * In contrast, "name" and "desc" are copied, so caller
145 * is still responsible for these.
146 *
147 * Number of args is unwieldy.
148 *
149 * Args: h - active top hit list
150 * key - value to sort by: bigger is better
151 * pvalue - P-value of this hit
152 * score - score of this hit
153 * motherp - P-value of parent whole sequence
154 * mothersc - score of parent whole sequence
155 * name - name of target sequence
156 * desc - description of target sequence
157 * sqfrom - 1..L pos in target seq of start
158 * sqto - 1..L pos; sqfrom > sqto if rev comp
159 * sqlen - length of sequence, L
160 * hmmfrom - 0..M+1 pos in HMM of start
161 * hmmto - 0..M+1 pos in HMM of end
162 * hmmlen - length of HMM, M
163 * domidx - number of this domain
164 * ndom - total # of domains in sequence
165 * ali - optional printable alignment info
166 *
167 * Return: (void)
168 * hitlist is modified and possibly reallocated internally.
169 */
170 void
RegisterHit(struct tophit_s * h,double key,double pvalue,float score,double motherp,float mothersc,char * name,char * desc,int sqfrom,int sqto,int sqlen,int hmmfrom,int hmmto,int hmmlen,int domidx,int ndom,struct fancyali_s * ali)171 RegisterHit(struct tophit_s *h, double key,
172 double pvalue, float score, double motherp, float mothersc,
173 char *name, char *desc,
174 int sqfrom, int sqto, int sqlen,
175 int hmmfrom, int hmmto, int hmmlen,
176 int domidx, int ndom,
177 struct fancyali_s *ali)
178 {
179 /* Check to see if list is full and we must realloc.
180 */
181 if (h->num == h->alloc) GrowTophits(h);
182
183 h->unsrt[h->num].name = Strdup(name);
184 h->unsrt[h->num].desc = Strdup(desc);
185 h->unsrt[h->num].sortkey = key;
186 h->unsrt[h->num].pvalue = pvalue;
187 h->unsrt[h->num].score = score;
188 h->unsrt[h->num].motherp = motherp;
189 h->unsrt[h->num].mothersc= mothersc;
190 h->unsrt[h->num].sqfrom = sqfrom;
191 h->unsrt[h->num].sqto = sqto;
192 h->unsrt[h->num].sqlen = sqlen;
193 h->unsrt[h->num].hmmfrom = hmmfrom;
194 h->unsrt[h->num].hmmto = hmmto;
195 h->unsrt[h->num].hmmlen = hmmlen;
196 h->unsrt[h->num].domidx = domidx;
197 h->unsrt[h->num].ndom = ndom;
198 h->unsrt[h->num].ali = ali;
199 h->num++;
200 return;
201 }
202
203 /* Function: GetRankedHit()
204 * Date: SRE, Tue Oct 28 10:06:48 1997 [Newton Institute, Cambridge UK]
205 *
206 * Purpose: Recover the data from the i'th ranked hit.
207 * Any of the data ptrs may be passed as NULL for fields
208 * you don't want. hitlist must have been sorted first.
209 *
210 * name, desc, and ali are returned as pointers, not copies;
211 * don't free them!
212 */
213 void
GetRankedHit(struct tophit_s * h,int rank,double * r_pvalue,float * r_score,double * r_motherp,float * r_mothersc,char ** r_name,char ** r_desc,int * r_sqfrom,int * r_sqto,int * r_sqlen,int * r_hmmfrom,int * r_hmmto,int * r_hmmlen,int * r_domidx,int * r_ndom,struct fancyali_s ** r_ali)214 GetRankedHit(struct tophit_s *h, int rank,
215 double *r_pvalue, float *r_score,
216 double *r_motherp, float *r_mothersc,
217 char **r_name, char **r_desc,
218 int *r_sqfrom, int *r_sqto, int *r_sqlen,
219 int *r_hmmfrom, int *r_hmmto, int *r_hmmlen,
220 int *r_domidx, int *r_ndom,
221 struct fancyali_s **r_ali)
222 {
223 if (r_pvalue != NULL) *r_pvalue = h->hit[rank]->pvalue;
224 if (r_score != NULL) *r_score = h->hit[rank]->score;
225 if (r_motherp != NULL) *r_motherp = h->hit[rank]->motherp;
226 if (r_mothersc!= NULL) *r_mothersc= h->hit[rank]->mothersc;
227 if (r_name != NULL) *r_name = h->hit[rank]->name;
228 if (r_desc != NULL) *r_desc = h->hit[rank]->desc;
229 if (r_sqfrom != NULL) *r_sqfrom = h->hit[rank]->sqfrom;
230 if (r_sqto != NULL) *r_sqto = h->hit[rank]->sqto;
231 if (r_sqlen != NULL) *r_sqlen = h->hit[rank]->sqlen;
232 if (r_hmmfrom != NULL) *r_hmmfrom = h->hit[rank]->hmmfrom;
233 if (r_hmmto != NULL) *r_hmmto = h->hit[rank]->hmmto;
234 if (r_hmmlen != NULL) *r_hmmlen = h->hit[rank]->hmmlen;
235 if (r_domidx != NULL) *r_domidx = h->hit[rank]->domidx;
236 if (r_ndom != NULL) *r_ndom = h->hit[rank]->ndom;
237 if (r_ali != NULL) *r_ali = h->hit[rank]->ali;
238 }
239
240 /* Function: TophitsMaxName()
241 *
242 * Purpose: Returns the maximum name length in a top hits list;
243 * doesn't need to be sorted yet.
244 */
245 int
TophitsMaxName(struct tophit_s * h)246 TophitsMaxName(struct tophit_s *h)
247 {
248 int i;
249 int len, maxlen;
250
251 maxlen = 0;
252 for (i = 0; i < h->num; i++)
253 {
254 len = strlen(h->unsrt[i].name);
255 if (len > maxlen) maxlen = len;
256 }
257 return maxlen;
258 }
259
260 /* Function: FullSortTophits()
261 *
262 * Purpose: Completely sort the top hits list. Calls
263 * qsort() to do the sorting, and uses
264 * hit_comparison() to do the comparison.
265 *
266 * Args: h - top hits structure
267 */
268 int
hit_comparison(const void * vh1,const void * vh2)269 hit_comparison(const void *vh1, const void *vh2)
270 {
271 /* don't ask. don't change. and, Don't Panic. */
272 struct hit_s *h1 = *((struct hit_s **) vh1);
273 struct hit_s *h2 = *((struct hit_s **) vh2);
274
275 if (h1->sortkey < h2->sortkey) return 1;
276 else if (h1->sortkey > h2->sortkey) return -1;
277 else if (h1->sortkey == h2->sortkey) return 0;
278 /*NOTREACHED*/
279 return 0;
280 }
281 void
FullSortTophits(struct tophit_s * h)282 FullSortTophits(struct tophit_s *h)
283 {
284 int i;
285
286 /* If we don't have /any/ hits, then don't
287 * bother.
288 */
289 if (h->num == 0) return;
290
291 /* Assign the ptrs in h->hit.
292 */
293 h->hit = MallocOrDie(h->num * sizeof(struct hit_s *));
294 for (i = 0; i < h->num; i++)
295 h->hit[i] = &(h->unsrt[i]);
296
297 /* Sort the pointers. Don't bother if we've only got one.
298 */
299 if (h->num > 1)
300 qsort(h->hit, h->num, sizeof(struct hit_s *), hit_comparison);
301 }
302
303
304
305 /* Function: TophitsReport()
306 * Date: Thu Dec 18 13:19:18 1997
307 *
308 * Purpose: Generate a printout summarizing how much
309 * memory is used by a tophits structure,
310 * how many hits are stored, and how much
311 * waste there is from not knowing nseqs.
312 *
313 * Args: h - the sorted tophits list
314 * E - the cutoff in Evalue
315 * nseq - the final number of seqs used for Eval
316 *
317 * Return: (void)
318 * Prints information on stdout
319 */
320 void
TophitsReport(struct tophit_s * h,double E,int nseq)321 TophitsReport(struct tophit_s *h, double E, int nseq)
322 {
323 int i;
324 int memused;
325 int x;
326 int n;
327
328 /* Count up how much memory is used
329 * in the whole list.
330 */
331 memused = sizeof(struct hit_s) * h->alloc + sizeof(struct tophit_s);
332 for (i = 0; i < h->num; i++)
333 {
334 if (h->unsrt[i].name != NULL)
335 memused += strlen(h->unsrt[i].name) + 1;
336 if (h->unsrt[i].desc != NULL)
337 memused += strlen(h->unsrt[i].desc) + 1;
338 if (h->unsrt[i].ali != NULL)
339 {
340 memused += sizeof(struct fancyali_s);
341 x = 0;
342 if (h->unsrt[i].ali->rfline != NULL) x++;
343 if (h->unsrt[i].ali->csline != NULL) x++;
344 if (h->unsrt[i].ali->model != NULL) x++;
345 if (h->unsrt[i].ali->mline != NULL) x++;
346 if (h->unsrt[i].ali->aseq != NULL) x++;
347 memused += x * (h->unsrt[i].ali->len + 1);
348
349 if (h->unsrt[i].ali->query != NULL)
350 memused += strlen(h->unsrt[i].ali->query) + 1;
351 if (h->unsrt[i].ali->target != NULL)
352 memused += strlen(h->unsrt[i].ali->target) + 1;
353 }
354 }
355
356 /* Count how many hits actually satisfy the E cutoff.
357 */
358 n = 0;
359 for (i = 0; i < h->num; i++)
360 {
361 if (h->hit[i]->pvalue * (double) nseq >= E) break;
362 n++;
363 }
364
365 /* Format and print a summary
366 */
367 printf("tophits_s report:\n");
368 printf(" Total hits: %d\n", h->num);
369 printf(" Satisfying E cutoff: %d\n", n);
370 printf(" Total memory: %dK\n", memused / 1000);
371 }
372