1 /*	scalebest.c
2 
3 	This routine gives us the option to rescale a set of scores and
4 	leave them in an unused irelv
5 
6 */
7 
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <math.h>
11 
12 #define MAXHIST 50
13 #ifdef BIGMEM
14 #define MAX_LLEN 200
15 #else
16 #define MAX_LLEN 100
17 #endif
18 #define MAX_SSCORE 300
19 
20 extern int dnaseq;
21 extern int nlib, optall;
22 extern int histflg;
23 extern long *hist;
24 extern int histint, min_hist, max_hist, maxh;
25 float ks_dev;
26 int ks_df;
27 long num_db_entries, total_db_length, z_calls;
28 
29 #define LENGTH_CUTOFF 10 /* minimum database sequence length allowed, for fitting */
30 
31 #define LN_FACT 10.0
32 
33 int *llen_hist=NULL;
34 float *score_sums, *score2_sums, *score_var, *score_mu;
35 static int *h_len;
36 static double logK, lambda, rho, mu, rho2, mu2, mean_var, mean_length;
37 static int max_llen, min_llen, max_sscore;
38 static int max_dlen, min_dlen;
39 static int max_length, min_length, zero_scores;
40 static float var_cutoff;
41 static int fit_flag=1;
42 
43 #ifdef FASTA_BEST
44 struct beststr {
45 	int score;	/* pam score with segment optimization*/
46 	int score0;
47 	int gscore;
48 	int sscore;
49 	float zscore;
50 	float escore;
51 	int n1;
52 	long lseek;	/* position in library file */
53 	int dp;		/* diagonal of match */
54 	int start;	/* start of match in lib seq */
55 	int stop;	/* end of match in lib seq */
56 	int cont;	/* offset into sequence */
57 	int frame;
58 	int lib;
59 	};
60 #else
61 struct beststr {
62 	int score;	/* smith-waterman score */
63 	int sscore;	/* duplicate for compatibility with fasta */
64 	float zscore;
65 	float escore;
66 	int n1;
67 	long lseek;	/* position in library file */
68 	int cont;	/* offset into sequence */
69 	int frame;
70 	int lib;
71 #ifdef RES_STATS
72   char libstr[13];
73 #endif
74 	};
75 #endif
76 
77 float find_zm(), find_z();
78 
process_hist(n0,bptr,nbest)79 process_hist(n0,bptr, nbest)
80      int n0;
81 #ifndef FAR_PTR
82      struct beststr **bptr;
83 #else
84      struct beststr huge * huge * bptr;
85 #endif
86      int nbest;
87 {
88   int i, j, llen, hscore;
89   int n_trimmed;
90   float zs;
91 
92   if (nbest <= 5) {
93     mean_var = 500.0;
94     rho = mu = rho2 = mu2 = 0.0;
95     for (i=0; i<nbest; i++) bptr[i]->zscore = 50.0;
96     return;
97   }
98 
99   inithist();
100 
101   for (i = 0; i<nbest; i++)
102     addhist(bptr[i]->sscore,bptr[i]->n1);
103 
104   fit_llen();	/* now we have rho, mu, rho2, mu2, mean_var to set
105 		   the parameters for the histogram */
106 
107   if (fit_flag) {
108     n_trimmed=0;
109     for (i = 0; i < nbest; i++) {
110       zs = find_zm(bptr[i]->sscore,bptr[i]->n1);
111       if (zs < 0 || zs > 100 ) {
112 	prune_hist(bptr[i]->sscore,bptr[i]->n1);
113 	n_trimmed++;
114       }
115     }
116   }
117 /*
118   if (n_trimmed > 0)
119     printf("Trimmed %d entries with z > 5.0\n", n_trimmed);
120 */
121 
122   if (fit_flag) fit_llens();
123 
124   inithistz(MAXHIST);
125   for (i = 0; i < nbest; i++) {
126     bptr[i]->zscore = find_zm(bptr[i]->sscore,bptr[i]->n1);
127     addhistz(bptr[i]->zscore,bptr[i]->n1);
128   }
129 }
130 
131 
alloc_hist()132 alloc_hist()
133 {
134   if (llen_hist == NULL) {
135     llen_hist = (int *)calloc((size_t)(max_llen+1),sizeof(int));
136     score_sums = (float *)calloc((size_t)(max_llen + 1),sizeof(float));
137     score2_sums = (float *)calloc((size_t)(max_llen + 1),sizeof(float));
138     score_var = (float *)calloc((size_t)(max_llen + 1),sizeof(float));
139     score_mu = (float *)calloc((size_t)(max_llen + 1),sizeof(float));
140   }
141 }
142 
free_hist()143 free_hist()
144 {
145   if (llen_hist != NULL) {
146     free(score_mu);
147     free(score_var);
148     free(score2_sums);
149     free(score_sums);
150     free(llen_hist);
151     llen_hist=NULL;
152   }
153 }
154 
inithist()155 inithist()
156 {
157   int i;
158 
159   max_llen = MAX_LLEN;
160   max_sscore = MAX_SSCORE;
161   alloc_hist();
162 
163   max_dlen = 0;
164   min_dlen = MAX_LLEN;
165 
166   for (i = 0; i <= max_llen; i++) {
167     llen_hist[i] = 0;
168     score_sums[i] = score2_sums[i] = 0.0;
169   }
170   zero_scores = 0;
171   min_length = 10000;
172   max_length = 0;
173   num_db_entries = 0;
174   total_db_length = 0;
175 }
176 
addhist(score,length)177 addhist(score, length)
178      int score, length;
179 {
180   int llen;
181 
182   if ( score<=0 || length < LENGTH_CUTOFF) {
183     zero_scores++;
184     return;
185   }
186 
187   if (length > max_length) max_length = length;
188   if (length < min_length) min_length = length;
189   if (score > max_sscore) score = max_sscore;
190 
191   llen = (int)(LN_FACT*log((double)length)+0.5);
192 
193   if (llen < 0 ) llen = 0;
194   if (llen > max_llen) llen = max_llen;
195   if (llen > max_dlen) max_dlen = llen;
196   if (llen < min_dlen) min_dlen = llen;
197   llen_hist[llen]++;
198   score_sums[llen] += (float)score;
199   score2_sums[llen] += ((float)score * (float)score);
200 
201   num_db_entries++;
202   total_db_length += length;
203 }
204 
205 /* histogram will go from z-scores of 20 .. 100 with mean 50 and z=10 */
206 
207 
inithistz(mh)208 inithistz(mh)
209      int mh;
210 {
211   min_hist = 20;
212   max_hist = 120;
213   z_calls = 0;
214   total_db_length = 0;
215   num_db_entries = 0;
216 
217   histint = (int)((float)(max_hist - min_hist + 2)/(float)mh+0.5);
218   maxh = (int)((float)(max_hist - min_hist + 2)/(float)histint+0.5) ;
219 
220   if ((hist=(long *)calloc((size_t)maxh,sizeof(long)))==NULL) {
221     fprintf(stderr," cannot allocate %d for histogram\n",maxh);
222     histflg = 0;
223   }
224 }
225 
addhistz(zs,length)226 addhistz(zs,length)
227      float zs;
228      int length;
229 {
230   int ih, zi;
231 
232   z_calls++;
233 
234   zi = (int)(zs + 0.5);
235 
236   if ((zi >= min_hist) && (zi <= max_hist)) {
237     num_db_entries++;
238     total_db_length += length;
239   }
240 
241   if (zi < min_hist) zi = min_hist;
242   if (zi > max_hist) zi = max_hist;
243 
244   ih = (zi - min_hist)/histint;
245 
246   hist[ih]++;
247 }
248 
prune_hist(score,length)249 prune_hist(score, length)
250      int score, length;
251 {
252   int llen;
253 
254   if (score <= 0 || length < LENGTH_CUTOFF) return;
255 
256   if (score > max_sscore) score = max_sscore;
257 
258   llen = (int)(LN_FACT*log((double)length)+0.5);
259 
260   if (llen < 0 ) llen = 0;
261   if (llen > max_llen) llen = max_llen;
262   llen_hist[llen]--;
263   score_sums[llen] -= (float)score;
264   score2_sums[llen] -= ((float)score * (float)score);
265 
266   num_db_entries--;
267   total_db_length -= length;
268 }
269 
270 
fit_llen()271 fit_llen()
272 {
273   int j;
274   long n, cell_size;
275   int ej, last_ej;
276   int n_size;
277   double mean, s, s2, x, y, y2, t, t2, u, u2, v, p, z;
278   double mean_x, mean_y, var_x, var_y, covar_xy;
279   double mean_y2, covar_xy2;
280   double ex, exl;
281 
282   double sum_x, sum_y, sum_x2, sum_xy, det, n_w;
283 
284 /* now fit scores to best linear function of log(n), using
285    simple linear regression */
286 
287   /*
288   for (min_llen=0; min_llen < max_llen; min_llen++)
289     if (llen_hist[min_llen]) break;
290   */
291   min_dlen--;
292 
293   for (j=min_dlen; j< max_dlen; j++)
294     if (llen_hist[j]>0) score_mu[j] = score_sums[j]/(float)llen_hist[j];
295 
296   for (n_size=0,j = min_dlen; j < max_dlen; j++) {
297     if (llen_hist[j] > 1) {
298       score_var[j] = score2_sums[j] -
299 	(float)llen_hist[j]*score_mu[j]*score_mu[j];
300       score_var[j] /= (float)(llen_hist[j]-1.0);
301       if (score_var[j] <= 1.0 ) score_var[j] = 1.0;
302       n_size++;
303     }
304   }
305 
306   n_w = 0.0;
307   sum_x = sum_y = sum_x2 = sum_xy = 0;
308   for (j = min_dlen; j < max_dlen; j++)
309     if (llen_hist[j] > 1) {
310       x = j + 0.5;
311       n_w += (float)llen_hist[j]/score_var[j];
312       sum_x +=   (float)llen_hist[j] * x / score_var[j] ;
313       sum_y += score_sums[j] / score_var[j];
314       sum_x2 +=  (float)llen_hist[j] * x * x /score_var[j];
315       sum_xy +=  x * score_sums[j]/score_var[j];
316     }
317 
318   if (n_size < 5 ) {
319     fit_flag=0;
320     rho = 0;
321     mu = sum_y/n_w;
322   }
323   else {
324     det = n_w * sum_x2 - sum_x * sum_x;
325     if (det > 0.001) {
326       rho = (n_w * sum_xy  - sum_x * sum_y)/det;
327       mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
328     }
329     else {
330       fit_flag = 0;
331       rho = 0;
332       mu = sum_y/n_w;
333     }
334   }
335 
336     /*   printf(" rho1/mu1: %.2f/%.2f\n",rho*LN_FACT,mu); */
337 
338   n = 0;
339   mean_x = mean_y = 0.0;
340   var_x = var_y = 0.0;
341   covar_xy = 0.0;
342 
343   for (j = min_dlen; j <= max_dlen; j++)
344     if (llen_hist[j] > 1 ) {
345       n += llen_hist[j];
346       x = (double)j + 0.5;
347       mean_x += (double)llen_hist[j] * x;
348       mean_y += (double)score_sums[j];
349       var_x += (double)llen_hist[j] * x * x;
350       var_y += (double)score2_sums[j];
351       covar_xy += x * (double)score_sums[j];
352     }
353   mean_x /= n; mean_y /= n;
354   var_x = var_x / (float)n - mean_x * mean_x;
355   var_y = var_y / (float)n - mean_y * mean_y;
356 
357   covar_xy = covar_xy / (float)n - mean_x * mean_y;
358 
359 /*
360   rho = covar_xy / var_x;
361   mu = mean_y - rho * mean_x;
362 */
363   mean_y2 = covar_xy2 = 0.0;
364   for (j = min_dlen; j <= max_dlen; j++)
365     if (llen_hist[j] > 1) {
366       x = (double)j + 0.5;
367       u = rho * x + mu;
368       y2 = score2_sums[j] - 2 * score_sums[j] * u + (float)llen_hist[j] * u * u;
369       mean_y2 += y2;
370       covar_xy2 += x * y2;
371     }
372 
373   mean_var = mean_y2 /= (float)n;
374   covar_xy2 = covar_xy2 / (float)n - mean_x * mean_y2;
375   if (fit_flag) {
376     rho2 = covar_xy2 / var_x;
377     mu2 = mean_y2 - rho2 * mean_x;
378   }
379   else {
380     rho2 = 0;
381     mu2 = mean_y2;
382   }
383 
384   if (rho2 < 0.0 )
385     z = (rho2 * LN_FACT*log((double)max_length) + mu2 > 0.0) ? max_length : exp((-1.0 - mu2 / rho2)/LN_FACT);
386   else z =  rho2 ? exp((1.0 - mu2 / rho2)/LN_FACT) : LENGTH_CUTOFF;
387   if (z < 2*LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
388 
389   var_cutoff = rho2 * LN_FACT*log(z) + mu2;
390 
391 /*
392   fprintf(stderr,"\nMinimum allowed predicted variance (%0.2f) at n = %.0f\n",
393 	 var_cutoff,z);
394 
395   printf("\nVariance of residuals explained by log(n): %.2f\n",
396 	 covar_xy2 * covar_xy2 / var_x );
397 
398    printf("\nObserved best fits to log(n):\n");
399     printf(" mean = %.2f * log(n) + %.2f,   var = %.2f * log(n) + %.2f \n",
400     rho*LN_FACT, mu, rho2*LN_FACT, mu2);
401 
402   cell_size = n / 20;
403 
404   printf("\n\n                          S              z");
405   printf("\n\nLengths (n)   No.     Mean    Std        Mean    Std\n");
406 
407   last_ej = LENGTH_CUTOFF;
408   for (j = min_llen; j <= max_llen; ) {
409     for (s = s2 = t = t2 = u = u2 = n = 0;
410 	 n < cell_size && j <= max_llen; j++)
411       if (llen_hist[j] > 1) {
412 	n += llen_hist[j];
413 	s += score_sums[j];
414 	s2 += score2_sums[j];
415 
416 	y = j+0.5;
417 	x = rho * y + mu;
418 	v = rho2 * y + mu2;
419 	if (v < var_cutoff) v = var_cutoff;
420 	u += (score_sums[j] - (float)llen_hist[j] * x) / sqrt(v);
421 	u2 += (score2_sums[j] - 2 * x * score_sums[j] + (float)llen_hist[j] * x * x) /
422 	  v;
423       }
424     if (n > 0) {
425       mean = s / (float)n;
426       ej = (int)(exp((double)j/LN_FACT)+0.5);
427       printf("\n%3d - %4d:   %4d  %6.1f  %6.2f", last_ej, ej, n, mean,
428 	     sqrt(s2 / (float)n - mean * mean));
429       mean = u / (float)n;
430       printf("     %6.1f  %6.2f", mean, sqrt(u2 / n - mean * mean));
431       last_ej = ej;
432     }
433   }
434 */
435 }
436 
fit_llens()437 fit_llens()
438 {
439   int j;
440   long n, cell_size;
441   int ej, last_ej, n_u2;
442   double mean, s, s2, x, y, y2, t, t2, u, u2, v, p, z;
443   double mean_x, mean_y, var_x, var_y, covar_xy;
444   double mean_y2, covar_xy2;
445   double mean_u2, mean_3u2, mean_i3u2;
446   double sum_x, sum_y, sum_x2, sum_xy, det, n_w;
447 
448 /* now fit scores to best linear function of log(n), using
449    simple linear regression */
450 
451   /*
452   for (min_llen=0; min_llen < max_llen; min_llen++)
453     if (llen_hist[min_llen]) break;
454   min_llen--;
455   */
456 
457   for (j = min_dlen; j < max_dlen; j++) {
458     if (llen_hist[j] > 1) {
459       score_var[j] = score2_sums[j]/(float)llen_hist[j]
460 	- (score_sums[j]/(float)llen_hist[j])
461 	* (score_sums[j]/(float)llen_hist[j]);
462       score_var[j] /= (float)(llen_hist[j]-1.0);
463       if (score_var[j] <= 1.0 ) score_var[j] = 1.0;
464     }
465   }
466 
467   n_w = 0.0;
468   sum_x = sum_y = sum_x2 = sum_xy = 0;
469   for (j = min_dlen; j < max_dlen; j++)
470     if (llen_hist[j] > 1) {
471       x = j + 0.5;
472       n_w += (float)llen_hist[j]/score_var[j];
473       sum_x +=   (float)llen_hist[j] * x / score_var[j] ;
474       sum_y += score_sums[j] / score_var[j];
475       sum_x2 +=  (float)llen_hist[j] * x * x /score_var[j];
476       sum_xy +=  x * score_sums[j]/score_var[j];
477     }
478 
479   det = n_w * sum_x2 - sum_x * sum_x;
480   rho = (n_w * sum_xy  - sum_x * sum_y)/det;
481   mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
482 
483 /* printf(" rho1/mu1: %.2f/%.2f\n",rho*LN_FACT,mu); */
484 
485   n = 0;
486   mean_x = mean_y = mean_y2 = 0.0;
487   var_x = var_y = 0.0;
488   covar_xy = covar_xy2 = 0.0;
489 
490   for (j = min_dlen; j <= max_dlen; j++)
491     if (llen_hist[j] > 1 ) {
492       n += llen_hist[j];
493       x = (double)j + 0.5;
494       mean_x += (double)llen_hist[j] * x;
495       mean_y += (double)score_sums[j];
496       var_x += (double)llen_hist[j] * x * x;
497       var_y += (double)score2_sums[j];
498       covar_xy += x * (double)score_sums[j];
499     }
500   mean_x /= n; mean_y /= n;
501   var_x = var_x / (float)n - mean_x * mean_x;
502   var_y = var_y / (float)n - mean_y * mean_y;
503 
504   covar_xy = covar_xy / (float)n - mean_x * mean_y;
505 
506 /*  rho = covar_xy / var_x;
507   mu = mean_y - rho * mean_x;
508 */
509 
510   mean_y2 = covar_xy2 = 0.0;
511   for (j = min_dlen; j <= max_dlen; j++)
512     if (llen_hist[j] > 1) {
513       x = (double)j + 0.5;
514       u = rho * x + mu;
515       y2 = score2_sums[j] - 2 * score_sums[j] * u + (float)llen_hist[j] * u * u;
516       mean_y2 += y2;
517       covar_xy2 += x * y2;
518     }
519 
520   mean_y2 /= n;
521   covar_xy2 = covar_xy2 / (float)n - mean_x * mean_y2;
522   rho2 = covar_xy2 / var_x;
523   mu2 = mean_y2 - rho2 * mean_x;
524 
525   if (rho2 < 0.0 )
526     z = (rho2 * LN_FACT*log((double)max_length) + mu2 > 0.0) ? max_length : exp((-1.0 - mu2 / rho2)/LN_FACT);
527   else z =  rho2 ? exp((1.0 - mu2 / rho2)/LN_FACT) : LENGTH_CUTOFF;
528   if (z < 2* LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
529 
530   var_cutoff = rho2*LN_FACT*log(z) + mu2;
531 /*
532   printf("\nMinimum allowed predicted variance (%0.2f) at n = %.0f\n",
533 	 var_cutoff,z);
534 */
535   mean_u2 = 0.0;
536   n_u2 = 0;
537   for ( j = min_dlen; j < max_dlen; j++) {
538     y = j+0.5;
539     x = rho * y + mu;
540     v = rho2 * y + mu2;
541     if (v < var_cutoff) v = var_cutoff;
542     if (llen_hist[j]> 1) {
543       u2 = score_var[j] =
544 	(score2_sums[j] - 2 * x * score_sums[j] + (float)llen_hist[j] * x * x)/v;
545       mean_u2 += u2;
546       n_u2++;
547     }
548     else score_var[j] = -1.0;
549   }
550 
551   mean_u2 /= (double)n_u2;
552 /*  printf(" average variance: %.2f\n",mean_u2); */
553   if (mean_u2 < 0.0) mean_u2 = -mean_u2;
554 
555   mean_3u2 = mean_u2*3.0;
556 
557   for (j = min_dlen; j < max_dlen; j++) {
558     if (llen_hist[j] <= 1) continue;
559     if (score_var[j] > mean_3u2) {
560 /*      printf(" removing %d %d %.2f\n",
561 	     j, (int)(exp((double)j/LN_FACT)-0.5),
562 	     score_var[j]); */
563       llen_hist[j] = 0;
564     }
565   }
566   fit_llen();
567 }
568 
find_z(score,length)569 float find_z(score, length)
570      int score, length;
571 {
572   float log_len, var, z;
573 
574   log_len = LN_FACT*log((double)(length));
575   var = rho2 * log_len + mu2;
576   if (var < var_cutoff) var = var_cutoff;
577   z = ((float)score - rho * log_len - mu) / sqrt(var);
578 
579   return (50.0 + 10.0*z);
580 }
581 
find_zm(score,length)582 float find_zm(score, length)
583      int score, length;
584 {
585   float log_len, var, z;
586 
587   if ( length < LENGTH_CUTOFF) return 0;
588 
589   log_len = LN_FACT*log((double)(length));
590 /*  var = rho2 * log_len + mu2;
591   if (var < var_cutoff) var = var_cutoff;
592 */
593   var = mean_var;
594   z = ((float)score - rho * log_len - mu) / sqrt(var);
595 
596   return (50.0 + z*10.0);
597 }
598 
599 /* computes E value for a given z value, assuming extreme value distribution */
z_to_E(zs)600 float z_to_E(zs)
601      float zs;
602 {
603   float e;
604 
605   if (num_db_entries < 5) return (float)num_db_entries;
606 
607   e = exp(-1.282555 * zs - .577216);
608   return (float)num_db_entries * (e > .01 ? 1.0 - exp(-e) : e);
609 }
610 
zs_to_E(zs,n1)611 float zs_to_E(zs,n1) /* computes E-value for a given z value,
612 		    assuming extreme value distribution */
613      float zs;
614      int n1;
615 {
616   float e, z, k;
617 
618   if (num_db_entries < 5) return 0.0;
619 
620   z = (zs - 50.0)/10.0;
621 
622   e = exp(-1.28255 * z - .577216);
623 
624 
625   if (dnaseq==1) k = (float)total_db_length /(float)n1;
626   else k = (float)num_db_entries;
627 
628 
629 /*  k = (float)num_db_entries;  */
630 
631   return k * (e > .01 ? 1.0 - exp(-e) : e);
632 }
633 
zs_to_Ec(zs)634 float zs_to_Ec(zs) /* computes 1.0 - E value for a given z value,
635 		    assuming extreme value distribution */
636      float zs;
637 {
638   float e, z;
639 
640   if (num_db_entries < 5) return 0.0;
641 
642   z = (zs - 50.0)/10.0;
643 
644   e =  exp(-1.28255 * z - .577216);
645   return (float)num_db_entries * (e > .01 ?  exp(-e) : 1.0 - e);
646 }
647 
648 /* above choice of parameters for Gumbel give mean of .000168, variance of
649 1.000857, so should be adequate approximation to z-score distribution when
650 behavior is "local" */
651 
vsort(v,s,n)652 vsort(v,s,n)
653 	float *v; int *s, n;
654 {
655   int gap, i, j;
656   float tmp;
657   int itmp;
658 
659   for (gap=n/2; gap>0; gap/=2)
660     for (i=gap; i<n; i++)
661       for (j=i-gap; j>=0; j -= gap) {
662 	if (v[j] >= v[j+gap]) break;
663 	tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
664 	itmp = s[j]; s[j]=s[j+gap]; s[j+gap]=itmp;
665       }
666 }
667 
calc_ks()668 calc_ks()
669 {
670   int i, cum_hl;
671   float x_tmp, cur_e, f_int;
672 
673   cum_hl = ks_df = 0;
674   ks_dev = 0.0;
675   f_int = (float)(5*histint + min_hist) + (float)histint/2.0;
676   for (i=0; i<6; i++) cum_hl += hist[i];
677   for (i=6; i< (90-min_hist)/histint; i++) {
678     cum_hl += hist[i];
679     if (hist[i]>0) {
680       f_int = (float)(i*histint + min_hist) + (float)histint/2.0;
681       cur_e = zs_to_Ec(f_int);
682       x_tmp = fabs((float)cum_hl - cur_e);
683       if (x_tmp > ks_dev) ks_dev = x_tmp;
684       ks_df++;
685     }
686   }
687   ks_dev /= (float)num_db_entries;
688 }
689