1 /*	scalebest.c
2 
3 	This routine gives us the option to rescale a set of scores and
4 	leave them in an unused irelv
5 
6 */
7 
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <math.h>
11 
12 #define MAXHIST 50
13 #ifdef BIGMEM
14 #define MAX_LLEN 200
15 #else
16 #define MAX_LLEN 100
17 #endif
18 #define MAX_SSCORE 300
19 
20 extern int dnaseq;
21 extern int nlib, optall;
22 extern int histflg;
23 extern long *hist;
24 extern int histint, min_hist, max_hist, maxh;
25 float ks_dev;
26 int ks_df;
27 long num_db_entries, total_db_length, z_calls;
28 
29 #define LENGTH_CUTOFF 10 /* minimum database sequence length allowed, for fitting */
30 
31 #define LN_FACT 10.0
32 
33 int *llen_hist=NULL;
34 float *score_sums, *score2_sums, *score_var;
35 static int *h_len;
36 float *score_var;
37 static double logK, lambda, rho, mu, rho2, mu2, mean_var, mean_length;
38 static int max_llen, min_llen, max_sscore;
39 static int max_dlen, min_dlen;
40 static int max_length, min_length, zero_scores;
41 static float var_cutoff;
42 static int fit_flag=1;
43 
44 #ifdef FASTA_BEST
45 struct beststr {
46 	int score;	/* pam score with segment optimization*/
47 	int score0;
48 	int gscore;
49 	int sscore;
50 	float zscore;
51 	float escore;
52 	int n1;
53 	long lseek;	/* position in library file */
54 	int dp;		/* diagonal of match */
55 	int start;	/* start of match in lib seq */
56 	int stop;	/* end of match in lib seq */
57 	int cont;	/* offset into sequence */
58 	int frame;
59 	int lib;
60 	};
61 #else
62 struct beststr {
63 	int score;	/* smith-waterman score */
64 	int sscore;	/* duplicate for compatibility with fasta */
65 	float zscore;
66 	float escore;
67 	int n1;
68 	long lseek;	/* position in library file */
69 	int cont;	/* offset into sequence */
70 	int frame;
71 	int lib;
72 #ifdef RES_STATS
73   char libstr[13];
74 #endif
75 	};
76 #endif
77 
78 float find_zm(), find_z();
79 
process_hist(n0,bptr,nbest)80 process_hist(n0,bptr, nbest)
81      int n0;
82 #ifndef FAR_PTR
83      struct beststr **bptr;
84 #else
85      struct beststr huge * huge * bptr;
86 #endif
87      int nbest;
88 {
89   int i, j, llen, hscore;
90   int n_trimmed;
91   float zs;
92 
93   if (nbest <= 5) {
94     mean_var = 500.0;
95     rho = mu = rho2 = mu2 = 0.0;
96     for (i=0; i<nbest; i++) bptr[i]->zscore = 50.0;
97     return;
98   }
99 
100   inithist();
101 
102   for (i = 0; i<nbest; i++)
103     addhist(bptr[i]->sscore,bptr[i]->n1);
104 
105   fit_llen();	/* now we have rho, mu, rho2, mu2, mean_var to set
106 		   the parameters for the histogram */
107 
108   if (fit_flag) {
109     n_trimmed=0;
110     for (i = 0; i < nbest; i++) {
111       zs = find_zm(bptr[i]->sscore,bptr[i]->n1);
112       if (zs < 0 || zs > 100 ) {
113 	prune_hist(bptr[i]->sscore,bptr[i]->n1);
114 	n_trimmed++;
115       }
116     }
117   }
118 /*
119   if (n_trimmed > 0)
120     printf("Trimmed %d entries with z > 5.0\n", n_trimmed);
121 */
122 
123   if (fit_flag) fit_llens();
124 
125   inithistz(MAXHIST);
126   for (i = 0; i < nbest; i++) {
127     bptr[i]->zscore = find_zm(bptr[i]->sscore,bptr[i]->n1);
128     addhistz(bptr[i]->zscore,bptr[i]->n1);
129   }
130 }
131 
132 
alloc_hist()133 alloc_hist()
134 {
135   if (llen_hist == NULL) {
136     llen_hist = (int *)calloc((size_t)(max_llen+1),sizeof(int));
137     score_sums = (float *)calloc((size_t)(max_llen + 1),sizeof(float));
138     score2_sums = (float *)calloc((size_t)(max_llen + 1),sizeof(float));
139     score_var = (float *)calloc((size_t)(max_llen + 1),sizeof(float));
140   }
141 }
142 
free_hist()143 free_hist()
144 {
145   if (llen_hist != NULL) {
146     free(score_var);
147     free(score2_sums);
148     free(score_sums);
149     free(llen_hist);
150     llen_hist=NULL;
151   }
152 }
153 
inithist()154 inithist()
155 {
156   int i;
157 
158   max_llen = MAX_LLEN;
159   max_sscore = MAX_SSCORE;
160   alloc_hist();
161 
162   max_dlen = 0;
163   min_dlen = MAX_LLEN;
164 
165   for (i = 0; i <= max_llen; i++) {
166     llen_hist[i] = 0;
167     score_sums[i] = score2_sums[i] = 0.0;
168   }
169   zero_scores = 0;
170   min_length = 10000;
171   max_length = 0;
172   num_db_entries = 0;
173   total_db_length = 0;
174 }
175 
addhist(score,length)176 addhist(score, length)
177      int score, length;
178 {
179   int llen;
180 
181   if ( score<=0 || length < LENGTH_CUTOFF) {
182     zero_scores++;
183     return;
184   }
185 
186   if (length > max_length) max_length = length;
187   if (length < min_length) min_length = length;
188   if (score > max_sscore) score = max_sscore;
189 
190   llen = (int)(LN_FACT*log((double)length)+0.5);
191 
192   if (llen < 0 ) llen = 0;
193   if (llen > max_llen) llen = max_llen;
194   if (llen > max_dlen) max_dlen = llen;
195   if (llen < min_dlen) min_dlen = llen;
196   llen_hist[llen]++;
197   score_sums[llen] += (float)score;
198   score2_sums[llen] += ((float)score * (float)score);
199 
200   num_db_entries++;
201   total_db_length += length;
202 }
203 
204 /* histogram will go from z-scores of 20 .. 100 with mean 50 and z=10 */
205 
206 
inithistz(mh)207 inithistz(mh)
208      int mh;
209 {
210   min_hist = 20;
211   max_hist = 120;
212   z_calls = 0;
213   total_db_length = 0;
214   num_db_entries = 0;
215 
216   histint = (int)((float)(max_hist - min_hist + 2)/(float)mh+0.5);
217   maxh = (int)((float)(max_hist - min_hist + 2)/(float)histint+0.5) ;
218 
219   if ((hist=(long *)calloc((size_t)maxh,sizeof(long)))==NULL) {
220     fprintf(stderr," cannot allocate %d for histogram\n",maxh);
221     histflg = 0;
222   }
223 }
224 
addhistz(zs,length)225 addhistz(zs,length)
226      float zs;
227      int length;
228 {
229   int ih, zi;
230 
231   z_calls++;
232 
233   zi = (int)(zs + 0.5);
234 
235   if ((zi >= min_hist) && (zi <= max_hist)) {
236     num_db_entries++;
237     total_db_length += length;
238   }
239 
240   if (zi < min_hist) zi = min_hist;
241   if (zi > max_hist) zi = max_hist;
242 
243   ih = (zi - min_hist)/histint;
244 
245   hist[ih]++;
246 }
247 
prune_hist(score,length)248 prune_hist(score, length)
249      int score, length;
250 {
251   int llen;
252 
253   if (score <= 0 || length < LENGTH_CUTOFF) return;
254 
255   if (score > max_sscore) score = max_sscore;
256 
257   llen = (int)(LN_FACT*log((double)length)+0.5);
258 
259   if (llen < 0 ) llen = 0;
260   if (llen > max_llen) llen = max_llen;
261   llen_hist[llen]--;
262   score_sums[llen] -= (float)score;
263   score2_sums[llen] -= ((float)score * (float)score);
264 
265   num_db_entries--;
266   total_db_length -= length;
267 }
268 
269 
fit_llen()270 fit_llen()
271 {
272   int j;
273   long n, cell_size;
274   int ej, last_ej;
275   int n_size;
276   double mean, s, s2, x, y, y2, t, t2, u, u2, v, p, z;
277   double mean_x, mean_y, var_x, var_y, covar_xy;
278   double mean_y2, covar_xy2;
279   double ex, exl;
280 
281   double sum_x, sum_y, sum_x2, sum_xy, det, n_w;
282 
283 /* now fit scores to best linear function of log(n), using
284    simple linear regression */
285 
286   /*
287   for (min_llen=0; min_llen < max_llen; min_llen++)
288     if (llen_hist[min_llen]) break;
289   */
290   min_dlen--;
291 
292   for (n_size=0,j = min_dlen; j < max_dlen; j++) {
293     if (llen_hist[j] > 1) {
294       score_var[j] = score2_sums[j]/(float)llen_hist[j]
295 	- (score_sums[j]/(float)llen_hist[j])
296 	* (score_sums[j]/(float)llen_hist[j]);
297       score_var[j] /= (float)(llen_hist[j]-1.0);
298       if (score_var[j] <= 1.0 ) score_var[j] = 1.0;
299       n_size++;
300     }
301   }
302 
303   n_w = 0.0;
304   sum_x = sum_y = sum_x2 = sum_xy = 0;
305   for (j = min_dlen; j < max_dlen; j++)
306     if (llen_hist[j] > 1) {
307       x = j + 0.5;
308       n_w += (float)llen_hist[j]/score_var[j];
309       sum_x +=   (float)llen_hist[j] * x / score_var[j] ;
310       sum_y += score_sums[j] / score_var[j];
311       sum_x2 +=  (float)llen_hist[j] * x * x /score_var[j];
312       sum_xy +=  x * score_sums[j]/score_var[j];
313     }
314 
315   if (n_size < 5 ) {
316     fit_flag=0;
317     rho = 0;
318     mu = sum_y/n_w;
319   }
320   else {
321     det = n_w * sum_x2 - sum_x * sum_x;
322     if (det > 0.001) {
323       rho = (n_w * sum_xy  - sum_x * sum_y)/det;
324       mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
325     }
326     else {
327       fit_flag = 0;
328       rho = 0;
329       mu = sum_y/n_w;
330     }
331   }
332 
333     /*   printf(" rho1/mu1: %.2f/%.2f\n",rho*LN_FACT,mu); */
334 
335   n = 0;
336   mean_x = mean_y = 0.0;
337   var_x = var_y = 0.0;
338   covar_xy = 0.0;
339 
340   for (j = min_dlen; j <= max_dlen; j++)
341     if (llen_hist[j] > 1 ) {
342       n += llen_hist[j];
343       x = (double)j + 0.5;
344       mean_x += (double)llen_hist[j] * x;
345       mean_y += (double)score_sums[j];
346       var_x += (double)llen_hist[j] * x * x;
347       var_y += (double)score2_sums[j];
348       covar_xy += x * (double)score_sums[j];
349     }
350   mean_x /= n; mean_y /= n;
351   var_x = var_x / (float)n - mean_x * mean_x;
352   var_y = var_y / (float)n - mean_y * mean_y;
353 
354   covar_xy = covar_xy / (float)n - mean_x * mean_y;
355 
356 /*
357   rho = covar_xy / var_x;
358   mu = mean_y - rho * mean_x;
359 */
360   mean_y2 = covar_xy2 = 0.0;
361   for (j = min_dlen; j <= max_dlen; j++)
362     if (llen_hist[j] > 1) {
363       x = (double)j + 0.5;
364       u = rho * x + mu;
365       y2 = score2_sums[j] - 2 * score_sums[j] * u + (float)llen_hist[j] * u * u;
366       mean_y2 += y2;
367       covar_xy2 += x * y2;
368     }
369 
370   mean_var = mean_y2 /= (float)n;
371   covar_xy2 = covar_xy2 / (float)n - mean_x * mean_y2;
372   if (fit_flag) {
373     rho2 = covar_xy2 / var_x;
374     mu2 = mean_y2 - rho2 * mean_x;
375   }
376   else {
377     rho2 = 0;
378     mu2 = mean_y2;
379   }
380 
381   if (rho2 < 0.0 )
382     z = (rho2 * LN_FACT*log((double)max_length) + mu2 > 0.0) ? max_length : exp((-1.0 - mu2 / rho2)/LN_FACT);
383   else z =  rho2 ? exp((1.0 - mu2 / rho2)/LN_FACT) : LENGTH_CUTOFF;
384   if (z < 2*LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
385 
386   var_cutoff = rho2 * LN_FACT*log(z) + mu2;
387 
388 /*
389   fprintf(stderr,"\nMinimum allowed predicted variance (%0.2f) at n = %.0f\n",
390 	 var_cutoff,z);
391 
392   printf("\nVariance of residuals explained by log(n): %.2f\n",
393 	 covar_xy2 * covar_xy2 / var_x );
394 
395    printf("\nObserved best fits to log(n):\n");
396     printf(" mean = %.2f * log(n) + %.2f,   var = %.2f * log(n) + %.2f \n",
397     rho*LN_FACT, mu, rho2*LN_FACT, mu2);
398 
399   cell_size = n / 20;
400 
401   printf("\n\n                          S              z");
402   printf("\n\nLengths (n)   No.     Mean    Std        Mean    Std\n");
403 
404   last_ej = LENGTH_CUTOFF;
405   for (j = min_llen; j <= max_llen; ) {
406     for (s = s2 = t = t2 = u = u2 = n = 0;
407 	 n < cell_size && j <= max_llen; j++)
408       if (llen_hist[j] > 1) {
409 	n += llen_hist[j];
410 	s += score_sums[j];
411 	s2 += score2_sums[j];
412 
413 	y = j+0.5;
414 	x = rho * y + mu;
415 	v = rho2 * y + mu2;
416 	if (v < var_cutoff) v = var_cutoff;
417 	u += (score_sums[j] - (float)llen_hist[j] * x) / sqrt(v);
418 	u2 += (score2_sums[j] - 2 * x * score_sums[j] + (float)llen_hist[j] * x * x) /
419 	  v;
420       }
421     if (n > 0) {
422       mean = s / (float)n;
423       ej = (int)(exp((double)j/LN_FACT)+0.5);
424       printf("\n%3d - %4d:   %4d  %6.1f  %6.2f", last_ej, ej, n, mean,
425 	     sqrt(s2 / (float)n - mean * mean));
426       mean = u / (float)n;
427       printf("     %6.1f  %6.2f", mean, sqrt(u2 / n - mean * mean));
428       last_ej = ej;
429     }
430   }
431 */
432 }
433 
fit_llens()434 fit_llens()
435 {
436   int j;
437   long n, cell_size;
438   int ej, last_ej, n_u2;
439   double mean, s, s2, x, y, y2, t, t2, u, u2, v, p, z;
440   double mean_x, mean_y, var_x, var_y, covar_xy;
441   double mean_y2, covar_xy2;
442   double mean_u2, mean_3u2, mean_i3u2;
443   double sum_x, sum_y, sum_x2, sum_xy, det, n_w;
444 
445 /* now fit scores to best linear function of log(n), using
446    simple linear regression */
447 
448   /*
449   for (min_llen=0; min_llen < max_llen; min_llen++)
450     if (llen_hist[min_llen]) break;
451   min_llen--;
452   */
453 
454   for (j = min_dlen; j < max_dlen; j++) {
455     if (llen_hist[j] > 1) {
456       score_var[j] = score2_sums[j]/(float)llen_hist[j]
457 	- (score_sums[j]/(float)llen_hist[j])
458 	* (score_sums[j]/(float)llen_hist[j]);
459       score_var[j] /= (float)(llen_hist[j]-1.0);
460       if (score_var[j] <= 1.0 ) score_var[j] = 1.0;
461     }
462   }
463 
464   n_w = 0.0;
465   sum_x = sum_y = sum_x2 = sum_xy = 0;
466   for (j = min_dlen; j < max_dlen; j++)
467     if (llen_hist[j] > 1) {
468       x = j + 0.5;
469       n_w += (float)llen_hist[j]/score_var[j];
470       sum_x +=   (float)llen_hist[j] * x / score_var[j] ;
471       sum_y += score_sums[j] / score_var[j];
472       sum_x2 +=  (float)llen_hist[j] * x * x /score_var[j];
473       sum_xy +=  x * score_sums[j]/score_var[j];
474     }
475 
476   det = n_w * sum_x2 - sum_x * sum_x;
477   rho = (n_w * sum_xy  - sum_x * sum_y)/det;
478   mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
479 
480 /* printf(" rho1/mu1: %.2f/%.2f\n",rho*LN_FACT,mu); */
481 
482   n = 0;
483   mean_x = mean_y = mean_y2 = 0.0;
484   var_x = var_y = 0.0;
485   covar_xy = covar_xy2 = 0.0;
486 
487   for (j = min_dlen; j <= max_dlen; j++)
488     if (llen_hist[j] > 1 ) {
489       n += llen_hist[j];
490       x = (double)j + 0.5;
491       mean_x += (double)llen_hist[j] * x;
492       mean_y += (double)score_sums[j];
493       var_x += (double)llen_hist[j] * x * x;
494       var_y += (double)score2_sums[j];
495       covar_xy += x * (double)score_sums[j];
496     }
497   mean_x /= n; mean_y /= n;
498   var_x = var_x / (float)n - mean_x * mean_x;
499   var_y = var_y / (float)n - mean_y * mean_y;
500 
501   covar_xy = covar_xy / (float)n - mean_x * mean_y;
502 
503 /*  rho = covar_xy / var_x;
504   mu = mean_y - rho * mean_x;
505 */
506 
507   mean_y2 = covar_xy2 = 0.0;
508   for (j = min_dlen; j <= max_dlen; j++)
509     if (llen_hist[j] > 1) {
510       x = (double)j + 0.5;
511       u = rho * x + mu;
512       y2 = score2_sums[j] - 2 * score_sums[j] * u + (float)llen_hist[j] * u * u;
513       mean_y2 += y2;
514       covar_xy2 += x * y2;
515     }
516 
517   mean_y2 /= n;
518   covar_xy2 = covar_xy2 / (float)n - mean_x * mean_y2;
519   rho2 = covar_xy2 / var_x;
520   mu2 = mean_y2 - rho2 * mean_x;
521 
522   if (rho2 < 0.0 )
523     z = (rho2 * LN_FACT*log((double)max_length) + mu2 > 0.0) ? max_length : exp((-1.0 - mu2 / rho2)/LN_FACT);
524   else z =  rho2 ? exp((1.0 - mu2 / rho2)/LN_FACT) : LENGTH_CUTOFF;
525   if (z < 2* LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
526 
527   var_cutoff = rho2*LN_FACT*log(z) + mu2;
528 /*
529   printf("\nMinimum allowed predicted variance (%0.2f) at n = %.0f\n",
530 	 var_cutoff,z);
531 */
532   mean_u2 = 0.0;
533   n_u2 = 0;
534   for ( j = min_dlen; j < max_dlen; j++) {
535     y = j+0.5;
536     x = rho * y + mu;
537     v = rho2 * y + mu2;
538     if (v < var_cutoff) v = var_cutoff;
539     if (llen_hist[j]> 1) {
540       u2 = score_var[j] =
541 	(score2_sums[j] - 2 * x * score_sums[j] + (float)llen_hist[j] * x * x)/v;
542       mean_u2 += u2;
543       n_u2++;
544     }
545     else score_var[j] = -1.0;
546   }
547 
548   mean_u2 /= (double)n_u2;
549 /*  printf(" average variance: %.2f\n",mean_u2); */
550   if (mean_u2 < 0.0) mean_u2 = -mean_u2;
551 
552   mean_3u2 = mean_u2*3.0;
553 
554   for (j = min_dlen; j < max_dlen; j++) {
555     if (llen_hist[j] <= 1) continue;
556     if (score_var[j] > mean_3u2) {
557 /*      printf(" removing %d %d %.2f\n",
558 	     j, (int)(exp((double)j/LN_FACT)-0.5),
559 	     score_var[j]); */
560       llen_hist[j] = 0;
561     }
562   }
563   fit_llen();
564 }
565 
find_z(score,length)566 float find_z(score, length)
567      int score, length;
568 {
569   float log_len, var, z;
570 
571   log_len = LN_FACT*log((double)(length));
572   var = rho2 * log_len + mu2;
573   if (var < var_cutoff) var = var_cutoff;
574   z = ((float)score - rho * log_len - mu) / sqrt(var);
575 
576   return (50.0 + 10.0*z);
577 }
578 
find_zm(score,length)579 float find_zm(score, length)
580      int score, length;
581 {
582   float log_len, var, z;
583 
584   if ( length < LENGTH_CUTOFF) return 0;
585 
586   log_len = LN_FACT*log((double)(length));
587 /*  var = rho2 * log_len + mu2;
588   if (var < var_cutoff) var = var_cutoff;
589 */
590   var = mean_var;
591   z = ((float)score - rho * log_len - mu) / sqrt(var);
592 
593   return (50.0 + z*10.0);
594 }
595 
596 /* computes E value for a given z value, assuming extreme value distribution */
z_to_E(zs)597 float z_to_E(zs)
598      float zs;
599 {
600   float e;
601 
602   if (num_db_entries < 5) return (float)num_db_entries;
603 
604   e = exp(-1.282555 * zs - .577216);
605   return (float)num_db_entries * (e > .01 ? 1.0 - exp(-e) : e);
606 }
607 
zs_to_E(zs,n1)608 float zs_to_E(zs,n1) /* computes E-value for a given z value,
609 		    assuming extreme value distribution */
610      float zs;
611      int n1;
612 {
613   float e, z, k;
614 
615   if (num_db_entries < 5) return 0.0;
616 
617   z = (zs - 50.0)/10.0;
618 
619   e = exp(-1.28255 * z - .577216);
620 
621 
622   if (dnaseq==1) k = (float)total_db_length /(float)n1;
623   else k = (float)num_db_entries;
624 
625 
626 /*  k = (float)num_db_entries;  */
627 
628   return k * (e > .01 ? 1.0 - exp(-e) : e);
629 }
630 
zs_to_Ec(zs)631 float zs_to_Ec(zs) /* computes 1.0 - E value for a given z value,
632 		    assuming extreme value distribution */
633      float zs;
634 {
635   float e, z;
636 
637   if (num_db_entries < 5) return 0.0;
638 
639   z = (zs - 50.0)/10.0;
640 
641   e =  exp(-1.28255 * z - .577216);
642   return (float)num_db_entries * (e > .01 ?  exp(-e) : 1.0 - e);
643 }
644 
645 /* above choice of parameters for Gumbel give mean of .000168, variance of
646 1.000857, so should be adequate approximation to z-score distribution when
647 behavior is "local" */
648 
vsort(v,s,n)649 vsort(v,s,n)
650 	float *v; int *s, n;
651 {
652   int gap, i, j;
653   float tmp;
654   int itmp;
655 
656   for (gap=n/2; gap>0; gap/=2)
657     for (i=gap; i<n; i++)
658       for (j=i-gap; j>=0; j -= gap) {
659 	if (v[j] >= v[j+gap]) break;
660 	tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
661 	itmp = s[j]; s[j]=s[j+gap]; s[j+gap]=itmp;
662       }
663 }
664 
calc_ks()665 calc_ks()
666 {
667   int i, cum_hl;
668   float x_tmp, cur_e, f_int;
669 
670   cum_hl = ks_df = 0;
671   ks_dev = 0.0;
672   f_int = (float)(5*histint + min_hist) + (float)histint/2.0;
673   for (i=0; i<6; i++) cum_hl += hist[i];
674   for (i=6; i< (90-min_hist)/histint; i++) {
675     cum_hl += hist[i];
676     if (hist[i]>0) {
677       f_int = (float)(i*histint + min_hist) + (float)histint/2.0;
678       cur_e = zs_to_Ec(f_int);
679       x_tmp = fabs((float)cum_hl - cur_e);
680       if (x_tmp > ks_dev) ks_dev = x_tmp;
681       ks_df++;
682     }
683   }
684   ks_dev /= (float)num_db_entries;
685 }
686