1 /* scalebest.c
2
3 This routine gives us the option to rescale a set of scores and
4 leave them in an unused irelv
5
6 */
7
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <math.h>
11
12 #define MAXHIST 50
13 #ifdef BIGMEM
14 #define MAX_LLEN 200
15 #else
16 #define MAX_LLEN 100
17 #endif
18 #define MAX_SSCORE 300
19
20 extern int dnaseq;
21 extern int nlib, optall;
22 extern int histflg;
23 extern long *hist;
24 extern int histint, min_hist, max_hist, maxh;
25 float ks_dev;
26 int ks_df;
27 long num_db_entries, total_db_length, z_calls;
28
29 #define LENGTH_CUTOFF 10 /* minimum database sequence length allowed, for fitting */
30
31 #define LN_FACT 10.0
32
33 int *llen_hist=NULL;
34 float *score_sums, *score2_sums, *score_var, *score_mu;
35 static int *h_len;
36 static double logK, lambda, rho, mu, rho2, mu2, mean_var, mean_length;
37 static int max_llen, min_llen, max_sscore;
38 static int max_dlen, min_dlen;
39 static int max_length, min_length, zero_scores;
40 static float var_cutoff;
41 static int fit_flag=1;
42
43 #ifdef FASTA_BEST
44 struct beststr {
45 int score; /* pam score with segment optimization*/
46 int score0;
47 int gscore;
48 int sscore;
49 float zscore;
50 float escore;
51 int n1;
52 long lseek; /* position in library file */
53 int dp; /* diagonal of match */
54 int start; /* start of match in lib seq */
55 int stop; /* end of match in lib seq */
56 int cont; /* offset into sequence */
57 int frame;
58 int lib;
59 };
60 #else
61 struct beststr {
62 int score; /* smith-waterman score */
63 int sscore; /* duplicate for compatibility with fasta */
64 float zscore;
65 float escore;
66 int n1;
67 long lseek; /* position in library file */
68 int cont; /* offset into sequence */
69 int frame;
70 int lib;
71 #ifdef RES_STATS
72 char libstr[13];
73 #endif
74 };
75 #endif
76
77 float find_zm(), find_z();
78
process_hist(n0,bptr,nbest)79 process_hist(n0,bptr, nbest)
80 int n0;
81 #ifndef FAR_PTR
82 struct beststr **bptr;
83 #else
84 struct beststr huge * huge * bptr;
85 #endif
86 int nbest;
87 {
88 int i, j, llen, hscore;
89 int n_trimmed;
90 float zs;
91
92 if (nbest <= 5) {
93 mean_var = 500.0;
94 rho = mu = rho2 = mu2 = 0.0;
95 for (i=0; i<nbest; i++) bptr[i]->zscore = 50.0;
96 return;
97 }
98
99 inithist();
100
101 for (i = 0; i<nbest; i++)
102 addhist(bptr[i]->sscore,bptr[i]->n1);
103
104 fit_llen(); /* now we have rho, mu, rho2, mu2, mean_var to set
105 the parameters for the histogram */
106
107 if (fit_flag) {
108 n_trimmed=0;
109 for (i = 0; i < nbest; i++) {
110 zs = find_zm(bptr[i]->sscore,bptr[i]->n1);
111 if (zs < 0 || zs > 100 ) {
112 prune_hist(bptr[i]->sscore,bptr[i]->n1);
113 n_trimmed++;
114 }
115 }
116 }
117 /*
118 if (n_trimmed > 0)
119 printf("Trimmed %d entries with z > 5.0\n", n_trimmed);
120 */
121
122 if (fit_flag) fit_llens();
123
124 inithistz(MAXHIST);
125 for (i = 0; i < nbest; i++) {
126 bptr[i]->zscore = find_zm(bptr[i]->sscore,bptr[i]->n1);
127 addhistz(bptr[i]->zscore,bptr[i]->n1);
128 }
129 }
130
131
alloc_hist()132 alloc_hist()
133 {
134 if (llen_hist == NULL) {
135 llen_hist = (int *)calloc((size_t)(max_llen+1),sizeof(int));
136 score_sums = (float *)calloc((size_t)(max_llen + 1),sizeof(float));
137 score2_sums = (float *)calloc((size_t)(max_llen + 1),sizeof(float));
138 score_var = (float *)calloc((size_t)(max_llen + 1),sizeof(float));
139 score_mu = (float *)calloc((size_t)(max_llen + 1),sizeof(float));
140 }
141 }
142
free_hist()143 free_hist()
144 {
145 if (llen_hist != NULL) {
146 free(score_mu);
147 free(score_var);
148 free(score2_sums);
149 free(score_sums);
150 free(llen_hist);
151 llen_hist=NULL;
152 }
153 }
154
inithist()155 inithist()
156 {
157 int i;
158
159 max_llen = MAX_LLEN;
160 max_sscore = MAX_SSCORE;
161 alloc_hist();
162
163 max_dlen = 0;
164 min_dlen = MAX_LLEN;
165
166 for (i = 0; i <= max_llen; i++) {
167 llen_hist[i] = 0;
168 score_sums[i] = score2_sums[i] = 0.0;
169 }
170 zero_scores = 0;
171 min_length = 10000;
172 max_length = 0;
173 num_db_entries = 0;
174 total_db_length = 0;
175 }
176
addhist(score,length)177 addhist(score, length)
178 int score, length;
179 {
180 int llen;
181
182 if ( score<=0 || length < LENGTH_CUTOFF) {
183 zero_scores++;
184 return;
185 }
186
187 if (length > max_length) max_length = length;
188 if (length < min_length) min_length = length;
189 if (score > max_sscore) score = max_sscore;
190
191 llen = (int)(LN_FACT*log((double)length)+0.5);
192
193 if (llen < 0 ) llen = 0;
194 if (llen > max_llen) llen = max_llen;
195 if (llen > max_dlen) max_dlen = llen;
196 if (llen < min_dlen) min_dlen = llen;
197 llen_hist[llen]++;
198 score_sums[llen] += (float)score;
199 score2_sums[llen] += ((float)score * (float)score);
200
201 num_db_entries++;
202 total_db_length += length;
203 }
204
205 /* histogram will go from z-scores of 20 .. 100 with mean 50 and z=10 */
206
207
inithistz(mh)208 inithistz(mh)
209 int mh;
210 {
211 min_hist = 20;
212 max_hist = 120;
213 z_calls = 0;
214 total_db_length = 0;
215 num_db_entries = 0;
216
217 histint = (int)((float)(max_hist - min_hist + 2)/(float)mh+0.5);
218 maxh = (int)((float)(max_hist - min_hist + 2)/(float)histint+0.5) ;
219
220 if ((hist=(long *)calloc((size_t)maxh,sizeof(long)))==NULL) {
221 fprintf(stderr," cannot allocate %d for histogram\n",maxh);
222 histflg = 0;
223 }
224 }
225
addhistz(zs,length)226 addhistz(zs,length)
227 float zs;
228 int length;
229 {
230 int ih, zi;
231
232 z_calls++;
233
234 zi = (int)(zs + 0.5);
235
236 if ((zi >= min_hist) && (zi <= max_hist)) {
237 num_db_entries++;
238 total_db_length += length;
239 }
240
241 if (zi < min_hist) zi = min_hist;
242 if (zi > max_hist) zi = max_hist;
243
244 ih = (zi - min_hist)/histint;
245
246 hist[ih]++;
247 }
248
prune_hist(score,length)249 prune_hist(score, length)
250 int score, length;
251 {
252 int llen;
253
254 if (score <= 0 || length < LENGTH_CUTOFF) return;
255
256 if (score > max_sscore) score = max_sscore;
257
258 llen = (int)(LN_FACT*log((double)length)+0.5);
259
260 if (llen < 0 ) llen = 0;
261 if (llen > max_llen) llen = max_llen;
262 llen_hist[llen]--;
263 score_sums[llen] -= (float)score;
264 score2_sums[llen] -= ((float)score * (float)score);
265
266 num_db_entries--;
267 total_db_length -= length;
268 }
269
270
fit_llen()271 fit_llen()
272 {
273 int j;
274 long n, cell_size;
275 int ej, last_ej;
276 int n_size;
277 double mean, s, s2, x, y, y2, t, t2, u, u2, v, p, z;
278 double mean_x, mean_y, var_x, var_y, covar_xy;
279 double mean_y2, covar_xy2;
280 double ex, exl;
281
282 double sum_x, sum_y, sum_x2, sum_xy, det, n_w;
283
284 /* now fit scores to best linear function of log(n), using
285 simple linear regression */
286
287 /*
288 for (min_llen=0; min_llen < max_llen; min_llen++)
289 if (llen_hist[min_llen]) break;
290 */
291 min_dlen--;
292
293 for (j=min_dlen; j< max_dlen; j++)
294 if (llen_hist[j]>0) score_mu[j] = score_sums[j]/(float)llen_hist[j];
295
296 for (n_size=0,j = min_dlen; j < max_dlen; j++) {
297 if (llen_hist[j] > 1) {
298 score_var[j] = score2_sums[j] -
299 (float)llen_hist[j]*score_mu[j]*score_mu[j];
300 score_var[j] /= (float)(llen_hist[j]-1.0);
301 if (score_var[j] <= 1.0 ) score_var[j] = 1.0;
302 n_size++;
303 }
304 }
305
306 n_w = 0.0;
307 sum_x = sum_y = sum_x2 = sum_xy = 0;
308 for (j = min_dlen; j < max_dlen; j++)
309 if (llen_hist[j] > 1) {
310 x = j + 0.5;
311 n_w += (float)llen_hist[j]/score_var[j];
312 sum_x += (float)llen_hist[j] * x / score_var[j] ;
313 sum_y += score_sums[j] / score_var[j];
314 sum_x2 += (float)llen_hist[j] * x * x /score_var[j];
315 sum_xy += x * score_sums[j]/score_var[j];
316 }
317
318 if (n_size < 5 ) {
319 fit_flag=0;
320 rho = 0;
321 mu = sum_y/n_w;
322 }
323 else {
324 det = n_w * sum_x2 - sum_x * sum_x;
325 if (det > 0.001) {
326 rho = (n_w * sum_xy - sum_x * sum_y)/det;
327 mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
328 }
329 else {
330 fit_flag = 0;
331 rho = 0;
332 mu = sum_y/n_w;
333 }
334 }
335
336 /* printf(" rho1/mu1: %.2f/%.2f\n",rho*LN_FACT,mu); */
337
338 n = 0;
339 mean_x = mean_y = 0.0;
340 var_x = var_y = 0.0;
341 covar_xy = 0.0;
342
343 for (j = min_dlen; j <= max_dlen; j++)
344 if (llen_hist[j] > 1 ) {
345 n += llen_hist[j];
346 x = (double)j + 0.5;
347 mean_x += (double)llen_hist[j] * x;
348 mean_y += (double)score_sums[j];
349 var_x += (double)llen_hist[j] * x * x;
350 var_y += (double)score2_sums[j];
351 covar_xy += x * (double)score_sums[j];
352 }
353 mean_x /= n; mean_y /= n;
354 var_x = var_x / (float)n - mean_x * mean_x;
355 var_y = var_y / (float)n - mean_y * mean_y;
356
357 covar_xy = covar_xy / (float)n - mean_x * mean_y;
358
359 /*
360 rho = covar_xy / var_x;
361 mu = mean_y - rho * mean_x;
362 */
363 mean_y2 = covar_xy2 = 0.0;
364 for (j = min_dlen; j <= max_dlen; j++)
365 if (llen_hist[j] > 1) {
366 x = (double)j + 0.5;
367 u = rho * x + mu;
368 y2 = score2_sums[j] - 2 * score_sums[j] * u + (float)llen_hist[j] * u * u;
369 mean_y2 += y2;
370 covar_xy2 += x * y2;
371 }
372
373 mean_var = mean_y2 /= (float)n;
374 covar_xy2 = covar_xy2 / (float)n - mean_x * mean_y2;
375 if (fit_flag) {
376 rho2 = covar_xy2 / var_x;
377 mu2 = mean_y2 - rho2 * mean_x;
378 }
379 else {
380 rho2 = 0;
381 mu2 = mean_y2;
382 }
383
384 if (rho2 < 0.0 )
385 z = (rho2 * LN_FACT*log((double)max_length) + mu2 > 0.0) ? max_length : exp((-1.0 - mu2 / rho2)/LN_FACT);
386 else z = rho2 ? exp((1.0 - mu2 / rho2)/LN_FACT) : LENGTH_CUTOFF;
387 if (z < 2*LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
388
389 var_cutoff = rho2 * LN_FACT*log(z) + mu2;
390
391 /*
392 fprintf(stderr,"\nMinimum allowed predicted variance (%0.2f) at n = %.0f\n",
393 var_cutoff,z);
394
395 printf("\nVariance of residuals explained by log(n): %.2f\n",
396 covar_xy2 * covar_xy2 / var_x );
397
398 printf("\nObserved best fits to log(n):\n");
399 printf(" mean = %.2f * log(n) + %.2f, var = %.2f * log(n) + %.2f \n",
400 rho*LN_FACT, mu, rho2*LN_FACT, mu2);
401
402 cell_size = n / 20;
403
404 printf("\n\n S z");
405 printf("\n\nLengths (n) No. Mean Std Mean Std\n");
406
407 last_ej = LENGTH_CUTOFF;
408 for (j = min_llen; j <= max_llen; ) {
409 for (s = s2 = t = t2 = u = u2 = n = 0;
410 n < cell_size && j <= max_llen; j++)
411 if (llen_hist[j] > 1) {
412 n += llen_hist[j];
413 s += score_sums[j];
414 s2 += score2_sums[j];
415
416 y = j+0.5;
417 x = rho * y + mu;
418 v = rho2 * y + mu2;
419 if (v < var_cutoff) v = var_cutoff;
420 u += (score_sums[j] - (float)llen_hist[j] * x) / sqrt(v);
421 u2 += (score2_sums[j] - 2 * x * score_sums[j] + (float)llen_hist[j] * x * x) /
422 v;
423 }
424 if (n > 0) {
425 mean = s / (float)n;
426 ej = (int)(exp((double)j/LN_FACT)+0.5);
427 printf("\n%3d - %4d: %4d %6.1f %6.2f", last_ej, ej, n, mean,
428 sqrt(s2 / (float)n - mean * mean));
429 mean = u / (float)n;
430 printf(" %6.1f %6.2f", mean, sqrt(u2 / n - mean * mean));
431 last_ej = ej;
432 }
433 }
434 */
435 }
436
fit_llens()437 fit_llens()
438 {
439 int j;
440 long n, cell_size;
441 int ej, last_ej, n_u2;
442 double mean, s, s2, x, y, y2, t, t2, u, u2, v, p, z;
443 double mean_x, mean_y, var_x, var_y, covar_xy;
444 double mean_y2, covar_xy2;
445 double mean_u2, mean_3u2, mean_i3u2;
446 double sum_x, sum_y, sum_x2, sum_xy, det, n_w;
447
448 /* now fit scores to best linear function of log(n), using
449 simple linear regression */
450
451 /*
452 for (min_llen=0; min_llen < max_llen; min_llen++)
453 if (llen_hist[min_llen]) break;
454 min_llen--;
455 */
456
457 for (j = min_dlen; j < max_dlen; j++) {
458 if (llen_hist[j] > 1) {
459 score_var[j] = score2_sums[j]/(float)llen_hist[j]
460 - (score_sums[j]/(float)llen_hist[j])
461 * (score_sums[j]/(float)llen_hist[j]);
462 score_var[j] /= (float)(llen_hist[j]-1.0);
463 if (score_var[j] <= 1.0 ) score_var[j] = 1.0;
464 }
465 }
466
467 n_w = 0.0;
468 sum_x = sum_y = sum_x2 = sum_xy = 0;
469 for (j = min_dlen; j < max_dlen; j++)
470 if (llen_hist[j] > 1) {
471 x = j + 0.5;
472 n_w += (float)llen_hist[j]/score_var[j];
473 sum_x += (float)llen_hist[j] * x / score_var[j] ;
474 sum_y += score_sums[j] / score_var[j];
475 sum_x2 += (float)llen_hist[j] * x * x /score_var[j];
476 sum_xy += x * score_sums[j]/score_var[j];
477 }
478
479 det = n_w * sum_x2 - sum_x * sum_x;
480 rho = (n_w * sum_xy - sum_x * sum_y)/det;
481 mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
482
483 /* printf(" rho1/mu1: %.2f/%.2f\n",rho*LN_FACT,mu); */
484
485 n = 0;
486 mean_x = mean_y = mean_y2 = 0.0;
487 var_x = var_y = 0.0;
488 covar_xy = covar_xy2 = 0.0;
489
490 for (j = min_dlen; j <= max_dlen; j++)
491 if (llen_hist[j] > 1 ) {
492 n += llen_hist[j];
493 x = (double)j + 0.5;
494 mean_x += (double)llen_hist[j] * x;
495 mean_y += (double)score_sums[j];
496 var_x += (double)llen_hist[j] * x * x;
497 var_y += (double)score2_sums[j];
498 covar_xy += x * (double)score_sums[j];
499 }
500 mean_x /= n; mean_y /= n;
501 var_x = var_x / (float)n - mean_x * mean_x;
502 var_y = var_y / (float)n - mean_y * mean_y;
503
504 covar_xy = covar_xy / (float)n - mean_x * mean_y;
505
506 /* rho = covar_xy / var_x;
507 mu = mean_y - rho * mean_x;
508 */
509
510 mean_y2 = covar_xy2 = 0.0;
511 for (j = min_dlen; j <= max_dlen; j++)
512 if (llen_hist[j] > 1) {
513 x = (double)j + 0.5;
514 u = rho * x + mu;
515 y2 = score2_sums[j] - 2 * score_sums[j] * u + (float)llen_hist[j] * u * u;
516 mean_y2 += y2;
517 covar_xy2 += x * y2;
518 }
519
520 mean_y2 /= n;
521 covar_xy2 = covar_xy2 / (float)n - mean_x * mean_y2;
522 rho2 = covar_xy2 / var_x;
523 mu2 = mean_y2 - rho2 * mean_x;
524
525 if (rho2 < 0.0 )
526 z = (rho2 * LN_FACT*log((double)max_length) + mu2 > 0.0) ? max_length : exp((-1.0 - mu2 / rho2)/LN_FACT);
527 else z = rho2 ? exp((1.0 - mu2 / rho2)/LN_FACT) : LENGTH_CUTOFF;
528 if (z < 2* LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
529
530 var_cutoff = rho2*LN_FACT*log(z) + mu2;
531 /*
532 printf("\nMinimum allowed predicted variance (%0.2f) at n = %.0f\n",
533 var_cutoff,z);
534 */
535 mean_u2 = 0.0;
536 n_u2 = 0;
537 for ( j = min_dlen; j < max_dlen; j++) {
538 y = j+0.5;
539 x = rho * y + mu;
540 v = rho2 * y + mu2;
541 if (v < var_cutoff) v = var_cutoff;
542 if (llen_hist[j]> 1) {
543 u2 = score_var[j] =
544 (score2_sums[j] - 2 * x * score_sums[j] + (float)llen_hist[j] * x * x)/v;
545 mean_u2 += u2;
546 n_u2++;
547 }
548 else score_var[j] = -1.0;
549 }
550
551 mean_u2 /= (double)n_u2;
552 /* printf(" average variance: %.2f\n",mean_u2); */
553 if (mean_u2 < 0.0) mean_u2 = -mean_u2;
554
555 mean_3u2 = mean_u2*3.0;
556
557 for (j = min_dlen; j < max_dlen; j++) {
558 if (llen_hist[j] <= 1) continue;
559 if (score_var[j] > mean_3u2) {
560 /* printf(" removing %d %d %.2f\n",
561 j, (int)(exp((double)j/LN_FACT)-0.5),
562 score_var[j]); */
563 llen_hist[j] = 0;
564 }
565 }
566 fit_llen();
567 }
568
find_z(score,length)569 float find_z(score, length)
570 int score, length;
571 {
572 float log_len, var, z;
573
574 log_len = LN_FACT*log((double)(length));
575 var = rho2 * log_len + mu2;
576 if (var < var_cutoff) var = var_cutoff;
577 z = ((float)score - rho * log_len - mu) / sqrt(var);
578
579 return (50.0 + 10.0*z);
580 }
581
find_zm(score,length)582 float find_zm(score, length)
583 int score, length;
584 {
585 float log_len, var, z;
586
587 if ( length < LENGTH_CUTOFF) return 0;
588
589 log_len = LN_FACT*log((double)(length));
590 /* var = rho2 * log_len + mu2;
591 if (var < var_cutoff) var = var_cutoff;
592 */
593 var = mean_var;
594 z = ((float)score - rho * log_len - mu) / sqrt(var);
595
596 return (50.0 + z*10.0);
597 }
598
599 /* computes E value for a given z value, assuming extreme value distribution */
z_to_E(zs)600 float z_to_E(zs)
601 float zs;
602 {
603 float e;
604
605 if (num_db_entries < 5) return (float)num_db_entries;
606
607 e = exp(-1.282555 * zs - .577216);
608 return (float)num_db_entries * (e > .01 ? 1.0 - exp(-e) : e);
609 }
610
zs_to_E(zs,n1)611 float zs_to_E(zs,n1) /* computes E-value for a given z value,
612 assuming extreme value distribution */
613 float zs;
614 int n1;
615 {
616 float e, z, k;
617
618 if (num_db_entries < 5) return 0.0;
619
620 z = (zs - 50.0)/10.0;
621
622 e = exp(-1.28255 * z - .577216);
623
624
625 if (dnaseq==1) k = (float)total_db_length /(float)n1;
626 else k = (float)num_db_entries;
627
628
629 /* k = (float)num_db_entries; */
630
631 return k * (e > .01 ? 1.0 - exp(-e) : e);
632 }
633
zs_to_Ec(zs)634 float zs_to_Ec(zs) /* computes 1.0 - E value for a given z value,
635 assuming extreme value distribution */
636 float zs;
637 {
638 float e, z;
639
640 if (num_db_entries < 5) return 0.0;
641
642 z = (zs - 50.0)/10.0;
643
644 e = exp(-1.28255 * z - .577216);
645 return (float)num_db_entries * (e > .01 ? exp(-e) : 1.0 - e);
646 }
647
648 /* above choice of parameters for Gumbel give mean of .000168, variance of
649 1.000857, so should be adequate approximation to z-score distribution when
650 behavior is "local" */
651
vsort(v,s,n)652 vsort(v,s,n)
653 float *v; int *s, n;
654 {
655 int gap, i, j;
656 float tmp;
657 int itmp;
658
659 for (gap=n/2; gap>0; gap/=2)
660 for (i=gap; i<n; i++)
661 for (j=i-gap; j>=0; j -= gap) {
662 if (v[j] >= v[j+gap]) break;
663 tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
664 itmp = s[j]; s[j]=s[j+gap]; s[j+gap]=itmp;
665 }
666 }
667
calc_ks()668 calc_ks()
669 {
670 int i, cum_hl;
671 float x_tmp, cur_e, f_int;
672
673 cum_hl = ks_df = 0;
674 ks_dev = 0.0;
675 f_int = (float)(5*histint + min_hist) + (float)histint/2.0;
676 for (i=0; i<6; i++) cum_hl += hist[i];
677 for (i=6; i< (90-min_hist)/histint; i++) {
678 cum_hl += hist[i];
679 if (hist[i]>0) {
680 f_int = (float)(i*histint + min_hist) + (float)histint/2.0;
681 cur_e = zs_to_Ec(f_int);
682 x_tmp = fabs((float)cum_hl - cur_e);
683 if (x_tmp > ks_dev) ks_dev = x_tmp;
684 ks_df++;
685 }
686 }
687 ks_dev /= (float)num_db_entries;
688 }
689