1 /* scalebest.c
2
3 This routine gives us the option to rescale a set of scores and
4 leave them in an unused irelv
5
6 */
7
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <math.h>
11
12 #define MAXHIST 50
13 #ifdef BIGMEM
14 #define MAX_LLEN 200
15 #else
16 #define MAX_LLEN 100
17 #endif
18 #define MAX_SSCORE 300
19
20 extern int dnaseq;
21 extern int nlib, optall;
22 extern int histflg;
23 extern long *hist;
24 extern int histint, min_hist, max_hist, maxh;
25 float ks_dev;
26 int ks_df;
27 long num_db_entries, total_db_length, z_calls;
28
29 #define LENGTH_CUTOFF 10 /* minimum database sequence length allowed, for fitting */
30
31 #define LN_FACT 10.0
32
33 int *llen_hist=NULL;
34 float *score_sums, *score2_sums, *score_var;
35 static int *h_len;
36 float *score_var;
37 static double logK, lambda, rho, mu, rho2, mu2, mean_var, mean_length;
38 static int max_llen, min_llen, max_sscore;
39 static int max_dlen, min_dlen;
40 static int max_length, min_length, zero_scores;
41 static float var_cutoff;
42 static int fit_flag=1;
43
44 #ifdef FASTA_BEST
45 struct beststr {
46 int score; /* pam score with segment optimization*/
47 int score0;
48 int gscore;
49 int sscore;
50 float zscore;
51 float escore;
52 int n1;
53 long lseek; /* position in library file */
54 int dp; /* diagonal of match */
55 int start; /* start of match in lib seq */
56 int stop; /* end of match in lib seq */
57 int cont; /* offset into sequence */
58 int frame;
59 int lib;
60 };
61 #else
62 struct beststr {
63 int score; /* smith-waterman score */
64 int sscore; /* duplicate for compatibility with fasta */
65 float zscore;
66 float escore;
67 int n1;
68 long lseek; /* position in library file */
69 int cont; /* offset into sequence */
70 int frame;
71 int lib;
72 #ifdef RES_STATS
73 char libstr[13];
74 #endif
75 };
76 #endif
77
78 float find_zm(), find_z();
79
process_hist(n0,bptr,nbest)80 process_hist(n0,bptr, nbest)
81 int n0;
82 #ifndef FAR_PTR
83 struct beststr **bptr;
84 #else
85 struct beststr huge * huge * bptr;
86 #endif
87 int nbest;
88 {
89 int i, j, llen, hscore;
90 int n_trimmed;
91 float zs;
92
93 if (nbest <= 5) {
94 mean_var = 500.0;
95 rho = mu = rho2 = mu2 = 0.0;
96 for (i=0; i<nbest; i++) bptr[i]->zscore = 50.0;
97 return;
98 }
99
100 inithist();
101
102 for (i = 0; i<nbest; i++)
103 addhist(bptr[i]->sscore,bptr[i]->n1);
104
105 fit_llen(); /* now we have rho, mu, rho2, mu2, mean_var to set
106 the parameters for the histogram */
107
108 if (fit_flag) {
109 n_trimmed=0;
110 for (i = 0; i < nbest; i++) {
111 zs = find_zm(bptr[i]->sscore,bptr[i]->n1);
112 if (zs < 0 || zs > 100 ) {
113 prune_hist(bptr[i]->sscore,bptr[i]->n1);
114 n_trimmed++;
115 }
116 }
117 }
118 /*
119 if (n_trimmed > 0)
120 printf("Trimmed %d entries with z > 5.0\n", n_trimmed);
121 */
122
123 if (fit_flag) fit_llens();
124
125 inithistz(MAXHIST);
126 for (i = 0; i < nbest; i++) {
127 bptr[i]->zscore = find_zm(bptr[i]->sscore,bptr[i]->n1);
128 addhistz(bptr[i]->zscore,bptr[i]->n1);
129 }
130 }
131
132
alloc_hist()133 alloc_hist()
134 {
135 if (llen_hist == NULL) {
136 llen_hist = (int *)calloc((size_t)(max_llen+1),sizeof(int));
137 score_sums = (float *)calloc((size_t)(max_llen + 1),sizeof(float));
138 score2_sums = (float *)calloc((size_t)(max_llen + 1),sizeof(float));
139 score_var = (float *)calloc((size_t)(max_llen + 1),sizeof(float));
140 }
141 }
142
free_hist()143 free_hist()
144 {
145 if (llen_hist != NULL) {
146 free(score_var);
147 free(score2_sums);
148 free(score_sums);
149 free(llen_hist);
150 llen_hist=NULL;
151 }
152 }
153
inithist()154 inithist()
155 {
156 int i;
157
158 max_llen = MAX_LLEN;
159 max_sscore = MAX_SSCORE;
160 alloc_hist();
161
162 max_dlen = 0;
163 min_dlen = MAX_LLEN;
164
165 for (i = 0; i <= max_llen; i++) {
166 llen_hist[i] = 0;
167 score_sums[i] = score2_sums[i] = 0.0;
168 }
169 zero_scores = 0;
170 min_length = 10000;
171 max_length = 0;
172 num_db_entries = 0;
173 total_db_length = 0;
174 }
175
addhist(score,length)176 addhist(score, length)
177 int score, length;
178 {
179 int llen;
180
181 if ( score<=0 || length < LENGTH_CUTOFF) {
182 zero_scores++;
183 return;
184 }
185
186 if (length > max_length) max_length = length;
187 if (length < min_length) min_length = length;
188 if (score > max_sscore) score = max_sscore;
189
190 llen = (int)(LN_FACT*log((double)length)+0.5);
191
192 if (llen < 0 ) llen = 0;
193 if (llen > max_llen) llen = max_llen;
194 if (llen > max_dlen) max_dlen = llen;
195 if (llen < min_dlen) min_dlen = llen;
196 llen_hist[llen]++;
197 score_sums[llen] += (float)score;
198 score2_sums[llen] += ((float)score * (float)score);
199
200 num_db_entries++;
201 total_db_length += length;
202 }
203
204 /* histogram will go from z-scores of 20 .. 100 with mean 50 and z=10 */
205
206
inithistz(mh)207 inithistz(mh)
208 int mh;
209 {
210 min_hist = 20;
211 max_hist = 120;
212 z_calls = 0;
213 total_db_length = 0;
214 num_db_entries = 0;
215
216 histint = (int)((float)(max_hist - min_hist + 2)/(float)mh+0.5);
217 maxh = (int)((float)(max_hist - min_hist + 2)/(float)histint+0.5) ;
218
219 if ((hist=(long *)calloc((size_t)maxh,sizeof(long)))==NULL) {
220 fprintf(stderr," cannot allocate %d for histogram\n",maxh);
221 histflg = 0;
222 }
223 }
224
addhistz(zs,length)225 addhistz(zs,length)
226 float zs;
227 int length;
228 {
229 int ih, zi;
230
231 z_calls++;
232
233 zi = (int)(zs + 0.5);
234
235 if ((zi >= min_hist) && (zi <= max_hist)) {
236 num_db_entries++;
237 total_db_length += length;
238 }
239
240 if (zi < min_hist) zi = min_hist;
241 if (zi > max_hist) zi = max_hist;
242
243 ih = (zi - min_hist)/histint;
244
245 hist[ih]++;
246 }
247
prune_hist(score,length)248 prune_hist(score, length)
249 int score, length;
250 {
251 int llen;
252
253 if (score <= 0 || length < LENGTH_CUTOFF) return;
254
255 if (score > max_sscore) score = max_sscore;
256
257 llen = (int)(LN_FACT*log((double)length)+0.5);
258
259 if (llen < 0 ) llen = 0;
260 if (llen > max_llen) llen = max_llen;
261 llen_hist[llen]--;
262 score_sums[llen] -= (float)score;
263 score2_sums[llen] -= ((float)score * (float)score);
264
265 num_db_entries--;
266 total_db_length -= length;
267 }
268
269
fit_llen()270 fit_llen()
271 {
272 int j;
273 long n, cell_size;
274 int ej, last_ej;
275 int n_size;
276 double mean, s, s2, x, y, y2, t, t2, u, u2, v, p, z;
277 double mean_x, mean_y, var_x, var_y, covar_xy;
278 double mean_y2, covar_xy2;
279 double ex, exl;
280
281 double sum_x, sum_y, sum_x2, sum_xy, det, n_w;
282
283 /* now fit scores to best linear function of log(n), using
284 simple linear regression */
285
286 /*
287 for (min_llen=0; min_llen < max_llen; min_llen++)
288 if (llen_hist[min_llen]) break;
289 */
290 min_dlen--;
291
292 for (n_size=0,j = min_dlen; j < max_dlen; j++) {
293 if (llen_hist[j] > 1) {
294 score_var[j] = score2_sums[j]/(float)llen_hist[j]
295 - (score_sums[j]/(float)llen_hist[j])
296 * (score_sums[j]/(float)llen_hist[j]);
297 score_var[j] /= (float)(llen_hist[j]-1.0);
298 if (score_var[j] <= 1.0 ) score_var[j] = 1.0;
299 n_size++;
300 }
301 }
302
303 n_w = 0.0;
304 sum_x = sum_y = sum_x2 = sum_xy = 0;
305 for (j = min_dlen; j < max_dlen; j++)
306 if (llen_hist[j] > 1) {
307 x = j + 0.5;
308 n_w += (float)llen_hist[j]/score_var[j];
309 sum_x += (float)llen_hist[j] * x / score_var[j] ;
310 sum_y += score_sums[j] / score_var[j];
311 sum_x2 += (float)llen_hist[j] * x * x /score_var[j];
312 sum_xy += x * score_sums[j]/score_var[j];
313 }
314
315 if (n_size < 5 ) {
316 fit_flag=0;
317 rho = 0;
318 mu = sum_y/n_w;
319 }
320 else {
321 det = n_w * sum_x2 - sum_x * sum_x;
322 if (det > 0.001) {
323 rho = (n_w * sum_xy - sum_x * sum_y)/det;
324 mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
325 }
326 else {
327 fit_flag = 0;
328 rho = 0;
329 mu = sum_y/n_w;
330 }
331 }
332
333 /* printf(" rho1/mu1: %.2f/%.2f\n",rho*LN_FACT,mu); */
334
335 n = 0;
336 mean_x = mean_y = 0.0;
337 var_x = var_y = 0.0;
338 covar_xy = 0.0;
339
340 for (j = min_dlen; j <= max_dlen; j++)
341 if (llen_hist[j] > 1 ) {
342 n += llen_hist[j];
343 x = (double)j + 0.5;
344 mean_x += (double)llen_hist[j] * x;
345 mean_y += (double)score_sums[j];
346 var_x += (double)llen_hist[j] * x * x;
347 var_y += (double)score2_sums[j];
348 covar_xy += x * (double)score_sums[j];
349 }
350 mean_x /= n; mean_y /= n;
351 var_x = var_x / (float)n - mean_x * mean_x;
352 var_y = var_y / (float)n - mean_y * mean_y;
353
354 covar_xy = covar_xy / (float)n - mean_x * mean_y;
355
356 /*
357 rho = covar_xy / var_x;
358 mu = mean_y - rho * mean_x;
359 */
360 mean_y2 = covar_xy2 = 0.0;
361 for (j = min_dlen; j <= max_dlen; j++)
362 if (llen_hist[j] > 1) {
363 x = (double)j + 0.5;
364 u = rho * x + mu;
365 y2 = score2_sums[j] - 2 * score_sums[j] * u + (float)llen_hist[j] * u * u;
366 mean_y2 += y2;
367 covar_xy2 += x * y2;
368 }
369
370 mean_var = mean_y2 /= (float)n;
371 covar_xy2 = covar_xy2 / (float)n - mean_x * mean_y2;
372 if (fit_flag) {
373 rho2 = covar_xy2 / var_x;
374 mu2 = mean_y2 - rho2 * mean_x;
375 }
376 else {
377 rho2 = 0;
378 mu2 = mean_y2;
379 }
380
381 if (rho2 < 0.0 )
382 z = (rho2 * LN_FACT*log((double)max_length) + mu2 > 0.0) ? max_length : exp((-1.0 - mu2 / rho2)/LN_FACT);
383 else z = rho2 ? exp((1.0 - mu2 / rho2)/LN_FACT) : LENGTH_CUTOFF;
384 if (z < 2*LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
385
386 var_cutoff = rho2 * LN_FACT*log(z) + mu2;
387
388 /*
389 fprintf(stderr,"\nMinimum allowed predicted variance (%0.2f) at n = %.0f\n",
390 var_cutoff,z);
391
392 printf("\nVariance of residuals explained by log(n): %.2f\n",
393 covar_xy2 * covar_xy2 / var_x );
394
395 printf("\nObserved best fits to log(n):\n");
396 printf(" mean = %.2f * log(n) + %.2f, var = %.2f * log(n) + %.2f \n",
397 rho*LN_FACT, mu, rho2*LN_FACT, mu2);
398
399 cell_size = n / 20;
400
401 printf("\n\n S z");
402 printf("\n\nLengths (n) No. Mean Std Mean Std\n");
403
404 last_ej = LENGTH_CUTOFF;
405 for (j = min_llen; j <= max_llen; ) {
406 for (s = s2 = t = t2 = u = u2 = n = 0;
407 n < cell_size && j <= max_llen; j++)
408 if (llen_hist[j] > 1) {
409 n += llen_hist[j];
410 s += score_sums[j];
411 s2 += score2_sums[j];
412
413 y = j+0.5;
414 x = rho * y + mu;
415 v = rho2 * y + mu2;
416 if (v < var_cutoff) v = var_cutoff;
417 u += (score_sums[j] - (float)llen_hist[j] * x) / sqrt(v);
418 u2 += (score2_sums[j] - 2 * x * score_sums[j] + (float)llen_hist[j] * x * x) /
419 v;
420 }
421 if (n > 0) {
422 mean = s / (float)n;
423 ej = (int)(exp((double)j/LN_FACT)+0.5);
424 printf("\n%3d - %4d: %4d %6.1f %6.2f", last_ej, ej, n, mean,
425 sqrt(s2 / (float)n - mean * mean));
426 mean = u / (float)n;
427 printf(" %6.1f %6.2f", mean, sqrt(u2 / n - mean * mean));
428 last_ej = ej;
429 }
430 }
431 */
432 }
433
fit_llens()434 fit_llens()
435 {
436 int j;
437 long n, cell_size;
438 int ej, last_ej, n_u2;
439 double mean, s, s2, x, y, y2, t, t2, u, u2, v, p, z;
440 double mean_x, mean_y, var_x, var_y, covar_xy;
441 double mean_y2, covar_xy2;
442 double mean_u2, mean_3u2, mean_i3u2;
443 double sum_x, sum_y, sum_x2, sum_xy, det, n_w;
444
445 /* now fit scores to best linear function of log(n), using
446 simple linear regression */
447
448 /*
449 for (min_llen=0; min_llen < max_llen; min_llen++)
450 if (llen_hist[min_llen]) break;
451 min_llen--;
452 */
453
454 for (j = min_dlen; j < max_dlen; j++) {
455 if (llen_hist[j] > 1) {
456 score_var[j] = score2_sums[j]/(float)llen_hist[j]
457 - (score_sums[j]/(float)llen_hist[j])
458 * (score_sums[j]/(float)llen_hist[j]);
459 score_var[j] /= (float)(llen_hist[j]-1.0);
460 if (score_var[j] <= 1.0 ) score_var[j] = 1.0;
461 }
462 }
463
464 n_w = 0.0;
465 sum_x = sum_y = sum_x2 = sum_xy = 0;
466 for (j = min_dlen; j < max_dlen; j++)
467 if (llen_hist[j] > 1) {
468 x = j + 0.5;
469 n_w += (float)llen_hist[j]/score_var[j];
470 sum_x += (float)llen_hist[j] * x / score_var[j] ;
471 sum_y += score_sums[j] / score_var[j];
472 sum_x2 += (float)llen_hist[j] * x * x /score_var[j];
473 sum_xy += x * score_sums[j]/score_var[j];
474 }
475
476 det = n_w * sum_x2 - sum_x * sum_x;
477 rho = (n_w * sum_xy - sum_x * sum_y)/det;
478 mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
479
480 /* printf(" rho1/mu1: %.2f/%.2f\n",rho*LN_FACT,mu); */
481
482 n = 0;
483 mean_x = mean_y = mean_y2 = 0.0;
484 var_x = var_y = 0.0;
485 covar_xy = covar_xy2 = 0.0;
486
487 for (j = min_dlen; j <= max_dlen; j++)
488 if (llen_hist[j] > 1 ) {
489 n += llen_hist[j];
490 x = (double)j + 0.5;
491 mean_x += (double)llen_hist[j] * x;
492 mean_y += (double)score_sums[j];
493 var_x += (double)llen_hist[j] * x * x;
494 var_y += (double)score2_sums[j];
495 covar_xy += x * (double)score_sums[j];
496 }
497 mean_x /= n; mean_y /= n;
498 var_x = var_x / (float)n - mean_x * mean_x;
499 var_y = var_y / (float)n - mean_y * mean_y;
500
501 covar_xy = covar_xy / (float)n - mean_x * mean_y;
502
503 /* rho = covar_xy / var_x;
504 mu = mean_y - rho * mean_x;
505 */
506
507 mean_y2 = covar_xy2 = 0.0;
508 for (j = min_dlen; j <= max_dlen; j++)
509 if (llen_hist[j] > 1) {
510 x = (double)j + 0.5;
511 u = rho * x + mu;
512 y2 = score2_sums[j] - 2 * score_sums[j] * u + (float)llen_hist[j] * u * u;
513 mean_y2 += y2;
514 covar_xy2 += x * y2;
515 }
516
517 mean_y2 /= n;
518 covar_xy2 = covar_xy2 / (float)n - mean_x * mean_y2;
519 rho2 = covar_xy2 / var_x;
520 mu2 = mean_y2 - rho2 * mean_x;
521
522 if (rho2 < 0.0 )
523 z = (rho2 * LN_FACT*log((double)max_length) + mu2 > 0.0) ? max_length : exp((-1.0 - mu2 / rho2)/LN_FACT);
524 else z = rho2 ? exp((1.0 - mu2 / rho2)/LN_FACT) : LENGTH_CUTOFF;
525 if (z < 2* LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
526
527 var_cutoff = rho2*LN_FACT*log(z) + mu2;
528 /*
529 printf("\nMinimum allowed predicted variance (%0.2f) at n = %.0f\n",
530 var_cutoff,z);
531 */
532 mean_u2 = 0.0;
533 n_u2 = 0;
534 for ( j = min_dlen; j < max_dlen; j++) {
535 y = j+0.5;
536 x = rho * y + mu;
537 v = rho2 * y + mu2;
538 if (v < var_cutoff) v = var_cutoff;
539 if (llen_hist[j]> 1) {
540 u2 = score_var[j] =
541 (score2_sums[j] - 2 * x * score_sums[j] + (float)llen_hist[j] * x * x)/v;
542 mean_u2 += u2;
543 n_u2++;
544 }
545 else score_var[j] = -1.0;
546 }
547
548 mean_u2 /= (double)n_u2;
549 /* printf(" average variance: %.2f\n",mean_u2); */
550 if (mean_u2 < 0.0) mean_u2 = -mean_u2;
551
552 mean_3u2 = mean_u2*3.0;
553
554 for (j = min_dlen; j < max_dlen; j++) {
555 if (llen_hist[j] <= 1) continue;
556 if (score_var[j] > mean_3u2) {
557 /* printf(" removing %d %d %.2f\n",
558 j, (int)(exp((double)j/LN_FACT)-0.5),
559 score_var[j]); */
560 llen_hist[j] = 0;
561 }
562 }
563 fit_llen();
564 }
565
find_z(score,length)566 float find_z(score, length)
567 int score, length;
568 {
569 float log_len, var, z;
570
571 log_len = LN_FACT*log((double)(length));
572 var = rho2 * log_len + mu2;
573 if (var < var_cutoff) var = var_cutoff;
574 z = ((float)score - rho * log_len - mu) / sqrt(var);
575
576 return (50.0 + 10.0*z);
577 }
578
find_zm(score,length)579 float find_zm(score, length)
580 int score, length;
581 {
582 float log_len, var, z;
583
584 if ( length < LENGTH_CUTOFF) return 0;
585
586 log_len = LN_FACT*log((double)(length));
587 /* var = rho2 * log_len + mu2;
588 if (var < var_cutoff) var = var_cutoff;
589 */
590 var = mean_var;
591 z = ((float)score - rho * log_len - mu) / sqrt(var);
592
593 return (50.0 + z*10.0);
594 }
595
596 /* computes E value for a given z value, assuming extreme value distribution */
z_to_E(zs)597 float z_to_E(zs)
598 float zs;
599 {
600 float e;
601
602 if (num_db_entries < 5) return (float)num_db_entries;
603
604 e = exp(-1.282555 * zs - .577216);
605 return (float)num_db_entries * (e > .01 ? 1.0 - exp(-e) : e);
606 }
607
zs_to_E(zs,n1)608 float zs_to_E(zs,n1) /* computes E-value for a given z value,
609 assuming extreme value distribution */
610 float zs;
611 int n1;
612 {
613 float e, z, k;
614
615 if (num_db_entries < 5) return 0.0;
616
617 z = (zs - 50.0)/10.0;
618
619 e = exp(-1.28255 * z - .577216);
620
621
622 if (dnaseq==1) k = (float)total_db_length /(float)n1;
623 else k = (float)num_db_entries;
624
625
626 /* k = (float)num_db_entries; */
627
628 return k * (e > .01 ? 1.0 - exp(-e) : e);
629 }
630
zs_to_Ec(zs)631 float zs_to_Ec(zs) /* computes 1.0 - E value for a given z value,
632 assuming extreme value distribution */
633 float zs;
634 {
635 float e, z;
636
637 if (num_db_entries < 5) return 0.0;
638
639 z = (zs - 50.0)/10.0;
640
641 e = exp(-1.28255 * z - .577216);
642 return (float)num_db_entries * (e > .01 ? exp(-e) : 1.0 - e);
643 }
644
645 /* above choice of parameters for Gumbel give mean of .000168, variance of
646 1.000857, so should be adequate approximation to z-score distribution when
647 behavior is "local" */
648
vsort(v,s,n)649 vsort(v,s,n)
650 float *v; int *s, n;
651 {
652 int gap, i, j;
653 float tmp;
654 int itmp;
655
656 for (gap=n/2; gap>0; gap/=2)
657 for (i=gap; i<n; i++)
658 for (j=i-gap; j>=0; j -= gap) {
659 if (v[j] >= v[j+gap]) break;
660 tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
661 itmp = s[j]; s[j]=s[j+gap]; s[j+gap]=itmp;
662 }
663 }
664
calc_ks()665 calc_ks()
666 {
667 int i, cum_hl;
668 float x_tmp, cur_e, f_int;
669
670 cum_hl = ks_df = 0;
671 ks_dev = 0.0;
672 f_int = (float)(5*histint + min_hist) + (float)histint/2.0;
673 for (i=0; i<6; i++) cum_hl += hist[i];
674 for (i=6; i< (90-min_hist)/histint; i++) {
675 cum_hl += hist[i];
676 if (hist[i]>0) {
677 f_int = (float)(i*histint + min_hist) + (float)histint/2.0;
678 cur_e = zs_to_Ec(f_int);
679 x_tmp = fabs((float)cum_hl - cur_e);
680 if (x_tmp > ks_dev) ks_dev = x_tmp;
681 ks_df++;
682 }
683 }
684 ks_dev /= (float)num_db_entries;
685 }
686