1 /*   ==========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ==========================================================================*/
24 
25 /** @file unified_pvalues.c
26  * Procedures for computing a "composition" p-value of a match, and
27  * for computing a unified p-value combining the customary alignment
28  * p-value and the new composition p-value.
29  *
30  * @author Yi-Kuo Yu, Alejandro Schaffer, Mike Gertz
31  */
32 
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <math.h>
37 
38 #include <algo/blast/composition_adjustment/composition_constants.h>
39 #include <algo/blast/composition_adjustment/unified_pvalues.h>
40 
41 /** @{ The next two constants are used as thresholds in one test of whether
42  *     to use compositional p-values */
43 #define LENGTH_MIN             40
44 #define LENGTH_MAX             4000
45 /** @} */
46 
47 #define LAMBDA_BIN_SIZE      0.001      /**< bin size when discretizing lambda
48                                              distribution function */
49 #define LAMBDA_BIN_TOTAL       565      /**< total number of bins for the
50                                              lambda distribution function */
51 /** Marks the boundary of the left tail of the lambda distribution;
52     values of lambda that lie in the left tail are treated all same */
53 #define LOW_LAMBDA_BIN_CUT     35
54 
55 
56 /**
57  * P_lambda_table is a discretized version of the empirical cumulative
58  * distribution function for the statistical parameter lambda for
59  * pairs of unrelated sequences in the Astral data set*/
60 static double
61 P_lambda_table[LAMBDA_BIN_TOTAL] =
62     { 1.247750e-07, 1.247750e-07, 1.247750e-07, 1.247750e-07, 1.247750e-07,
63       1.247750e-07, 1.247750e-07, 1.247750e-07, 1.247750e-07, 1.247750e-07,
64       1.247750e-07, 1.247750e-07, 2.495500e-07,
65       2.495500e-07, 2.495500e-07, 2.495500e-07, 2.495500e-07, 2.495500e-07,
66       2.495500e-07, 2.495500e-07, 2.495500e-07,
67       2.495500e-07, 3.743250e-07, 4.991000e-07, 4.991000e-07, 4.991000e-07,
68       4.991000e-07, 7.486490e-07, 7.486490e-07,
69       7.486490e-07, 7.486490e-07, 8.734240e-07, 9.981990e-07, 9.981990e-07,
70       1.122974e-06, 1.122974e-06, 1.372523e-06,
71       1.497298e-06, 1.622073e-06, 1.871622e-06, 1.871622e-06, 1.996397e-06,
72       2.370721e-06, 2.620270e-06, 2.620270e-06,
73       2.620270e-06, 2.745045e-06, 2.745045e-06, 2.745045e-06, 2.745045e-06,
74       2.869820e-06, 3.493695e-06, 3.493695e-06,
75       3.493695e-06, 3.868019e-06, 4.117568e-06, 4.367117e-06, 4.866216e-06,
76       4.866216e-06, 5.240540e-06, 5.240540e-06,
77       5.864415e-06, 6.363514e-06, 6.737838e-06, 7.236937e-06, 7.860812e-06,
78       8.110361e-06, 8.484685e-06, 9.233334e-06,
79       9.857209e-06, 1.035631e-05, 1.060586e-05, 1.085541e-05, 1.210316e-05,
80       1.272703e-05, 1.322613e-05, 1.409955e-05,
81       1.509775e-05, 1.647027e-05, 1.784279e-05, 1.859144e-05, 1.921532e-05,
82       2.008874e-05, 2.133649e-05, 2.245946e-05,
83       2.333288e-05, 2.445585e-05, 2.557882e-05, 2.732567e-05, 2.832387e-05,
84       3.019549e-05, 3.231666e-05, 3.468738e-05,
85       3.568558e-05, 3.718288e-05, 3.942882e-05, 4.292251e-05, 4.479413e-05,
86       4.666575e-05, 4.903647e-05, 5.140719e-05,
87       5.352836e-05, 5.539998e-05, 5.714683e-05, 5.964232e-05, 6.263691e-05,
88       6.550673e-05, 6.825177e-05, 7.124636e-05,
89       7.411618e-05, 7.611258e-05, 7.910717e-05, 8.235131e-05, 8.534590e-05,
90       8.971302e-05, 9.457923e-05, 9.907112e-05,
91       1.028144e-04, 1.068072e-04, 1.124220e-04, 1.150423e-04, 1.202828e-04,
92       1.241509e-04, 1.297657e-04, 1.352558e-04,
93       1.408707e-04, 1.474838e-04, 1.535977e-04, 1.605851e-04, 1.679469e-04,
94       1.766811e-04, 1.849162e-04, 1.943991e-04,
95       2.023847e-04, 2.111190e-04, 2.208514e-04, 2.298352e-04, 2.390685e-04,
96       2.500487e-04, 2.599059e-04, 2.705118e-04,
97       2.826149e-04, 2.953419e-04, 3.058230e-04, 3.209207e-04, 3.336477e-04,
98       3.469986e-04, 3.607238e-04, 3.758215e-04,
99       3.915431e-04, 4.100098e-04, 4.283517e-04, 4.454458e-04, 4.661584e-04,
100       4.845003e-04, 5.032165e-04, 5.194372e-04,
101       5.399003e-04, 5.642314e-04, 5.905589e-04, 6.173855e-04, 6.407184e-04,
102       6.646751e-04, 6.896300e-04, 7.199503e-04,
103       7.487733e-04, 7.780954e-04, 8.119093e-04, 8.442260e-04, 8.782895e-04,
104       9.142246e-04, 9.507836e-04, 9.890894e-04,
105       1.025898e-03, 1.060710e-03, 1.102010e-03, 1.141065e-03, 1.179870e-03,
106       1.225413e-03, 1.268086e-03, 1.313254e-03,
107       1.362665e-03, 1.413199e-03, 1.465479e-03, 1.514516e-03, 1.568668e-03,
108       1.620325e-03, 1.678595e-03, 1.738861e-03,
109       1.800375e-03, 1.863511e-03, 1.933760e-03, 1.999641e-03, 2.066271e-03,
110       2.136269e-03, 2.208639e-03, 2.282256e-03,
111       2.363235e-03, 2.451451e-03, 2.535050e-03, 2.626385e-03, 2.715973e-03,
112       2.811676e-03, 2.906879e-03, 3.008695e-03,
113       3.108390e-03, 3.220687e-03, 3.337351e-03, 3.458507e-03, 3.587275e-03,
114       3.717165e-03, 3.853419e-03, 3.998407e-03,
115       4.150882e-03, 4.307349e-03, 4.470305e-03, 4.650105e-03, 4.835520e-03,
116       5.028047e-03, 5.239291e-03, 5.459394e-03,
117       5.699835e-03, 5.944019e-03, 6.201679e-03, 6.472939e-03, 6.767532e-03,
118       7.067490e-03, 7.402760e-03, 7.779704e-03,
119       8.163261e-03, 8.581007e-03, 9.043672e-03, 9.538778e-03, 1.006046e-02,
120       1.061571e-02, 1.121351e-02, 1.187918e-02,
121       1.259427e-02, 1.336213e-02, 1.420611e-02, 1.510486e-02, 1.609220e-02,
122       1.716439e-02, 1.831930e-02, 1.961309e-02,
123       2.100770e-02, 2.254767e-02, 2.423026e-02, 2.604985e-02, 2.803414e-02,
124       3.023304e-02, 3.261836e-02, 3.527531e-02,
125       3.818206e-02, 4.134298e-02, 4.483305e-02, 4.870456e-02, 5.296474e-02,
126       5.765964e-02, 6.281384e-02, 6.850944e-02,
127       7.481781e-02, 8.173895e-02, 8.937667e-02, 9.774769e-02, 1.069522e-01,
128       1.170782e-01, 1.282548e-01, 1.405027e-01,
129       1.539673e-01, 1.686191e-01, 1.846574e-01, 2.020771e-01, 2.210459e-01,
130       2.415374e-01, 2.637095e-01, 2.872893e-01,
131       3.124426e-01, 3.391945e-01, 3.673232e-01, 3.966229e-01, 4.269345e-01,
132       4.580085e-01, 4.895437e-01, 5.211873e-01,
133       5.525534e-01, 5.832051e-01, 6.129409e-01, 6.414552e-01, 6.685366e-01,
134       6.941538e-01, 7.180993e-01, 7.403272e-01,
135       7.608830e-01, 7.798100e-01, 7.972520e-01, 8.132446e-01, 8.277589e-01,
136       8.410031e-01, 8.531792e-01, 8.642644e-01,
137       8.743145e-01, 8.835475e-01, 8.919562e-01, 8.996102e-01, 9.066574e-01,
138       9.130520e-01, 9.189736e-01, 9.244128e-01,
139       9.293928e-01, 9.339722e-01, 9.381913e-01, 9.421451e-01, 9.457657e-01,
140       9.491230e-01, 9.522394e-01, 9.551589e-01,
141       9.578458e-01, 9.603525e-01, 9.626876e-01, 9.648771e-01, 9.669361e-01,
142       9.688645e-01, 9.706878e-01, 9.723798e-01,
143       9.739466e-01, 9.754310e-01, 9.768135e-01, 9.781311e-01, 9.793539e-01,
144       9.805091e-01, 9.816204e-01, 9.826387e-01,
145       9.835868e-01, 9.844858e-01, 9.853414e-01, 9.861397e-01, 9.869005e-01,
146       9.876234e-01, 9.882862e-01, 9.889344e-01,
147       9.895357e-01, 9.901063e-01, 9.906435e-01, 9.911373e-01, 9.916119e-01,
148       9.920576e-01, 9.924838e-01, 9.928730e-01,
149       9.932470e-01, 9.935931e-01, 9.939352e-01, 9.942430e-01, 9.945419e-01,
150       9.948185e-01, 9.950818e-01, 9.953442e-01,
151       9.955881e-01, 9.958246e-01, 9.960354e-01, 9.962523e-01, 9.964441e-01,
152       9.966252e-01, 9.967933e-01, 9.969595e-01,
153       9.971159e-01, 9.972685e-01, 9.974102e-01, 9.975482e-01, 9.976761e-01,
154       9.977899e-01, 9.978979e-01, 9.979996e-01,
155       9.980933e-01, 9.981938e-01, 9.982869e-01, 9.983711e-01, 9.984526e-01,
156       9.985349e-01, 9.986089e-01, 9.986793e-01,
157       9.987437e-01, 9.988027e-01, 9.988587e-01, 9.989131e-01, 9.989656e-01,
158       9.990185e-01, 9.990667e-01, 9.991120e-01,
159       9.991549e-01, 9.991993e-01, 9.992385e-01, 9.992718e-01, 9.993088e-01,
160       9.993393e-01, 9.993723e-01, 9.993982e-01,
161       9.994278e-01, 9.994549e-01, 9.994825e-01, 9.995078e-01, 9.995336e-01,
162       9.995536e-01, 9.995732e-01, 9.995946e-01,
163       9.996155e-01, 9.996358e-01, 9.996535e-01, 9.996697e-01, 9.996842e-01,
164       9.996983e-01, 9.997115e-01, 9.997263e-01,
165       9.997404e-01, 9.997520e-01, 9.997612e-01, 9.997721e-01, 9.997834e-01,
166       9.997917e-01, 9.998010e-01, 9.998097e-01,
167       9.998197e-01, 9.998272e-01, 9.998347e-01, 9.998427e-01, 9.998504e-01,
168       9.998564e-01, 9.998629e-01, 9.998688e-01,
169       9.998745e-01, 9.998795e-01, 9.998834e-01, 9.998882e-01, 9.998937e-01,
170       9.998978e-01, 9.999021e-01, 9.999067e-01,
171       9.999094e-01, 9.999134e-01, 9.999174e-01, 9.999209e-01, 9.999240e-01,
172       9.999275e-01, 9.999312e-01, 9.999336e-01,
173       9.999374e-01, 9.999390e-01, 9.999415e-01, 9.999435e-01, 9.999452e-01,
174       9.999480e-01, 9.999506e-01, 9.999517e-01,
175       9.999541e-01, 9.999555e-01, 9.999570e-01, 9.999581e-01, 9.999595e-01,
176       9.999608e-01, 9.999623e-01, 9.999636e-01,
177       9.999655e-01, 9.999665e-01, 9.999680e-01, 9.999691e-01, 9.999702e-01,
178       9.999712e-01, 9.999718e-01, 9.999733e-01,
179       9.999739e-01, 9.999751e-01, 9.999761e-01, 9.999771e-01, 9.999782e-01,
180       9.999792e-01, 9.999801e-01, 9.999807e-01,
181       9.999813e-01, 9.999816e-01, 9.999826e-01, 9.999831e-01, 9.999834e-01,
182       9.999836e-01, 9.999838e-01, 9.999841e-01,
183       9.999846e-01, 9.999854e-01, 9.999856e-01, 9.999859e-01, 9.999870e-01,
184       9.999874e-01, 9.999882e-01, 9.999885e-01,
185       9.999890e-01, 9.999892e-01, 9.999892e-01, 9.999893e-01, 9.999895e-01,
186       9.999900e-01, 9.999904e-01, 9.999907e-01,
187       9.999909e-01, 9.999914e-01, 9.999917e-01, 9.999923e-01, 9.999923e-01,
188       9.999925e-01, 9.999928e-01, 9.999932e-01,
189       9.999934e-01, 9.999937e-01, 9.999939e-01, 9.999942e-01, 9.999948e-01,
190       9.999950e-01, 9.999950e-01, 9.999952e-01,
191       9.999954e-01, 9.999955e-01, 9.999957e-01, 9.999957e-01, 9.999959e-01,
192       9.999959e-01, 9.999960e-01, 9.999960e-01,
193       9.999962e-01, 9.999963e-01, 9.999965e-01, 9.999965e-01, 9.999967e-01,
194       9.999967e-01, 9.999968e-01, 9.999973e-01,
195       9.999973e-01, 9.999974e-01, 9.999975e-01, 9.999977e-01, 9.999977e-01,
196       9.999977e-01, 9.999977e-01, 9.999979e-01,
197       9.999979e-01, 9.999979e-01, 9.999982e-01, 9.999983e-01, 9.999984e-01,
198       9.999985e-01, 9.999985e-01, 9.999985e-01,
199       9.999985e-01, 9.999985e-01, 9.999985e-01, 9.999985e-01, 9.999985e-01,
200       9.999987e-01, 9.999987e-01, 9.999988e-01,
201       9.999988e-01, 9.999988e-01, 9.999988e-01, 9.999989e-01, 9.999989e-01,
202       9.999989e-01, 9.999989e-01, 9.999992e-01
203     };
204 
205 
206 /* Documented in unified_pvalues.h */
207 double
Blast_CompositionPvalue(double lambda)208 Blast_CompositionPvalue(double lambda)
209 {
210     /* Let initialUScore be lambda, scaled and shifted to fit the
211      * bins of P_lambda_table. */
212     double initialUScore = (lambda - COMPO_MIN_LAMBDA) / LAMBDA_BIN_SIZE;
213 
214     if (initialUScore < LOW_LAMBDA_BIN_CUT) {
215         /* Values of lambda less than
216          *     LOW_LAMBDA_BIN_CUT * LAMBDA_BIN_SIZE + COMPO_MIN_LAMBDA
217          * all receive the same p-value.
218          */
219         return P_lambda_table[LOW_LAMBDA_BIN_CUT];
220     } else if (initialUScore > LAMBDA_BIN_TOTAL - 1) {
221         /* Values of initialUScore strictly larger than the largest
222          * value in the table all receive p-value 1.0 */
223         return 1.0;
224     } else {
225         /* floor of initialUScore is the bin in the table */
226         int bin = ((int) initialUScore);
227         if (bin == LAMBDA_BIN_TOTAL - 1) {
228             /* In the unlikely event that bin is exactly the
229              * largest item in the table, just return that value. */
230             return P_lambda_table[LAMBDA_BIN_TOTAL - 1];
231         } else {
232             /* Return a value using linear interpolation; delta is the
233              * difference between the discrete table bin and the
234              * actual desired value */
235             double delta = initialUScore - bin;
236             return (1.0 - delta) * P_lambda_table[bin] +
237                 delta * P_lambda_table[bin + 1];
238         }
239     }
240 }
241 
242 
243 /* Documented in unified_pvalues.h */
244 double
Blast_Overall_P_Value(double p_comp,double p_alignment)245 Blast_Overall_P_Value(double p_comp,
246                       double p_alignment)
247 {
248     double product;               /* the product of the two input p-values */
249 
250     product = p_comp * p_alignment;
251     if (product > 0) {
252         return product * (1.0 - log(product));
253     } else {
254         /* product is either correctly 0, in which case the overall P
255          * is also corectly zero; or incorrectly nonpositive (possibly
256          * NaN), in which case we return the nonsense value so as not
257          * to hide the error */
258         return product;
259     }
260 }
261