1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "Binomial_Bound.H"
19 #include "sqStore.H"
20
21 #undef COMPUTE_IN_LOG_SPACE
22
23 // Determined by EDIT_DIST_PROB_BOUND
24 #define NORMAL_DISTRIB_THOLD 3.62
25
26 // Probability limit to "band" edit-distance calculation
27 // Determines NORMAL_DISTRIB_THOLD
28 #define EDIT_DIST_PROB_BOUND 1e-4
29
30
31
32 // Return the smallest n >= Start s.t.
33 // prob [>= e errors in n binomial trials (p = error prob)] > EDIT_DIST_PROB_BOUND
34 //
35 int
Binomial_Bound(int e,double p,int Start)36 Binomial_Bound(int e, double p, int Start) {
37 double Normal_Z, Mu_Power, Factorial, Poisson_Coeff;
38 double q, Sum, P_Power, Q_Power, X;
39 int k, n, Bin_Coeff, Ct;
40
41 q = 1.0 - p;
42 if (Start < e)
43 Start = e;
44
45 for (n = Start; n < AS_MAX_READLEN; n ++) {
46 if (n <= 35) {
47 Sum = 0.0;
48 Bin_Coeff = 1;
49 Ct = 0;
50 P_Power = 1.0;
51 Q_Power = pow (q, n);
52
53 for (k = 0; k < e && 1.0 - Sum > EDIT_DIST_PROB_BOUND; k ++) {
54 X = Bin_Coeff * P_Power * Q_Power;
55 Sum += X;
56 Bin_Coeff *= n - Ct;
57 Bin_Coeff /= ++ Ct;
58 P_Power *= p;
59 Q_Power /= q;
60 }
61
62 if (1.0 - Sum > EDIT_DIST_PROB_BOUND)
63 return(n);
64
65 } else {
66 Normal_Z = (e - 0.5 - n * p) / sqrt (n * p * q);
67 if (Normal_Z <= NORMAL_DISTRIB_THOLD)
68 return n;
69
70 #ifndef COMPUTE_IN_LOG_SPACE
71 Sum = 0.0;
72 Mu_Power = 1.0;
73 Factorial = 1.0;
74 Poisson_Coeff = exp (- n * p);
75 for (k = 0; k < e; k ++) {
76 Sum += Mu_Power * Poisson_Coeff / Factorial;
77 Mu_Power *= n * p;
78 Factorial *= k + 1;
79 }
80 #else
81 Sum = 0.0;
82 Mu_Power = 0.0;
83 Factorial = 0.0;
84 Poisson_Coeff = - n * p;
85 for (k = 0; k < e; k ++) {
86 Sum += exp(Mu_Power + Poisson_Coeff - Factorial);
87 Mu_Power += log(n * p);
88 Factorial = lgamma(k + 1);
89 }
90 #endif
91
92 if (1.0 - Sum > EDIT_DIST_PROB_BOUND)
93 return(n);
94 }
95 }
96
97 return(AS_MAX_READLEN);
98 }
99
100
101
102
103 void
Initialize_Match_Limit(int32 * ml,double maxErate,int32 maxErrors)104 Initialize_Match_Limit(int32 *ml, double maxErate, int32 maxErrors) {
105 int32 e = 0;
106 int32 s = 1;
107 int32 l = std::min(maxErrors, 2000); // Compute the first 2000 values; set to maxErrors to do no estimation
108
109 // The number of errors that are ignored in setting probability bound for terminating alignment
110 // extensions in edit distance calculations
111 int32 ERRORS_FOR_FREE = 1;
112
113 // Free errors.
114
115 while (e <= ERRORS_FOR_FREE)
116 ml[e++] = 0;
117
118 // Compute the actual limits. This is _VERY_ expensive for longer reads. BITS=17 is about all
119 // it can support.
120
121 #ifdef DUMP_MATCH_LIMIT
122 l = maxErrors; // For showing deviations
123 #endif
124
125 while (e < l) {
126 s = Binomial_Bound(e - ERRORS_FOR_FREE, maxErate, s);
127 ml[e] = s - 1;
128
129 assert(ml[e] >= ml[e-1]);
130
131 //if ((e % 100) == 0)
132 // fprintf(stderr, " %8.4f%% - %8d / %8d\r", 100.0 * e / maxErrors, e, maxErrors);
133
134 e++;
135 }
136
137 // Estimate the remaining limits. using a linear function based on a precomputed slope.
138 // prefixEditDistance-matchLimitGenerate computes the data values for a bunch of error rates.
139 // These are used to compute the slope of a line from the [2000] point through the [max] point.
140 // These slopes fit, almost exactly, an a/x+b curve, and that curve is used to compute the slope
141 // for any error rate.
142 //
143 #if AS_MAX_READLEN_BITS == 17
144 double sl = 0.962830901135531 / maxErate + 0.096810267016486;
145 #endif
146
147 #if AS_MAX_READLEN_BITS == 18
148 double sl = 0.964368146781421 / maxErate + 0.118101522100597;
149 #endif
150
151 #if AS_MAX_READLEN_BITS == 19
152 double sl = 0.963823337297648 / maxErate + 0.156091528250625;
153 #endif
154
155 #if AS_MAX_READLEN_BITS == 20
156 double sl = 0.971023157863311 / maxErate + 0.154425731746994;
157 #endif
158
159 #if AS_MAX_READLEN_BITS == 21
160 double sl = 0.982064188397525 / maxErate + 0.067835741959926;
161 #endif
162
163 #if AS_MAX_READLEN_BITS == 22
164 double sl = 0.986446300363063 / maxErate + 0.052358358862826;
165 #endif
166
167 #if AS_MAX_READLEN_BITS == 23
168 double sl = 0.989923769842693 / maxErate + 0.0395372203695468;
169 #endif
170
171 #if AS_MAX_READLEN_BITS == 24
172 double sl = 0.992440299290478 / maxErate + 0.036791522317757;
173 #endif
174
175 // And the first value.
176 double vl = ml[e-1] + sl;
177
178 while (e < maxErrors) {
179 ml[e] = (int32)ceil(vl);
180 vl += sl;
181 e++;
182 }
183
184 #ifdef DUMP_MATCH_LIMIT
185 FILE *F = fopen("values-new.dat", "w");
186 for (int32 e=0; e<maxErrors; e++)
187 fprintf(F, "%d %d\n", e, Edit_Match_Limit[e]);
188 AS_UTL_closeFile(F, "values-new.dat");
189
190 fprintf(stderr, "values-orig.dat and values-new.dat dumped. exiting.\n");
191 exit(1);
192 #endif
193
194 }
195
196