1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "Binomial_Bound.H"
19 #include "sqStore.H"
20 
21 #undef COMPUTE_IN_LOG_SPACE
22 
23 //  Determined by  EDIT_DIST_PROB_BOUND
24 #define  NORMAL_DISTRIB_THOLD    3.62
25 
26 //  Probability limit to "band" edit-distance calculation
27 //  Determines  NORMAL_DISTRIB_THOLD
28 #define  EDIT_DIST_PROB_BOUND    1e-4
29 
30 
31 
32 //  Return the smallest  n >= Start  s.t.
33 //    prob [>= e  errors in  n  binomial trials (p = error prob)] > EDIT_DIST_PROB_BOUND
34 //
35 int
Binomial_Bound(int e,double p,int Start)36 Binomial_Bound(int e, double p, int Start) {
37   double  Normal_Z, Mu_Power, Factorial, Poisson_Coeff;
38   double  q, Sum, P_Power, Q_Power, X;
39   int  k, n, Bin_Coeff, Ct;
40 
41   q = 1.0 - p;
42   if (Start < e)
43     Start = e;
44 
45   for (n = Start;  n < AS_MAX_READLEN;  n ++) {
46     if (n <= 35) {
47       Sum = 0.0;
48       Bin_Coeff = 1;
49       Ct = 0;
50       P_Power = 1.0;
51       Q_Power = pow (q, n);
52 
53       for (k = 0;  k < e && 1.0 - Sum > EDIT_DIST_PROB_BOUND;  k ++) {
54         X = Bin_Coeff * P_Power * Q_Power;
55         Sum += X;
56         Bin_Coeff *= n - Ct;
57         Bin_Coeff /= ++ Ct;
58         P_Power *= p;
59         Q_Power /= q;
60       }
61 
62       if (1.0 - Sum > EDIT_DIST_PROB_BOUND)
63         return(n);
64 
65     } else {
66       Normal_Z = (e - 0.5 - n * p) / sqrt (n * p * q);
67       if (Normal_Z <= NORMAL_DISTRIB_THOLD)
68         return  n;
69 
70 #ifndef COMPUTE_IN_LOG_SPACE
71       Sum = 0.0;
72       Mu_Power = 1.0;
73       Factorial = 1.0;
74       Poisson_Coeff = exp (- n * p);
75       for (k = 0;  k < e;  k ++) {
76         Sum += Mu_Power * Poisson_Coeff / Factorial;
77         Mu_Power *= n * p;
78         Factorial *= k + 1;
79       }
80 #else
81       Sum = 0.0;
82       Mu_Power = 0.0;
83       Factorial = 0.0;
84       Poisson_Coeff = - n * p;
85       for (k = 0;  k < e;  k ++) {
86         Sum += exp(Mu_Power + Poisson_Coeff - Factorial);
87         Mu_Power += log(n * p);
88         Factorial = lgamma(k + 1);
89       }
90 #endif
91 
92       if (1.0 - Sum > EDIT_DIST_PROB_BOUND)
93         return(n);
94     }
95   }
96 
97   return(AS_MAX_READLEN);
98 }
99 
100 
101 
102 
103 void
Initialize_Match_Limit(int32 * ml,double maxErate,int32 maxErrors)104 Initialize_Match_Limit(int32 *ml, double maxErate, int32 maxErrors) {
105   int32 e = 0;
106   int32 s = 1;
107   int32 l = std::min(maxErrors, 2000);  //  Compute the first 2000 values; set to maxErrors to do no estimation
108 
109   //  The number of errors that are ignored in setting probability bound for terminating alignment
110   //  extensions in edit distance calculations
111   int32 ERRORS_FOR_FREE = 1;
112 
113   //  Free errors.
114 
115   while (e <= ERRORS_FOR_FREE)
116     ml[e++] = 0;
117 
118   //  Compute the actual limits.  This is _VERY_ expensive for longer reads.  BITS=17 is about all
119   //  it can support.
120 
121 #ifdef DUMP_MATCH_LIMIT
122   l = maxErrors;  //  For showing deviations
123 #endif
124 
125   while (e < l) {
126     s = Binomial_Bound(e - ERRORS_FOR_FREE, maxErate, s);
127     ml[e] = s - 1;
128 
129     assert(ml[e] >= ml[e-1]);
130 
131     //if ((e % 100) == 0)
132     //  fprintf(stderr, " %8.4f%% - %8d / %8d\r", 100.0 * e / maxErrors, e, maxErrors);
133 
134     e++;
135   }
136 
137   //  Estimate the remaining limits.  using a linear function based on a precomputed slope.
138   //  prefixEditDistance-matchLimitGenerate computes the data values for a bunch of error rates.
139   //  These are used to compute the slope of a line from the [2000] point through the [max] point.
140   //  These slopes fit, almost exactly, an a/x+b curve, and that curve is used to compute the slope
141   //  for any error rate.
142   //
143 #if AS_MAX_READLEN_BITS == 17
144   double sl = 0.962830901135531 / maxErate + 0.096810267016486;
145 #endif
146 
147 #if AS_MAX_READLEN_BITS == 18
148   double sl = 0.964368146781421 / maxErate + 0.118101522100597;
149 #endif
150 
151 #if AS_MAX_READLEN_BITS == 19
152   double sl = 0.963823337297648 / maxErate + 0.156091528250625;
153 #endif
154 
155 #if AS_MAX_READLEN_BITS == 20
156   double sl = 0.971023157863311 / maxErate + 0.154425731746994;
157 #endif
158 
159 #if AS_MAX_READLEN_BITS == 21
160   double sl = 0.982064188397525 / maxErate + 0.067835741959926;
161 #endif
162 
163 #if AS_MAX_READLEN_BITS == 22
164   double sl = 0.986446300363063 / maxErate + 0.052358358862826;
165 #endif
166 
167 #if AS_MAX_READLEN_BITS == 23
168   double sl = 0.989923769842693 / maxErate + 0.0395372203695468;
169 #endif
170 
171 #if AS_MAX_READLEN_BITS == 24
172   double sl = 0.992440299290478 / maxErate + 0.036791522317757;
173 #endif
174 
175   //  And the first value.
176   double  vl = ml[e-1] + sl;
177 
178   while (e < maxErrors) {
179     ml[e] = (int32)ceil(vl);
180     vl += sl;
181     e++;
182   }
183 
184 #ifdef DUMP_MATCH_LIMIT
185   FILE *F = fopen("values-new.dat", "w");
186   for (int32 e=0; e<maxErrors; e++)
187     fprintf(F, "%d %d\n", e, Edit_Match_Limit[e]);
188   AS_UTL_closeFile(F, "values-new.dat");
189 
190   fprintf(stderr, "values-orig.dat and values-new.dat dumped.  exiting.\n");
191   exit(1);
192 #endif
193 
194 }
195 
196