1
2 /******************************************************************************
3 *
4 * This file is part of meryl-utility, a collection of miscellaneous code
5 * used by Meryl, Canu and others.
6 *
7 * This software is based on:
8 * 'Canu' v2.0 (https://github.com/marbl/canu)
9 * which is based on:
10 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
11 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
12 *
13 * Except as indicated otherwise, this is a 'United States Government Work',
14 * and is released in the public domain.
15 *
16 * File 'README.licenses' in the root directory of this distribution
17 * contains full conditions and disclaimers.
18 */
19
20 #ifndef STDDEV_H
21 #define STDDEV_H
22
23 #include "types.H"
24 #include "arrays.H"
25
26 #include <vector>
27 #include <algorithm>
28
29
30 // Online mean and std.dev calculation.
31 // B. P. Welford, Technometrics, Vol 4, No 3, Aug 1962 pp 419-420.
32 // http://www2.in.tu-clausthal.de/~zach/teaching/info_literatur/Welford.pdf
33 // Also presented in Knuth Vol 2 (3rd Ed.) pp 232.
34 //
35 template<typename TT>
36 class stdDev {
37 public:
38 stdDev(double mn=0.0, double sn=0.0, uint32 nn=0) {
39 _mn = mn;
40 _sn = sn;
41 _nn = nn;
42 };
43
~stdDev()44 ~stdDev() {
45 };
46
insert(TT val)47 void insert(TT val) {
48 double m0 = _mn;
49 double s0 = _sn;
50 uint32 n0 = _nn + 1;
51
52 if (_nn == 0x7fffffff)
53 fprintf(stderr, "ERROR: stdDev is full; can't insert() new value.\n"), exit(1);
54
55 if (_nn & 0x80000000)
56 fprintf(stderr, "ERROR: stdDev has been finalized; can't insert() new value.\n"), exit(1);
57
58 _mn = m0 + (val - m0) / n0;
59 _sn = s0 + (val - m0) * (val - _mn);
60 _nn = n0;
61 };
62
remove(TT val)63 void remove(TT val) {
64 uint32 n0 = _nn - 1;
65 double m0 = (n0 == 0) ? (0) : ((_nn * _mn - val) / n0);
66 double s0 = _sn - (val - m0) * (val - _mn);
67
68 if (n0 == 0) m0 = 0.0; // Reset mean and variance to zero when we can.
69 if (n0 <= 1) s0 = 0.0; // See tests/stddevTest.C testStability() for details.
70
71 if (s0 < 0.0) // Assume negative values are due to stability problems,
72 s0 = 0.0; // and not mismatched insert() and delete() values.
73 if (-1e-10 <= m0 && m0 <= 1e-10)
74 m0 = 0.0;
75
76 if (_nn == 0)
77 fprintf(stderr, "ERROR: stdDev has no data; can't remove() old value.\n"), exit(1);
78
79 if (_nn & 0x80000000)
80 fprintf(stderr, "ERROR: stdDev has been finalized; can't remove() old value.\n"), exit(1);
81
82 _nn = n0;
83 _mn = m0;
84 _sn = s0;
85 };
86
finalize(void)87 void finalize(void) {
88 _sn = stddev();
89 _nn |= 0x80000000;
90 };
91
size(void)92 uint32 size(void) {
93 return(_nn & 0x7fffffff);
94 };
95
mean(void)96 double mean(void) {
97 return(_mn);
98 };
99
variance(void)100 double variance(void) {
101 if (_nn & 0x80000000)
102 return(_sn * _sn);
103 else
104 return((_nn < 2) ? (0.0) : (_sn / (_nn-1)));
105 };
106
stddev(void)107 double stddev(void) {
108 if (_nn & 0x80000000)
109 return(_sn);
110 else
111 return(sqrt(variance()));
112 };
113
114 private:
115 double _mn; // mean
116 double _sn; // "sum of variances"
117 uint32 _nn; // number of items in the set
118 };
119
120
121
122
123 // Offline mean and std.dev calculation. Filters outliers.
124 // Does not work well with unsigned types. The 'smallest' compute can underflow.
125 //
126 template<typename TT>
127 void
128 computeStdDev(TT *dist, uint64 distLen, double &mean, double &stddev, bool isSorted=false) {
129 mean = 0;
130 stddev = 0;
131
132 if (distLen == 0)
133 return;
134
135 // Sort the values. Lets us approximate the stddev for filtering out outliers.
136
137 if (isSorted == false)
138 std::sort(dist, dist + distLen);
139
140 // Approximate the stddev to filter out outliers. This is done by assuming we're normally
141 // distributed, finding the values that would represent 1 standard deviation (about 68.27% of the
142 // data), and using that to find the 5 std.dev. limits.
143
144 TT median = dist[1 * distLen / 2];
145
146 TT oneThird = dist[1 * distLen / 3];
147 TT twoThird = dist[2 * distLen / 3];
148
149 TT approxStd = max(median - oneThird, twoThird - median);
150
151 TT biggest = median + approxStd * 5;
152 TT smallest = median - approxStd * 5;
153
154 fprintf(stderr, "computeStdDev %d %d %d %d %d %d\n", median, oneThird, twoThird, approxStd, biggest, smallest);
155
156 // Now, compute the number of samples within our bounds. And find the mean, too.
157
158 size_t numSamples = 0;
159
160 for (size_t x=0; x<distLen; x++)
161 if ((smallest <= dist[x]) &&
162 (dist[x] <= biggest)) {
163 numSamples += 1;
164 mean += dist[x];
165 }
166
167 if (numSamples == 0)
168 return;
169
170 mean = mean / numSamples;
171
172 // Use the standard std.dev. algorithm, tossing out the outliers.
173
174 for (uint64 x=0; x<distLen; x++)
175 if ((smallest <= dist[x]) &&
176 (dist[x] <= biggest))
177 stddev += (dist[x] - mean) * (dist[x] - mean);
178
179 if (numSamples > 1)
180 stddev = sqrt(stddev / (numSamples - 1));
181 };
182
183 template<typename TT>
184 void
185 computeStdDev(std::vector<TT> dist, double &mean, double &stddev, bool isSorted=false) {
186 computeStdDev(dist.data(), dist.size(), mean, stddev, isSorted);
187 }
188
189
190
191 // Compute the mode. Once the values are sorted, we just need to scan the list and remember the
192 // most common value.
193 //
194 template<typename TT>
195 void
196 computeMode(TT *dist, uint64 distLen, TT &mode, bool isSorted=false) {
197 mode = 0;
198
199 if (distLen == 0)
200 return;
201
202 if (isSorted == false)
203 std::sort(dist, dist + distLen);
204
205 uint32 modeCnt = 0;
206 TT modeVal = 0;
207
208 uint32 modeTmpCnt = 0;
209 TT modeTmpVal = 0;
210
211 for (uint64 x=0; x<distLen; x++) {
212 if (dist[x] != modeTmpVal) {
213 if (modeCnt < modeTmpCnt) {
214 modeCnt = modeTmpCnt;
215 modeVal = modeTmpVal;
216 }
217
218 modeTmpCnt = 1;
219 modeTmpVal = dist[x];
220 }
221
222 modeTmpCnt++;
223 }
224
225 if (modeCnt < modeTmpCnt) {
226 modeCnt = modeTmpCnt;
227 modeVal = modeTmpVal;
228 }
229
230 mode = modeVal;
231 }
232
233 template<typename TT>
234 void
235 computeMode(std::vector<TT> dist, TT &mode, bool isSorted=false) {
236 computeMode(dist.data(), dist.size(), mode, isSorted);
237 }
238
239
240
241 template<typename TT>
242 void
243 computeMedian(TT *dist, uint64 distLen, TT &median, bool isSorted=false) {
244 median = 0;
245
246 if (distLen == 0)
247 return;
248
249 if (isSorted == false)
250 std::sort(dist, dist + distLen);
251
252 if (distLen % 2 == 0)
253 median = (dist[distLen / 2 - 1] + dist[distLen / 2]) / 2;
254 else
255 median = dist[distLen / 2];
256 }
257
258 template<typename TT>
259 void
260 computeMedian(std::vector<TT> dist, TT &median, bool isSorted=false) {
261 computeMedian(dist.data(), dist.size(), median, isSorted);
262 }
263
264 // Compute the median and median absolute deviation. Sort the values to find the median, then
265 // build a new vector of |median - x| and find the median of that.
266 //
267 template<typename TT>
268 void
269 computeMedianAbsoluteDeviation(TT *dist, uint64 distLen, TT &median, TT &mad, bool isSorted=false) {
270 median = 0;
271 mad = 0;
272
273 if (distLen == 0)
274 return;
275
276 if (isSorted == false)
277 std::sort(dist, dist + distLen);
278
279 computeMedian(dist, distLen, median, true);
280
281 std::vector<TT> m;
282
283 for (uint64 ii=0; ii<distLen; ii++) {
284 if (dist[ii] < median)
285 m.push_back(median - dist[ii]);
286 else
287 m.push_back(dist[ii] - median);
288 }
289
290 std::sort(m.begin(), m.end());
291
292 mad = m[ m.size()/2 ];
293 };
294
295 template<typename TT>
296 void
297 computeMedianAbsoluteDeviation(std::vector<TT> dist, TT &median, TT &mad, bool isSorted=false) {
298 computeMedianAbsoluteDeviation(dist.data(), dist.size(), median, mad, isSorted);
299 }
300
301
302
303 template<typename TT>
304 TT
computeExponentialMovingAverage(TT alpha,TT ema,TT value)305 computeExponentialMovingAverage(TT alpha, TT ema, TT value) {
306 assert(0.0 <= alpha);
307 assert(alpha <= 1.0);
308
309 return(alpha * value + (1 - alpha) * ema);
310 };
311
312
313
314
315
316
317
318 class histogramStatistics {
319 public:
histogramStatistics()320 histogramStatistics() {
321 _histogramAlloc = 1024 * 1024;
322 _histogramMax = 0;
323 _histogram = new uint64 [_histogramAlloc];
324
325 memset(_histogram, 0, sizeof(uint64) * _histogramAlloc);
326
327 _finalized = false;
328
329 clearStatistics();
330 };
~histogramStatistics()331 ~histogramStatistics() {
332 delete [] _histogram;
333 };
334
335 void add(uint64 data, uint32 count=1) {
336 while (_histogramAlloc <= data)
337 resizeArray(_histogram, _histogramMax+1, _histogramAlloc, _histogramAlloc * 2, _raAct::copyData | _raAct::clearNew);
338
339 if (_histogramMax < data)
340 _histogramMax = data;
341
342 _histogram[data] += count;
343 _finalized = false;
344 };
345
346
numberOfObjects(void)347 uint64 numberOfObjects(void) { finalizeData(); return(_numObjs); };
348
mean(void)349 double mean(void) { finalizeData(); return(_mean); };
stddev(void)350 double stddev(void) { finalizeData(); return(_stddev); };
351
median(void)352 uint64 median(void) { finalizeData(); return(_median); };
mad(void)353 uint64 mad(void) { finalizeData(); return(_mad); };
354
clearStatistics(void)355 void clearStatistics(void) {
356 _numObjs = 0;
357
358 _mean = 0.0;
359 _stddev = 0.0;
360
361 _mode = 0;
362
363 _median = 0;
364 _mad = 0;
365 };
366
finalizeData(void)367 void finalizeData(void) {
368
369 if (_finalized == true)
370 return;
371
372 // Cheat sheet for this function.
373 // ii is the value of a sample item
374 // _histogram[ii] is how many of each item we have
375 // So:
376 // a) Something like '_histogram[ii] * f(ii)' we're just adding
377 // the contributions of each object.
378 // b) Pretend '_histogram[ii]' is 1 and the usual algorithms
379 // should appear.
380
381 clearStatistics();
382
383 // Compute number of objects
384
385 for (uint64 ii=0; ii <= _histogramMax; ii++)
386 _numObjs += _histogram[ii];
387
388 // Compute mean and stddev
389
390 for (uint64 ii=0; ii <= _histogramMax; ii++)
391 _mean += ii * _histogram[ii];
392
393 if (_numObjs > 1)
394 _mean /= _numObjs;
395
396 for (uint64 ii=0; ii <= _histogramMax; ii++)
397 _stddev += _histogram[ii] * (ii - _mean) * (ii - _mean);
398
399 if (_numObjs > 1)
400 _stddev = sqrt(_stddev / (_numObjs - 1));
401
402 // Compute mode, pick the highest to break ties
403
404 for (uint64 ii=0; ii <= _histogramMax; ii++)
405 if (_histogram[ii] > _histogram[_mode])
406 _mode = ii;
407
408 // Compute median and mad
409 //
410 // The MAD is the 'median of the absolute deviations from the set median'.
411 // We can compute this by making another histogram collection of data,
412 // this time of the absolute deviations. Then just find the median of
413 // that, as above.
414
415 for (uint64 ii=0; ii <= _histogramMax; ii++) {
416 _median += _histogram[ii]; // Sum the number of objects we've seen.
417
418 if (_median >= _numObjs / 2) { // If we've seen half of them, set the
419 _median = ii; // median to the value we're at and
420 break; // stop.
421 }
422 }
423
424 uint64 maddatamax = _histogramMax + 1; // Needs every value. Consider [0]=big, [n]=1.
425 uint64 *maddata = new uint64 [maddatamax]; // Deviation from median will be n - 0.
426
427 for (uint64 ii=0; ii < maddatamax; ii++)
428 maddata[ii] = 0;
429
430 for (uint64 ii=0; ii <= _histogramMax; ii++) {
431 if (_histogram[ii] > 0) {
432 uint64 deviation = (ii < _median) ? (_median - ii) : (ii - _median);
433
434 if (deviation >= maddatamax) {
435 fprintf(stderr, "finalizeData()-- Failed at ii=%lu for _histogramMax=%lu\n", ii, _histogramMax);
436 fprintf(stderr, "finalizeData()-- _median=%lu\n", _median);
437 fprintf(stderr, "finalizeData()-- deviation=%lu\n", deviation);
438 }
439 assert(deviation < maddatamax);
440
441 maddata[deviation] += _histogram[ii];
442 }
443 }
444
445 for (uint64 ii=0; ii <= _histogramMax; ii++) {
446 _mad += maddata[ii];
447
448 if (_mad >= _numObjs / 2) {
449 _mad = ii;
450 break;
451 }
452 }
453
454 // And, done
455
456 delete [] maddata;
457
458 _finalized = true;
459 };
460
histogram(uint64 ii)461 uint64 histogram(uint64 ii) {
462 return(_histogram[ii]);
463 };
histogramMax(void)464 uint64 histogramMax(void) {
465 return(_histogramMax);
466 };
467
writeHistogram(FILE * F,char * label)468 void writeHistogram(FILE *F, char *label) {
469 fprintf(F, "#%s\tquantity\n", label);
470
471 for (uint64 ii=0; ii <= _histogramMax; ii++)
472 fprintf(F, F_U64"\t" F_U64 "\n", ii, _histogram[ii]);
473 };
474
475
476 private:
477 bool _finalized;
478
479 uint64 _histogramAlloc; // Maximum allocated value
480 uint64 _histogramMax; // Maximum valid value
481 uint64 *_histogram;
482
483 uint64 _numObjs;
484
485 double _mean;
486 double _stddev;
487
488 uint64 _mode;
489
490 uint64 _median;
491 uint64 _mad;
492 };
493
494
495
496
497
498 #endif // STDDEV_H
499