1 
2 /******************************************************************************
3  *
4  *  This file is part of meryl-utility, a collection of miscellaneous code
5  *  used by Meryl, Canu and others.
6  *
7  *  This software is based on:
8  *    'Canu' v2.0              (https://github.com/marbl/canu)
9  *  which is based on:
10  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
11  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
12  *
13  *  Except as indicated otherwise, this is a 'United States Government Work',
14  *  and is released in the public domain.
15  *
16  *  File 'README.licenses' in the root directory of this distribution
17  *  contains full conditions and disclaimers.
18  */
19 
20 #ifndef STDDEV_H
21 #define STDDEV_H
22 
23 #include "types.H"
24 #include "arrays.H"
25 
26 #include <vector>
27 #include <algorithm>
28 
29 
30 //  Online mean and std.dev calculation.
31 //  B. P. Welford, Technometrics, Vol 4, No 3, Aug 1962 pp 419-420.
32 //    http://www2.in.tu-clausthal.de/~zach/teaching/info_literatur/Welford.pdf
33 //  Also presented in Knuth Vol 2 (3rd Ed.) pp 232.
34 //
35 template<typename TT>
36 class stdDev {
37 public:
38   stdDev(double mn=0.0, double sn=0.0, uint32 nn=0) {
39     _mn = mn;
40     _sn = sn;
41     _nn = nn;
42   };
43 
~stdDev()44   ~stdDev() {
45   };
46 
insert(TT val)47   void     insert(TT val) {
48     double m0 = _mn;
49     double s0 = _sn;
50     uint32 n0 = _nn + 1;
51 
52     if (_nn == 0x7fffffff)
53       fprintf(stderr, "ERROR: stdDev is full; can't insert() new value.\n"), exit(1);
54 
55     if (_nn & 0x80000000)
56       fprintf(stderr, "ERROR: stdDev has been finalized; can't insert() new value.\n"), exit(1);
57 
58     _mn = m0 + (val - m0) / n0;
59     _sn = s0 + (val - m0) * (val - _mn);
60     _nn = n0;
61   };
62 
remove(TT val)63   void     remove(TT val) {
64     uint32 n0 = _nn - 1;
65     double m0 = (n0 == 0) ? (0) : ((_nn * _mn - val) / n0);
66     double s0 = _sn - (val - m0) * (val - _mn);
67 
68     if (n0 == 0)   m0 = 0.0;   //  Reset mean and variance to zero when we can.
69     if (n0 <= 1)   s0 = 0.0;   //  See tests/stddevTest.C testStability() for details.
70 
71     if (s0 < 0.0)              //  Assume negative values are due to stability problems,
72       s0 = 0.0;                //  and not mismatched insert() and delete() values.
73     if (-1e-10 <= m0 && m0 <= 1e-10)
74       m0 = 0.0;
75 
76     if (_nn == 0)
77       fprintf(stderr, "ERROR: stdDev has no data; can't remove() old value.\n"), exit(1);
78 
79     if (_nn & 0x80000000)
80       fprintf(stderr, "ERROR: stdDev has been finalized; can't remove() old value.\n"), exit(1);
81 
82     _nn = n0;
83     _mn = m0;
84     _sn = s0;
85   };
86 
finalize(void)87   void     finalize(void) {
88     _sn  = stddev();
89     _nn  |= 0x80000000;
90   };
91 
size(void)92   uint32   size(void) {
93     return(_nn & 0x7fffffff);
94   };
95 
mean(void)96   double   mean(void) {
97     return(_mn);
98   };
99 
variance(void)100   double   variance(void) {
101     if (_nn & 0x80000000)
102       return(_sn * _sn);
103     else
104       return((_nn < 2) ? (0.0) : (_sn / (_nn-1)));
105   };
106 
stddev(void)107   double   stddev(void) {
108     if (_nn & 0x80000000)
109       return(_sn);
110     else
111       return(sqrt(variance()));
112   };
113 
114 private:
115   double   _mn;  //  mean
116   double   _sn;  //  "sum of variances"
117   uint32   _nn;  //  number of items in the set
118 };
119 
120 
121 
122 
123 //  Offline mean and std.dev calculation.  Filters outliers.
124 //  Does not work well with unsigned types.  The 'smallest' compute can underflow.
125 //
126 template<typename TT>
127 void
128 computeStdDev(TT *dist, uint64 distLen, double &mean, double &stddev, bool isSorted=false) {
129   mean   = 0;
130   stddev = 0;
131 
132   if (distLen == 0)
133     return;
134 
135   //  Sort the values.  Lets us approximate the stddev for filtering out outliers.
136 
137   if (isSorted == false)
138     std::sort(dist, dist + distLen);
139 
140   //  Approximate the stddev to filter out outliers.  This is done by assuming we're normally
141   //  distributed, finding the values that would represent 1 standard deviation (about 68.27% of the
142   //  data), and using that to find the 5 std.dev. limits.
143 
144   TT median     = dist[1 * distLen / 2];
145 
146   TT oneThird   = dist[1 * distLen / 3];
147   TT twoThird   = dist[2 * distLen / 3];
148 
149   TT approxStd  = max(median - oneThird, twoThird - median);
150 
151   TT biggest    = median + approxStd * 5;
152   TT smallest   = median - approxStd * 5;
153 
154   fprintf(stderr, "computeStdDev  %d %d %d %d %d %d\n", median, oneThird, twoThird, approxStd, biggest, smallest);
155 
156   //  Now, compute the number of samples within our bounds.  And find the mean, too.
157 
158   size_t  numSamples = 0;
159 
160   for (size_t x=0; x<distLen; x++)
161     if ((smallest  <= dist[x]) &&
162         (dist[x]   <= biggest)) {
163       numSamples += 1;
164       mean       += dist[x];
165     }
166 
167   if (numSamples == 0)
168     return;
169 
170   mean   = mean / numSamples;
171 
172   //  Use the standard std.dev. algorithm, tossing out the outliers.
173 
174   for (uint64 x=0; x<distLen; x++)
175     if ((smallest  <= dist[x]) &&
176         (dist[x]   <= biggest))
177       stddev += (dist[x] - mean) * (dist[x] - mean);
178 
179   if (numSamples > 1)
180     stddev = sqrt(stddev / (numSamples - 1));
181 };
182 
183 template<typename TT>
184 void
185 computeStdDev(std::vector<TT> dist, double &mean, double &stddev, bool isSorted=false) {
186   computeStdDev(dist.data(), dist.size(), mean, stddev, isSorted);
187 }
188 
189 
190 
191 //  Compute the mode.  Once the values are sorted, we just need to scan the list and remember the
192 //  most common value.
193 //
194 template<typename TT>
195 void
196 computeMode(TT *dist, uint64 distLen, TT &mode, bool isSorted=false) {
197   mode = 0;
198 
199   if (distLen == 0)
200     return;
201 
202   if (isSorted == false)
203     std::sort(dist, dist + distLen);
204 
205   uint32  modeCnt = 0;
206   TT      modeVal = 0;
207 
208   uint32  modeTmpCnt = 0;
209   TT      modeTmpVal = 0;
210 
211   for (uint64 x=0; x<distLen; x++) {
212     if (dist[x] != modeTmpVal) {
213       if (modeCnt < modeTmpCnt) {
214         modeCnt = modeTmpCnt;
215         modeVal = modeTmpVal;
216       }
217 
218       modeTmpCnt = 1;
219       modeTmpVal = dist[x];
220     }
221 
222     modeTmpCnt++;
223   }
224 
225   if (modeCnt < modeTmpCnt) {
226     modeCnt = modeTmpCnt;
227     modeVal = modeTmpVal;
228   }
229 
230   mode = modeVal;
231 }
232 
233 template<typename TT>
234 void
235 computeMode(std::vector<TT> dist, TT &mode, bool isSorted=false) {
236   computeMode(dist.data(), dist.size(), mode, isSorted);
237 }
238 
239 
240 
241 template<typename TT>
242 void
243 computeMedian(TT *dist, uint64 distLen, TT &median, bool isSorted=false) {
244   median = 0;
245 
246   if (distLen == 0)
247     return;
248 
249   if (isSorted == false)
250     std::sort(dist, dist + distLen);
251 
252   if (distLen % 2 == 0)
253      median = (dist[distLen / 2 - 1] + dist[distLen / 2]) / 2;
254   else
255      median = dist[distLen / 2];
256 }
257 
258 template<typename TT>
259 void
260 computeMedian(std::vector<TT> dist, TT &median, bool isSorted=false) {
261   computeMedian(dist.data(), dist.size(), median, isSorted);
262 }
263 
264 //  Compute the median and median absolute deviation.  Sort the values to find the median, then
265 //  build a new vector of |median - x| and find the median of that.
266 //
267 template<typename TT>
268 void
269 computeMedianAbsoluteDeviation(TT *dist, uint64 distLen, TT &median, TT &mad, bool isSorted=false) {
270   median = 0;
271   mad    = 0;
272 
273   if (distLen == 0)
274     return;
275 
276   if (isSorted == false)
277     std::sort(dist, dist + distLen);
278 
279   computeMedian(dist, distLen, median, true);
280 
281   std::vector<TT>  m;
282 
283   for (uint64 ii=0; ii<distLen; ii++) {
284     if (dist[ii] < median)
285       m.push_back(median - dist[ii]);
286     else
287       m.push_back(dist[ii] - median);
288   }
289 
290   std::sort(m.begin(), m.end());
291 
292   mad = m[ m.size()/2 ];
293 };
294 
295 template<typename TT>
296 void
297 computeMedianAbsoluteDeviation(std::vector<TT> dist, TT &median, TT &mad, bool isSorted=false) {
298   computeMedianAbsoluteDeviation(dist.data(), dist.size(), median, mad, isSorted);
299 }
300 
301 
302 
303 template<typename TT>
304 TT
computeExponentialMovingAverage(TT alpha,TT ema,TT value)305 computeExponentialMovingAverage(TT alpha, TT ema, TT value) {
306   assert(0.0   <= alpha);
307   assert(alpha <= 1.0);
308 
309   return(alpha * value + (1 - alpha) * ema);
310 };
311 
312 
313 
314 
315 
316 
317 
318 class histogramStatistics {
319 public:
histogramStatistics()320   histogramStatistics() {
321     _histogramAlloc = 1024 * 1024;
322     _histogramMax = 0;
323     _histogram    = new uint64 [_histogramAlloc];
324 
325     memset(_histogram, 0, sizeof(uint64) * _histogramAlloc);
326 
327     _finalized = false;
328 
329     clearStatistics();
330   };
~histogramStatistics()331   ~histogramStatistics() {
332     delete [] _histogram;
333   };
334 
335   void               add(uint64 data, uint32 count=1) {
336     while (_histogramAlloc <= data)
337       resizeArray(_histogram, _histogramMax+1, _histogramAlloc, _histogramAlloc * 2, _raAct::copyData | _raAct::clearNew);
338 
339     if (_histogramMax < data)
340       _histogramMax = data;
341 
342     _histogram[data] += count;
343     _finalized = false;
344   };
345 
346 
numberOfObjects(void)347   uint64             numberOfObjects(void)  { finalizeData(); return(_numObjs);  };
348 
mean(void)349   double             mean(void)             { finalizeData(); return(_mean);     };
stddev(void)350   double             stddev(void)           { finalizeData(); return(_stddev);   };
351 
median(void)352   uint64             median(void)           { finalizeData(); return(_median);   };
mad(void)353   uint64             mad(void)              { finalizeData(); return(_mad);      };
354 
clearStatistics(void)355   void               clearStatistics(void) {
356     _numObjs  = 0;
357 
358     _mean     = 0.0;
359     _stddev   = 0.0;
360 
361     _mode     = 0;
362 
363     _median   = 0;
364     _mad      = 0;
365   };
366 
finalizeData(void)367   void               finalizeData(void) {
368 
369     if (_finalized == true)
370       return;
371 
372     //  Cheat sheet for this function.
373     //    ii is the value of a sample item
374     //    _histogram[ii] is how many of each item we have
375     //  So:
376     //  a)  Something like '_histogram[ii] * f(ii)' we're just adding
377     //      the contributions of each object.
378     //  b)  Pretend '_histogram[ii]' is 1 and the usual algorithms
379     //      should appear.
380 
381     clearStatistics();
382 
383     //  Compute number of objects
384 
385     for (uint64 ii=0; ii <= _histogramMax; ii++)
386       _numObjs += _histogram[ii];
387 
388     //  Compute mean and stddev
389 
390     for (uint64 ii=0; ii <= _histogramMax; ii++)
391       _mean += ii * _histogram[ii];
392 
393     if (_numObjs > 1)
394       _mean /= _numObjs;
395 
396     for (uint64 ii=0; ii <= _histogramMax; ii++)
397       _stddev += _histogram[ii] * (ii - _mean) * (ii - _mean);
398 
399     if (_numObjs > 1)
400       _stddev = sqrt(_stddev / (_numObjs - 1));
401 
402     //  Compute mode, pick the highest to break ties
403 
404     for (uint64 ii=0; ii <= _histogramMax; ii++)
405       if (_histogram[ii] > _histogram[_mode])
406         _mode = ii;
407 
408     //  Compute median and mad
409     //
410     //  The MAD is the 'median of the absolute deviations from the set median'.
411     //  We can compute this by making another histogram collection of data,
412     //  this time of the absolute deviations.  Then just find the median of
413     //  that, as above.
414 
415     for (uint64 ii=0; ii <= _histogramMax; ii++) {
416       _median += _histogram[ii];       //  Sum the number of objects we've seen.
417 
418       if (_median >= _numObjs / 2) {   //  If we've seen half of them, set the
419         _median = ii;                  //  median to the value we're at and
420         break;                         //  stop.
421       }
422     }
423 
424     uint64   maddatamax = _histogramMax + 1;        //  Needs every value.  Consider [0]=big, [n]=1.
425     uint64  *maddata    = new uint64 [maddatamax];  //  Deviation from median will be n - 0.
426 
427     for (uint64 ii=0; ii < maddatamax; ii++)
428       maddata[ii] = 0;
429 
430     for (uint64 ii=0; ii <= _histogramMax; ii++) {
431       if (_histogram[ii] > 0) {
432         uint64  deviation = (ii < _median) ? (_median - ii) : (ii - _median);
433 
434         if (deviation >= maddatamax) {
435           fprintf(stderr, "finalizeData()--  Failed at ii=%lu for _histogramMax=%lu\n", ii, _histogramMax);
436           fprintf(stderr, "finalizeData()--            _median=%lu\n", _median);
437           fprintf(stderr, "finalizeData()--            deviation=%lu\n", deviation);
438         }
439         assert(deviation < maddatamax);
440 
441         maddata[deviation] += _histogram[ii];
442       }
443     }
444 
445     for (uint64 ii=0; ii <= _histogramMax; ii++) {
446       _mad += maddata[ii];
447 
448       if (_mad >= _numObjs / 2) {
449         _mad = ii;
450         break;
451       }
452     }
453 
454     //  And, done
455 
456     delete [] maddata;
457 
458     _finalized = true;
459   };
460 
histogram(uint64 ii)461   uint64           histogram(uint64 ii) {
462     return(_histogram[ii]);
463   };
histogramMax(void)464   uint64           histogramMax(void) {
465     return(_histogramMax);
466   };
467 
writeHistogram(FILE * F,char * label)468   void             writeHistogram(FILE *F, char *label) {
469     fprintf(F, "#%s\tquantity\n", label);
470 
471     for (uint64 ii=0; ii <= _histogramMax; ii++)
472       fprintf(F, F_U64"\t" F_U64 "\n", ii, _histogram[ii]);
473   };
474 
475 
476 private:
477   bool            _finalized;
478 
479   uint64          _histogramAlloc;  //  Maximum allocated value
480   uint64          _histogramMax;    //  Maximum valid value
481   uint64         *_histogram;
482 
483   uint64          _numObjs;
484 
485   double          _mean;
486   double          _stddev;
487 
488   uint64          _mode;
489 
490   uint64          _median;
491   uint64          _mad;
492 };
493 
494 
495 
496 
497 
498 #endif  //  STDDEV_H
499