1#
2#
3#            Nim's Runtime Library
4#        (c) Copyright 2015 Nim contributors
5#
6#    See the file "copying.txt", included in this
7#    distribution, for details about the copyright.
8#
9
10## Statistical analysis framework for performing
11## basic statistical analysis of data.
12## The data is analysed in a single pass, when it
13## is pushed to a `RunningStat` or `RunningRegress` object.
14##
15## `RunningStat` calculates for a single data set
16## - n (data count)
17## - min (smallest value)
18## - max (largest value)
19## - sum
20## - mean
21## - variance
22## - varianceS (sample variance)
23## - standardDeviation
24## - standardDeviationS (sample standard deviation)
25## - skewness (the third statistical moment)
26## - kurtosis (the fourth statistical moment)
27##
28## `RunningRegress` calculates for two sets of data
29## - n (data count)
30## - slope
31## - intercept
32## - correlation
33##
34## Procs are provided to calculate statistics on `openArray`s.
35##
36## However, if more than a single statistical calculation is required, it is more
37## efficient to push the data once to a `RunningStat` object and then
38## call the numerous statistical procs for the `RunningStat` object:
39
40runnableExamples:
41  from std/math import almostEqual
42
43  template `~=`(a, b: float): bool = almostEqual(a, b)
44
45  var statistics: RunningStat  # must be var
46  statistics.push(@[1.0, 2.0, 1.0, 4.0, 1.0, 4.0, 1.0, 2.0])
47  doAssert statistics.n == 8
48  doAssert statistics.mean() ~= 2.0
49  doAssert statistics.variance() ~= 1.5
50  doAssert statistics.varianceS() ~= 1.714285714285715
51  doAssert statistics.skewness() ~= 0.8164965809277261
52  doAssert statistics.skewnessS() ~= 1.018350154434631
53  doAssert statistics.kurtosis() ~= -1.0
54  doAssert statistics.kurtosisS() ~= -0.7000000000000008
55
56from math import FloatClass, sqrt, pow, round
57
58{.push debugger: off.} # the user does not want to trace a part
59                       # of the standard library!
60{.push checks: off, line_dir: off, stack_trace: off.}
61
62type
63  RunningStat* = object           ## An accumulator for statistical data.
64    n*: int                       ## amount of pushed data
65    min*, max*, sum*: float       ## self-explaining
66    mom1, mom2, mom3, mom4: float ## statistical moments, mom1 is mean
67
68  RunningRegress* = object ## An accumulator for regression calculations.
69    n*: int                ## amount of pushed data
70    x_stats*: RunningStat  ## stats for the first set of data
71    y_stats*: RunningStat  ## stats for the second set of data
72    s_xy: float            ## accumulated data for combined xy
73
74# ----------- RunningStat --------------------------
75
76proc clear*(s: var RunningStat) =
77  ## Resets `s`.
78  s.n = 0
79  s.min = 0.0
80  s.max = 0.0
81  s.sum = 0.0
82  s.mom1 = 0.0
83  s.mom2 = 0.0
84  s.mom3 = 0.0
85  s.mom4 = 0.0
86
87proc push*(s: var RunningStat, x: float) =
88  ## Pushes a value `x` for processing.
89  if s.n == 0:
90    s.min = x
91    s.max = x
92  else:
93    if s.min > x: s.min = x
94    if s.max < x: s.max = x
95  inc(s.n)
96  # See Knuth TAOCP vol 2, 3rd edition, page 232
97  s.sum += x
98  let n = toFloat(s.n)
99  let delta = x - s.mom1
100  let delta_n = delta / toFloat(s.n)
101  let delta_n2 = delta_n * delta_n
102  let term1 = delta * delta_n * toFloat(s.n - 1)
103  s.mom4 += term1 * delta_n2 * (n*n - 3*n + 3) +
104              6*delta_n2*s.mom2 - 4*delta_n*s.mom3
105  s.mom3 += term1 * delta_n * (n - 2) - 3*delta_n*s.mom2
106  s.mom2 += term1
107  s.mom1 += delta_n
108
109proc push*(s: var RunningStat, x: int) =
110  ## Pushes a value `x` for processing.
111  ##
112  ## `x` is simply converted to `float`
113  ## and the other push operation is called.
114  s.push(toFloat(x))
115
116proc push*(s: var RunningStat, x: openArray[float|int]) =
117  ## Pushes all values of `x` for processing.
118  ##
119  ## Int values of `x` are simply converted to `float` and
120  ## the other push operation is called.
121  for val in x:
122    s.push(val)
123
124proc mean*(s: RunningStat): float =
125  ## Computes the current mean of `s`.
126  result = s.mom1
127
128proc variance*(s: RunningStat): float =
129  ## Computes the current population variance of `s`.
130  result = s.mom2 / toFloat(s.n)
131
132proc varianceS*(s: RunningStat): float =
133  ## Computes the current sample variance of `s`.
134  if s.n > 1: result = s.mom2 / toFloat(s.n - 1)
135
136proc standardDeviation*(s: RunningStat): float =
137  ## Computes the current population standard deviation of `s`.
138  result = sqrt(variance(s))
139
140proc standardDeviationS*(s: RunningStat): float =
141  ## Computes the current sample standard deviation of `s`.
142  result = sqrt(varianceS(s))
143
144proc skewness*(s: RunningStat): float =
145  ## Computes the current population skewness of `s`.
146  result = sqrt(toFloat(s.n)) * s.mom3 / pow(s.mom2, 1.5)
147
148proc skewnessS*(s: RunningStat): float =
149  ## Computes the current sample skewness of `s`.
150  let s2 = skewness(s)
151  result = sqrt(toFloat(s.n*(s.n-1)))*s2 / toFloat(s.n-2)
152
153proc kurtosis*(s: RunningStat): float =
154  ## Computes the current population kurtosis of `s`.
155  result = toFloat(s.n) * s.mom4 / (s.mom2 * s.mom2) - 3.0
156
157proc kurtosisS*(s: RunningStat): float =
158  ## Computes the current sample kurtosis of `s`.
159  result = toFloat(s.n-1) / toFloat((s.n-2)*(s.n-3)) *
160              (toFloat(s.n+1)*kurtosis(s) + 6)
161
162proc `+`*(a, b: RunningStat): RunningStat =
163  ## Combines two `RunningStat`s.
164  ##
165  ## Useful when performing parallel analysis of data series
166  ## and needing to re-combine parallel result sets.
167  result.clear()
168  result.n = a.n + b.n
169
170  let delta = b.mom1 - a.mom1
171  let delta2 = delta*delta
172  let delta3 = delta*delta2
173  let delta4 = delta2*delta2
174  let n = toFloat(result.n)
175
176  result.mom1 = (a.n.float*a.mom1 + b.n.float*b.mom1) / n
177  result.mom2 = a.mom2 + b.mom2 + delta2 * a.n.float * b.n.float / n
178  result.mom3 = a.mom3 + b.mom3 +
179                delta3 * a.n.float * b.n.float * (a.n.float - b.n.float)/(n*n);
180  result.mom3 += 3.0*delta * (a.n.float*b.mom2 - b.n.float*a.mom2) / n
181  result.mom4 = a.mom4 + b.mom4 +
182            delta4*a.n.float*b.n.float * toFloat(a.n*a.n - a.n*b.n + b.n*b.n) /
183                (n*n*n)
184  result.mom4 += 6.0*delta2 * (a.n.float*a.n.float*b.mom2 + b.n.float*b.n.float*a.mom2) /
185                (n*n) +
186                4.0*delta*(a.n.float*b.mom3 - b.n.float*a.mom3) / n
187  result.max = max(a.max, b.max)
188  result.min = min(a.min, b.min)
189
190proc `+=`*(a: var RunningStat, b: RunningStat) {.inline.} =
191  ## Adds the `RunningStat` `b` to `a`.
192  a = a + b
193
194proc `$`*(a: RunningStat): string =
195  ## Produces a string representation of the `RunningStat`. The exact
196  ## format is currently unspecified and subject to change. Currently
197  ## it contains:
198  ##
199  ## - the number of probes
200  ## - min, max values
201  ## - sum, mean and standard deviation.
202  result = "RunningStat(\n"
203  result.add "  number of probes: " & $a.n & "\n"
204  result.add "  max: " & $a.max & "\n"
205  result.add "  min: " & $a.min & "\n"
206  result.add "  sum: " & $a.sum & "\n"
207  result.add "  mean: " & $a.mean & "\n"
208  result.add "  std deviation: " & $a.standardDeviation & "\n"
209  result.add ")"
210
211# ---------------------- standalone array/seq stats ---------------------
212
213proc mean*[T](x: openArray[T]): float =
214  ## Computes the mean of `x`.
215  var rs: RunningStat
216  rs.push(x)
217  result = rs.mean()
218
219proc variance*[T](x: openArray[T]): float =
220  ## Computes the population variance of `x`.
221  var rs: RunningStat
222  rs.push(x)
223  result = rs.variance()
224
225proc varianceS*[T](x: openArray[T]): float =
226  ## Computes the sample variance of `x`.
227  var rs: RunningStat
228  rs.push(x)
229  result = rs.varianceS()
230
231proc standardDeviation*[T](x: openArray[T]): float =
232  ## Computes the population standard deviation of `x`.
233  var rs: RunningStat
234  rs.push(x)
235  result = rs.standardDeviation()
236
237proc standardDeviationS*[T](x: openArray[T]): float =
238  ## Computes the sample standard deviation of `x`.
239  var rs: RunningStat
240  rs.push(x)
241  result = rs.standardDeviationS()
242
243proc skewness*[T](x: openArray[T]): float =
244  ## Computes the population skewness of `x`.
245  var rs: RunningStat
246  rs.push(x)
247  result = rs.skewness()
248
249proc skewnessS*[T](x: openArray[T]): float =
250  ## Computes the sample skewness of `x`.
251  var rs: RunningStat
252  rs.push(x)
253  result = rs.skewnessS()
254
255proc kurtosis*[T](x: openArray[T]): float =
256  ## Computes the population kurtosis of `x`.
257  var rs: RunningStat
258  rs.push(x)
259  result = rs.kurtosis()
260
261proc kurtosisS*[T](x: openArray[T]): float =
262  ## Computes the sample kurtosis of `x`.
263  var rs: RunningStat
264  rs.push(x)
265  result = rs.kurtosisS()
266
267# ---------------------- Running Regression -----------------------------
268
269proc clear*(r: var RunningRegress) =
270  ## Resets `r`.
271  r.x_stats.clear()
272  r.y_stats.clear()
273  r.s_xy = 0.0
274  r.n = 0
275
276proc push*(r: var RunningRegress, x, y: float) =
277  ## Pushes two values `x` and `y` for processing.
278  r.s_xy += (r.x_stats.mean() - x)*(r.y_stats.mean() - y) *
279                toFloat(r.n) / toFloat(r.n + 1)
280  r.x_stats.push(x)
281  r.y_stats.push(y)
282  inc(r.n)
283
284proc push*(r: var RunningRegress, x, y: int) {.inline.} =
285  ## Pushes two values `x` and `y` for processing.
286  ##
287  ## `x` and `y` are converted to `float`
288  ## and the other push operation is called.
289  r.push(toFloat(x), toFloat(y))
290
291proc push*(r: var RunningRegress, x, y: openArray[float|int]) =
292  ## Pushes two sets of values `x` and `y` for processing.
293  assert(x.len == y.len)
294  for i in 0..<x.len:
295    r.push(x[i], y[i])
296
297proc slope*(r: RunningRegress): float =
298  ## Computes the current slope of `r`.
299  let s_xx = r.x_stats.varianceS()*toFloat(r.n - 1)
300  result = r.s_xy / s_xx
301
302proc intercept*(r: RunningRegress): float =
303  ## Computes the current intercept of `r`.
304  result = r.y_stats.mean() - r.slope()*r.x_stats.mean()
305
306proc correlation*(r: RunningRegress): float =
307  ## Computes the current correlation of the two data
308  ## sets pushed into `r`.
309  let t = r.x_stats.standardDeviation() * r.y_stats.standardDeviation()
310  result = r.s_xy / (toFloat(r.n) * t)
311
312proc `+`*(a, b: RunningRegress): RunningRegress =
313  ## Combines two `RunningRegress` objects.
314  ##
315  ## Useful when performing parallel analysis of data series
316  ## and needing to re-combine parallel result sets
317  result.clear()
318  result.x_stats = a.x_stats + b.x_stats
319  result.y_stats = a.y_stats + b.y_stats
320  result.n = a.n + b.n
321
322  let delta_x = b.x_stats.mean() - a.x_stats.mean()
323  let delta_y = b.y_stats.mean() - a.y_stats.mean()
324  result.s_xy = a.s_xy + b.s_xy +
325      toFloat(a.n*b.n)*delta_x*delta_y/toFloat(result.n)
326
327proc `+=`*(a: var RunningRegress, b: RunningRegress) =
328  ## Adds the `RunningRegress` `b` to `a`.
329  a = a + b
330
331{.pop.}
332{.pop.}
333