1# 2# 3# Nim's Runtime Library 4# (c) Copyright 2015 Nim contributors 5# 6# See the file "copying.txt", included in this 7# distribution, for details about the copyright. 8# 9 10## Statistical analysis framework for performing 11## basic statistical analysis of data. 12## The data is analysed in a single pass, when it 13## is pushed to a `RunningStat` or `RunningRegress` object. 14## 15## `RunningStat` calculates for a single data set 16## - n (data count) 17## - min (smallest value) 18## - max (largest value) 19## - sum 20## - mean 21## - variance 22## - varianceS (sample variance) 23## - standardDeviation 24## - standardDeviationS (sample standard deviation) 25## - skewness (the third statistical moment) 26## - kurtosis (the fourth statistical moment) 27## 28## `RunningRegress` calculates for two sets of data 29## - n (data count) 30## - slope 31## - intercept 32## - correlation 33## 34## Procs are provided to calculate statistics on `openArray`s. 35## 36## However, if more than a single statistical calculation is required, it is more 37## efficient to push the data once to a `RunningStat` object and then 38## call the numerous statistical procs for the `RunningStat` object: 39 40runnableExamples: 41 from std/math import almostEqual 42 43 template `~=`(a, b: float): bool = almostEqual(a, b) 44 45 var statistics: RunningStat # must be var 46 statistics.push(@[1.0, 2.0, 1.0, 4.0, 1.0, 4.0, 1.0, 2.0]) 47 doAssert statistics.n == 8 48 doAssert statistics.mean() ~= 2.0 49 doAssert statistics.variance() ~= 1.5 50 doAssert statistics.varianceS() ~= 1.714285714285715 51 doAssert statistics.skewness() ~= 0.8164965809277261 52 doAssert statistics.skewnessS() ~= 1.018350154434631 53 doAssert statistics.kurtosis() ~= -1.0 54 doAssert statistics.kurtosisS() ~= -0.7000000000000008 55 56from math import FloatClass, sqrt, pow, round 57 58{.push debugger: off.} # the user does not want to trace a part 59 # of the standard library! 60{.push checks: off, line_dir: off, stack_trace: off.} 61 62type 63 RunningStat* = object ## An accumulator for statistical data. 64 n*: int ## amount of pushed data 65 min*, max*, sum*: float ## self-explaining 66 mom1, mom2, mom3, mom4: float ## statistical moments, mom1 is mean 67 68 RunningRegress* = object ## An accumulator for regression calculations. 69 n*: int ## amount of pushed data 70 x_stats*: RunningStat ## stats for the first set of data 71 y_stats*: RunningStat ## stats for the second set of data 72 s_xy: float ## accumulated data for combined xy 73 74# ----------- RunningStat -------------------------- 75 76proc clear*(s: var RunningStat) = 77 ## Resets `s`. 78 s.n = 0 79 s.min = 0.0 80 s.max = 0.0 81 s.sum = 0.0 82 s.mom1 = 0.0 83 s.mom2 = 0.0 84 s.mom3 = 0.0 85 s.mom4 = 0.0 86 87proc push*(s: var RunningStat, x: float) = 88 ## Pushes a value `x` for processing. 89 if s.n == 0: 90 s.min = x 91 s.max = x 92 else: 93 if s.min > x: s.min = x 94 if s.max < x: s.max = x 95 inc(s.n) 96 # See Knuth TAOCP vol 2, 3rd edition, page 232 97 s.sum += x 98 let n = toFloat(s.n) 99 let delta = x - s.mom1 100 let delta_n = delta / toFloat(s.n) 101 let delta_n2 = delta_n * delta_n 102 let term1 = delta * delta_n * toFloat(s.n - 1) 103 s.mom4 += term1 * delta_n2 * (n*n - 3*n + 3) + 104 6*delta_n2*s.mom2 - 4*delta_n*s.mom3 105 s.mom3 += term1 * delta_n * (n - 2) - 3*delta_n*s.mom2 106 s.mom2 += term1 107 s.mom1 += delta_n 108 109proc push*(s: var RunningStat, x: int) = 110 ## Pushes a value `x` for processing. 111 ## 112 ## `x` is simply converted to `float` 113 ## and the other push operation is called. 114 s.push(toFloat(x)) 115 116proc push*(s: var RunningStat, x: openArray[float|int]) = 117 ## Pushes all values of `x` for processing. 118 ## 119 ## Int values of `x` are simply converted to `float` and 120 ## the other push operation is called. 121 for val in x: 122 s.push(val) 123 124proc mean*(s: RunningStat): float = 125 ## Computes the current mean of `s`. 126 result = s.mom1 127 128proc variance*(s: RunningStat): float = 129 ## Computes the current population variance of `s`. 130 result = s.mom2 / toFloat(s.n) 131 132proc varianceS*(s: RunningStat): float = 133 ## Computes the current sample variance of `s`. 134 if s.n > 1: result = s.mom2 / toFloat(s.n - 1) 135 136proc standardDeviation*(s: RunningStat): float = 137 ## Computes the current population standard deviation of `s`. 138 result = sqrt(variance(s)) 139 140proc standardDeviationS*(s: RunningStat): float = 141 ## Computes the current sample standard deviation of `s`. 142 result = sqrt(varianceS(s)) 143 144proc skewness*(s: RunningStat): float = 145 ## Computes the current population skewness of `s`. 146 result = sqrt(toFloat(s.n)) * s.mom3 / pow(s.mom2, 1.5) 147 148proc skewnessS*(s: RunningStat): float = 149 ## Computes the current sample skewness of `s`. 150 let s2 = skewness(s) 151 result = sqrt(toFloat(s.n*(s.n-1)))*s2 / toFloat(s.n-2) 152 153proc kurtosis*(s: RunningStat): float = 154 ## Computes the current population kurtosis of `s`. 155 result = toFloat(s.n) * s.mom4 / (s.mom2 * s.mom2) - 3.0 156 157proc kurtosisS*(s: RunningStat): float = 158 ## Computes the current sample kurtosis of `s`. 159 result = toFloat(s.n-1) / toFloat((s.n-2)*(s.n-3)) * 160 (toFloat(s.n+1)*kurtosis(s) + 6) 161 162proc `+`*(a, b: RunningStat): RunningStat = 163 ## Combines two `RunningStat`s. 164 ## 165 ## Useful when performing parallel analysis of data series 166 ## and needing to re-combine parallel result sets. 167 result.clear() 168 result.n = a.n + b.n 169 170 let delta = b.mom1 - a.mom1 171 let delta2 = delta*delta 172 let delta3 = delta*delta2 173 let delta4 = delta2*delta2 174 let n = toFloat(result.n) 175 176 result.mom1 = (a.n.float*a.mom1 + b.n.float*b.mom1) / n 177 result.mom2 = a.mom2 + b.mom2 + delta2 * a.n.float * b.n.float / n 178 result.mom3 = a.mom3 + b.mom3 + 179 delta3 * a.n.float * b.n.float * (a.n.float - b.n.float)/(n*n); 180 result.mom3 += 3.0*delta * (a.n.float*b.mom2 - b.n.float*a.mom2) / n 181 result.mom4 = a.mom4 + b.mom4 + 182 delta4*a.n.float*b.n.float * toFloat(a.n*a.n - a.n*b.n + b.n*b.n) / 183 (n*n*n) 184 result.mom4 += 6.0*delta2 * (a.n.float*a.n.float*b.mom2 + b.n.float*b.n.float*a.mom2) / 185 (n*n) + 186 4.0*delta*(a.n.float*b.mom3 - b.n.float*a.mom3) / n 187 result.max = max(a.max, b.max) 188 result.min = min(a.min, b.min) 189 190proc `+=`*(a: var RunningStat, b: RunningStat) {.inline.} = 191 ## Adds the `RunningStat` `b` to `a`. 192 a = a + b 193 194proc `$`*(a: RunningStat): string = 195 ## Produces a string representation of the `RunningStat`. The exact 196 ## format is currently unspecified and subject to change. Currently 197 ## it contains: 198 ## 199 ## - the number of probes 200 ## - min, max values 201 ## - sum, mean and standard deviation. 202 result = "RunningStat(\n" 203 result.add " number of probes: " & $a.n & "\n" 204 result.add " max: " & $a.max & "\n" 205 result.add " min: " & $a.min & "\n" 206 result.add " sum: " & $a.sum & "\n" 207 result.add " mean: " & $a.mean & "\n" 208 result.add " std deviation: " & $a.standardDeviation & "\n" 209 result.add ")" 210 211# ---------------------- standalone array/seq stats --------------------- 212 213proc mean*[T](x: openArray[T]): float = 214 ## Computes the mean of `x`. 215 var rs: RunningStat 216 rs.push(x) 217 result = rs.mean() 218 219proc variance*[T](x: openArray[T]): float = 220 ## Computes the population variance of `x`. 221 var rs: RunningStat 222 rs.push(x) 223 result = rs.variance() 224 225proc varianceS*[T](x: openArray[T]): float = 226 ## Computes the sample variance of `x`. 227 var rs: RunningStat 228 rs.push(x) 229 result = rs.varianceS() 230 231proc standardDeviation*[T](x: openArray[T]): float = 232 ## Computes the population standard deviation of `x`. 233 var rs: RunningStat 234 rs.push(x) 235 result = rs.standardDeviation() 236 237proc standardDeviationS*[T](x: openArray[T]): float = 238 ## Computes the sample standard deviation of `x`. 239 var rs: RunningStat 240 rs.push(x) 241 result = rs.standardDeviationS() 242 243proc skewness*[T](x: openArray[T]): float = 244 ## Computes the population skewness of `x`. 245 var rs: RunningStat 246 rs.push(x) 247 result = rs.skewness() 248 249proc skewnessS*[T](x: openArray[T]): float = 250 ## Computes the sample skewness of `x`. 251 var rs: RunningStat 252 rs.push(x) 253 result = rs.skewnessS() 254 255proc kurtosis*[T](x: openArray[T]): float = 256 ## Computes the population kurtosis of `x`. 257 var rs: RunningStat 258 rs.push(x) 259 result = rs.kurtosis() 260 261proc kurtosisS*[T](x: openArray[T]): float = 262 ## Computes the sample kurtosis of `x`. 263 var rs: RunningStat 264 rs.push(x) 265 result = rs.kurtosisS() 266 267# ---------------------- Running Regression ----------------------------- 268 269proc clear*(r: var RunningRegress) = 270 ## Resets `r`. 271 r.x_stats.clear() 272 r.y_stats.clear() 273 r.s_xy = 0.0 274 r.n = 0 275 276proc push*(r: var RunningRegress, x, y: float) = 277 ## Pushes two values `x` and `y` for processing. 278 r.s_xy += (r.x_stats.mean() - x)*(r.y_stats.mean() - y) * 279 toFloat(r.n) / toFloat(r.n + 1) 280 r.x_stats.push(x) 281 r.y_stats.push(y) 282 inc(r.n) 283 284proc push*(r: var RunningRegress, x, y: int) {.inline.} = 285 ## Pushes two values `x` and `y` for processing. 286 ## 287 ## `x` and `y` are converted to `float` 288 ## and the other push operation is called. 289 r.push(toFloat(x), toFloat(y)) 290 291proc push*(r: var RunningRegress, x, y: openArray[float|int]) = 292 ## Pushes two sets of values `x` and `y` for processing. 293 assert(x.len == y.len) 294 for i in 0..<x.len: 295 r.push(x[i], y[i]) 296 297proc slope*(r: RunningRegress): float = 298 ## Computes the current slope of `r`. 299 let s_xx = r.x_stats.varianceS()*toFloat(r.n - 1) 300 result = r.s_xy / s_xx 301 302proc intercept*(r: RunningRegress): float = 303 ## Computes the current intercept of `r`. 304 result = r.y_stats.mean() - r.slope()*r.x_stats.mean() 305 306proc correlation*(r: RunningRegress): float = 307 ## Computes the current correlation of the two data 308 ## sets pushed into `r`. 309 let t = r.x_stats.standardDeviation() * r.y_stats.standardDeviation() 310 result = r.s_xy / (toFloat(r.n) * t) 311 312proc `+`*(a, b: RunningRegress): RunningRegress = 313 ## Combines two `RunningRegress` objects. 314 ## 315 ## Useful when performing parallel analysis of data series 316 ## and needing to re-combine parallel result sets 317 result.clear() 318 result.x_stats = a.x_stats + b.x_stats 319 result.y_stats = a.y_stats + b.y_stats 320 result.n = a.n + b.n 321 322 let delta_x = b.x_stats.mean() - a.x_stats.mean() 323 let delta_y = b.y_stats.mean() - a.y_stats.mean() 324 result.s_xy = a.s_xy + b.s_xy + 325 toFloat(a.n*b.n)*delta_x*delta_y/toFloat(result.n) 326 327proc `+=`*(a: var RunningRegress, b: RunningRegress) = 328 ## Adds the `RunningRegress` `b` to `a`. 329 a = a + b 330 331{.pop.} 332{.pop.} 333