1""" 2Kolmogorov-Smirnov : get the statistics distribution 3==================================================== 4""" 5# %% 6 7# %% 8# In this example, we draw the Kolmogorov-Smirnov distribution for a sample size 10. 9# We want to test the hypothesis that this sample has the `Uniform(0, 1)` 10# distribution. 11# The K.S. distribution is first plotted in the case where the 12# parameters of the uniform distribution are known. 13# Then we plot the distribution when the parameters of the uniform 14# distribution are estimated from the sample. 15# 16# *Reference* : Hovhannes Keutelian, "The Kolmogorov-Smirnov test when parameters are estimated from data", 30 April 1991, Fermilab 17# 18# Note: There is a sign error in the paper; the equation: 19# `D[i]=max(abs(S+step),D[i])` must be replaced with `D[i]=max(abs(S-step),D[i])`. 20 21# %% 22import openturns as ot 23import openturns.viewer as viewer 24from matplotlib import pylab as plt 25ot.Log.Show(ot.Log.NONE) 26 27# %% 28x = [0.9374, 0.7629, 0.4771, 0.5111, 0.8701, 29 0.0684, 0.7375, 0.5615, 0.2835, 0.2508] 30sample = ot.Sample([[xi] for xi in x]) 31 32# %% 33samplesize = sample.getSize() 34samplesize 35 36# %% 37# Plot the empirical distribution function. 38 39# %% 40graph = ot.UserDefined(sample).drawCDF() 41graph.setLegends(["Sample"]) 42curve = ot.Curve([0, 1], [0, 1]) 43curve.setLegend("Uniform") 44graph.add(curve) 45graph.setXTitle("X") 46graph.setTitle("Cumulated distribution function") 47view = viewer.View(graph) 48 49 50# %% 51# The computeKSStatisticsIndex function computes the Kolmogorov-Smirnov distance between the sample and the distribution. The following function is for teaching purposes only: use `FittingTest` for real applications. 52 53# %% 54def computeKSStatistics(sample, distribution): 55 sample = sample.sort() 56 n = sample.getSize() 57 D = 0. 58 index = -1 59 D_previous = 0. 60 for i in range(n): 61 F = distribution.computeCDF(sample[i]) 62 Fminus = F - float(i)/n 63 Fplus = float(i+1)/n - F 64 D = max(Fminus, Fplus, D) 65 if (D > D_previous): 66 index = i 67 D_previous = D 68 return D 69 70 71# %% 72dist = ot.Uniform(0, 1) 73dist 74 75# %% 76computeKSStatistics(sample, dist) 77 78 79# %% 80# The following function generates a sample of K.S. distances when the tested distribution is the `Uniform(0,1)` distribution. 81 82# %% 83def generateKSSampleKnownParameters(nrepeat, samplesize): 84 """ 85 nrepeat : Number of repetitions, size of the table 86 samplesize : the size of each sample to generate from the Uniform distribution 87 """ 88 dist = ot.Uniform(0, 1) 89 D = ot.Sample(nrepeat, 1) 90 for i in range(nrepeat): 91 sample = dist.getSample(samplesize) 92 D[i, 0] = computeKSStatistics(sample, dist) 93 return D 94 95 96# %% 97# Generate a sample of KS distances. 98 99# %% 100nrepeat = 10000 # Size of the KS distances sample 101sampleD = generateKSSampleKnownParameters(nrepeat, samplesize) 102 103 104# %% 105# Compute exact Kolmogorov CDF. 106 107# %% 108def pKolmogorovPy(x): 109 y = ot.DistFunc_pKolmogorov(samplesize, x[0]) 110 return [y] 111 112 113# %% 114pKolmogorov = ot.PythonFunction(1, 1, pKolmogorovPy) 115 116 117# %% 118def dKolmogorov(x, samplesize): 119 """ 120 Compute Kolmogorov PDF for given x. 121 x : an array, the points where the PDF must be evaluated 122 samplesize : the size of the sample 123 Reference 124 Numerical Derivatives in Scilab, Michael Baudin, May 2009 125 """ 126 n = x.getSize() 127 y = ot.Sample(n, 1) 128 for i in range(n): 129 y[i, 0] = pKolmogorov.gradient(x[i])[0, 0] 130 return y 131 132 133# %% 134def linearSample(xmin, xmax, npoints): 135 '''Returns a sample created from a regular grid 136 from xmin to xmax with npoints points.''' 137 step = (xmax-xmin)/(npoints-1) 138 rg = ot.RegularGrid(xmin, step, npoints) 139 vertices = rg.getVertices() 140 return vertices 141 142 143# %% 144n = 1000 # Number of points in the plot 145s = linearSample(0.001, 0.999, n) 146y = dKolmogorov(s, samplesize) 147 148# %% 149curve = ot.Curve(s, y) 150curve.setLegend("Exact distribution") 151graph = ot.HistogramFactory().build(sampleD).drawPDF() 152graph.setLegends(["Empirical distribution"]) 153graph.add(curve) 154graph.setTitle("Kolmogorov-Smirnov distribution (known parameters)") 155graph.setXTitle("KS-Statistics") 156view = viewer.View(graph) 157 158 159# %% 160# Known parameters versus estimated parameters 161# -------------------------------------------- 162 163# %% 164# The following function generates a sample of K.S. distances when the tested distribution is the `Uniform(a,b)` distribution, where the `a` and `b` parameters are estimated from the sample. 165 166# %% 167def generateKSSampleEstimatedParameters(nrepeat, samplesize): 168 """ 169 nrepeat : Number of repetitions, size of the table 170 samplesize : the size of each sample to generate from the Uniform distribution 171 """ 172 distfactory = ot.UniformFactory() 173 refdist = ot.Uniform(0, 1) 174 D = ot.Sample(nrepeat, 1) 175 for i in range(nrepeat): 176 sample = refdist.getSample(samplesize) 177 trialdist = distfactory.build(sample) 178 D[i, 0] = computeKSStatistics(sample, trialdist) 179 return D 180 181 182# %% 183# Generate a sample of KS distances. 184 185# %% 186sampleDP = generateKSSampleEstimatedParameters(nrepeat, samplesize) 187 188# %% 189graph = ot.KernelSmoothing().build(sampleD).drawPDF() 190graph.setLegends(["Known parameters"]) 191graphP = ot.KernelSmoothing().build(sampleDP).drawPDF() 192graphP.setLegends(["Estimated parameters"]) 193graphP.setColors(["blue"]) 194graph.add(graphP) 195graph.setTitle("Kolmogorov-Smirnov distribution") 196graph.setXTitle("KS-Statistics") 197view = viewer.View(graph) 198plt.show() 199 200# %% 201# We see that the distribution of the KS distances when the parameters are estimated is shifted towards the left: smaller distances occur more often. This is a consequence of the fact that the estimated parameters tend to make the estimated distribution closer to the empirical sample. 202