1"""
2Kolmogorov-Smirnov : get the statistics distribution
3====================================================
4"""
5# %%
6
7# %%
8# In this example, we draw the Kolmogorov-Smirnov distribution for a sample size 10.
9# We want to test the hypothesis that this sample has the `Uniform(0, 1)`
10# distribution.
11# The K.S. distribution is first plotted in the case where the
12# parameters of the uniform distribution are known.
13# Then we plot the distribution when the parameters of the uniform
14# distribution are estimated from the sample.
15#
16# *Reference* : Hovhannes Keutelian, "The Kolmogorov-Smirnov test when parameters are estimated from data", 30 April 1991, Fermilab
17#
18# Note: There is a sign error in the paper; the equation:
19# `D[i]=max(abs(S+step),D[i])` must be replaced with `D[i]=max(abs(S-step),D[i])`.
20
21# %%
22import openturns as ot
23import openturns.viewer as viewer
24from matplotlib import pylab as plt
25ot.Log.Show(ot.Log.NONE)
26
27# %%
28x = [0.9374, 0.7629, 0.4771, 0.5111, 0.8701,
29     0.0684, 0.7375, 0.5615, 0.2835, 0.2508]
30sample = ot.Sample([[xi] for xi in x])
31
32# %%
33samplesize = sample.getSize()
34samplesize
35
36# %%
37# Plot the empirical distribution function.
38
39# %%
40graph = ot.UserDefined(sample).drawCDF()
41graph.setLegends(["Sample"])
42curve = ot.Curve([0, 1], [0, 1])
43curve.setLegend("Uniform")
44graph.add(curve)
45graph.setXTitle("X")
46graph.setTitle("Cumulated distribution function")
47view = viewer.View(graph)
48
49
50# %%
51# The computeKSStatisticsIndex function computes the Kolmogorov-Smirnov distance between the sample and the distribution. The following function is for teaching purposes only: use `FittingTest` for real applications.
52
53# %%
54def computeKSStatistics(sample, distribution):
55    sample = sample.sort()
56    n = sample.getSize()
57    D = 0.
58    index = -1
59    D_previous = 0.
60    for i in range(n):
61        F = distribution.computeCDF(sample[i])
62        Fminus = F - float(i)/n
63        Fplus = float(i+1)/n - F
64        D = max(Fminus, Fplus, D)
65        if (D > D_previous):
66            index = i
67            D_previous = D
68    return D
69
70
71# %%
72dist = ot.Uniform(0, 1)
73dist
74
75# %%
76computeKSStatistics(sample, dist)
77
78
79# %%
80# The following function generates a sample of K.S. distances when the tested distribution is the `Uniform(0,1)` distribution.
81
82# %%
83def generateKSSampleKnownParameters(nrepeat, samplesize):
84    """
85    nrepeat : Number of repetitions, size of the table
86    samplesize : the size of each sample to generate from the Uniform distribution
87    """
88    dist = ot.Uniform(0, 1)
89    D = ot.Sample(nrepeat, 1)
90    for i in range(nrepeat):
91        sample = dist.getSample(samplesize)
92        D[i, 0] = computeKSStatistics(sample, dist)
93    return D
94
95
96# %%
97# Generate a sample of KS distances.
98
99# %%
100nrepeat = 10000  # Size of the KS distances sample
101sampleD = generateKSSampleKnownParameters(nrepeat, samplesize)
102
103
104# %%
105# Compute exact Kolmogorov CDF.
106
107# %%
108def pKolmogorovPy(x):
109    y = ot.DistFunc_pKolmogorov(samplesize, x[0])
110    return [y]
111
112
113# %%
114pKolmogorov = ot.PythonFunction(1, 1, pKolmogorovPy)
115
116
117# %%
118def dKolmogorov(x, samplesize):
119    """
120    Compute Kolmogorov PDF for given x.
121    x : an array, the points where the PDF must be evaluated
122    samplesize : the size of the sample
123    Reference
124    Numerical Derivatives in Scilab, Michael Baudin, May 2009
125    """
126    n = x.getSize()
127    y = ot.Sample(n, 1)
128    for i in range(n):
129        y[i, 0] = pKolmogorov.gradient(x[i])[0, 0]
130    return y
131
132
133# %%
134def linearSample(xmin, xmax, npoints):
135    '''Returns a sample created from a regular grid
136    from xmin to xmax with npoints points.'''
137    step = (xmax-xmin)/(npoints-1)
138    rg = ot.RegularGrid(xmin, step, npoints)
139    vertices = rg.getVertices()
140    return vertices
141
142
143# %%
144n = 1000  # Number of points in the plot
145s = linearSample(0.001, 0.999, n)
146y = dKolmogorov(s, samplesize)
147
148# %%
149curve = ot.Curve(s, y)
150curve.setLegend("Exact distribution")
151graph = ot.HistogramFactory().build(sampleD).drawPDF()
152graph.setLegends(["Empirical distribution"])
153graph.add(curve)
154graph.setTitle("Kolmogorov-Smirnov distribution (known parameters)")
155graph.setXTitle("KS-Statistics")
156view = viewer.View(graph)
157
158
159# %%
160# Known parameters versus estimated parameters
161# --------------------------------------------
162
163# %%
164# The following function generates a sample of K.S. distances when the tested distribution is the `Uniform(a,b)` distribution, where the `a` and `b` parameters are estimated from the sample.
165
166# %%
167def generateKSSampleEstimatedParameters(nrepeat, samplesize):
168    """
169    nrepeat : Number of repetitions, size of the table
170    samplesize : the size of each sample to generate from the Uniform distribution
171    """
172    distfactory = ot.UniformFactory()
173    refdist = ot.Uniform(0, 1)
174    D = ot.Sample(nrepeat, 1)
175    for i in range(nrepeat):
176        sample = refdist.getSample(samplesize)
177        trialdist = distfactory.build(sample)
178        D[i, 0] = computeKSStatistics(sample, trialdist)
179    return D
180
181
182# %%
183# Generate a sample of KS distances.
184
185# %%
186sampleDP = generateKSSampleEstimatedParameters(nrepeat, samplesize)
187
188# %%
189graph = ot.KernelSmoothing().build(sampleD).drawPDF()
190graph.setLegends(["Known parameters"])
191graphP = ot.KernelSmoothing().build(sampleDP).drawPDF()
192graphP.setLegends(["Estimated parameters"])
193graphP.setColors(["blue"])
194graph.add(graphP)
195graph.setTitle("Kolmogorov-Smirnov distribution")
196graph.setXTitle("KS-Statistics")
197view = viewer.View(graph)
198plt.show()
199
200# %%
201# We see that the distribution of the KS distances when the parameters are estimated is shifted towards the left: smaller distances occur more often. This is a consequence of the fact that the estimated parameters tend to make the estimated distribution closer to the empirical sample.
202