1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2011 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
16
17 #include <config.h>
18
19 #include "language/stats/ks-one-sample.h"
20
21 #include <gsl/gsl_cdf.h>
22 #include <math.h>
23 #include <stdlib.h>
24
25
26 #include "math/sort.h"
27 #include "data/case.h"
28 #include "data/casereader.h"
29 #include "data/dataset.h"
30 #include "data/dictionary.h"
31 #include "data/format.h"
32 #include "data/value-labels.h"
33 #include "data/variable.h"
34 #include "language/stats/freq.h"
35 #include "language/stats/npar.h"
36 #include "libpspp/array.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/compiler.h"
40 #include "libpspp/hash-functions.h"
41 #include "libpspp/message.h"
42 #include "libpspp/misc.h"
43 #include "output/pivot-table.h"
44
45 #include "gl/xalloc.h"
46
47 #include "gettext.h"
48 #define N_(msgid) msgid
49 #define _(msgid) gettext (msgid)
50
51
52 /* The per test variable statistics */
53 struct ks
54 {
55 double obs_cc;
56
57 double test_min ;
58 double test_max;
59 double mu;
60 double sigma;
61
62 double diff_pos;
63 double diff_neg;
64
65 double ssq;
66 double sum;
67 };
68
69 typedef double theoretical (const struct ks *ks, double x);
70 typedef theoretical *theoreticalfp;
71
72 static double
theoretical_uniform(const struct ks * ks,double x)73 theoretical_uniform (const struct ks *ks, double x)
74 {
75 return gsl_cdf_flat_P (x, ks->test_min, ks->test_max);
76 }
77
78 static double
theoretical_normal(const struct ks * ks,double x)79 theoretical_normal (const struct ks *ks, double x)
80 {
81 return gsl_cdf_gaussian_P (x - ks->mu, ks->sigma);
82 }
83
84 static double
theoretical_poisson(const struct ks * ks,double x)85 theoretical_poisson (const struct ks *ks, double x)
86 {
87 return gsl_cdf_poisson_P (x, ks->mu);
88 }
89
90 static double
theoretical_exponential(const struct ks * ks,double x)91 theoretical_exponential (const struct ks *ks, double x)
92 {
93 return gsl_cdf_exponential_P (x, 1/ks->mu);
94 }
95
96
97 static const theoreticalfp theoreticalf[4] =
98 {
99 theoretical_normal,
100 theoretical_uniform,
101 theoretical_poisson,
102 theoretical_exponential
103 };
104
105 /*
106 Return the assymptotic approximation to the significance of Z
107 */
108 static double
ks_asymp_sig(double z)109 ks_asymp_sig (double z)
110 {
111 if (z < 0.27)
112 return 1;
113
114 if (z >= 3.1)
115 return 0;
116
117 if (z < 1)
118 {
119 double q = exp (-1.233701 * pow (z, -2));
120 return 1 - 2.506628 * (q + pow (q, 9) + pow (q, 25))/ z ;
121 }
122 else
123 {
124 double q = exp (-2 * z * z);
125 return 2 * (q - pow (q, 4) + pow (q, 9) - pow (q, 16))/ z ;
126 }
127 }
128
129 static void show_results (const struct ks *, const struct ks_one_sample_test *, const struct fmt_spec *);
130
131
132 void
ks_one_sample_execute(const struct dataset * ds,struct casereader * input,enum mv_class exclude,const struct npar_test * test,bool x UNUSED,double y UNUSED)133 ks_one_sample_execute (const struct dataset *ds,
134 struct casereader *input,
135 enum mv_class exclude,
136 const struct npar_test *test,
137 bool x UNUSED, double y UNUSED)
138 {
139 const struct dictionary *dict = dataset_dict (ds);
140 const struct ks_one_sample_test *kst = UP_CAST (test, const struct ks_one_sample_test, parent.parent);
141 const struct one_sample_test *ost = &kst->parent;
142 struct ccase *c;
143 const struct fmt_spec *wfmt = dict_get_weight_format (dict);
144 bool warn = true;
145 int v;
146 struct casereader *r = casereader_clone (input);
147
148 struct ks *ks = XCALLOC (ost->n_vars, struct ks);
149
150 for (v = 0; v < ost->n_vars; ++v)
151 {
152 ks[v].obs_cc = 0;
153 ks[v].test_min = DBL_MAX;
154 ks[v].test_max = -DBL_MAX;
155 ks[v].diff_pos = -DBL_MAX;
156 ks[v].diff_neg = DBL_MAX;
157 ks[v].sum = 0;
158 ks[v].ssq = 0;
159 }
160
161 for (; (c = casereader_read (r)) != NULL; case_unref (c))
162 {
163 const double weight = dict_get_case_weight (dict, c, &warn);
164
165 for (v = 0; v < ost->n_vars; ++v)
166 {
167 const struct variable *var = ost->vars[v];
168 const union value *val = case_data (c, var);
169
170 if (var_is_value_missing (var, val, exclude))
171 continue;
172
173 minimize (&ks[v].test_min, val->f);
174 maximize (&ks[v].test_max, val->f);
175
176 ks[v].obs_cc += weight;
177 ks[v].sum += val->f;
178 ks[v].ssq += pow2 (val->f);
179 }
180 }
181 casereader_destroy (r);
182
183 for (v = 0; v < ost->n_vars; ++v)
184 {
185 const struct variable *var = ost->vars[v];
186 double cc = 0;
187 double prev_empirical = 0;
188
189 switch (kst->dist)
190 {
191 case KS_UNIFORM:
192 if (kst->p[0] != SYSMIS)
193 ks[v].test_min = kst->p[0];
194
195 if (kst->p[1] != SYSMIS)
196 ks[v].test_max = kst->p[1];
197 break;
198 case KS_NORMAL:
199 if (kst->p[0] != SYSMIS)
200 ks[v].mu = kst->p[0];
201 else
202 ks[v].mu = ks[v].sum / ks[v].obs_cc;
203
204 if (kst->p[1] != SYSMIS)
205 ks[v].sigma = kst->p[1];
206 else
207 {
208 ks[v].sigma = ks[v].ssq - pow2 (ks[v].sum) / ks[v].obs_cc;
209 ks[v].sigma /= ks[v].obs_cc - 1;
210 ks[v].sigma = sqrt (ks[v].sigma);
211 }
212
213 break;
214 case KS_POISSON:
215 case KS_EXPONENTIAL:
216 if (kst->p[0] != SYSMIS)
217 ks[v].mu = ks[v].sigma = kst->p[0];
218 else
219 ks[v].mu = ks[v].sigma = ks[v].sum / ks[v].obs_cc;
220 break;
221 default:
222 NOT_REACHED ();
223 }
224
225 r = sort_execute_1var (casereader_clone (input), var);
226 for (; (c = casereader_read (r)) != NULL; case_unref (c))
227 {
228 double theoretical, empirical;
229 double d, dp;
230 const double weight = dict_get_case_weight (dict, c, &warn);
231 const union value *val = case_data (c, var);
232
233 if (var_is_value_missing (var, val, exclude))
234 continue;
235
236 cc += weight;
237
238 empirical = cc / ks[v].obs_cc;
239
240 theoretical = theoreticalf[kst->dist] (&ks[v], val->f);
241
242 d = empirical - theoretical;
243 dp = prev_empirical - theoretical;
244
245 if (d > 0)
246 maximize (&ks[v].diff_pos, d);
247 else
248 minimize (&ks[v].diff_neg, d);
249
250 if (dp > 0)
251 maximize (&ks[v].diff_pos, dp);
252 else
253 minimize (&ks[v].diff_neg, dp);
254
255 prev_empirical = empirical;
256 }
257
258 casereader_destroy (r);
259 }
260
261 show_results (ks, kst, wfmt);
262
263 free (ks);
264 casereader_destroy (input);
265 }
266
267
268 static void
show_results(const struct ks * ks,const struct ks_one_sample_test * kst,const struct fmt_spec * wfmt)269 show_results (const struct ks *ks,
270 const struct ks_one_sample_test *kst,
271 const struct fmt_spec *wfmt)
272 {
273 struct pivot_table *table = pivot_table_create (
274 N_("One-Sample Kolmogorov-Smirnov Test"));
275 pivot_table_set_weight_format (table, wfmt);
276
277 struct pivot_dimension *statistics = pivot_dimension_create (
278 table, PIVOT_AXIS_ROW, N_("Statistics"),
279 N_("N"), PIVOT_RC_COUNT);
280
281 switch (kst->dist)
282 {
283 case KS_UNIFORM:
284 pivot_category_create_group (statistics->root, N_("Uniform Parameters"),
285 N_("Minimum"), N_("Maximum"));
286 break;
287
288 case KS_NORMAL:
289 pivot_category_create_group (statistics->root, N_("Normal Parameters"),
290 N_("Mean"), N_("Std. Deviation"));
291 break;
292
293 case KS_POISSON:
294 pivot_category_create_group (statistics->root, N_("Poisson Parameters"),
295 N_("Lambda"));
296 break;
297
298 case KS_EXPONENTIAL:
299 pivot_category_create_group (statistics->root,
300 N_("Exponential Parameters"), N_("Scale"));
301 break;
302
303 default:
304 NOT_REACHED ();
305 }
306
307 pivot_category_create_group (
308 statistics->root, N_("Most Extreme Differences"),
309 N_("Absolute"), N_("Positive"), N_("Negative"));
310
311 pivot_category_create_leaves (
312 statistics->root, N_("Kolmogorov-Smirnov Z"),
313 _("Asymp. Sig. (2-tailed)"), PIVOT_RC_SIGNIFICANCE);
314
315 struct pivot_dimension *variables = pivot_dimension_create (
316 table, PIVOT_AXIS_COLUMN, N_("Variables"));
317
318 for (size_t i = 0; i < kst->parent.n_vars; ++i)
319 {
320 int col = pivot_category_create_leaf (
321 variables->root, pivot_value_new_variable (kst->parent.vars[i]));
322
323 double values[10];
324 size_t n = 0;
325
326 values[n++] = ks[i].obs_cc;
327
328 switch (kst->dist)
329 {
330 case KS_UNIFORM:
331 values[n++] = ks[i].test_min;
332 values[n++] = ks[i].test_max;
333 break;
334
335 case KS_NORMAL:
336 values[n++] = ks[i].mu;
337 values[n++] = ks[i].sigma;
338 break;
339
340 case KS_POISSON:
341 case KS_EXPONENTIAL:
342 values[n++] = ks[i].mu;
343 break;
344
345 default:
346 NOT_REACHED ();
347 }
348
349 double abs = ks[i].diff_pos;
350 maximize (&abs, -ks[i].diff_neg);
351
352 double z = sqrt (ks[i].obs_cc) * abs;
353
354 values[n++] = abs;
355 values[n++] = ks[i].diff_pos;
356 values[n++] = ks[i].diff_neg;
357 values[n++] = z;
358 values[n++] = ks_asymp_sig (z);
359
360 for (size_t j = 0; j < n; j++)
361 pivot_table_put2 (table, j, col, pivot_value_new_number (values[j]));
362 }
363
364 pivot_table_submit (table);
365 }
366