1 /* PSPP - a program for statistical analysis. -*-c-*-
2 Copyright (C) 2010, 2011, 2014 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include <config.h>
19
20 #include "language/stats/runs.h"
21
22 #include <float.h>
23 #include <gsl/gsl_cdf.h>
24 #include <math.h>
25
26 #include "data/casegrouper.h"
27 #include "data/casereader.h"
28 #include "data/casewriter.h"
29 #include "data/dataset.h"
30 #include "data/dictionary.h"
31 #include "data/format.h"
32 #include "data/subcase.h"
33 #include "data/variable.h"
34 #include "libpspp/message.h"
35 #include "libpspp/misc.h"
36 #include "math/percentiles.h"
37 #include "math/sort.h"
38 #include "output/pivot-table.h"
39
40 #include "gettext.h"
41 #define N_(msgid) msgid
42 #define _(msgid) gettext (msgid)
43
44
45 struct run_state
46 {
47 /* The value used to dichotimise the data */
48 double cutpoint;
49
50 /* The number of cases not less than cutpoint */
51 double np;
52
53 /* The number of cases less than cutpoint */
54 double nn;
55
56 /* The sum of np and nn */
57 double n;
58
59 /* The number of runs */
60 long runs;
61
62 /* The sign of the last case seen */
63 short last_sign;
64 };
65
66
67
68 /* Return the Z statistic representing the assympototic
69 distribution of the number of runs */
70 static double
runs_statistic(const struct run_state * rs)71 runs_statistic (const struct run_state *rs)
72 {
73 double z;
74 double sigma;
75 double mu = 2 * rs->np * rs->nn;
76 mu /= rs->np + rs->nn;
77 mu += 1.0;
78
79 z = rs->runs - mu;
80
81 if (rs->n < 50)
82 {
83 if (z <= -0.5)
84 z += 0.5;
85 else if (z >= 0.5)
86 z -= 0.5;
87 else
88 return 0;
89 }
90
91 sigma = 2 * rs->np * rs->nn;
92 sigma *= 2 * rs->np * rs->nn - rs->nn - rs->np;
93 sigma /= pow2 (rs->np + rs->nn);
94 sigma /= rs->np + rs->nn - 1.0;
95 sigma = sqrt (sigma);
96
97 z /= sigma;
98
99 return z;
100 }
101
102 static void show_runs_result (const struct runs_test *, const struct run_state *, const struct dictionary *);
103
104 void
runs_execute(const struct dataset * ds,struct casereader * input,enum mv_class exclude,const struct npar_test * test,bool exact UNUSED,double timer UNUSED)105 runs_execute (const struct dataset *ds,
106 struct casereader *input,
107 enum mv_class exclude,
108 const struct npar_test *test,
109 bool exact UNUSED,
110 double timer UNUSED)
111 {
112 int v;
113 struct ccase *c;
114 const struct dictionary *dict = dataset_dict (ds);
115 const struct variable *weight = dict_get_weight (dict);
116
117 struct one_sample_test *otp = UP_CAST (test, struct one_sample_test, parent);
118 struct runs_test *rt = UP_CAST (otp, struct runs_test, parent);
119 struct run_state *rs = XCALLOC (otp->n_vars, struct run_state);
120
121 switch (rt->cp_mode)
122 {
123 case CP_MODE:
124 {
125 for (v = 0; v < otp->n_vars; ++v)
126 {
127 bool multimodal = false;
128 struct run_state *run = &rs[v];
129 double last_cc;
130 struct casereader *group = NULL;
131 struct casegrouper *grouper;
132 struct casereader *reader = casereader_clone (input);
133 const struct variable *var = otp->vars[v];
134
135 reader = sort_execute_1var (reader, var);
136
137 grouper = casegrouper_create_vars (reader, &var, 1);
138 last_cc = SYSMIS;
139 while (casegrouper_get_next_group (grouper, &group))
140 {
141 double x = SYSMIS;
142 double cc = 0.0;
143 struct ccase *c;
144 for (; (c = casereader_read (group)); case_unref (c))
145 {
146 const double w = weight ? case_data (c, weight)->f: 1.0;
147 const union value *val = case_data (c, var);
148 if (var_is_value_missing (var, val, exclude))
149 continue;
150 x = val->f;
151 cc += w;
152 }
153
154 if (cc > last_cc)
155 {
156 run->cutpoint = x;
157 }
158 else if (cc == last_cc)
159 {
160 multimodal = true;
161 if (x > run->cutpoint)
162 run->cutpoint = x;
163 }
164 last_cc = cc;
165 casereader_destroy (group);
166 }
167 casegrouper_destroy (grouper);
168 if (multimodal)
169 msg (MW, _("Multiple modes exist for variable `%s'. "
170 "Using %.*g as the threshold value."),
171 var_get_name (var), DBL_DIG + 1, run->cutpoint);
172 }
173 }
174 break;
175 case CP_MEDIAN:
176 {
177 for (v = 0; v < otp->n_vars; ++v)
178 {
179 double cc = 0.0;
180 struct ccase *c;
181 struct run_state *run = &rs[v];
182 struct casereader *reader = casereader_clone (input);
183 const struct variable *var = otp->vars[v];
184 struct casewriter *writer;
185 struct percentile *median;
186 struct order_stats *os;
187 struct subcase sc;
188 subcase_init_var (&sc, var, SC_ASCEND);
189 writer = sort_create_writer (&sc, casereader_get_proto (reader));
190
191 for (; (c = casereader_read (reader));)
192 {
193 const union value *val = case_data (c, var);
194 const double w = weight ? case_data (c, weight)->f: 1.0;
195 if (var_is_value_missing (var, val, exclude))
196 {
197 case_unref (c);
198 continue;
199 }
200
201 cc += w;
202 casewriter_write (writer, c);
203 }
204 subcase_destroy (&sc);
205 casereader_destroy (reader);
206 reader = casewriter_make_reader (writer);
207
208 median = percentile_create (0.5, cc);
209 os = &median->parent;
210
211 order_stats_accumulate (&os, 1,
212 reader,
213 weight,
214 var,
215 exclude);
216
217 run->cutpoint = percentile_calculate (median, PC_HAVERAGE);
218 statistic_destroy (&median->parent.parent);
219 }
220 }
221 break;
222 case CP_MEAN:
223 {
224 struct casereader *reader = casereader_clone (input);
225 for (; (c = casereader_read (reader)); case_unref (c))
226 {
227 const double w = weight ? case_data (c, weight)->f: 1.0;
228 for (v = 0; v < otp->n_vars; ++v)
229 {
230 const struct variable *var = otp->vars[v];
231 const union value *val = case_data (c, var);
232 const double x = val->f;
233 struct run_state *run = &rs[v];
234
235 if (var_is_value_missing (var, val, exclude))
236 continue;
237
238 run->cutpoint += x * w;
239 run->n += w;
240 }
241 }
242 casereader_destroy (reader);
243 for (v = 0; v < otp->n_vars; ++v)
244 {
245 struct run_state *run = &rs[v];
246 run->cutpoint /= run->n;
247 }
248 }
249 break;
250 case CP_CUSTOM:
251 {
252 for (v = 0; v < otp->n_vars; ++v)
253 {
254 struct run_state *run = &rs[v];
255 run->cutpoint = rt->cutpoint;
256 }
257 }
258 break;
259 }
260
261 for (; (c = casereader_read (input)); case_unref (c))
262 {
263 const double w = weight ? case_data (c, weight)->f: 1.0;
264
265 for (v = 0; v < otp->n_vars; ++v)
266 {
267 struct run_state *run = &rs[v];
268 const struct variable *var = otp->vars[v];
269 const union value *val = case_data (c, var);
270 double x = val->f;
271 double d = x - run->cutpoint;
272 short sign = 0;
273
274 if (var_is_value_missing (var, val, exclude))
275 continue;
276
277 if (d >= 0)
278 {
279 sign = +1;
280 run->np += w;
281 }
282 else
283 {
284 sign = -1;
285 run->nn += w;
286 }
287
288 if (sign != run->last_sign)
289 run->runs++;
290
291 run->last_sign = sign;
292 }
293 }
294 casereader_destroy (input);
295
296 for (v = 0; v < otp->n_vars; ++v)
297 {
298 struct run_state *run = &rs[v];
299 run->n = run->np + run->nn;
300 }
301
302 show_runs_result (rt, rs, dict);
303
304 free (rs);
305 }
306
307
308
309 static void
show_runs_result(const struct runs_test * rt,const struct run_state * rs,const struct dictionary * dict)310 show_runs_result (const struct runs_test *rt, const struct run_state *rs, const struct dictionary *dict)
311 {
312 const struct one_sample_test *otp = &rt->parent;
313
314 struct pivot_table *table = pivot_table_create (N_("Runs Test"));
315 pivot_table_set_weight_var (table, dict_get_weight (dict));
316
317 pivot_dimension_create (
318 table, PIVOT_AXIS_ROW, N_("Statistics"),
319 (rt->cp_mode == CP_CUSTOM ? N_("Test Value")
320 : rt->cp_mode == CP_MODE ? N_("Test Value (mode)")
321 : rt->cp_mode == CP_MEAN ? N_("Test Value (mean)")
322 : N_("Test Value (median)")), PIVOT_RC_OTHER,
323 N_("Cases < Test Value"), PIVOT_RC_COUNT,
324 N_("Cases ≥ Test Value"), PIVOT_RC_COUNT,
325 N_("Total Cases"), PIVOT_RC_COUNT,
326 N_("Number of Runs"), PIVOT_RC_INTEGER,
327 N_("Z"), PIVOT_RC_OTHER,
328 N_("Asymp. Sig. (2-tailed)"), PIVOT_RC_SIGNIFICANCE);
329
330 struct pivot_dimension *variables = pivot_dimension_create (
331 table, PIVOT_AXIS_COLUMN, N_("Variable"));
332
333 for (size_t i = 0 ; i < otp->n_vars; ++i)
334 {
335 const struct run_state *run = &rs[i];
336
337 int col = pivot_category_create_leaf (
338 variables->root, pivot_value_new_variable (otp->vars[i]));
339
340 double z = runs_statistic (run);
341
342 double rows[] = {
343 run->cutpoint,
344 run->nn,
345 run->np,
346 run->n,
347 run->runs,
348 z,
349 2.0 * (1.0 - gsl_cdf_ugaussian_P (fabs (z))),
350 };
351
352 for (int row = 0; row < sizeof rows / sizeof *rows; row++)
353 pivot_table_put2 (table, row, col, pivot_value_new_number (rows[row]));
354 }
355
356 pivot_table_submit (table);
357 }
358