1 /* PSPP - a program for statistical analysis. -*-c-*-
2    Copyright (C) 2010, 2011, 2014 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 */
17 
18 #include <config.h>
19 
20 #include "language/stats/runs.h"
21 
22 #include <float.h>
23 #include <gsl/gsl_cdf.h>
24 #include <math.h>
25 
26 #include "data/casegrouper.h"
27 #include "data/casereader.h"
28 #include "data/casewriter.h"
29 #include "data/dataset.h"
30 #include "data/dictionary.h"
31 #include "data/format.h"
32 #include "data/subcase.h"
33 #include "data/variable.h"
34 #include "libpspp/message.h"
35 #include "libpspp/misc.h"
36 #include "math/percentiles.h"
37 #include "math/sort.h"
38 #include "output/pivot-table.h"
39 
40 #include "gettext.h"
41 #define N_(msgid) msgid
42 #define _(msgid) gettext (msgid)
43 
44 
45 struct run_state
46 {
47   /* The value used to dichotimise the data */
48   double cutpoint;
49 
50   /* The number of cases not less than cutpoint */
51   double np;
52 
53   /* The number of cases less than cutpoint */
54   double nn;
55 
56   /* The sum of np and nn */
57   double n;
58 
59   /* The number of runs */
60   long runs;
61 
62   /* The sign of the last case seen */
63   short last_sign;
64 };
65 
66 
67 
68 /* Return the Z statistic representing the assympototic
69    distribution of the number of runs */
70 static double
runs_statistic(const struct run_state * rs)71 runs_statistic (const struct run_state *rs)
72 {
73   double z;
74   double sigma;
75   double mu  = 2 * rs->np * rs->nn;
76   mu /= rs->np + rs->nn;
77   mu += 1.0;
78 
79   z = rs->runs - mu;
80 
81   if (rs->n < 50)
82     {
83       if (z <= -0.5)
84 	z += 0.5;
85       else if (z >= 0.5)
86 	z -= 0.5;
87       else
88 	return 0;
89     }
90 
91   sigma = 2 * rs->np * rs->nn;
92   sigma *= 2 * rs->np * rs->nn - rs->nn - rs->np;
93   sigma /= pow2 (rs->np + rs->nn);
94   sigma /= rs->np + rs->nn - 1.0;
95   sigma = sqrt (sigma);
96 
97   z /= sigma;
98 
99   return z;
100 }
101 
102 static void show_runs_result (const struct runs_test *, const struct run_state *, const struct dictionary *);
103 
104 void
runs_execute(const struct dataset * ds,struct casereader * input,enum mv_class exclude,const struct npar_test * test,bool exact UNUSED,double timer UNUSED)105 runs_execute (const struct dataset *ds,
106 	      struct casereader *input,
107 	      enum mv_class exclude,
108 	      const struct npar_test *test,
109 	      bool exact UNUSED,
110 	      double timer UNUSED)
111 {
112   int v;
113   struct ccase *c;
114   const struct dictionary *dict = dataset_dict (ds);
115   const struct variable *weight = dict_get_weight (dict);
116 
117   struct one_sample_test *otp = UP_CAST (test, struct one_sample_test, parent);
118   struct runs_test *rt = UP_CAST (otp, struct runs_test, parent);
119   struct run_state *rs = XCALLOC (otp->n_vars,  struct run_state);
120 
121   switch  (rt->cp_mode)
122     {
123     case CP_MODE:
124       {
125 	for (v = 0; v < otp->n_vars; ++v)
126 	  {
127 	    bool multimodal = false;
128 	    struct run_state *run = &rs[v];
129 	    double last_cc;
130 	    struct casereader *group = NULL;
131 	    struct casegrouper *grouper;
132 	    struct casereader *reader = casereader_clone (input);
133 	    const struct variable *var = otp->vars[v];
134 
135 	    reader = sort_execute_1var (reader, var);
136 
137 	    grouper = casegrouper_create_vars (reader, &var, 1);
138 	    last_cc = SYSMIS;
139 	    while (casegrouper_get_next_group (grouper, &group))
140 	      {
141 		double x = SYSMIS;
142 		double cc = 0.0;
143 		struct ccase *c;
144 		for (; (c = casereader_read (group)); case_unref (c))
145 		  {
146 		    const double w = weight ? case_data (c, weight)->f: 1.0;
147 		    const union value *val = case_data (c, var);
148 		    if (var_is_value_missing (var, val, exclude))
149 		      continue;
150 		    x = val->f;
151 		    cc += w;
152 		  }
153 
154 		if (cc > last_cc)
155 		  {
156 		    run->cutpoint = x;
157 		  }
158 		else if (cc == last_cc)
159 		  {
160 		    multimodal = true;
161 		    if (x > run->cutpoint)
162 		      run->cutpoint = x;
163 		  }
164 		last_cc = cc;
165 		casereader_destroy (group);
166 	      }
167 	    casegrouper_destroy (grouper);
168 	    if (multimodal)
169 	      msg (MW, _("Multiple modes exist for variable `%s'.  "
170                          "Using %.*g as the threshold value."),
171 		   var_get_name (var), DBL_DIG + 1, run->cutpoint);
172 	  }
173       }
174       break;
175     case CP_MEDIAN:
176       {
177 	for (v = 0; v < otp->n_vars; ++v)
178 	  {
179 	    double cc = 0.0;
180 	    struct ccase *c;
181 	    struct run_state *run = &rs[v];
182 	    struct casereader *reader = casereader_clone (input);
183 	    const struct variable *var = otp->vars[v];
184 	    struct casewriter *writer;
185 	    struct percentile *median;
186 	    struct order_stats *os;
187 	    struct subcase sc;
188 	    subcase_init_var (&sc, var, SC_ASCEND);
189 	    writer = sort_create_writer (&sc, casereader_get_proto (reader));
190 
191  	    for (; (c = casereader_read (reader));)
192 	      {
193 		const union value *val = case_data (c, var);
194 		const double w = weight ? case_data (c, weight)->f: 1.0;
195 		if (var_is_value_missing (var, val, exclude))
196 		  {
197 		    case_unref (c);
198 		    continue;
199 		  }
200 
201 		cc += w;
202 		casewriter_write (writer, c);
203 	      }
204 	    subcase_destroy (&sc);
205 	    casereader_destroy (reader);
206 	    reader = casewriter_make_reader (writer);
207 
208 	    median = percentile_create (0.5, cc);
209 	    os = &median->parent;
210 
211 	    order_stats_accumulate (&os, 1,
212 				    reader,
213 				    weight,
214 				    var,
215 				    exclude);
216 
217 	    run->cutpoint = percentile_calculate (median, PC_HAVERAGE);
218 	    statistic_destroy (&median->parent.parent);
219 	  }
220       }
221       break;
222     case CP_MEAN:
223       {
224 	struct casereader *reader = casereader_clone (input);
225 	for (; (c = casereader_read (reader)); case_unref (c))
226 	  {
227 	    const double w = weight ? case_data (c, weight)->f: 1.0;
228 	    for (v = 0; v < otp->n_vars; ++v)
229 	      {
230 		const struct variable *var = otp->vars[v];
231 		const union value *val = case_data (c, var);
232 		const double x = val->f;
233 		struct run_state *run = &rs[v];
234 
235 		if (var_is_value_missing (var, val, exclude))
236 		  continue;
237 
238 		run->cutpoint += x * w;
239 		run->n += w;
240 	      }
241 	  }
242 	casereader_destroy (reader);
243 	for (v = 0; v < otp->n_vars; ++v)
244 	  {
245 	    struct run_state *run = &rs[v];
246 	    run->cutpoint /= run->n;
247 	  }
248       }
249       break;
250     case CP_CUSTOM:
251       {
252       for (v = 0; v < otp->n_vars; ++v)
253 	{
254 	  struct run_state *run = &rs[v];
255 	  run->cutpoint = rt->cutpoint;
256 	}
257       }
258       break;
259     }
260 
261   for (; (c = casereader_read (input)); case_unref (c))
262     {
263       const double w = weight ? case_data (c, weight)->f: 1.0;
264 
265       for (v = 0; v < otp->n_vars; ++v)
266 	{
267 	  struct run_state *run = &rs[v];
268 	  const struct variable *var = otp->vars[v];
269 	  const union value *val = case_data (c, var);
270 	  double x = val->f;
271 	  double d = x - run->cutpoint;
272 	  short sign = 0;
273 
274 	  if (var_is_value_missing (var, val, exclude))
275 	    continue;
276 
277 	  if (d >= 0)
278 	    {
279 	      sign = +1;
280 	      run->np += w;
281 	    }
282 	  else
283 	    {
284 	      sign = -1;
285 	      run->nn += w;
286 	    }
287 
288 	  if (sign != run->last_sign)
289 	    run->runs++;
290 
291 	  run->last_sign = sign;
292 	}
293     }
294   casereader_destroy (input);
295 
296   for (v = 0; v < otp->n_vars; ++v)
297     {
298       struct run_state *run = &rs[v];
299       run->n = run->np + run->nn;
300     }
301 
302   show_runs_result (rt, rs, dict);
303 
304   free (rs);
305 }
306 
307 
308 
309 static void
show_runs_result(const struct runs_test * rt,const struct run_state * rs,const struct dictionary * dict)310 show_runs_result (const struct runs_test *rt, const struct run_state *rs, const struct dictionary *dict)
311 {
312   const struct one_sample_test *otp = &rt->parent;
313 
314   struct pivot_table *table = pivot_table_create (N_("Runs Test"));
315   pivot_table_set_weight_var (table, dict_get_weight (dict));
316 
317   pivot_dimension_create (
318     table, PIVOT_AXIS_ROW, N_("Statistics"),
319     (rt->cp_mode == CP_CUSTOM ? N_("Test Value")
320      : rt->cp_mode == CP_MODE ? N_("Test Value (mode)")
321      : rt->cp_mode == CP_MEAN ? N_("Test Value (mean)")
322      : N_("Test Value (median)")), PIVOT_RC_OTHER,
323     N_("Cases < Test Value"), PIVOT_RC_COUNT,
324     N_("Cases ≥ Test Value"), PIVOT_RC_COUNT,
325     N_("Total Cases"), PIVOT_RC_COUNT,
326     N_("Number of Runs"), PIVOT_RC_INTEGER,
327     N_("Z"), PIVOT_RC_OTHER,
328     N_("Asymp. Sig. (2-tailed)"), PIVOT_RC_SIGNIFICANCE);
329 
330   struct pivot_dimension *variables = pivot_dimension_create (
331     table, PIVOT_AXIS_COLUMN, N_("Variable"));
332 
333   for (size_t i = 0 ; i < otp->n_vars; ++i)
334     {
335       const struct run_state *run = &rs[i];
336 
337       int col = pivot_category_create_leaf (
338         variables->root, pivot_value_new_variable (otp->vars[i]));
339 
340       double z = runs_statistic (run);
341 
342       double rows[] = {
343         run->cutpoint,
344         run->nn,
345         run->np,
346         run->n,
347         run->runs,
348         z,
349         2.0 * (1.0 - gsl_cdf_ugaussian_P (fabs (z))),
350       };
351 
352       for (int row = 0; row < sizeof rows / sizeof *rows; row++)
353         pivot_table_put2 (table, row, col, pivot_value_new_number (rows[row]));
354     }
355 
356   pivot_table_submit (table);
357 }
358