1 /* PSPP - a program for statistical analysis.
2    Copyright (C) 2011 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 */
17 
18 #include <config.h>
19 #include "median.h"
20 
21 #include <gsl/gsl_cdf.h>
22 
23 #include "data/format.h"
24 
25 
26 #include "data/variable.h"
27 #include "data/case.h"
28 #include "data/dictionary.h"
29 #include "data/dataset.h"
30 #include "data/casereader.h"
31 #include "data/casewriter.h"
32 #include "data/subcase.h"
33 #include "data/value.h"
34 
35 #include "math/percentiles.h"
36 #include "math/sort.h"
37 
38 #include "libpspp/cast.h"
39 #include "libpspp/hmap.h"
40 #include "libpspp/array.h"
41 #include "libpspp/str.h"
42 #include "libpspp/misc.h"
43 
44 #include "output/pivot-table.h"
45 
46 #include "gettext.h"
47 #define N_(msgid) msgid
48 #define _(msgid) gettext (msgid)
49 
50 
51 struct val_node
52 {
53   struct hmap_node node;
54   union value val;
55   casenumber le;
56   casenumber gt;
57 };
58 
59 struct results
60 {
61   const struct variable *var;
62   struct val_node **sorted_array;
63   double n;
64   double median;
65   double chisq;
66 };
67 
68 
69 
70 static int
71 val_node_cmp_3way (const void *a_, const void *b_, const void *aux)
72 {
73   const struct variable *indep_var = aux;
74   const struct val_node *const *a = a_;
75   const struct val_node *const *b = b_;
76 
77   return value_compare_3way (&(*a)->val, &(*b)->val, var_get_width (indep_var));
78 }
79 
80 static void
81 show_frequencies (const struct n_sample_test *nst, const struct results *results,  int n_vals, const struct dictionary *);
82 
83 static void
84 show_test_statistics (const struct n_sample_test *nst, const struct results *results, int, const struct dictionary *);
85 
86 
87 static struct val_node *
88 find_value (const struct hmap *map, const union value *val,
89 	    const struct variable *var)
90 {
91   struct val_node *foo = NULL;
92   size_t hash = value_hash (val, var_get_width (var), 0);
93   HMAP_FOR_EACH_WITH_HASH (foo, struct val_node, node, hash, map)
94     if (value_equal (val, &foo->val, var_get_width (var)))
95       break;
96 
97   return foo;
98 }
99 
100 void
101 median_execute (const struct dataset *ds,
102 		struct casereader *input,
103 		enum mv_class exclude,
104 		const struct npar_test *test,
105 		bool exact UNUSED,
106 		double timer UNUSED)
107 {
108   const struct dictionary *dict = dataset_dict (ds);
109   const struct variable *wvar = dict_get_weight (dict);
110   bool warn = true;
111   int v;
112   const struct median_test *mt = UP_CAST (test, const struct median_test,
113 					  parent.parent);
114 
115   const struct n_sample_test *nst = UP_CAST (test, const struct n_sample_test,
116 					  parent);
117 
118   const bool n_sample_test = (value_compare_3way (&nst->val2, &nst->val1,
119 				       var_get_width (nst->indep_var)) > 0);
120 
121   struct results *results = XCALLOC (nst->n_vars,  struct results);
122   int n_vals = 0;
123   for (v = 0; v < nst->n_vars; ++v)
124     {
125       double count = 0;
126       double cc = 0;
127       double median = mt->median;
128       const struct variable *var = nst->vars[v];
129       struct ccase *c;
130       struct hmap map = HMAP_INITIALIZER (map);
131       struct casereader *r = casereader_clone (input);
132 
133 
134 
135       if (n_sample_test == false)
136 	{
137 	  struct val_node *vn = xzalloc (sizeof *vn);
138 	  value_clone (&vn->val,  &nst->val1, var_get_width (nst->indep_var));
139 	  hmap_insert (&map, &vn->node, value_hash (&nst->val1,
140 					    var_get_width (nst->indep_var), 0));
141 
142 	  vn = xzalloc (sizeof *vn);
143 	  value_clone (&vn->val,  &nst->val2, var_get_width (nst->indep_var));
144 	  hmap_insert (&map, &vn->node, value_hash (&nst->val2,
145 					    var_get_width (nst->indep_var), 0));
146 	}
147 
148       if (median == SYSMIS)
149 	{
150 	  struct percentile *ptl;
151 	  struct order_stats *os;
152 
153 	  struct casereader *rr;
154 	  struct subcase sc;
155 	  struct casewriter *writer;
156 	  subcase_init_var (&sc, var, SC_ASCEND);
157 	  rr = casereader_clone (r);
158 	  writer = sort_create_writer (&sc, casereader_get_proto (rr));
159 
160 	  for (; (c = casereader_read (rr)) != NULL;)
161 	    {
162 	      if (var_is_value_missing (var, case_data (c, var), exclude))
163 		{
164 		  case_unref (c);
165 		  continue;
166 		}
167 
168 	      cc += dict_get_case_weight (dict, c, &warn);
169 	      casewriter_write (writer, c);
170 	    }
171 	  subcase_destroy (&sc);
172 	  casereader_destroy (rr);
173 
174 	  rr = casewriter_make_reader (writer);
175 
176 	  ptl = percentile_create (0.5, cc);
177 	  os = &ptl->parent;
178 
179 	  order_stats_accumulate (&os, 1,
180 				  rr,
181 				  wvar,
182 				  var,
183 				  exclude);
184 
185 	  median = percentile_calculate (ptl, PC_HAVERAGE);
186 	  statistic_destroy (&ptl->parent.parent);
187 	}
188 
189       results[v].median = median;
190 
191 
192       for (; (c = casereader_read (r)) != NULL; case_unref (c))
193 	{
194 	  struct val_node *vn ;
195 	  const double weight = dict_get_case_weight (dict, c, &warn);
196 	  const union value *val = case_data (c, var);
197 	  const union value *indep_val = case_data (c, nst->indep_var);
198 
199 	  if (var_is_value_missing (var, case_data (c, var), exclude))
200 	    {
201 	      continue;
202 	    }
203 
204 	  if (n_sample_test)
205 	    {
206 	      int width = var_get_width (nst->indep_var);
207 	      /* Ignore out of range values */
208 	      if (
209 		  value_compare_3way (indep_val, &nst->val1, width) < 0
210 		||
211 		  value_compare_3way (indep_val, &nst->val2, width) > 0
212 		)
213 		{
214 		  continue;
215 		}
216 	    }
217 
218 	  vn = find_value (&map, indep_val, nst->indep_var);
219 	  if (vn == NULL)
220 	    {
221 	      if (n_sample_test == true)
222 		{
223 		  int width = var_get_width (nst->indep_var);
224 		  vn = xzalloc (sizeof *vn);
225 		  value_clone (&vn->val,  indep_val, width);
226 
227 		  hmap_insert (&map, &vn->node, value_hash (indep_val, width, 0));
228 		}
229 	      else
230 		{
231 		  continue;
232 		}
233 	    }
234 
235 	  if (val->f <= median)
236 	    vn->le += weight;
237 	  else
238 	    vn->gt += weight;
239 
240 	  count += weight;
241 	}
242       casereader_destroy (r);
243 
244       {
245 	int x = 0;
246 	struct val_node *vn = NULL;
247 	double r_0 = 0;
248 	double r_1 = 0;
249 	HMAP_FOR_EACH (vn, struct val_node, node, &map)
250 	  {
251 	    r_0 += vn->le;
252 	    r_1 += vn->gt;
253 	  }
254 
255 	results[v].n = count;
256 	results[v].sorted_array = xcalloc (hmap_count (&map), sizeof (void*));
257 	results[v].var = var;
258 
259 	HMAP_FOR_EACH (vn, struct val_node, node, &map)
260 	  {
261 	    double e_0j = r_0 * (vn->le + vn->gt) / count;
262 	    double e_1j = r_1 * (vn->le + vn->gt) / count;
263 
264 	    results[v].chisq += pow2 (vn->le - e_0j) / e_0j;
265 	    results[v].chisq += pow2 (vn->gt - e_1j) / e_1j;
266 
267 	    results[v].sorted_array[x++] = vn;
268 	  }
269 
270 	n_vals = x;
271 	hmap_destroy (&map);
272 
273 	sort (results[v].sorted_array, x, sizeof (*results[v].sorted_array),
274 	      val_node_cmp_3way, nst->indep_var);
275 
276       }
277     }
278 
279   casereader_destroy (input);
280 
281   show_frequencies (nst, results,  n_vals, dict);
282   show_test_statistics (nst, results, n_vals, dict);
283 
284   for (v = 0; v < nst->n_vars; ++v)
285     {
286       int i;
287       const struct results *rs = results + v;
288 
289       for (i = 0; i < n_vals; ++i)
290 	{
291 	  struct val_node *vn = rs->sorted_array[i];
292 	  value_destroy (&vn->val, var_get_width (nst->indep_var));
293 	  free (vn);
294 	}
295       free (rs->sorted_array);
296     }
297   free (results);
298 }
299 
300 
301 
302 static void
303 show_frequencies (const struct n_sample_test *nst, const struct results *results,  int n_vals, const struct dictionary *dict)
304 {
305   struct pivot_table *table = pivot_table_create (N_("Frequencies"));
306   pivot_table_set_weight_var (table, dict_get_weight (dict));
307 
308   struct pivot_dimension *indep = pivot_dimension_create__ (
309     table, PIVOT_AXIS_COLUMN, pivot_value_new_variable (nst->indep_var));
310   indep->root->show_label = true;
311   for (int i = 0; i < n_vals; ++i)
312     pivot_category_create_leaf_rc (
313       indep->root, pivot_value_new_var_value (
314         nst->indep_var, &results->sorted_array[i]->val), PIVOT_RC_COUNT);
315   pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Statistics"),
316                           N_("> Median"), N_("≤ Median"));
317 
318   struct pivot_dimension *dep = pivot_dimension_create (
319     table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
320 
321   for (int v = 0; v < nst->n_vars; ++v)
322     {
323       const struct results *rs = &results[v];
324 
325       int dep_idx = pivot_category_create_leaf (
326         dep->root, pivot_value_new_variable (rs->var));
327 
328       for (int indep_idx = 0; indep_idx < n_vals; indep_idx++)
329 	{
330 	  const struct val_node *vn = rs->sorted_array[indep_idx];
331           pivot_table_put3 (table, indep_idx, 0, dep_idx,
332                             pivot_value_new_number (vn->gt));
333           pivot_table_put3 (table, indep_idx, 1, dep_idx,
334                             pivot_value_new_number (vn->le));
335 	}
336     }
337 
338   pivot_table_submit (table);
339 }
340 
341 static void
342 show_test_statistics (const struct n_sample_test *nst,
343 		      const struct results *results,
344 		      int n_vals,
345 		      const struct dictionary *dict)
346 {
347   struct pivot_table *table = pivot_table_create (N_("Test Statistics"));
348   pivot_table_set_weight_var (table, dict_get_weight (dict));
349 
350   pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
351                           N_("N"), PIVOT_RC_COUNT,
352                           N_("Median"),
353                           N_("Chi-Square"), PIVOT_RC_OTHER,
354                           N_("df"), PIVOT_RC_COUNT,
355                           N_("Asymp. Sig."), PIVOT_RC_SIGNIFICANCE);
356 
357   struct pivot_dimension *variables = pivot_dimension_create (
358     table, PIVOT_AXIS_ROW, N_("Variables"));
359 
360   for (int v = 0; v < nst->n_vars; ++v)
361     {
362       double df = n_vals - 1;
363       const struct results *rs = &results[v];
364 
365       int var_idx = pivot_category_create_leaf (
366         variables->root, pivot_value_new_variable (rs->var));
367 
368       double entries[] = {
369         rs->n,
370         rs->median,
371         rs->chisq,
372         df,
373         gsl_cdf_chisq_Q (rs->chisq, df),
374       };
375       for (size_t i = 0; i < sizeof entries / sizeof *entries; i++)
376         {
377           struct pivot_value *value
378             = pivot_value_new_number (entries[i]);
379           if (i == 1)
380             value->numeric.format = *var_get_print_format (rs->var);
381           pivot_table_put2 (table, i, var_idx, value);
382         }
383     }
384 
385   pivot_table_submit (table);
386 }
387