1 /* PSPP - a program for statistical analysis.
2    Copyright (C) 2010, 2011 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
16 
17 #include <config.h>
18 
19 #include "language/stats/mann-whitney.h"
20 
21 #include <gsl/gsl_cdf.h>
22 
23 #include "data/case.h"
24 #include "data/casereader.h"
25 #include "data/dataset.h"
26 #include "data/dictionary.h"
27 #include "data/format.h"
28 #include "data/variable.h"
29 #include "libpspp/cast.h"
30 #include "libpspp/misc.h"
31 #include "math/sort.h"
32 #include "output/pivot-table.h"
33 
34 #include "gettext.h"
35 #define N_(msgid) msgid
36 #define _(msgid) gettext (msgid)
37 
38 /* Calculates the adjustment necessary for tie compensation */
39 static void
distinct_callback(double v UNUSED,casenumber t,double w UNUSED,void * aux)40 distinct_callback (double v UNUSED, casenumber t, double w UNUSED, void *aux)
41 {
42   double *tiebreaker = aux;
43 
44   *tiebreaker += (pow3 (t) - t) / 12.0;
45 }
46 
47 struct mw
48 {
49   double rank_sum[2];
50   double n[2];
51 
52   double u;  /* The Mann-Whitney U statistic */
53   double w;  /* The Wilcoxon Rank Sum W statistic */
54   double z;
55 };
56 
57 static void show_ranks_box (const struct n_sample_test *, const struct mw *);
58 static void show_statistics_box (const struct n_sample_test *,
59                                  const struct mw *);
60 
61 
62 
63 static bool
belongs_to_test(const struct ccase * c,void * aux)64 belongs_to_test (const struct ccase *c, void *aux)
65 {
66   const struct n_sample_test *nst = aux;
67 
68   const union value *group = case_data (c, nst->indep_var);
69   const size_t group_var_width = var_get_width (nst->indep_var);
70 
71   if (value_equal (group, &nst->val1, group_var_width))
72     return true;
73 
74   if (value_equal (group, &nst->val2, group_var_width))
75     return true;
76 
77   return false;
78 }
79 
80 
81 
82 void
mann_whitney_execute(const struct dataset * ds,struct casereader * input,enum mv_class exclude,const struct npar_test * test,bool exact UNUSED,double timer UNUSED)83 mann_whitney_execute (const struct dataset *ds,
84 		      struct casereader *input,
85 		      enum mv_class exclude,
86 		      const struct npar_test *test,
87 		      bool exact UNUSED,
88 		      double timer UNUSED)
89 {
90   int i;
91   const struct dictionary *dict = dataset_dict (ds);
92   const struct n_sample_test *nst = UP_CAST (test, const struct n_sample_test, parent);
93 
94   const struct caseproto *proto = casereader_get_proto (input);
95   size_t rank_idx = caseproto_get_n_widths (proto);
96 
97   struct mw *mw = XCALLOC (nst->n_vars,  struct mw);
98 
99   for (i = 0; i < nst->n_vars; ++i)
100     {
101       double tiebreaker = 0.0;
102       bool warn = true;
103       enum rank_error rerr = 0;
104       struct casereader *rr;
105       struct ccase *c;
106       const struct variable *var = nst->vars[i];
107 
108       struct casereader *reader =
109 	casereader_create_filter_func (casereader_clone (input),
110 				       belongs_to_test,
111 				       NULL,
112 				       CONST_CAST (struct n_sample_test *, nst),
113 				       NULL);
114 
115       reader = casereader_create_filter_missing (reader, &var, 1,
116 						 exclude,
117 						 NULL, NULL);
118 
119       reader = sort_execute_1var (reader, var);
120 
121       rr = casereader_create_append_rank (reader, var,
122 					  dict_get_weight (dict),
123 					  &rerr,
124 					  distinct_callback, &tiebreaker);
125 
126       for (; (c = casereader_read (rr)); case_unref (c))
127 	{
128 	  const union value *group = case_data (c, nst->indep_var);
129 	  const size_t group_var_width = var_get_width (nst->indep_var);
130 	  const double rank = case_data_idx (c, rank_idx)->f;
131 
132 	  if (value_equal (group, &nst->val1, group_var_width))
133 	    {
134 	      mw[i].rank_sum[0] += rank;
135 	      mw[i].n[0] += dict_get_case_weight (dict, c, &warn);
136 	    }
137 	  else if (value_equal (group, &nst->val2, group_var_width))
138 	    {
139 	      mw[i].rank_sum[1] += rank;
140 	      mw[i].n[1] += dict_get_case_weight (dict, c, &warn);
141 	    }
142 	}
143       casereader_destroy (rr);
144 
145       {
146 	double n;
147 	double denominator;
148 	struct mw *mwv = &mw[i];
149 
150 	mwv->u = mwv->n[0] * mwv->n[1] ;
151 	mwv->u += mwv->n[0] * (mwv->n[0] + 1) / 2.0;
152 	mwv->u -= mwv->rank_sum[0];
153 
154 	mwv->w = mwv->rank_sum[1];
155 	if (mwv->u > mwv->n[0] * mwv->n[1] / 2.0)
156 	  {
157 	    mwv->u =  mwv->n[0] * mwv->n[1] - mwv->u;
158 	    mwv->w = mwv->rank_sum[0];
159 	  }
160 	mwv->z = mwv->u - mwv->n[0] * mwv->n[1] / 2.0;
161 	n = mwv->n[0] + mwv->n[1];
162 	denominator = pow3(n) - n;
163 	denominator /= 12;
164 	denominator -= tiebreaker;
165 	denominator *= mwv->n[0] * mwv->n[1];
166 	denominator /= n * (n - 1);
167 
168 	mwv->z /= sqrt (denominator);
169       }
170     }
171   casereader_destroy (input);
172 
173   show_ranks_box (nst, mw);
174   show_statistics_box (nst, mw);
175 
176   free (mw);
177 }
178 
179 static void
show_ranks_box(const struct n_sample_test * nst,const struct mw * mwv)180 show_ranks_box (const struct n_sample_test *nst, const struct mw *mwv)
181 {
182   struct pivot_table *table = pivot_table_create (N_("Ranks"));
183 
184   pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
185                           N_("N"), PIVOT_RC_COUNT,
186                           N_("Mean Rank"), PIVOT_RC_OTHER,
187                           N_("Sum of Ranks"), PIVOT_RC_OTHER);
188 
189   struct pivot_dimension *indep = pivot_dimension_create__ (
190     table, PIVOT_AXIS_ROW, pivot_value_new_variable (nst->indep_var));
191   pivot_category_create_leaf (indep->root,
192                               pivot_value_new_var_value (nst->indep_var,
193                                                          &nst->val1));
194   pivot_category_create_leaf (indep->root,
195                               pivot_value_new_var_value (nst->indep_var,
196                                                          &nst->val2));
197   pivot_category_create_leaves (indep->root, N_("Total"));
198 
199   struct pivot_dimension *dep = pivot_dimension_create (
200     table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
201 
202   for (size_t i = 0 ; i < nst->n_vars ; ++i)
203     {
204       const struct mw *mw = &mwv[i];
205 
206       int dep_idx = pivot_category_create_leaf (
207         dep->root, pivot_value_new_variable (nst->vars[i]));
208 
209       struct entry
210         {
211           int stat_idx;
212           int indep_idx;
213           double x;
214         }
215       entries[] = {
216         /* N. */
217         { 0, 0, mw->n[0] },
218         { 0, 1, mw->n[1] },
219         { 0, 2, mw->n[0] + mw->n[1] },
220 
221         /* Mean Rank. */
222         { 1, 0, mw->rank_sum[0] / mw->n[0] },
223         { 1, 1, mw->rank_sum[1] / mw->n[1] },
224 
225         /* Sum of Ranks. */
226         { 2, 0, mw->rank_sum[0] },
227         { 2, 1, mw->rank_sum[1] },
228       };
229 
230       for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
231         {
232           const struct entry *e = &entries[j];
233           pivot_table_put3 (table, e->stat_idx, e->indep_idx, dep_idx,
234                             pivot_value_new_number (e->x));
235         }
236     }
237 
238   pivot_table_submit (table);
239 }
240 
241 static void
show_statistics_box(const struct n_sample_test * nst,const struct mw * mwv)242 show_statistics_box (const struct n_sample_test *nst, const struct mw *mwv)
243 {
244   struct pivot_table *table = pivot_table_create (N_("Test Statistics"));
245 
246   pivot_dimension_create (
247     table, PIVOT_AXIS_COLUMN, N_("Statistics"),
248     _("Mann-Whitney U"), PIVOT_RC_OTHER,
249     _("Wilcoxon W"), PIVOT_RC_OTHER,
250     _("Z"), PIVOT_RC_OTHER,
251     _("Asymp. Sig. (2-tailed)"), PIVOT_RC_SIGNIFICANCE);
252 
253   struct pivot_dimension *variables = pivot_dimension_create (
254     table, PIVOT_AXIS_ROW, N_("Variables"));
255 
256   for (size_t i = 0 ; i < nst->n_vars ; ++i)
257     {
258       const struct mw *mw = &mwv[i];
259 
260       int row = pivot_category_create_leaf (
261         variables->root, pivot_value_new_variable (nst->vars[i]));
262 
263       double entries[] = {
264         mw->u,
265         mw->w,
266         mw->z,
267         2.0 * gsl_cdf_ugaussian_P (mw->z),
268       };
269       for (size_t i = 0; i < sizeof entries / sizeof *entries; i++)
270         pivot_table_put2 (table, i, row, pivot_value_new_number (entries[i]));
271     }
272 
273   pivot_table_submit (table);
274 }
275