1 /* PSPP - a program for statistical analysis. 2 Copyright (C) 2011 Free Software Foundation, Inc. 3 4 This program is free software: you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation, either version 3 of the License, or 7 (at your option) any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program. If not, see <http://www.gnu.org/licenses/>. 16 */ 17 18 #include <config.h> 19 #include "median.h" 20 21 #include <gsl/gsl_cdf.h> 22 23 #include "data/format.h" 24 25 26 #include "data/variable.h" 27 #include "data/case.h" 28 #include "data/dictionary.h" 29 #include "data/dataset.h" 30 #include "data/casereader.h" 31 #include "data/casewriter.h" 32 #include "data/subcase.h" 33 #include "data/value.h" 34 35 #include "math/percentiles.h" 36 #include "math/sort.h" 37 38 #include "libpspp/cast.h" 39 #include "libpspp/hmap.h" 40 #include "libpspp/array.h" 41 #include "libpspp/str.h" 42 #include "libpspp/misc.h" 43 44 #include "output/pivot-table.h" 45 46 #include "gettext.h" 47 #define N_(msgid) msgid 48 #define _(msgid) gettext (msgid) 49 50 51 struct val_node 52 { 53 struct hmap_node node; 54 union value val; 55 casenumber le; 56 casenumber gt; 57 }; 58 59 struct results 60 { 61 const struct variable *var; 62 struct val_node **sorted_array; 63 double n; 64 double median; 65 double chisq; 66 }; 67 68 69 70 static int 71 val_node_cmp_3way (const void *a_, const void *b_, const void *aux) 72 { 73 const struct variable *indep_var = aux; 74 const struct val_node *const *a = a_; 75 const struct val_node *const *b = b_; 76 77 return value_compare_3way (&(*a)->val, &(*b)->val, var_get_width (indep_var)); 78 } 79 80 static void 81 show_frequencies (const struct n_sample_test *nst, const struct results *results, int n_vals, const struct dictionary *); 82 83 static void 84 show_test_statistics (const struct n_sample_test *nst, const struct results *results, int, const struct dictionary *); 85 86 87 static struct val_node * 88 find_value (const struct hmap *map, const union value *val, 89 const struct variable *var) 90 { 91 struct val_node *foo = NULL; 92 size_t hash = value_hash (val, var_get_width (var), 0); 93 HMAP_FOR_EACH_WITH_HASH (foo, struct val_node, node, hash, map) 94 if (value_equal (val, &foo->val, var_get_width (var))) 95 break; 96 97 return foo; 98 } 99 100 void 101 median_execute (const struct dataset *ds, 102 struct casereader *input, 103 enum mv_class exclude, 104 const struct npar_test *test, 105 bool exact UNUSED, 106 double timer UNUSED) 107 { 108 const struct dictionary *dict = dataset_dict (ds); 109 const struct variable *wvar = dict_get_weight (dict); 110 bool warn = true; 111 int v; 112 const struct median_test *mt = UP_CAST (test, const struct median_test, 113 parent.parent); 114 115 const struct n_sample_test *nst = UP_CAST (test, const struct n_sample_test, 116 parent); 117 118 const bool n_sample_test = (value_compare_3way (&nst->val2, &nst->val1, 119 var_get_width (nst->indep_var)) > 0); 120 121 struct results *results = XCALLOC (nst->n_vars, struct results); 122 int n_vals = 0; 123 for (v = 0; v < nst->n_vars; ++v) 124 { 125 double count = 0; 126 double cc = 0; 127 double median = mt->median; 128 const struct variable *var = nst->vars[v]; 129 struct ccase *c; 130 struct hmap map = HMAP_INITIALIZER (map); 131 struct casereader *r = casereader_clone (input); 132 133 134 135 if (n_sample_test == false) 136 { 137 struct val_node *vn = xzalloc (sizeof *vn); 138 value_clone (&vn->val, &nst->val1, var_get_width (nst->indep_var)); 139 hmap_insert (&map, &vn->node, value_hash (&nst->val1, 140 var_get_width (nst->indep_var), 0)); 141 142 vn = xzalloc (sizeof *vn); 143 value_clone (&vn->val, &nst->val2, var_get_width (nst->indep_var)); 144 hmap_insert (&map, &vn->node, value_hash (&nst->val2, 145 var_get_width (nst->indep_var), 0)); 146 } 147 148 if (median == SYSMIS) 149 { 150 struct percentile *ptl; 151 struct order_stats *os; 152 153 struct casereader *rr; 154 struct subcase sc; 155 struct casewriter *writer; 156 subcase_init_var (&sc, var, SC_ASCEND); 157 rr = casereader_clone (r); 158 writer = sort_create_writer (&sc, casereader_get_proto (rr)); 159 160 for (; (c = casereader_read (rr)) != NULL;) 161 { 162 if (var_is_value_missing (var, case_data (c, var), exclude)) 163 { 164 case_unref (c); 165 continue; 166 } 167 168 cc += dict_get_case_weight (dict, c, &warn); 169 casewriter_write (writer, c); 170 } 171 subcase_destroy (&sc); 172 casereader_destroy (rr); 173 174 rr = casewriter_make_reader (writer); 175 176 ptl = percentile_create (0.5, cc); 177 os = &ptl->parent; 178 179 order_stats_accumulate (&os, 1, 180 rr, 181 wvar, 182 var, 183 exclude); 184 185 median = percentile_calculate (ptl, PC_HAVERAGE); 186 statistic_destroy (&ptl->parent.parent); 187 } 188 189 results[v].median = median; 190 191 192 for (; (c = casereader_read (r)) != NULL; case_unref (c)) 193 { 194 struct val_node *vn ; 195 const double weight = dict_get_case_weight (dict, c, &warn); 196 const union value *val = case_data (c, var); 197 const union value *indep_val = case_data (c, nst->indep_var); 198 199 if (var_is_value_missing (var, case_data (c, var), exclude)) 200 { 201 continue; 202 } 203 204 if (n_sample_test) 205 { 206 int width = var_get_width (nst->indep_var); 207 /* Ignore out of range values */ 208 if ( 209 value_compare_3way (indep_val, &nst->val1, width) < 0 210 || 211 value_compare_3way (indep_val, &nst->val2, width) > 0 212 ) 213 { 214 continue; 215 } 216 } 217 218 vn = find_value (&map, indep_val, nst->indep_var); 219 if (vn == NULL) 220 { 221 if (n_sample_test == true) 222 { 223 int width = var_get_width (nst->indep_var); 224 vn = xzalloc (sizeof *vn); 225 value_clone (&vn->val, indep_val, width); 226 227 hmap_insert (&map, &vn->node, value_hash (indep_val, width, 0)); 228 } 229 else 230 { 231 continue; 232 } 233 } 234 235 if (val->f <= median) 236 vn->le += weight; 237 else 238 vn->gt += weight; 239 240 count += weight; 241 } 242 casereader_destroy (r); 243 244 { 245 int x = 0; 246 struct val_node *vn = NULL; 247 double r_0 = 0; 248 double r_1 = 0; 249 HMAP_FOR_EACH (vn, struct val_node, node, &map) 250 { 251 r_0 += vn->le; 252 r_1 += vn->gt; 253 } 254 255 results[v].n = count; 256 results[v].sorted_array = xcalloc (hmap_count (&map), sizeof (void*)); 257 results[v].var = var; 258 259 HMAP_FOR_EACH (vn, struct val_node, node, &map) 260 { 261 double e_0j = r_0 * (vn->le + vn->gt) / count; 262 double e_1j = r_1 * (vn->le + vn->gt) / count; 263 264 results[v].chisq += pow2 (vn->le - e_0j) / e_0j; 265 results[v].chisq += pow2 (vn->gt - e_1j) / e_1j; 266 267 results[v].sorted_array[x++] = vn; 268 } 269 270 n_vals = x; 271 hmap_destroy (&map); 272 273 sort (results[v].sorted_array, x, sizeof (*results[v].sorted_array), 274 val_node_cmp_3way, nst->indep_var); 275 276 } 277 } 278 279 casereader_destroy (input); 280 281 show_frequencies (nst, results, n_vals, dict); 282 show_test_statistics (nst, results, n_vals, dict); 283 284 for (v = 0; v < nst->n_vars; ++v) 285 { 286 int i; 287 const struct results *rs = results + v; 288 289 for (i = 0; i < n_vals; ++i) 290 { 291 struct val_node *vn = rs->sorted_array[i]; 292 value_destroy (&vn->val, var_get_width (nst->indep_var)); 293 free (vn); 294 } 295 free (rs->sorted_array); 296 } 297 free (results); 298 } 299 300 301 302 static void 303 show_frequencies (const struct n_sample_test *nst, const struct results *results, int n_vals, const struct dictionary *dict) 304 { 305 struct pivot_table *table = pivot_table_create (N_("Frequencies")); 306 pivot_table_set_weight_var (table, dict_get_weight (dict)); 307 308 struct pivot_dimension *indep = pivot_dimension_create__ ( 309 table, PIVOT_AXIS_COLUMN, pivot_value_new_variable (nst->indep_var)); 310 indep->root->show_label = true; 311 for (int i = 0; i < n_vals; ++i) 312 pivot_category_create_leaf_rc ( 313 indep->root, pivot_value_new_var_value ( 314 nst->indep_var, &results->sorted_array[i]->val), PIVOT_RC_COUNT); 315 pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Statistics"), 316 N_("> Median"), N_("≤ Median")); 317 318 struct pivot_dimension *dep = pivot_dimension_create ( 319 table, PIVOT_AXIS_ROW, N_("Dependent Variables")); 320 321 for (int v = 0; v < nst->n_vars; ++v) 322 { 323 const struct results *rs = &results[v]; 324 325 int dep_idx = pivot_category_create_leaf ( 326 dep->root, pivot_value_new_variable (rs->var)); 327 328 for (int indep_idx = 0; indep_idx < n_vals; indep_idx++) 329 { 330 const struct val_node *vn = rs->sorted_array[indep_idx]; 331 pivot_table_put3 (table, indep_idx, 0, dep_idx, 332 pivot_value_new_number (vn->gt)); 333 pivot_table_put3 (table, indep_idx, 1, dep_idx, 334 pivot_value_new_number (vn->le)); 335 } 336 } 337 338 pivot_table_submit (table); 339 } 340 341 static void 342 show_test_statistics (const struct n_sample_test *nst, 343 const struct results *results, 344 int n_vals, 345 const struct dictionary *dict) 346 { 347 struct pivot_table *table = pivot_table_create (N_("Test Statistics")); 348 pivot_table_set_weight_var (table, dict_get_weight (dict)); 349 350 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"), 351 N_("N"), PIVOT_RC_COUNT, 352 N_("Median"), 353 N_("Chi-Square"), PIVOT_RC_OTHER, 354 N_("df"), PIVOT_RC_COUNT, 355 N_("Asymp. Sig."), PIVOT_RC_SIGNIFICANCE); 356 357 struct pivot_dimension *variables = pivot_dimension_create ( 358 table, PIVOT_AXIS_ROW, N_("Variables")); 359 360 for (int v = 0; v < nst->n_vars; ++v) 361 { 362 double df = n_vals - 1; 363 const struct results *rs = &results[v]; 364 365 int var_idx = pivot_category_create_leaf ( 366 variables->root, pivot_value_new_variable (rs->var)); 367 368 double entries[] = { 369 rs->n, 370 rs->median, 371 rs->chisq, 372 df, 373 gsl_cdf_chisq_Q (rs->chisq, df), 374 }; 375 for (size_t i = 0; i < sizeof entries / sizeof *entries; i++) 376 { 377 struct pivot_value *value 378 = pivot_value_new_number (entries[i]); 379 if (i == 1) 380 value->numeric.format = *var_get_print_format (rs->var); 381 pivot_table_put2 (table, i, var_idx, value); 382 } 383 } 384 385 pivot_table_submit (table); 386 } 387