1 /*
2 PSPP - a program for statistical analysis.
3 Copyright (C) 2012, 2013, 2016, 2019 Free Software Foundation, Inc.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include <config.h>
20
21 #include <math.h>
22 #include <gsl/gsl_cdf.h>
23
24 #include "libpspp/assertion.h"
25 #include "libpspp/message.h"
26 #include "libpspp/pool.h"
27
28
29 #include "data/dataset.h"
30 #include "data/dictionary.h"
31 #include "data/casegrouper.h"
32 #include "data/casereader.h"
33 #include "data/casewriter.h"
34 #include "data/caseproto.h"
35 #include "data/subcase.h"
36
37
38 #include "data/format.h"
39
40 #include "math/interaction.h"
41 #include "math/box-whisker.h"
42 #include "math/categoricals.h"
43 #include "math/chart-geometry.h"
44 #include "math/histogram.h"
45 #include "math/moments.h"
46 #include "math/np.h"
47 #include "math/sort.h"
48 #include "math/order-stats.h"
49 #include "math/percentiles.h"
50 #include "math/shapiro-wilk.h"
51 #include "math/tukey-hinges.h"
52 #include "math/trimmed-mean.h"
53
54 #include "output/charts/boxplot.h"
55 #include "output/charts/np-plot.h"
56 #include "output/charts/spreadlevel-plot.h"
57 #include "output/charts/plot-hist.h"
58
59 #include "language/command.h"
60 #include "language/lexer/lexer.h"
61 #include "language/lexer/value-parser.h"
62 #include "language/lexer/variable-parser.h"
63
64 #include "output/pivot-table.h"
65
66 #include "gettext.h"
67 #define _(msgid) gettext (msgid)
68 #define N_(msgid) msgid
69
70 static void
append_value_name(const struct variable * var,const union value * val,struct string * str)71 append_value_name (const struct variable *var, const union value *val, struct string *str)
72 {
73 var_append_value_name (var, val, str);
74 if (var_is_value_missing (var, val, MV_ANY))
75 ds_put_cstr (str, _(" (missing)"));
76 }
77
78 enum bp_mode
79 {
80 BP_GROUPS,
81 BP_VARIABLES
82 };
83
84
85 /* Indices for the ex_proto member (below) */
86 enum
87 {
88 EX_VAL, /* value */
89 EX_ID, /* identity */
90 EX_WT /* weight */
91 };
92
93
94 #define PLOT_HISTOGRAM 0x1
95 #define PLOT_BOXPLOT 0x2
96 #define PLOT_NPPLOT 0x4
97 #define PLOT_SPREADLEVEL 0x8
98
99 struct examine
100 {
101 struct pool *pool;
102
103 /* A caseproto used to contain the data subsets under examination,
104 see (enum above) */
105 struct caseproto *ex_proto;
106
107 size_t n_dep_vars;
108 const struct variable **dep_vars;
109
110 size_t n_iacts;
111 struct interaction **iacts;
112
113 enum mv_class dep_excl;
114 enum mv_class fctr_excl;
115
116 const struct dictionary *dict;
117
118 struct categoricals *cats;
119
120 /* how many extremities to display */
121 int disp_extremes;
122 int calc_extremes;
123 bool descriptives;
124
125 double conf;
126
127 bool missing_pw;
128
129 /* The case index of the ID value (or -1) if not applicable */
130 size_t id_idx;
131 int id_width;
132
133 enum pc_alg pc_alg;
134 double *ptiles;
135 size_t n_percentiles;
136
137 unsigned int plot;
138 int sl_power;
139
140 enum bp_mode boxplot_mode;
141
142 const struct variable *id_var;
143
144 const struct variable *wv;
145 };
146
147 struct extremity
148 {
149 /* The value of this extremity */
150 double val;
151
152 /* Either the casenumber or the value of the variable specified
153 by the /ID subcommand which corresponds to this extremity */
154 union value identity;
155 };
156
157 struct exploratory_stats
158 {
159 double missing;
160 double non_missing;
161
162 struct moments *mom;
163
164 /* Most operations need a sorted reader/writer */
165 struct casewriter *sorted_writer;
166 struct casereader *sorted_reader;
167
168 struct extremity *minima;
169 struct extremity *maxima;
170
171 /*
172 Minimum should alway equal mimima[0].val.
173 Likewise, maximum should alway equal maxima[0].val.
174 This redundancy exists as an optimisation effort.
175 Some statistics (eg histogram) require early calculation
176 of the min and max
177 */
178 double minimum;
179 double maximum;
180
181 struct trimmed_mean *trimmed_mean;
182 struct percentile *quartiles[3];
183 struct percentile **percentiles;
184 struct shapiro_wilk *shapiro_wilk;
185
186 struct tukey_hinges *hinges;
187
188 /* The data for the NP Plots */
189 struct np *np;
190
191 struct histogram *histogram;
192
193 /* The data for the box plots */
194 struct box_whisker *box_whisker;
195
196 /* Total weight */
197 double cc;
198
199 /* The minimum weight */
200 double cmin;
201 };
202
203 static void
show_boxplot_grouped(const struct examine * cmd,int iact_idx)204 show_boxplot_grouped (const struct examine *cmd, int iact_idx)
205 {
206 int v;
207
208 const struct interaction *iact = cmd->iacts[iact_idx];
209 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
210
211 for (v = 0; v < cmd->n_dep_vars; ++v)
212 {
213 double y_min = DBL_MAX;
214 double y_max = -DBL_MAX;
215 int grp;
216 struct boxplot *boxplot;
217 struct string title;
218 ds_init_empty (&title);
219
220 if (iact->n_vars > 0)
221 {
222 struct string istr;
223 ds_init_empty (&istr);
224 interaction_to_string (iact, &istr);
225 ds_put_format (&title, _("Boxplot of %s vs. %s"),
226 var_to_string (cmd->dep_vars[v]),
227 ds_cstr (&istr));
228 ds_destroy (&istr);
229 }
230 else
231 ds_put_format (&title, _("Boxplot of %s"), var_to_string (cmd->dep_vars[v]));
232
233 for (grp = 0; grp < n_cats; ++grp)
234 {
235 const struct exploratory_stats *es =
236 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
237
238 if (y_min > es[v].minimum)
239 y_min = es[v].minimum;
240
241 if (y_max < es[v].maximum)
242 y_max = es[v].maximum;
243 }
244
245 boxplot = boxplot_create (y_min, y_max, ds_cstr (&title));
246
247 ds_destroy (&title);
248
249 for (grp = 0; grp < n_cats; ++grp)
250 {
251 int ivar_idx;
252 struct string label;
253
254 const struct ccase *c =
255 categoricals_get_case_by_category_real (cmd->cats, iact_idx, grp);
256
257 struct exploratory_stats *es =
258 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
259
260 ds_init_empty (&label);
261 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
262 {
263 struct string l;
264 const struct variable *ivar = iact->vars[ivar_idx];
265 const union value *val = case_data (c, ivar);
266 ds_init_empty (&l);
267
268 append_value_name (ivar, val, &l);
269 ds_ltrim (&l, ss_cstr (" "));
270
271 ds_put_substring (&label, l.ss);
272 if (ivar_idx < iact->n_vars - 1)
273 ds_put_cstr (&label, "; ");
274
275 ds_destroy (&l);
276 }
277
278 boxplot_add_box (boxplot, es[v].box_whisker, ds_cstr (&label));
279 es[v].box_whisker = NULL;
280
281 ds_destroy (&label);
282 }
283
284 boxplot_submit (boxplot);
285 }
286 }
287
288 static void
show_boxplot_variabled(const struct examine * cmd,int iact_idx)289 show_boxplot_variabled (const struct examine *cmd, int iact_idx)
290 {
291 int grp;
292 const struct interaction *iact = cmd->iacts[iact_idx];
293 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
294
295 for (grp = 0; grp < n_cats; ++grp)
296 {
297 struct boxplot *boxplot;
298 int v;
299 double y_min = DBL_MAX;
300 double y_max = -DBL_MAX;
301
302 const struct ccase *c =
303 categoricals_get_case_by_category_real (cmd->cats, iact_idx, grp);
304
305 struct string title;
306 ds_init_empty (&title);
307
308 for (v = 0; v < cmd->n_dep_vars; ++v)
309 {
310 const struct exploratory_stats *es =
311 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
312
313 if (y_min > es[v].minimum)
314 y_min = es[v].minimum;
315
316 if (y_max < es[v].maximum)
317 y_max = es[v].maximum;
318 }
319
320 if (iact->n_vars == 0)
321 ds_put_format (&title, _("Boxplot"));
322 else
323 {
324 int ivar_idx;
325 struct string label;
326 ds_init_empty (&label);
327 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
328 {
329 const struct variable *ivar = iact->vars[ivar_idx];
330 const union value *val = case_data (c, ivar);
331
332 ds_put_cstr (&label, var_to_string (ivar));
333 ds_put_cstr (&label, " = ");
334 append_value_name (ivar, val, &label);
335 ds_put_cstr (&label, "; ");
336 }
337
338 ds_put_format (&title, _("Boxplot of %s"),
339 ds_cstr (&label));
340
341 ds_destroy (&label);
342 }
343
344 boxplot = boxplot_create (y_min, y_max, ds_cstr (&title));
345
346 ds_destroy (&title);
347
348 for (v = 0; v < cmd->n_dep_vars; ++v)
349 {
350 struct exploratory_stats *es =
351 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
352
353 boxplot_add_box (boxplot, es[v].box_whisker,
354 var_to_string (cmd->dep_vars[v]));
355 es[v].box_whisker = NULL;
356 }
357
358 boxplot_submit (boxplot);
359 }
360 }
361
362
363 static void
show_npplot(const struct examine * cmd,int iact_idx)364 show_npplot (const struct examine *cmd, int iact_idx)
365 {
366 const struct interaction *iact = cmd->iacts[iact_idx];
367 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
368
369 int v;
370
371 for (v = 0; v < cmd->n_dep_vars; ++v)
372 {
373 int grp;
374 for (grp = 0; grp < n_cats; ++grp)
375 {
376 struct chart_item *npp, *dnpp;
377 struct casereader *reader;
378 struct np *np;
379
380 int ivar_idx;
381 const struct ccase *c =
382 categoricals_get_case_by_category_real (cmd->cats,
383 iact_idx, grp);
384
385 const struct exploratory_stats *es =
386 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
387
388 struct string label;
389 ds_init_cstr (&label,
390 var_to_string (cmd->dep_vars[v]));
391
392 if (iact->n_vars > 0)
393 {
394 ds_put_cstr (&label, " (");
395 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
396 {
397 const struct variable *ivar = iact->vars[ivar_idx];
398 const union value *val = case_data (c, ivar);
399
400 ds_put_cstr (&label, var_to_string (ivar));
401 ds_put_cstr (&label, " = ");
402 append_value_name (ivar, val, &label);
403 ds_put_cstr (&label, "; ");
404
405 }
406 ds_put_cstr (&label, ")");
407 }
408
409 np = es[v].np;
410 reader = casewriter_make_reader (np->writer);
411 np->writer = NULL;
412
413 npp = np_plot_create (np, reader, ds_cstr (&label));
414 dnpp = dnp_plot_create (np, reader, ds_cstr (&label));
415
416 if (npp == NULL || dnpp == NULL)
417 {
418 msg (MW, _("Not creating NP plot because data set is empty."));
419 chart_item_unref (npp);
420 chart_item_unref (dnpp);
421 }
422 else
423 {
424 chart_item_submit (npp);
425 chart_item_submit (dnpp);
426 }
427 casereader_destroy (reader);
428
429 ds_destroy (&label);
430 }
431 }
432 }
433
434 static void
show_spreadlevel(const struct examine * cmd,int iact_idx)435 show_spreadlevel (const struct examine *cmd, int iact_idx)
436 {
437 const struct interaction *iact = cmd->iacts[iact_idx];
438 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
439
440 int v;
441
442 /* Spreadlevel when there are no levels is not useful */
443 if (iact->n_vars == 0)
444 return;
445
446 for (v = 0; v < cmd->n_dep_vars; ++v)
447 {
448 int grp;
449 struct chart_item *sl;
450
451 struct string label;
452 ds_init_cstr (&label,
453 var_to_string (cmd->dep_vars[v]));
454
455 if (iact->n_vars > 0)
456 {
457 ds_put_cstr (&label, " (");
458 interaction_to_string (iact, &label);
459 ds_put_cstr (&label, ")");
460 }
461
462 sl = spreadlevel_plot_create (ds_cstr (&label), cmd->sl_power);
463
464 for (grp = 0; grp < n_cats; ++grp)
465 {
466 const struct exploratory_stats *es =
467 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
468
469 double median = percentile_calculate (es[v].quartiles[1], cmd->pc_alg);
470
471 double iqr = percentile_calculate (es[v].quartiles[2], cmd->pc_alg) -
472 percentile_calculate (es[v].quartiles[0], cmd->pc_alg);
473
474 spreadlevel_plot_add (sl, iqr, median);
475 }
476
477 if (sl == NULL)
478 msg (MW, _("Not creating spreadlevel chart for %s"), ds_cstr (&label));
479 else
480 chart_item_submit (sl);
481
482 ds_destroy (&label);
483 }
484 }
485
486
487 static void
show_histogram(const struct examine * cmd,int iact_idx)488 show_histogram (const struct examine *cmd, int iact_idx)
489 {
490 const struct interaction *iact = cmd->iacts[iact_idx];
491 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
492
493 int v;
494
495 for (v = 0; v < cmd->n_dep_vars; ++v)
496 {
497 int grp;
498 for (grp = 0; grp < n_cats; ++grp)
499 {
500 double n, mean, var;
501 int ivar_idx;
502 const struct ccase *c =
503 categoricals_get_case_by_category_real (cmd->cats,
504 iact_idx, grp);
505
506 const struct exploratory_stats *es =
507 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
508
509 struct string label;
510
511 if (es[v].histogram == NULL)
512 continue;
513
514 ds_init_cstr (&label,
515 var_to_string (cmd->dep_vars[v]));
516
517 if (iact->n_vars > 0)
518 {
519 ds_put_cstr (&label, " (");
520 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
521 {
522 const struct variable *ivar = iact->vars[ivar_idx];
523 const union value *val = case_data (c, ivar);
524
525 ds_put_cstr (&label, var_to_string (ivar));
526 ds_put_cstr (&label, " = ");
527 append_value_name (ivar, val, &label);
528 ds_put_cstr (&label, "; ");
529
530 }
531 ds_put_cstr (&label, ")");
532 }
533
534
535 moments_calculate (es[v].mom, &n, &mean, &var, NULL, NULL);
536
537 chart_item_submit
538 (histogram_chart_create (es[v].histogram->gsl_hist,
539 ds_cstr (&label), n, mean,
540 sqrt (var), false));
541
542
543 ds_destroy (&label);
544 }
545 }
546 }
547
548 static struct pivot_value *
new_value_with_missing_footnote(const struct variable * var,const union value * value,struct pivot_footnote * missing_footnote)549 new_value_with_missing_footnote (const struct variable *var,
550 const union value *value,
551 struct pivot_footnote *missing_footnote)
552 {
553 struct pivot_value *pv = pivot_value_new_var_value (var, value);
554 if (var_is_value_missing (var, value, MV_USER))
555 pivot_value_add_footnote (pv, missing_footnote);
556 return pv;
557 }
558
559 static void
create_interaction_dimensions(struct pivot_table * table,const struct categoricals * cats,const struct interaction * iact,struct pivot_footnote * missing_footnote)560 create_interaction_dimensions (struct pivot_table *table,
561 const struct categoricals *cats,
562 const struct interaction *iact,
563 struct pivot_footnote *missing_footnote)
564 {
565 for (size_t i = iact->n_vars; i-- > 0;)
566 {
567 const struct variable *var = iact->vars[i];
568 struct pivot_dimension *d = pivot_dimension_create__ (
569 table, PIVOT_AXIS_ROW, pivot_value_new_variable (var));
570 d->root->show_label = true;
571
572 size_t n;
573 union value *values = categoricals_get_var_values (cats, var, &n);
574 for (size_t j = 0; j < n; j++)
575 pivot_category_create_leaf (
576 d->root, new_value_with_missing_footnote (var, &values[j],
577 missing_footnote));
578 }
579 }
580
581 static struct pivot_footnote *
create_missing_footnote(struct pivot_table * table)582 create_missing_footnote (struct pivot_table *table)
583 {
584 return pivot_table_create_footnote (
585 table, pivot_value_new_text (N_("User-missing value.")));
586 }
587
588 static void
percentiles_report(const struct examine * cmd,int iact_idx)589 percentiles_report (const struct examine *cmd, int iact_idx)
590 {
591 struct pivot_table *table = pivot_table_create (N_("Percentiles"));
592 table->omit_empty = true;
593
594 struct pivot_dimension *percentiles = pivot_dimension_create (
595 table, PIVOT_AXIS_COLUMN, N_("Percentiles"));
596 percentiles->root->show_label = true;
597 for (int i = 0; i < cmd->n_percentiles; ++i)
598 pivot_category_create_leaf (
599 percentiles->root,
600 pivot_value_new_user_text_nocopy (xasprintf ("%g", cmd->ptiles[i])));
601
602 pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Statistics"),
603 N_("Weighted Average"), N_("Tukey's Hinges"));
604
605 const struct interaction *iact = cmd->iacts[iact_idx];
606 struct pivot_footnote *missing_footnote = create_missing_footnote (table);
607 create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
608
609 struct pivot_dimension *dep_dim = pivot_dimension_create (
610 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
611
612 size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
613
614 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
615 for (size_t v = 0; v < cmd->n_dep_vars; ++v)
616 {
617 indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
618 dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
619
620 for (size_t i = 0; i < n_cats; ++i)
621 {
622 for (size_t j = 0; j < iact->n_vars; j++)
623 {
624 int idx = categoricals_get_value_index_by_category_real (
625 cmd->cats, iact_idx, i, j);
626 indexes[table->n_dimensions - 2 - j] = idx;
627 }
628
629 const struct exploratory_stats *ess
630 = categoricals_get_user_data_by_category_real (
631 cmd->cats, iact_idx, i);
632 const struct exploratory_stats *es = ess + v;
633
634 double hinges[3];
635 tukey_hinges_calculate (es->hinges, hinges);
636
637 for (size_t pc_idx = 0; pc_idx < cmd->n_percentiles; ++pc_idx)
638 {
639 indexes[0] = pc_idx;
640
641 indexes[1] = 0;
642 double value = percentile_calculate (es->percentiles[pc_idx],
643 cmd->pc_alg);
644 pivot_table_put (table, indexes, table->n_dimensions,
645 pivot_value_new_number (value));
646
647 double hinge = (cmd->ptiles[pc_idx] == 25.0 ? hinges[0]
648 : cmd->ptiles[pc_idx] == 50.0 ? hinges[1]
649 : cmd->ptiles[pc_idx] == 75.0 ? hinges[2]
650 : SYSMIS);
651 if (hinge != SYSMIS)
652 {
653 indexes[1] = 1;
654 pivot_table_put (table, indexes, table->n_dimensions,
655 pivot_value_new_number (hinge));
656 }
657 }
658 }
659
660 }
661 free (indexes);
662
663 pivot_table_submit (table);
664 }
665
666 static void
normality_report(const struct examine * cmd,int iact_idx)667 normality_report (const struct examine *cmd, int iact_idx)
668 {
669 struct pivot_table *table = pivot_table_create (N_("Tests of Normality"));
670 table->omit_empty = true;
671
672 struct pivot_dimension *test =
673 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Shapiro-Wilk"),
674 N_("Statistic"),
675 N_("df"), PIVOT_RC_COUNT,
676 N_("Sig."));
677
678 test->root->show_label = true;
679
680 const struct interaction *iact = cmd->iacts[iact_idx];
681 struct pivot_footnote *missing_footnote = create_missing_footnote (table);
682 create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
683
684 struct pivot_dimension *dep_dim = pivot_dimension_create (
685 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
686
687 size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
688
689 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
690 for (size_t v = 0; v < cmd->n_dep_vars; ++v)
691 {
692 indexes[table->n_dimensions - 1] =
693 pivot_category_create_leaf (dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
694
695 for (size_t i = 0; i < n_cats; ++i)
696 {
697 indexes[1] = i;
698
699 const struct exploratory_stats *es
700 = categoricals_get_user_data_by_category_real (
701 cmd->cats, iact_idx, i);
702
703 struct shapiro_wilk *sw = es[v].shapiro_wilk;
704
705 if (sw == NULL)
706 continue;
707
708 double w = shapiro_wilk_calculate (sw);
709
710 int j = 0;
711 indexes[0] = j;
712
713 pivot_table_put (table, indexes, table->n_dimensions,
714 pivot_value_new_number (w));
715
716 indexes[0] = ++j;
717 pivot_table_put (table, indexes, table->n_dimensions,
718 pivot_value_new_number (sw->n));
719
720 indexes[0] = ++j;
721 pivot_table_put (table, indexes, table->n_dimensions,
722 pivot_value_new_number (shapiro_wilk_significance (sw->n, w)));
723 }
724 }
725
726 free (indexes);
727
728 pivot_table_submit (table);
729 }
730
731
732 static void
descriptives_report(const struct examine * cmd,int iact_idx)733 descriptives_report (const struct examine *cmd, int iact_idx)
734 {
735 struct pivot_table *table = pivot_table_create (N_("Descriptives"));
736 table->omit_empty = true;
737
738 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Aspect"),
739 N_("Statistic"), N_("Std. Error"));
740
741 struct pivot_dimension *statistics = pivot_dimension_create (
742 table, PIVOT_AXIS_ROW, N_("Statistics"), N_("Mean"));
743 struct pivot_category *interval = pivot_category_create_group__ (
744 statistics->root,
745 pivot_value_new_text_format (N_("%g%% Confidence Interval for Mean"),
746 cmd->conf * 100.0));
747 pivot_category_create_leaves (interval, N_("Lower Bound"),
748 N_("Upper Bound"));
749 pivot_category_create_leaves (
750 statistics->root, N_("5% Trimmed Mean"), N_("Median"), N_("Variance"),
751 N_("Std. Deviation"), N_("Minimum"), N_("Maximum"), N_("Range"),
752 N_("Interquartile Range"), N_("Skewness"), N_("Kurtosis"));
753
754 const struct interaction *iact = cmd->iacts[iact_idx];
755 struct pivot_footnote *missing_footnote = create_missing_footnote (table);
756 create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
757
758 struct pivot_dimension *dep_dim = pivot_dimension_create (
759 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
760
761 size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
762
763 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
764 for (size_t v = 0; v < cmd->n_dep_vars; ++v)
765 {
766 indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
767 dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
768
769 for (size_t i = 0; i < n_cats; ++i)
770 {
771 for (size_t j = 0; j < iact->n_vars; j++)
772 {
773 int idx = categoricals_get_value_index_by_category_real (
774 cmd->cats, iact_idx, i, j);
775 indexes[table->n_dimensions - 2 - j] = idx;
776 }
777
778 const struct exploratory_stats *ess
779 = categoricals_get_user_data_by_category_real (cmd->cats,
780 iact_idx, i);
781 const struct exploratory_stats *es = ess + v;
782
783 double m0, m1, m2, m3, m4;
784 moments_calculate (es->mom, &m0, &m1, &m2, &m3, &m4);
785 double tval = gsl_cdf_tdist_Qinv ((1.0 - cmd->conf) / 2.0, m0 - 1.0);
786
787 struct entry
788 {
789 int stat_idx;
790 int aspect_idx;
791 double x;
792 }
793 entries[] = {
794 { 0, 0, m1 },
795 { 0, 1, calc_semean (m2, m0) },
796 { 1, 0, m1 - tval * calc_semean (m2, m0) },
797 { 2, 0, m1 + tval * calc_semean (m2, m0) },
798 { 3, 0, trimmed_mean_calculate (es->trimmed_mean) },
799 { 4, 0, percentile_calculate (es->quartiles[1], cmd->pc_alg) },
800 { 5, 0, m2 },
801 { 6, 0, sqrt (m2) },
802 { 7, 0, es->minima[0].val },
803 { 8, 0, es->maxima[0].val },
804 { 9, 0, es->maxima[0].val - es->minima[0].val },
805 { 10, 0, (percentile_calculate (es->quartiles[2], cmd->pc_alg) -
806 percentile_calculate (es->quartiles[0], cmd->pc_alg)) },
807 { 11, 0, m3 },
808 { 11, 1, calc_seskew (m0) },
809 { 12, 0, m4 },
810 { 12, 1, calc_sekurt (m0) },
811 };
812 for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
813 {
814 const struct entry *e = &entries[j];
815 indexes[0] = e->aspect_idx;
816 indexes[1] = e->stat_idx;
817 pivot_table_put (table, indexes, table->n_dimensions,
818 pivot_value_new_number (e->x));
819 }
820 }
821 }
822
823 free (indexes);
824
825 pivot_table_submit (table);
826 }
827
828
829 static void
extremes_report(const struct examine * cmd,int iact_idx)830 extremes_report (const struct examine *cmd, int iact_idx)
831 {
832 struct pivot_table *table = pivot_table_create (N_("Extreme Values"));
833 table->omit_empty = true;
834
835 struct pivot_dimension *statistics = pivot_dimension_create (
836 table, PIVOT_AXIS_COLUMN, N_("Statistics"));
837 pivot_category_create_leaf (statistics->root,
838 (cmd->id_var
839 ? pivot_value_new_variable (cmd->id_var)
840 : pivot_value_new_text (N_("Case Number"))));
841 pivot_category_create_leaves (statistics->root, N_("Value"));
842
843 struct pivot_dimension *order = pivot_dimension_create (
844 table, PIVOT_AXIS_ROW, N_("Order"));
845 for (size_t i = 0; i < cmd->disp_extremes; i++)
846 pivot_category_create_leaf (order->root, pivot_value_new_integer (i + 1));
847
848 pivot_dimension_create (table, PIVOT_AXIS_ROW,
849 /* TRANSLATORS: This is a noun, not an adjective. */
850 N_("Extreme"),
851 N_("Highest"), N_("Lowest"));
852
853 const struct interaction *iact = cmd->iacts[iact_idx];
854 struct pivot_footnote *missing_footnote = create_missing_footnote (table);
855 create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
856
857 struct pivot_dimension *dep_dim = pivot_dimension_create (
858 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
859
860 size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
861
862 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
863 for (size_t v = 0; v < cmd->n_dep_vars; ++v)
864 {
865 indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
866 dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
867
868 for (size_t i = 0; i < n_cats; ++i)
869 {
870 for (size_t j = 0; j < iact->n_vars; j++)
871 {
872 int idx = categoricals_get_value_index_by_category_real (
873 cmd->cats, iact_idx, i, j);
874 indexes[table->n_dimensions - 2 - j] = idx;
875 }
876
877 const struct exploratory_stats *ess
878 = categoricals_get_user_data_by_category_real (cmd->cats,
879 iact_idx, i);
880 const struct exploratory_stats *es = ess + v;
881
882 for (int e = 0 ; e < cmd->disp_extremes; ++e)
883 {
884 indexes[1] = e;
885
886 for (size_t j = 0; j < 2; j++)
887 {
888 const struct extremity *extremity
889 = j ? &es->minima[e] : &es->maxima[e];
890 indexes[2] = j;
891
892 indexes[0] = 0;
893 pivot_table_put (
894 table, indexes, table->n_dimensions,
895 (cmd->id_var
896 ? new_value_with_missing_footnote (cmd->id_var,
897 &extremity->identity,
898 missing_footnote)
899 : pivot_value_new_integer (extremity->identity.f)));
900
901 indexes[0] = 1;
902 union value val = { .f = extremity->val };
903 pivot_table_put (
904 table, indexes, table->n_dimensions,
905 new_value_with_missing_footnote (cmd->dep_vars[v], &val,
906 missing_footnote));
907 }
908 }
909 }
910 }
911 free (indexes);
912
913 pivot_table_submit (table);
914 }
915
916
917 static void
summary_report(const struct examine * cmd,int iact_idx)918 summary_report (const struct examine *cmd, int iact_idx)
919 {
920 struct pivot_table *table = pivot_table_create (
921 N_("Case Processing Summary"));
922 table->omit_empty = true;
923 pivot_table_set_weight_var (table, dict_get_weight (cmd->dict));
924
925 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
926 N_("N"), PIVOT_RC_COUNT,
927 N_("Percent"), PIVOT_RC_PERCENT);
928 struct pivot_dimension *cases = pivot_dimension_create (
929 table, PIVOT_AXIS_COLUMN, N_("Cases"), N_("Valid"), N_("Missing"),
930 N_("Total"));
931 cases->root->show_label = true;
932
933 const struct interaction *iact = cmd->iacts[iact_idx];
934 struct pivot_footnote *missing_footnote = create_missing_footnote (table);
935 create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
936
937 struct pivot_dimension *dep_dim = pivot_dimension_create (
938 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
939
940 size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
941
942 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
943 for (size_t v = 0; v < cmd->n_dep_vars; ++v)
944 {
945 indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
946 dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
947
948 for (size_t i = 0; i < n_cats; ++i)
949 {
950 for (size_t j = 0; j < iact->n_vars; j++)
951 {
952 int idx = categoricals_get_value_index_by_category_real (
953 cmd->cats, iact_idx, i, j);
954 indexes[table->n_dimensions - 2 - j] = idx;
955 }
956
957 const struct exploratory_stats *es
958 = categoricals_get_user_data_by_category_real (
959 cmd->cats, iact_idx, i);
960
961 double total = es[v].missing + es[v].non_missing;
962 struct entry
963 {
964 int stat_idx;
965 int case_idx;
966 double x;
967 }
968 entries[] = {
969 { 0, 0, es[v].non_missing },
970 { 1, 0, 100.0 * es[v].non_missing / total },
971 { 0, 1, es[v].missing },
972 { 1, 1, 100.0 * es[v].missing / total },
973 { 0, 2, total },
974 { 1, 2, 100.0 },
975 };
976 for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
977 {
978 const struct entry *e = &entries[j];
979 indexes[0] = e->stat_idx;
980 indexes[1] = e->case_idx;
981 pivot_table_put (table, indexes, table->n_dimensions,
982 pivot_value_new_number (e->x));
983 }
984 }
985 }
986
987 free (indexes);
988
989 pivot_table_submit (table);
990 }
991
992 /* Attempt to parse an interaction from LEXER */
993 static struct interaction *
parse_interaction(struct lexer * lexer,struct examine * ex)994 parse_interaction (struct lexer *lexer, struct examine *ex)
995 {
996 const struct variable *v = NULL;
997 struct interaction *iact = NULL;
998
999 if (lex_match_variable (lexer, ex->dict, &v))
1000 {
1001 iact = interaction_create (v);
1002
1003 while (lex_match (lexer, T_BY))
1004 {
1005 if (!lex_match_variable (lexer, ex->dict, &v))
1006 {
1007 interaction_destroy (iact);
1008 return NULL;
1009 }
1010 interaction_add_variable (iact, v);
1011 }
1012 lex_match (lexer, T_COMMA);
1013 }
1014
1015 return iact;
1016 }
1017
1018
1019 static void *
create_n(const void * aux1,void * aux2 UNUSED)1020 create_n (const void *aux1, void *aux2 UNUSED)
1021 {
1022 int v;
1023
1024 const struct examine *examine = aux1;
1025 struct exploratory_stats *es = pool_calloc (examine->pool, examine->n_dep_vars, sizeof (*es));
1026 struct subcase ordering;
1027 subcase_init (&ordering, 0, 0, SC_ASCEND);
1028
1029 for (v = 0; v < examine->n_dep_vars; v++)
1030 {
1031 es[v].sorted_writer = sort_create_writer (&ordering, examine->ex_proto);
1032 es[v].sorted_reader = NULL;
1033
1034 es[v].mom = moments_create (MOMENT_KURTOSIS);
1035 es[v].cmin = DBL_MAX;
1036
1037 es[v].maximum = -DBL_MAX;
1038 es[v].minimum = DBL_MAX;
1039 }
1040
1041 subcase_destroy (&ordering);
1042 return es;
1043 }
1044
1045 static void
update_n(const void * aux1,void * aux2 UNUSED,void * user_data,const struct ccase * c,double weight)1046 update_n (const void *aux1, void *aux2 UNUSED, void *user_data,
1047 const struct ccase *c, double weight)
1048 {
1049 int v;
1050 const struct examine *examine = aux1;
1051 struct exploratory_stats *es = user_data;
1052
1053 bool this_case_is_missing = false;
1054 /* LISTWISE missing must be dealt with here */
1055 if (!examine->missing_pw)
1056 {
1057 for (v = 0; v < examine->n_dep_vars; v++)
1058 {
1059 const struct variable *var = examine->dep_vars[v];
1060
1061 if (var_is_value_missing (var, case_data (c, var), examine->dep_excl))
1062 {
1063 es[v].missing += weight;
1064 this_case_is_missing = true;
1065 }
1066 }
1067 }
1068
1069 if (this_case_is_missing)
1070 return;
1071
1072 for (v = 0; v < examine->n_dep_vars; v++)
1073 {
1074 struct ccase *outcase ;
1075 const struct variable *var = examine->dep_vars[v];
1076 const double x = case_data (c, var)->f;
1077
1078 if (var_is_value_missing (var, case_data (c, var), examine->dep_excl))
1079 {
1080 es[v].missing += weight;
1081 continue;
1082 }
1083
1084 outcase = case_create (examine->ex_proto);
1085
1086 if (x > es[v].maximum)
1087 es[v].maximum = x;
1088
1089 if (x < es[v].minimum)
1090 es[v].minimum = x;
1091
1092 es[v].non_missing += weight;
1093
1094 moments_pass_one (es[v].mom, x, weight);
1095
1096 /* Save the value and the ID to the writer */
1097 assert (examine->id_idx != -1);
1098 case_data_rw_idx (outcase, EX_VAL)->f = x;
1099 value_copy (case_data_rw_idx (outcase, EX_ID),
1100 case_data_idx (c, examine->id_idx), examine->id_width);
1101
1102 case_data_rw_idx (outcase, EX_WT)->f = weight;
1103
1104 es[v].cc += weight;
1105
1106 if (es[v].cmin > weight)
1107 es[v].cmin = weight;
1108
1109 casewriter_write (es[v].sorted_writer, outcase);
1110 }
1111 }
1112
1113 static void
calculate_n(const void * aux1,void * aux2 UNUSED,void * user_data)1114 calculate_n (const void *aux1, void *aux2 UNUSED, void *user_data)
1115 {
1116 int v;
1117 const struct examine *examine = aux1;
1118 struct exploratory_stats *es = user_data;
1119
1120 for (v = 0; v < examine->n_dep_vars; v++)
1121 {
1122 int i;
1123 casenumber imin = 0;
1124 casenumber imax;
1125 struct casereader *reader;
1126 struct ccase *c;
1127
1128 if (examine->plot & PLOT_HISTOGRAM && es[v].non_missing > 0)
1129 {
1130 /* Sturges Rule */
1131 double bin_width = fabs (es[v].minimum - es[v].maximum)
1132 / (1 + log2 (es[v].cc))
1133 ;
1134
1135 es[v].histogram =
1136 histogram_create (bin_width, es[v].minimum, es[v].maximum);
1137 }
1138
1139 es[v].sorted_reader = casewriter_make_reader (es[v].sorted_writer);
1140 es[v].sorted_writer = NULL;
1141
1142 imax = casereader_get_case_cnt (es[v].sorted_reader);
1143
1144 es[v].maxima = pool_calloc (examine->pool, examine->calc_extremes, sizeof (*es[v].maxima));
1145 es[v].minima = pool_calloc (examine->pool, examine->calc_extremes, sizeof (*es[v].minima));
1146 for (i = 0; i < examine->calc_extremes; ++i)
1147 {
1148 value_init_pool (examine->pool, &es[v].maxima[i].identity, examine->id_width) ;
1149 value_init_pool (examine->pool, &es[v].minima[i].identity, examine->id_width) ;
1150 }
1151
1152 bool warn = true;
1153 for (reader = casereader_clone (es[v].sorted_reader);
1154 (c = casereader_read (reader)) != NULL; case_unref (c))
1155 {
1156 const double val = case_data_idx (c, EX_VAL)->f;
1157 double wt = case_data_idx (c, EX_WT)->f;
1158 wt = var_force_valid_weight (examine->wv, wt, &warn);
1159
1160 moments_pass_two (es[v].mom, val, wt);
1161
1162 if (es[v].histogram)
1163 histogram_add (es[v].histogram, val, wt);
1164
1165 if (imin < examine->calc_extremes)
1166 {
1167 int x;
1168 for (x = imin; x < examine->calc_extremes; ++x)
1169 {
1170 struct extremity *min = &es[v].minima[x];
1171 min->val = val;
1172 value_copy (&min->identity, case_data_idx (c, EX_ID), examine->id_width);
1173 }
1174 imin ++;
1175 }
1176
1177 imax --;
1178 if (imax < examine->calc_extremes)
1179 {
1180 int x;
1181
1182 for (x = imax; x < imax + 1; ++x)
1183 {
1184 struct extremity *max;
1185
1186 if (x >= examine->calc_extremes)
1187 break;
1188
1189 max = &es[v].maxima[x];
1190 max->val = val;
1191 value_copy (&max->identity, case_data_idx (c, EX_ID), examine->id_width);
1192 }
1193 }
1194 }
1195 casereader_destroy (reader);
1196
1197 if (examine->calc_extremes > 0 && es[v].non_missing > 0)
1198 {
1199 assert (es[v].minima[0].val == es[v].minimum);
1200 assert (es[v].maxima[0].val == es[v].maximum);
1201 }
1202
1203 {
1204 const int n_os = 5 + examine->n_percentiles;
1205 struct order_stats **os ;
1206 es[v].percentiles = pool_calloc (examine->pool, examine->n_percentiles, sizeof (*es[v].percentiles));
1207
1208 es[v].trimmed_mean = trimmed_mean_create (es[v].cc, 0.05);
1209 es[v].shapiro_wilk = NULL;
1210
1211 os = xcalloc (n_os, sizeof *os);
1212 os[0] = &es[v].trimmed_mean->parent;
1213
1214 es[v].quartiles[0] = percentile_create (0.25, es[v].cc);
1215 es[v].quartiles[1] = percentile_create (0.5, es[v].cc);
1216 es[v].quartiles[2] = percentile_create (0.75, es[v].cc);
1217
1218 os[1] = &es[v].quartiles[0]->parent;
1219 os[2] = &es[v].quartiles[1]->parent;
1220 os[3] = &es[v].quartiles[2]->parent;
1221
1222 es[v].hinges = tukey_hinges_create (es[v].cc, es[v].cmin);
1223 os[4] = &es[v].hinges->parent;
1224
1225 for (i = 0; i < examine->n_percentiles; ++i)
1226 {
1227 es[v].percentiles[i] = percentile_create (examine->ptiles[i] / 100.00, es[v].cc);
1228 os[5 + i] = &es[v].percentiles[i]->parent;
1229 }
1230
1231 order_stats_accumulate_idx (os, n_os,
1232 casereader_clone (es[v].sorted_reader),
1233 EX_WT, EX_VAL);
1234
1235 free (os);
1236 }
1237
1238 if (examine->plot & PLOT_BOXPLOT)
1239 {
1240 struct order_stats *os;
1241
1242 es[v].box_whisker = box_whisker_create (es[v].hinges,
1243 EX_ID, examine->id_var);
1244
1245 os = &es[v].box_whisker->parent;
1246 order_stats_accumulate_idx (&os, 1,
1247 casereader_clone (es[v].sorted_reader),
1248 EX_WT, EX_VAL);
1249 }
1250
1251 if (examine->plot)
1252 {
1253 double mean;
1254
1255 moments_calculate (es[v].mom, NULL, &mean, NULL, NULL, NULL);
1256
1257 es[v].shapiro_wilk = shapiro_wilk_create (es[v].non_missing, mean);
1258
1259 if (es[v].shapiro_wilk)
1260 {
1261 struct order_stats *os = &es[v].shapiro_wilk->parent;
1262 order_stats_accumulate_idx (&os, 1,
1263 casereader_clone (es[v].sorted_reader),
1264 EX_WT, EX_VAL);
1265 }
1266 }
1267
1268 if (examine->plot & PLOT_NPPLOT)
1269 {
1270 double n, mean, var;
1271 struct order_stats *os;
1272
1273 moments_calculate (es[v].mom, &n, &mean, &var, NULL, NULL);
1274
1275 es[v].np = np_create (n, mean, var);
1276
1277 os = &es[v].np->parent;
1278
1279 order_stats_accumulate_idx (&os, 1,
1280 casereader_clone (es[v].sorted_reader),
1281 EX_WT, EX_VAL);
1282 }
1283
1284 }
1285 }
1286
1287 static void
cleanup_exploratory_stats(struct examine * cmd)1288 cleanup_exploratory_stats (struct examine *cmd)
1289 {
1290 int i;
1291 for (i = 0; i < cmd->n_iacts; ++i)
1292 {
1293 int v;
1294 const size_t n_cats = categoricals_n_count (cmd->cats, i);
1295
1296 for (v = 0; v < cmd->n_dep_vars; ++v)
1297 {
1298 int grp;
1299 for (grp = 0; grp < n_cats; ++grp)
1300 {
1301 int q;
1302 const struct exploratory_stats *es =
1303 categoricals_get_user_data_by_category_real (cmd->cats, i, grp);
1304
1305 struct order_stats *os = &es[v].hinges->parent;
1306 struct statistic *stat = &os->parent;
1307 stat->destroy (stat);
1308
1309 for (q = 0; q < 3 ; q++)
1310 {
1311 os = &es[v].quartiles[q]->parent;
1312 stat = &os->parent;
1313 stat->destroy (stat);
1314 }
1315
1316 for (q = 0; q < cmd->n_percentiles ; q++)
1317 {
1318 os = &es[v].percentiles[q]->parent;
1319 stat = &os->parent;
1320 stat->destroy (stat);
1321 }
1322
1323 if (es[v].shapiro_wilk)
1324 {
1325 stat = &es[v].shapiro_wilk->parent.parent;
1326 stat->destroy (stat);
1327 }
1328
1329 os = &es[v].trimmed_mean->parent;
1330 stat = &os->parent;
1331 stat->destroy (stat);
1332
1333 os = &es[v].np->parent;
1334 if (os)
1335 {
1336 stat = &os->parent;
1337 stat->destroy (stat);
1338 }
1339
1340 statistic_destroy (&es[v].histogram->parent);
1341 moments_destroy (es[v].mom);
1342
1343 if (es[v].box_whisker)
1344 {
1345 stat = &es[v].box_whisker->parent.parent;
1346 stat->destroy (stat);
1347 }
1348
1349 casereader_destroy (es[v].sorted_reader);
1350 }
1351 }
1352 }
1353 }
1354
1355
1356 static void
run_examine(struct examine * cmd,struct casereader * input)1357 run_examine (struct examine *cmd, struct casereader *input)
1358 {
1359 int i;
1360 struct ccase *c;
1361 struct casereader *reader;
1362
1363 struct payload payload;
1364 payload.create = create_n;
1365 payload.update = update_n;
1366 payload.calculate = calculate_n;
1367 payload.destroy = NULL;
1368
1369 cmd->wv = dict_get_weight (cmd->dict);
1370
1371 cmd->cats
1372 = categoricals_create (cmd->iacts, cmd->n_iacts, cmd->wv, cmd->fctr_excl);
1373
1374 categoricals_set_payload (cmd->cats, &payload, cmd, NULL);
1375
1376 if (cmd->id_var == NULL)
1377 {
1378 struct ccase *c = casereader_peek (input, 0);
1379
1380 cmd->id_idx = case_get_value_cnt (c);
1381 input = casereader_create_arithmetic_sequence (input, 1.0, 1.0);
1382
1383 case_unref (c);
1384 }
1385
1386 for (reader = input;
1387 (c = casereader_read (reader)) != NULL; case_unref (c))
1388 {
1389 categoricals_update (cmd->cats, c);
1390 }
1391 casereader_destroy (reader);
1392 categoricals_done (cmd->cats);
1393
1394 for (i = 0; i < cmd->n_iacts; ++i)
1395 {
1396 summary_report (cmd, i);
1397
1398 const size_t n_cats = categoricals_n_count (cmd->cats, i);
1399 if (n_cats == 0)
1400 continue;
1401
1402 if (cmd->disp_extremes > 0)
1403 extremes_report (cmd, i);
1404
1405 if (cmd->n_percentiles > 0)
1406 percentiles_report (cmd, i);
1407
1408 if (cmd->plot & PLOT_BOXPLOT)
1409 {
1410 switch (cmd->boxplot_mode)
1411 {
1412 case BP_GROUPS:
1413 show_boxplot_grouped (cmd, i);
1414 break;
1415 case BP_VARIABLES:
1416 show_boxplot_variabled (cmd, i);
1417 break;
1418 default:
1419 NOT_REACHED ();
1420 break;
1421 }
1422 }
1423
1424 if (cmd->plot & PLOT_HISTOGRAM)
1425 show_histogram (cmd, i);
1426
1427 if (cmd->plot & PLOT_NPPLOT)
1428 show_npplot (cmd, i);
1429
1430 if (cmd->plot & PLOT_SPREADLEVEL)
1431 show_spreadlevel (cmd, i);
1432
1433 if (cmd->descriptives)
1434 descriptives_report (cmd, i);
1435
1436 if (cmd->plot)
1437 normality_report (cmd, i);
1438 }
1439
1440 cleanup_exploratory_stats (cmd);
1441 categoricals_destroy (cmd->cats);
1442 }
1443
1444
1445 int
cmd_examine(struct lexer * lexer,struct dataset * ds)1446 cmd_examine (struct lexer *lexer, struct dataset *ds)
1447 {
1448 int i;
1449 bool nototals_seen = false;
1450 bool totals_seen = false;
1451
1452 struct interaction **iacts_mem = NULL;
1453 struct examine examine;
1454 bool percentiles_seen = false;
1455
1456 examine.missing_pw = false;
1457 examine.disp_extremes = 0;
1458 examine.calc_extremes = 0;
1459 examine.descriptives = false;
1460 examine.conf = 0.95;
1461 examine.pc_alg = PC_HAVERAGE;
1462 examine.ptiles = NULL;
1463 examine.n_percentiles = 0;
1464 examine.id_idx = -1;
1465 examine.id_width = 0;
1466 examine.id_var = NULL;
1467 examine.boxplot_mode = BP_GROUPS;
1468
1469 examine.ex_proto = caseproto_create ();
1470
1471 examine.pool = pool_create ();
1472
1473 /* Allocate space for the first interaction.
1474 This is interaction is an empty one (for the totals).
1475 If no totals are requested, we will simply ignore this
1476 interaction.
1477 */
1478 examine.n_iacts = 1;
1479 examine.iacts = iacts_mem = pool_zalloc (examine.pool, sizeof (struct interaction *));
1480 examine.iacts[0] = interaction_create (NULL);
1481
1482 examine.dep_excl = MV_ANY;
1483 examine.fctr_excl = MV_ANY;
1484 examine.plot = 0;
1485 examine.sl_power = 0;
1486 examine.dep_vars = NULL;
1487 examine.n_dep_vars = 0;
1488 examine.dict = dataset_dict (ds);
1489
1490 /* Accept an optional, completely pointless "/VARIABLES=" */
1491 lex_match (lexer, T_SLASH);
1492 if (lex_match_id (lexer, "VARIABLES"))
1493 {
1494 if (! lex_force_match (lexer, T_EQUALS))
1495 goto error;
1496 }
1497
1498 if (!parse_variables_const (lexer, examine.dict,
1499 &examine.dep_vars, &examine.n_dep_vars,
1500 PV_NO_DUPLICATE | PV_NUMERIC))
1501 goto error;
1502
1503 if (lex_match (lexer, T_BY))
1504 {
1505 struct interaction *iact = NULL;
1506 do
1507 {
1508 iact = parse_interaction (lexer, &examine);
1509 if (iact)
1510 {
1511 examine.n_iacts++;
1512 iacts_mem =
1513 pool_nrealloc (examine.pool, iacts_mem,
1514 examine.n_iacts,
1515 sizeof (*iacts_mem));
1516
1517 iacts_mem[examine.n_iacts - 1] = iact;
1518 }
1519 }
1520 while (iact);
1521 }
1522
1523
1524 while (lex_token (lexer) != T_ENDCMD)
1525 {
1526 lex_match (lexer, T_SLASH);
1527
1528 if (lex_match_id (lexer, "STATISTICS"))
1529 {
1530 lex_match (lexer, T_EQUALS);
1531
1532 while (lex_token (lexer) != T_ENDCMD
1533 && lex_token (lexer) != T_SLASH)
1534 {
1535 if (lex_match_id (lexer, "DESCRIPTIVES"))
1536 {
1537 examine.descriptives = true;
1538 }
1539 else if (lex_match_id (lexer, "EXTREME"))
1540 {
1541 int extr = 5;
1542 if (lex_match (lexer, T_LPAREN))
1543 {
1544 if (!lex_force_int (lexer))
1545 goto error;
1546 extr = lex_integer (lexer);
1547
1548 if (extr < 0)
1549 {
1550 msg (MW, _("%s may not be negative. Using default value (%g)."), "EXTREME", 5.0);
1551 extr = 5;
1552 }
1553
1554 lex_get (lexer);
1555 if (! lex_force_match (lexer, T_RPAREN))
1556 goto error;
1557 }
1558 examine.disp_extremes = extr;
1559 }
1560 else if (lex_match_id (lexer, "NONE"))
1561 {
1562 }
1563 else if (lex_match (lexer, T_ALL))
1564 {
1565 if (examine.disp_extremes == 0)
1566 examine.disp_extremes = 5;
1567 }
1568 else
1569 {
1570 lex_error (lexer, NULL);
1571 goto error;
1572 }
1573 }
1574 }
1575 else if (lex_match_id (lexer, "PERCENTILES"))
1576 {
1577 percentiles_seen = true;
1578 if (lex_match (lexer, T_LPAREN))
1579 {
1580 while (lex_is_number (lexer))
1581 {
1582 double p = lex_number (lexer);
1583
1584 if (p <= 0 || p >= 100.0)
1585 {
1586 lex_error (lexer,
1587 _("Percentiles must lie in the range (0, 100)"));
1588 goto error;
1589 }
1590
1591 examine.n_percentiles++;
1592 examine.ptiles =
1593 xrealloc (examine.ptiles,
1594 sizeof (*examine.ptiles) *
1595 examine.n_percentiles);
1596
1597 examine.ptiles[examine.n_percentiles - 1] = p;
1598
1599 lex_get (lexer);
1600 lex_match (lexer, T_COMMA);
1601 }
1602 if (!lex_force_match (lexer, T_RPAREN))
1603 goto error;
1604 }
1605
1606 lex_match (lexer, T_EQUALS);
1607
1608 while (lex_token (lexer) != T_ENDCMD
1609 && lex_token (lexer) != T_SLASH)
1610 {
1611 if (lex_match_id (lexer, "HAVERAGE"))
1612 {
1613 examine.pc_alg = PC_HAVERAGE;
1614 }
1615 else if (lex_match_id (lexer, "WAVERAGE"))
1616 {
1617 examine.pc_alg = PC_WAVERAGE;
1618 }
1619 else if (lex_match_id (lexer, "ROUND"))
1620 {
1621 examine.pc_alg = PC_ROUND;
1622 }
1623 else if (lex_match_id (lexer, "EMPIRICAL"))
1624 {
1625 examine.pc_alg = PC_EMPIRICAL;
1626 }
1627 else if (lex_match_id (lexer, "AEMPIRICAL"))
1628 {
1629 examine.pc_alg = PC_AEMPIRICAL;
1630 }
1631 else if (lex_match_id (lexer, "NONE"))
1632 {
1633 examine.pc_alg = PC_NONE;
1634 }
1635 else
1636 {
1637 lex_error (lexer, NULL);
1638 goto error;
1639 }
1640 }
1641 }
1642 else if (lex_match_id (lexer, "TOTAL"))
1643 {
1644 totals_seen = true;
1645 }
1646 else if (lex_match_id (lexer, "NOTOTAL"))
1647 {
1648 nototals_seen = true;
1649 }
1650 else if (lex_match_id (lexer, "MISSING"))
1651 {
1652 lex_match (lexer, T_EQUALS);
1653
1654 while (lex_token (lexer) != T_ENDCMD
1655 && lex_token (lexer) != T_SLASH)
1656 {
1657 if (lex_match_id (lexer, "LISTWISE"))
1658 {
1659 examine.missing_pw = false;
1660 }
1661 else if (lex_match_id (lexer, "PAIRWISE"))
1662 {
1663 examine.missing_pw = true;
1664 }
1665 else if (lex_match_id (lexer, "EXCLUDE"))
1666 {
1667 examine.dep_excl = MV_ANY;
1668 }
1669 else if (lex_match_id (lexer, "INCLUDE"))
1670 {
1671 examine.dep_excl = MV_SYSTEM;
1672 }
1673 else if (lex_match_id (lexer, "REPORT"))
1674 {
1675 examine.fctr_excl = MV_NEVER;
1676 }
1677 else if (lex_match_id (lexer, "NOREPORT"))
1678 {
1679 examine.fctr_excl = MV_ANY;
1680 }
1681 else
1682 {
1683 lex_error (lexer, NULL);
1684 goto error;
1685 }
1686 }
1687 }
1688 else if (lex_match_id (lexer, "COMPARE"))
1689 {
1690 lex_match (lexer, T_EQUALS);
1691 if (lex_match_id (lexer, "VARIABLES"))
1692 {
1693 examine.boxplot_mode = BP_VARIABLES;
1694 }
1695 else if (lex_match_id (lexer, "GROUPS"))
1696 {
1697 examine.boxplot_mode = BP_GROUPS;
1698 }
1699 else
1700 {
1701 lex_error (lexer, NULL);
1702 goto error;
1703 }
1704 }
1705 else if (lex_match_id (lexer, "PLOT"))
1706 {
1707 lex_match (lexer, T_EQUALS);
1708
1709 while (lex_token (lexer) != T_ENDCMD
1710 && lex_token (lexer) != T_SLASH)
1711 {
1712 if (lex_match_id (lexer, "BOXPLOT"))
1713 {
1714 examine.plot |= PLOT_BOXPLOT;
1715 }
1716 else if (lex_match_id (lexer, "NPPLOT"))
1717 {
1718 examine.plot |= PLOT_NPPLOT;
1719 }
1720 else if (lex_match_id (lexer, "HISTOGRAM"))
1721 {
1722 examine.plot |= PLOT_HISTOGRAM;
1723 }
1724 else if (lex_match_id (lexer, "SPREADLEVEL"))
1725 {
1726 examine.plot |= PLOT_SPREADLEVEL;
1727 examine.sl_power = 0;
1728 if (lex_match (lexer, T_LPAREN) && lex_force_int (lexer))
1729 {
1730 examine.sl_power = lex_integer (lexer);
1731
1732 lex_get (lexer);
1733 if (! lex_force_match (lexer, T_RPAREN))
1734 goto error;
1735 }
1736 }
1737 else if (lex_match_id (lexer, "NONE"))
1738 {
1739 examine.plot = 0;
1740 }
1741 else if (lex_match (lexer, T_ALL))
1742 {
1743 examine.plot = ~0;
1744 }
1745 else
1746 {
1747 lex_error (lexer, NULL);
1748 goto error;
1749 }
1750 lex_match (lexer, T_COMMA);
1751 }
1752 }
1753 else if (lex_match_id (lexer, "CINTERVAL"))
1754 {
1755 if (!lex_force_num (lexer))
1756 goto error;
1757
1758 examine.conf = lex_number (lexer);
1759 lex_get (lexer);
1760 }
1761 else if (lex_match_id (lexer, "ID"))
1762 {
1763 lex_match (lexer, T_EQUALS);
1764
1765 examine.id_var = parse_variable_const (lexer, examine.dict);
1766 }
1767 else
1768 {
1769 lex_error (lexer, NULL);
1770 goto error;
1771 }
1772 }
1773
1774
1775 if (totals_seen && nototals_seen)
1776 {
1777 msg (SE, _("%s and %s are mutually exclusive"), "TOTAL", "NOTOTAL");
1778 goto error;
1779 }
1780
1781 /* If totals have been requested or if there are no factors
1782 in this analysis, then the totals need to be included. */
1783 if (!nototals_seen || examine.n_iacts == 1)
1784 {
1785 examine.iacts = &iacts_mem[0];
1786 }
1787 else
1788 {
1789 examine.n_iacts--;
1790 examine.iacts = &iacts_mem[1];
1791 interaction_destroy (iacts_mem[0]);
1792 }
1793
1794
1795 if (examine.id_var)
1796 {
1797 examine.id_idx = var_get_case_index (examine.id_var);
1798 examine.id_width = var_get_width (examine.id_var);
1799 }
1800
1801 examine.ex_proto = caseproto_add_width (examine.ex_proto, 0); /* value */
1802 examine.ex_proto = caseproto_add_width (examine.ex_proto, examine.id_width); /* id */
1803 examine.ex_proto = caseproto_add_width (examine.ex_proto, 0); /* weight */
1804
1805
1806 if (examine.disp_extremes > 0)
1807 {
1808 examine.calc_extremes = examine.disp_extremes;
1809 }
1810
1811 if (examine.descriptives && examine.calc_extremes == 0)
1812 {
1813 /* Descriptives always displays the max and min */
1814 examine.calc_extremes = 1;
1815 }
1816
1817 if (percentiles_seen && examine.n_percentiles == 0)
1818 {
1819 examine.n_percentiles = 7;
1820 examine.ptiles = xcalloc (examine.n_percentiles,
1821 sizeof (*examine.ptiles));
1822
1823 examine.ptiles[0] = 5;
1824 examine.ptiles[1] = 10;
1825 examine.ptiles[2] = 25;
1826 examine.ptiles[3] = 50;
1827 examine.ptiles[4] = 75;
1828 examine.ptiles[5] = 90;
1829 examine.ptiles[6] = 95;
1830 }
1831
1832 assert (examine.calc_extremes >= examine.disp_extremes);
1833 {
1834 struct casegrouper *grouper;
1835 struct casereader *group;
1836 bool ok;
1837
1838 grouper = casegrouper_create_splits (proc_open (ds), examine.dict);
1839 while (casegrouper_get_next_group (grouper, &group))
1840 run_examine (&examine, group);
1841 ok = casegrouper_destroy (grouper);
1842 ok = proc_commit (ds) && ok;
1843 }
1844
1845 caseproto_unref (examine.ex_proto);
1846
1847 for (i = 0; i < examine.n_iacts; ++i)
1848 interaction_destroy (examine.iacts[i]);
1849 free (examine.ptiles);
1850 free (examine.dep_vars);
1851 pool_destroy (examine.pool);
1852
1853 return CMD_SUCCESS;
1854
1855 error:
1856 caseproto_unref (examine.ex_proto);
1857 examine.iacts = iacts_mem;
1858 for (i = 0; i < examine.n_iacts; ++i)
1859 interaction_destroy (examine.iacts[i]);
1860 free (examine.dep_vars);
1861 free (examine.ptiles);
1862 pool_destroy (examine.pool);
1863
1864 return CMD_FAILURE;
1865 }
1866