1 /*
2   PSPP - a program for statistical analysis.
3   Copyright (C) 2012, 2013, 2016, 2019  Free Software Foundation, Inc.
4 
5   This program is free software: you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation, either version 3 of the License, or
8   (at your option) any later version.
9 
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14 
15   You should have received a copy of the GNU General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <config.h>
20 
21 #include <math.h>
22 #include <gsl/gsl_cdf.h>
23 
24 #include "libpspp/assertion.h"
25 #include "libpspp/message.h"
26 #include "libpspp/pool.h"
27 
28 
29 #include "data/dataset.h"
30 #include "data/dictionary.h"
31 #include "data/casegrouper.h"
32 #include "data/casereader.h"
33 #include "data/casewriter.h"
34 #include "data/caseproto.h"
35 #include "data/subcase.h"
36 
37 
38 #include "data/format.h"
39 
40 #include "math/interaction.h"
41 #include "math/box-whisker.h"
42 #include "math/categoricals.h"
43 #include "math/chart-geometry.h"
44 #include "math/histogram.h"
45 #include "math/moments.h"
46 #include "math/np.h"
47 #include "math/sort.h"
48 #include "math/order-stats.h"
49 #include "math/percentiles.h"
50 #include "math/shapiro-wilk.h"
51 #include "math/tukey-hinges.h"
52 #include "math/trimmed-mean.h"
53 
54 #include "output/charts/boxplot.h"
55 #include "output/charts/np-plot.h"
56 #include "output/charts/spreadlevel-plot.h"
57 #include "output/charts/plot-hist.h"
58 
59 #include "language/command.h"
60 #include "language/lexer/lexer.h"
61 #include "language/lexer/value-parser.h"
62 #include "language/lexer/variable-parser.h"
63 
64 #include "output/pivot-table.h"
65 
66 #include "gettext.h"
67 #define _(msgid) gettext (msgid)
68 #define N_(msgid) msgid
69 
70 static void
append_value_name(const struct variable * var,const union value * val,struct string * str)71 append_value_name (const struct variable *var, const union value *val, struct string *str)
72 {
73   var_append_value_name (var, val, str);
74   if (var_is_value_missing (var, val, MV_ANY))
75     ds_put_cstr (str, _(" (missing)"));
76 }
77 
78 enum bp_mode
79   {
80     BP_GROUPS,
81     BP_VARIABLES
82   };
83 
84 
85 /* Indices for the ex_proto member (below) */
86 enum
87   {
88     EX_VAL,  /* value */
89     EX_ID,   /* identity */
90     EX_WT    /* weight */
91   };
92 
93 
94 #define PLOT_HISTOGRAM      0x1
95 #define PLOT_BOXPLOT        0x2
96 #define PLOT_NPPLOT         0x4
97 #define PLOT_SPREADLEVEL    0x8
98 
99 struct examine
100 {
101   struct pool *pool;
102 
103   /* A caseproto used to contain the data subsets under examination,
104      see (enum above)   */
105   struct caseproto *ex_proto;
106 
107   size_t n_dep_vars;
108   const struct variable **dep_vars;
109 
110   size_t n_iacts;
111   struct interaction **iacts;
112 
113   enum mv_class dep_excl;
114   enum mv_class fctr_excl;
115 
116   const struct dictionary *dict;
117 
118   struct categoricals *cats;
119 
120   /* how many extremities to display */
121   int disp_extremes;
122   int calc_extremes;
123   bool descriptives;
124 
125   double conf;
126 
127   bool missing_pw;
128 
129   /* The case index of the ID value (or -1) if not applicable */
130   size_t id_idx;
131   int id_width;
132 
133   enum pc_alg pc_alg;
134   double *ptiles;
135   size_t n_percentiles;
136 
137   unsigned int plot;
138   int sl_power;
139 
140   enum bp_mode boxplot_mode;
141 
142   const struct variable *id_var;
143 
144   const struct variable *wv;
145 };
146 
147 struct extremity
148 {
149   /* The value of this extremity */
150   double val;
151 
152   /* Either the casenumber or the value of the variable specified
153      by the /ID subcommand which corresponds to this extremity */
154   union value identity;
155 };
156 
157 struct exploratory_stats
158 {
159   double missing;
160   double non_missing;
161 
162   struct moments *mom;
163 
164   /* Most operations need a sorted reader/writer */
165   struct casewriter *sorted_writer;
166   struct casereader *sorted_reader;
167 
168   struct extremity *minima;
169   struct extremity *maxima;
170 
171   /*
172      Minimum should alway equal mimima[0].val.
173      Likewise, maximum should alway equal maxima[0].val.
174      This redundancy exists as an optimisation effort.
175      Some statistics (eg histogram) require early calculation
176      of the min and max
177   */
178   double minimum;
179   double maximum;
180 
181   struct trimmed_mean *trimmed_mean;
182   struct percentile *quartiles[3];
183   struct percentile **percentiles;
184   struct shapiro_wilk *shapiro_wilk;
185 
186   struct tukey_hinges *hinges;
187 
188   /* The data for the NP Plots */
189   struct np *np;
190 
191   struct histogram *histogram;
192 
193   /* The data for the box plots */
194   struct box_whisker *box_whisker;
195 
196   /* Total weight */
197   double cc;
198 
199   /* The minimum weight */
200   double cmin;
201 };
202 
203 static void
show_boxplot_grouped(const struct examine * cmd,int iact_idx)204 show_boxplot_grouped (const struct examine *cmd, int iact_idx)
205 {
206   int v;
207 
208   const struct interaction *iact = cmd->iacts[iact_idx];
209   const size_t n_cats =  categoricals_n_count (cmd->cats, iact_idx);
210 
211   for (v = 0; v < cmd->n_dep_vars; ++v)
212     {
213       double y_min = DBL_MAX;
214       double y_max = -DBL_MAX;
215       int grp;
216       struct boxplot *boxplot;
217       struct string title;
218       ds_init_empty (&title);
219 
220       if (iact->n_vars > 0)
221         {
222           struct string istr;
223           ds_init_empty (&istr);
224           interaction_to_string (iact, &istr);
225           ds_put_format (&title, _("Boxplot of %s vs. %s"),
226                          var_to_string (cmd->dep_vars[v]),
227                          ds_cstr (&istr));
228           ds_destroy (&istr);
229         }
230       else
231         ds_put_format (&title, _("Boxplot of %s"), var_to_string (cmd->dep_vars[v]));
232 
233       for (grp = 0; grp < n_cats; ++grp)
234         {
235           const struct exploratory_stats *es =
236             categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
237 
238           if (y_min > es[v].minimum)
239             y_min = es[v].minimum;
240 
241           if (y_max < es[v].maximum)
242             y_max = es[v].maximum;
243         }
244 
245       boxplot = boxplot_create (y_min, y_max, ds_cstr (&title));
246 
247       ds_destroy (&title);
248 
249       for (grp = 0; grp < n_cats; ++grp)
250         {
251           int ivar_idx;
252           struct string label;
253 
254           const struct ccase *c =
255             categoricals_get_case_by_category_real (cmd->cats,  iact_idx, grp);
256 
257           struct exploratory_stats *es =
258             categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
259 
260           ds_init_empty (&label);
261           for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
262             {
263               struct string l;
264               const struct variable *ivar = iact->vars[ivar_idx];
265               const union value *val = case_data (c, ivar);
266               ds_init_empty (&l);
267 
268               append_value_name (ivar, val, &l);
269               ds_ltrim (&l, ss_cstr (" "));
270 
271               ds_put_substring (&label, l.ss);
272               if (ivar_idx < iact->n_vars - 1)
273                 ds_put_cstr (&label, "; ");
274 
275               ds_destroy (&l);
276             }
277 
278           boxplot_add_box (boxplot, es[v].box_whisker, ds_cstr (&label));
279           es[v].box_whisker = NULL;
280 
281           ds_destroy (&label);
282         }
283 
284       boxplot_submit (boxplot);
285     }
286 }
287 
288 static void
show_boxplot_variabled(const struct examine * cmd,int iact_idx)289 show_boxplot_variabled (const struct examine *cmd, int iact_idx)
290 {
291   int grp;
292   const struct interaction *iact = cmd->iacts[iact_idx];
293   const size_t n_cats =  categoricals_n_count (cmd->cats, iact_idx);
294 
295   for (grp = 0; grp < n_cats; ++grp)
296     {
297       struct boxplot *boxplot;
298       int v;
299       double y_min = DBL_MAX;
300       double y_max = -DBL_MAX;
301 
302       const struct ccase *c =
303 	categoricals_get_case_by_category_real (cmd->cats,  iact_idx, grp);
304 
305       struct string title;
306       ds_init_empty (&title);
307 
308       for (v = 0; v < cmd->n_dep_vars; ++v)
309         {
310           const struct exploratory_stats *es =
311             categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
312 
313           if (y_min > es[v].minimum)
314             y_min = es[v].minimum;
315 
316           if (y_max < es[v].maximum)
317             y_max = es[v].maximum;
318         }
319 
320       if (iact->n_vars == 0)
321         ds_put_format (&title, _("Boxplot"));
322       else
323         {
324           int ivar_idx;
325           struct string label;
326           ds_init_empty (&label);
327           for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
328             {
329               const struct variable *ivar = iact->vars[ivar_idx];
330               const union value *val = case_data (c, ivar);
331 
332               ds_put_cstr (&label, var_to_string (ivar));
333               ds_put_cstr (&label, " = ");
334               append_value_name (ivar, val, &label);
335               ds_put_cstr (&label, "; ");
336             }
337 
338           ds_put_format (&title, _("Boxplot of %s"),
339                          ds_cstr (&label));
340 
341           ds_destroy (&label);
342         }
343 
344       boxplot = boxplot_create (y_min, y_max, ds_cstr (&title));
345 
346       ds_destroy (&title);
347 
348       for (v = 0; v < cmd->n_dep_vars; ++v)
349         {
350           struct exploratory_stats *es =
351             categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
352 
353           boxplot_add_box (boxplot, es[v].box_whisker,
354                            var_to_string (cmd->dep_vars[v]));
355           es[v].box_whisker = NULL;
356         }
357 
358       boxplot_submit (boxplot);
359     }
360 }
361 
362 
363 static void
show_npplot(const struct examine * cmd,int iact_idx)364 show_npplot (const struct examine *cmd, int iact_idx)
365 {
366   const struct interaction *iact = cmd->iacts[iact_idx];
367   const size_t n_cats =  categoricals_n_count (cmd->cats, iact_idx);
368 
369   int v;
370 
371   for (v = 0; v < cmd->n_dep_vars; ++v)
372     {
373       int grp;
374       for (grp = 0; grp < n_cats; ++grp)
375         {
376           struct chart_item *npp, *dnpp;
377           struct casereader *reader;
378           struct np *np;
379 
380           int ivar_idx;
381           const struct ccase *c =
382             categoricals_get_case_by_category_real (cmd->cats,
383                                                     iact_idx, grp);
384 
385           const struct exploratory_stats *es =
386             categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
387 
388           struct string label;
389           ds_init_cstr (&label,
390                         var_to_string (cmd->dep_vars[v]));
391 
392           if (iact->n_vars > 0)
393             {
394               ds_put_cstr (&label, " (");
395               for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
396                 {
397                   const struct variable *ivar = iact->vars[ivar_idx];
398                   const union value *val = case_data (c, ivar);
399 
400                   ds_put_cstr (&label, var_to_string (ivar));
401                   ds_put_cstr (&label, " = ");
402                   append_value_name (ivar, val, &label);
403                   ds_put_cstr (&label, "; ");
404 
405                 }
406               ds_put_cstr (&label, ")");
407             }
408 
409           np = es[v].np;
410           reader = casewriter_make_reader (np->writer);
411           np->writer = NULL;
412 
413           npp = np_plot_create (np, reader, ds_cstr (&label));
414           dnpp = dnp_plot_create (np, reader, ds_cstr (&label));
415 
416           if (npp == NULL || dnpp == NULL)
417             {
418               msg (MW, _("Not creating NP plot because data set is empty."));
419               chart_item_unref (npp);
420               chart_item_unref (dnpp);
421             }
422           else
423             {
424               chart_item_submit (npp);
425               chart_item_submit (dnpp);
426             }
427 	  casereader_destroy (reader);
428 
429           ds_destroy (&label);
430         }
431     }
432 }
433 
434 static void
show_spreadlevel(const struct examine * cmd,int iact_idx)435 show_spreadlevel (const struct examine *cmd, int iact_idx)
436 {
437   const struct interaction *iact = cmd->iacts[iact_idx];
438   const size_t n_cats =  categoricals_n_count (cmd->cats, iact_idx);
439 
440   int v;
441 
442   /* Spreadlevel when there are no levels is not useful */
443   if (iact->n_vars == 0)
444     return;
445 
446   for (v = 0; v < cmd->n_dep_vars; ++v)
447     {
448       int grp;
449       struct chart_item *sl;
450 
451       struct string label;
452       ds_init_cstr (&label,
453 		    var_to_string (cmd->dep_vars[v]));
454 
455       if (iact->n_vars > 0)
456 	{
457 	  ds_put_cstr (&label, " (");
458 	  interaction_to_string (iact, &label);
459 	  ds_put_cstr (&label, ")");
460 	}
461 
462       sl = spreadlevel_plot_create (ds_cstr (&label), cmd->sl_power);
463 
464       for (grp = 0; grp < n_cats; ++grp)
465         {
466           const struct exploratory_stats *es =
467             categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
468 
469 	  double median = percentile_calculate (es[v].quartiles[1], cmd->pc_alg);
470 
471 	  double iqr = percentile_calculate (es[v].quartiles[2], cmd->pc_alg) -
472 	    percentile_calculate (es[v].quartiles[0], cmd->pc_alg);
473 
474 	  spreadlevel_plot_add (sl, iqr, median);
475 	}
476 
477       if (sl == NULL)
478 	msg (MW, _("Not creating spreadlevel chart for %s"), ds_cstr (&label));
479       else
480 	chart_item_submit (sl);
481 
482       ds_destroy (&label);
483     }
484 }
485 
486 
487 static void
show_histogram(const struct examine * cmd,int iact_idx)488 show_histogram (const struct examine *cmd, int iact_idx)
489 {
490   const struct interaction *iact = cmd->iacts[iact_idx];
491   const size_t n_cats =  categoricals_n_count (cmd->cats, iact_idx);
492 
493   int v;
494 
495   for (v = 0; v < cmd->n_dep_vars; ++v)
496     {
497       int grp;
498       for (grp = 0; grp < n_cats; ++grp)
499         {
500           double n, mean, var;
501           int ivar_idx;
502           const struct ccase *c =
503             categoricals_get_case_by_category_real (cmd->cats,
504                                                     iact_idx, grp);
505 
506           const struct exploratory_stats *es =
507             categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
508 
509           struct string label;
510 
511 	  if (es[v].histogram == NULL)
512 	    continue;
513 
514           ds_init_cstr (&label,
515                         var_to_string (cmd->dep_vars[v]));
516 
517           if (iact->n_vars > 0)
518             {
519               ds_put_cstr (&label, " (");
520               for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
521                 {
522                   const struct variable *ivar = iact->vars[ivar_idx];
523                   const union value *val = case_data (c, ivar);
524 
525                   ds_put_cstr (&label, var_to_string (ivar));
526                   ds_put_cstr (&label, " = ");
527                   append_value_name (ivar, val, &label);
528                   ds_put_cstr (&label, "; ");
529 
530                 }
531               ds_put_cstr (&label, ")");
532             }
533 
534 
535           moments_calculate (es[v].mom, &n, &mean, &var, NULL, NULL);
536 
537           chart_item_submit
538             (histogram_chart_create (es[v].histogram->gsl_hist,
539                                       ds_cstr (&label), n, mean,
540                                       sqrt (var), false));
541 
542 
543           ds_destroy (&label);
544         }
545     }
546 }
547 
548 static struct pivot_value *
new_value_with_missing_footnote(const struct variable * var,const union value * value,struct pivot_footnote * missing_footnote)549 new_value_with_missing_footnote (const struct variable *var,
550                                  const union value *value,
551                                  struct pivot_footnote *missing_footnote)
552 {
553   struct pivot_value *pv = pivot_value_new_var_value (var, value);
554   if (var_is_value_missing (var, value, MV_USER))
555     pivot_value_add_footnote (pv, missing_footnote);
556   return pv;
557 }
558 
559 static void
create_interaction_dimensions(struct pivot_table * table,const struct categoricals * cats,const struct interaction * iact,struct pivot_footnote * missing_footnote)560 create_interaction_dimensions (struct pivot_table *table,
561                                const struct categoricals *cats,
562                                const struct interaction *iact,
563                                struct pivot_footnote *missing_footnote)
564 {
565   for (size_t i = iact->n_vars; i-- > 0;)
566     {
567       const struct variable *var = iact->vars[i];
568       struct pivot_dimension *d = pivot_dimension_create__ (
569         table, PIVOT_AXIS_ROW, pivot_value_new_variable (var));
570       d->root->show_label = true;
571 
572       size_t n;
573       union value *values = categoricals_get_var_values (cats, var, &n);
574       for (size_t j = 0; j < n; j++)
575         pivot_category_create_leaf (
576           d->root, new_value_with_missing_footnote (var, &values[j],
577                                                     missing_footnote));
578     }
579 }
580 
581 static struct pivot_footnote *
create_missing_footnote(struct pivot_table * table)582 create_missing_footnote (struct pivot_table *table)
583 {
584   return pivot_table_create_footnote (
585     table, pivot_value_new_text (N_("User-missing value.")));
586 }
587 
588 static void
percentiles_report(const struct examine * cmd,int iact_idx)589 percentiles_report (const struct examine *cmd, int iact_idx)
590 {
591   struct pivot_table *table = pivot_table_create (N_("Percentiles"));
592   table->omit_empty = true;
593 
594   struct pivot_dimension *percentiles = pivot_dimension_create (
595     table, PIVOT_AXIS_COLUMN, N_("Percentiles"));
596   percentiles->root->show_label = true;
597   for (int i = 0; i < cmd->n_percentiles; ++i)
598     pivot_category_create_leaf (
599       percentiles->root,
600       pivot_value_new_user_text_nocopy (xasprintf ("%g", cmd->ptiles[i])));
601 
602   pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Statistics"),
603                           N_("Weighted Average"), N_("Tukey's Hinges"));
604 
605   const struct interaction *iact = cmd->iacts[iact_idx];
606   struct pivot_footnote *missing_footnote = create_missing_footnote (table);
607   create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
608 
609   struct pivot_dimension *dep_dim = pivot_dimension_create (
610     table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
611 
612   size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
613 
614   size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
615   for (size_t v = 0; v < cmd->n_dep_vars; ++v)
616     {
617       indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
618         dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
619 
620       for (size_t i = 0; i < n_cats; ++i)
621         {
622           for (size_t j = 0; j < iact->n_vars; j++)
623             {
624               int idx = categoricals_get_value_index_by_category_real (
625                 cmd->cats, iact_idx, i, j);
626               indexes[table->n_dimensions - 2 - j] = idx;
627             }
628 
629           const struct exploratory_stats *ess
630             = categoricals_get_user_data_by_category_real (
631               cmd->cats, iact_idx, i);
632           const struct exploratory_stats *es = ess + v;
633 
634           double hinges[3];
635           tukey_hinges_calculate (es->hinges, hinges);
636 
637           for (size_t pc_idx = 0; pc_idx < cmd->n_percentiles; ++pc_idx)
638             {
639               indexes[0] = pc_idx;
640 
641               indexes[1] = 0;
642               double value = percentile_calculate (es->percentiles[pc_idx],
643                                                    cmd->pc_alg);
644               pivot_table_put (table, indexes, table->n_dimensions,
645                                pivot_value_new_number (value));
646 
647               double hinge = (cmd->ptiles[pc_idx] == 25.0 ? hinges[0]
648                               : cmd->ptiles[pc_idx] == 50.0 ? hinges[1]
649                               : cmd->ptiles[pc_idx] == 75.0 ? hinges[2]
650                               : SYSMIS);
651               if (hinge != SYSMIS)
652                 {
653                   indexes[1] = 1;
654                   pivot_table_put (table, indexes, table->n_dimensions,
655                                    pivot_value_new_number (hinge));
656                 }
657             }
658         }
659 
660     }
661   free (indexes);
662 
663   pivot_table_submit (table);
664 }
665 
666 static void
normality_report(const struct examine * cmd,int iact_idx)667 normality_report (const struct examine *cmd, int iact_idx)
668 {
669   struct pivot_table *table = pivot_table_create (N_("Tests of Normality"));
670   table->omit_empty = true;
671 
672   struct pivot_dimension *test =
673     pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Shapiro-Wilk"),
674 			    N_("Statistic"),
675 			    N_("df"), PIVOT_RC_COUNT,
676 			    N_("Sig."));
677 
678   test->root->show_label = true;
679 
680   const struct interaction *iact = cmd->iacts[iact_idx];
681   struct pivot_footnote *missing_footnote = create_missing_footnote (table);
682   create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
683 
684   struct pivot_dimension *dep_dim = pivot_dimension_create (
685     table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
686 
687   size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
688 
689   size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
690   for (size_t v = 0; v < cmd->n_dep_vars; ++v)
691     {
692       indexes[table->n_dimensions - 1] =
693 	pivot_category_create_leaf (dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
694 
695       for (size_t i = 0; i < n_cats; ++i)
696         {
697 	  indexes[1] = i;
698 
699           const struct exploratory_stats *es
700             = categoricals_get_user_data_by_category_real (
701               cmd->cats, iact_idx, i);
702 
703 	  struct shapiro_wilk *sw =  es[v].shapiro_wilk;
704 
705 	  if (sw == NULL)
706 	    continue;
707 
708 	  double w = shapiro_wilk_calculate (sw);
709 
710 	  int j = 0;
711 	  indexes[0] = j;
712 
713 	  pivot_table_put (table, indexes, table->n_dimensions,
714 			   pivot_value_new_number (w));
715 
716 	  indexes[0] = ++j;
717 	  pivot_table_put (table, indexes, table->n_dimensions,
718 			   pivot_value_new_number (sw->n));
719 
720 	  indexes[0] = ++j;
721 	  pivot_table_put (table, indexes, table->n_dimensions,
722 			   pivot_value_new_number (shapiro_wilk_significance (sw->n, w)));
723 	}
724     }
725 
726   free (indexes);
727 
728   pivot_table_submit (table);
729 }
730 
731 
732 static void
descriptives_report(const struct examine * cmd,int iact_idx)733 descriptives_report (const struct examine *cmd, int iact_idx)
734 {
735   struct pivot_table *table = pivot_table_create (N_("Descriptives"));
736   table->omit_empty = true;
737 
738   pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Aspect"),
739                           N_("Statistic"), N_("Std. Error"));
740 
741   struct pivot_dimension *statistics = pivot_dimension_create (
742     table, PIVOT_AXIS_ROW, N_("Statistics"), N_("Mean"));
743   struct pivot_category *interval = pivot_category_create_group__ (
744     statistics->root,
745     pivot_value_new_text_format (N_("%g%% Confidence Interval for Mean"),
746                                  cmd->conf * 100.0));
747   pivot_category_create_leaves (interval, N_("Lower Bound"),
748                                 N_("Upper Bound"));
749   pivot_category_create_leaves (
750     statistics->root, N_("5% Trimmed Mean"), N_("Median"), N_("Variance"),
751     N_("Std. Deviation"), N_("Minimum"), N_("Maximum"), N_("Range"),
752     N_("Interquartile Range"), N_("Skewness"), N_("Kurtosis"));
753 
754   const struct interaction *iact = cmd->iacts[iact_idx];
755   struct pivot_footnote *missing_footnote = create_missing_footnote (table);
756   create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
757 
758   struct pivot_dimension *dep_dim = pivot_dimension_create (
759     table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
760 
761   size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
762 
763   size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
764   for (size_t v = 0; v < cmd->n_dep_vars; ++v)
765     {
766       indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
767         dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
768 
769       for (size_t i = 0; i < n_cats; ++i)
770         {
771           for (size_t j = 0; j < iact->n_vars; j++)
772             {
773               int idx = categoricals_get_value_index_by_category_real (
774                 cmd->cats, iact_idx, i, j);
775               indexes[table->n_dimensions - 2 - j] = idx;
776             }
777 
778           const struct exploratory_stats *ess
779             = categoricals_get_user_data_by_category_real (cmd->cats,
780                                                            iact_idx, i);
781           const struct exploratory_stats *es = ess + v;
782 
783           double m0, m1, m2, m3, m4;
784           moments_calculate (es->mom, &m0, &m1, &m2, &m3, &m4);
785           double tval = gsl_cdf_tdist_Qinv ((1.0 - cmd->conf) / 2.0, m0 - 1.0);
786 
787           struct entry
788             {
789               int stat_idx;
790               int aspect_idx;
791               double x;
792             }
793           entries[] = {
794             { 0, 0, m1 },
795             { 0, 1, calc_semean (m2, m0) },
796             { 1, 0, m1 - tval * calc_semean (m2, m0) },
797             { 2, 0, m1 + tval * calc_semean (m2, m0) },
798             { 3, 0, trimmed_mean_calculate (es->trimmed_mean) },
799             { 4, 0, percentile_calculate (es->quartiles[1], cmd->pc_alg) },
800             { 5, 0, m2 },
801             { 6, 0, sqrt (m2) },
802             { 7, 0, es->minima[0].val },
803             { 8, 0, es->maxima[0].val },
804             { 9, 0, es->maxima[0].val - es->minima[0].val },
805             { 10, 0, (percentile_calculate (es->quartiles[2], cmd->pc_alg) -
806                       percentile_calculate (es->quartiles[0], cmd->pc_alg)) },
807             { 11, 0, m3 },
808             { 11, 1, calc_seskew (m0) },
809             { 12, 0, m4 },
810             { 12, 1, calc_sekurt (m0) },
811           };
812           for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
813             {
814               const struct entry *e = &entries[j];
815               indexes[0] = e->aspect_idx;
816               indexes[1] = e->stat_idx;
817               pivot_table_put (table, indexes, table->n_dimensions,
818                                pivot_value_new_number (e->x));
819             }
820         }
821     }
822 
823   free (indexes);
824 
825   pivot_table_submit (table);
826 }
827 
828 
829 static void
extremes_report(const struct examine * cmd,int iact_idx)830 extremes_report (const struct examine *cmd, int iact_idx)
831 {
832   struct pivot_table *table = pivot_table_create (N_("Extreme Values"));
833   table->omit_empty = true;
834 
835   struct pivot_dimension *statistics = pivot_dimension_create (
836     table, PIVOT_AXIS_COLUMN, N_("Statistics"));
837   pivot_category_create_leaf (statistics->root,
838                               (cmd->id_var
839                                ? pivot_value_new_variable (cmd->id_var)
840                                : pivot_value_new_text (N_("Case Number"))));
841   pivot_category_create_leaves (statistics->root, N_("Value"));
842 
843   struct pivot_dimension *order = pivot_dimension_create (
844     table, PIVOT_AXIS_ROW, N_("Order"));
845   for (size_t i = 0; i < cmd->disp_extremes; i++)
846     pivot_category_create_leaf (order->root, pivot_value_new_integer (i + 1));
847 
848   pivot_dimension_create (table, PIVOT_AXIS_ROW,
849 			  /* TRANSLATORS: This is a noun, not an adjective.  */
850 			  N_("Extreme"),
851                           N_("Highest"), N_("Lowest"));
852 
853   const struct interaction *iact = cmd->iacts[iact_idx];
854   struct pivot_footnote *missing_footnote = create_missing_footnote (table);
855   create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
856 
857   struct pivot_dimension *dep_dim = pivot_dimension_create (
858     table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
859 
860   size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
861 
862   size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
863   for (size_t v = 0; v < cmd->n_dep_vars; ++v)
864     {
865       indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
866         dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
867 
868       for (size_t i = 0; i < n_cats; ++i)
869         {
870           for (size_t j = 0; j < iact->n_vars; j++)
871             {
872               int idx = categoricals_get_value_index_by_category_real (
873                 cmd->cats, iact_idx, i, j);
874               indexes[table->n_dimensions - 2 - j] = idx;
875             }
876 
877           const struct exploratory_stats *ess
878             = categoricals_get_user_data_by_category_real (cmd->cats,
879                                                            iact_idx, i);
880           const struct exploratory_stats *es = ess + v;
881 
882           for (int e = 0 ; e < cmd->disp_extremes; ++e)
883             {
884               indexes[1] = e;
885 
886               for (size_t j = 0; j < 2; j++)
887                 {
888                   const struct extremity *extremity
889                     = j ? &es->minima[e] : &es->maxima[e];
890                   indexes[2] = j;
891 
892                   indexes[0] = 0;
893                   pivot_table_put (
894                     table, indexes, table->n_dimensions,
895                     (cmd->id_var
896                      ? new_value_with_missing_footnote (cmd->id_var,
897                                                         &extremity->identity,
898                                                         missing_footnote)
899                      : pivot_value_new_integer (extremity->identity.f)));
900 
901                   indexes[0] = 1;
902                   union value val = { .f = extremity->val };
903                   pivot_table_put (
904                     table, indexes, table->n_dimensions,
905                     new_value_with_missing_footnote (cmd->dep_vars[v], &val,
906                                                      missing_footnote));
907                 }
908             }
909         }
910     }
911   free (indexes);
912 
913   pivot_table_submit (table);
914 }
915 
916 
917 static void
summary_report(const struct examine * cmd,int iact_idx)918 summary_report (const struct examine *cmd, int iact_idx)
919 {
920   struct pivot_table *table = pivot_table_create (
921     N_("Case Processing Summary"));
922   table->omit_empty = true;
923   pivot_table_set_weight_var (table, dict_get_weight (cmd->dict));
924 
925   pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
926                           N_("N"), PIVOT_RC_COUNT,
927                           N_("Percent"), PIVOT_RC_PERCENT);
928   struct pivot_dimension *cases = pivot_dimension_create (
929     table, PIVOT_AXIS_COLUMN, N_("Cases"), N_("Valid"), N_("Missing"),
930     N_("Total"));
931   cases->root->show_label = true;
932 
933   const struct interaction *iact = cmd->iacts[iact_idx];
934   struct pivot_footnote *missing_footnote = create_missing_footnote (table);
935   create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
936 
937   struct pivot_dimension *dep_dim = pivot_dimension_create (
938     table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
939 
940   size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
941 
942   size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
943   for (size_t v = 0; v < cmd->n_dep_vars; ++v)
944     {
945       indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
946         dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
947 
948       for (size_t i = 0; i < n_cats; ++i)
949         {
950           for (size_t j = 0; j < iact->n_vars; j++)
951             {
952               int idx = categoricals_get_value_index_by_category_real (
953                 cmd->cats, iact_idx, i, j);
954               indexes[table->n_dimensions - 2 - j] = idx;
955             }
956 
957           const struct exploratory_stats *es
958             = categoricals_get_user_data_by_category_real (
959               cmd->cats, iact_idx, i);
960 
961           double total = es[v].missing + es[v].non_missing;
962           struct entry
963             {
964               int stat_idx;
965               int case_idx;
966               double x;
967             }
968           entries[] = {
969             { 0, 0, es[v].non_missing },
970             { 1, 0, 100.0 * es[v].non_missing / total },
971             { 0, 1, es[v].missing },
972             { 1, 1, 100.0 * es[v].missing / total },
973             { 0, 2, total },
974             { 1, 2, 100.0 },
975           };
976           for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
977             {
978               const struct entry *e = &entries[j];
979               indexes[0] = e->stat_idx;
980               indexes[1] = e->case_idx;
981               pivot_table_put (table, indexes, table->n_dimensions,
982                                pivot_value_new_number (e->x));
983             }
984         }
985     }
986 
987   free (indexes);
988 
989   pivot_table_submit (table);
990 }
991 
992 /* Attempt to parse an interaction from LEXER */
993 static struct interaction *
parse_interaction(struct lexer * lexer,struct examine * ex)994 parse_interaction (struct lexer *lexer, struct examine *ex)
995 {
996   const struct variable *v = NULL;
997   struct interaction *iact = NULL;
998 
999   if (lex_match_variable (lexer, ex->dict, &v))
1000     {
1001       iact = interaction_create (v);
1002 
1003       while (lex_match (lexer, T_BY))
1004         {
1005           if (!lex_match_variable (lexer, ex->dict, &v))
1006             {
1007               interaction_destroy (iact);
1008               return NULL;
1009             }
1010           interaction_add_variable (iact, v);
1011         }
1012       lex_match (lexer, T_COMMA);
1013     }
1014 
1015   return iact;
1016 }
1017 
1018 
1019 static void *
create_n(const void * aux1,void * aux2 UNUSED)1020 create_n (const void *aux1, void *aux2 UNUSED)
1021 {
1022   int v;
1023 
1024   const struct examine *examine = aux1;
1025   struct exploratory_stats *es = pool_calloc (examine->pool, examine->n_dep_vars, sizeof (*es));
1026   struct subcase ordering;
1027   subcase_init (&ordering, 0, 0, SC_ASCEND);
1028 
1029   for (v = 0; v < examine->n_dep_vars; v++)
1030     {
1031       es[v].sorted_writer = sort_create_writer (&ordering, examine->ex_proto);
1032       es[v].sorted_reader = NULL;
1033 
1034       es[v].mom = moments_create (MOMENT_KURTOSIS);
1035       es[v].cmin = DBL_MAX;
1036 
1037       es[v].maximum = -DBL_MAX;
1038       es[v].minimum =  DBL_MAX;
1039     }
1040 
1041   subcase_destroy (&ordering);
1042   return es;
1043 }
1044 
1045 static void
update_n(const void * aux1,void * aux2 UNUSED,void * user_data,const struct ccase * c,double weight)1046 update_n (const void *aux1, void *aux2 UNUSED, void *user_data,
1047           const struct ccase *c, double weight)
1048 {
1049   int v;
1050   const struct examine *examine = aux1;
1051   struct exploratory_stats *es = user_data;
1052 
1053   bool this_case_is_missing = false;
1054   /* LISTWISE missing must be dealt with here */
1055   if (!examine->missing_pw)
1056     {
1057       for (v = 0; v < examine->n_dep_vars; v++)
1058 	{
1059 	  const struct variable *var = examine->dep_vars[v];
1060 
1061 	  if (var_is_value_missing (var, case_data (c, var), examine->dep_excl))
1062 	    {
1063 	      es[v].missing += weight;
1064 	      this_case_is_missing = true;
1065 	    }
1066 	}
1067     }
1068 
1069   if (this_case_is_missing)
1070     return;
1071 
1072   for (v = 0; v < examine->n_dep_vars; v++)
1073     {
1074       struct ccase *outcase ;
1075       const struct variable *var = examine->dep_vars[v];
1076       const double x = case_data (c, var)->f;
1077 
1078       if (var_is_value_missing (var, case_data (c, var), examine->dep_excl))
1079         {
1080           es[v].missing += weight;
1081           continue;
1082         }
1083 
1084       outcase = case_create (examine->ex_proto);
1085 
1086       if (x > es[v].maximum)
1087         es[v].maximum = x;
1088 
1089       if (x < es[v].minimum)
1090         es[v].minimum =  x;
1091 
1092       es[v].non_missing += weight;
1093 
1094       moments_pass_one (es[v].mom, x, weight);
1095 
1096       /* Save the value and the ID to the writer */
1097       assert (examine->id_idx != -1);
1098       case_data_rw_idx (outcase, EX_VAL)->f = x;
1099       value_copy (case_data_rw_idx (outcase, EX_ID),
1100                   case_data_idx (c, examine->id_idx), examine->id_width);
1101 
1102       case_data_rw_idx (outcase, EX_WT)->f = weight;
1103 
1104       es[v].cc += weight;
1105 
1106       if (es[v].cmin > weight)
1107         es[v].cmin = weight;
1108 
1109       casewriter_write (es[v].sorted_writer, outcase);
1110     }
1111 }
1112 
1113 static void
calculate_n(const void * aux1,void * aux2 UNUSED,void * user_data)1114 calculate_n (const void *aux1, void *aux2 UNUSED, void *user_data)
1115 {
1116   int v;
1117   const struct examine *examine = aux1;
1118   struct exploratory_stats *es = user_data;
1119 
1120   for (v = 0; v < examine->n_dep_vars; v++)
1121     {
1122       int i;
1123       casenumber imin = 0;
1124       casenumber imax;
1125       struct casereader *reader;
1126       struct ccase *c;
1127 
1128       if (examine->plot & PLOT_HISTOGRAM && es[v].non_missing > 0)
1129         {
1130           /* Sturges Rule */
1131           double bin_width = fabs (es[v].minimum - es[v].maximum)
1132             / (1 + log2 (es[v].cc))
1133             ;
1134 
1135           es[v].histogram =
1136             histogram_create (bin_width, es[v].minimum, es[v].maximum);
1137         }
1138 
1139       es[v].sorted_reader = casewriter_make_reader (es[v].sorted_writer);
1140       es[v].sorted_writer = NULL;
1141 
1142       imax = casereader_get_case_cnt (es[v].sorted_reader);
1143 
1144       es[v].maxima = pool_calloc (examine->pool, examine->calc_extremes, sizeof (*es[v].maxima));
1145       es[v].minima = pool_calloc (examine->pool, examine->calc_extremes, sizeof (*es[v].minima));
1146       for (i = 0; i < examine->calc_extremes; ++i)
1147         {
1148           value_init_pool (examine->pool, &es[v].maxima[i].identity, examine->id_width) ;
1149           value_init_pool (examine->pool, &es[v].minima[i].identity, examine->id_width) ;
1150         }
1151 
1152       bool warn = true;
1153       for (reader = casereader_clone (es[v].sorted_reader);
1154            (c = casereader_read (reader)) != NULL; case_unref (c))
1155         {
1156           const double val = case_data_idx (c, EX_VAL)->f;
1157           double wt = case_data_idx (c, EX_WT)->f;
1158 	  wt = var_force_valid_weight (examine->wv, wt, &warn);
1159 
1160           moments_pass_two (es[v].mom, val, wt);
1161 
1162           if (es[v].histogram)
1163             histogram_add (es[v].histogram, val, wt);
1164 
1165           if (imin < examine->calc_extremes)
1166             {
1167               int x;
1168               for (x = imin; x < examine->calc_extremes; ++x)
1169                 {
1170                   struct extremity *min = &es[v].minima[x];
1171                   min->val = val;
1172                   value_copy (&min->identity, case_data_idx (c, EX_ID), examine->id_width);
1173                 }
1174               imin ++;
1175             }
1176 
1177           imax --;
1178           if (imax < examine->calc_extremes)
1179             {
1180               int x;
1181 
1182               for (x = imax; x < imax + 1; ++x)
1183                 {
1184                   struct extremity *max;
1185 
1186                   if (x >= examine->calc_extremes)
1187                     break;
1188 
1189                   max = &es[v].maxima[x];
1190                   max->val = val;
1191                   value_copy (&max->identity, case_data_idx (c, EX_ID), examine->id_width);
1192                 }
1193             }
1194         }
1195       casereader_destroy (reader);
1196 
1197       if (examine->calc_extremes > 0 && es[v].non_missing > 0)
1198         {
1199           assert (es[v].minima[0].val == es[v].minimum);
1200 	  assert (es[v].maxima[0].val == es[v].maximum);
1201         }
1202 
1203       {
1204 	const int n_os = 5 + examine->n_percentiles;
1205 	struct order_stats **os ;
1206 	es[v].percentiles = pool_calloc (examine->pool, examine->n_percentiles, sizeof (*es[v].percentiles));
1207 
1208 	es[v].trimmed_mean = trimmed_mean_create (es[v].cc, 0.05);
1209 	es[v].shapiro_wilk = NULL;
1210 
1211 	os = xcalloc (n_os, sizeof *os);
1212 	os[0] = &es[v].trimmed_mean->parent;
1213 
1214 	es[v].quartiles[0] = percentile_create (0.25, es[v].cc);
1215 	es[v].quartiles[1] = percentile_create (0.5,  es[v].cc);
1216 	es[v].quartiles[2] = percentile_create (0.75, es[v].cc);
1217 
1218 	os[1] = &es[v].quartiles[0]->parent;
1219 	os[2] = &es[v].quartiles[1]->parent;
1220 	os[3] = &es[v].quartiles[2]->parent;
1221 
1222 	es[v].hinges = tukey_hinges_create (es[v].cc, es[v].cmin);
1223 	os[4] = &es[v].hinges->parent;
1224 
1225 	for (i = 0; i < examine->n_percentiles; ++i)
1226 	  {
1227 	    es[v].percentiles[i] = percentile_create (examine->ptiles[i] / 100.00, es[v].cc);
1228 	    os[5 + i] = &es[v].percentiles[i]->parent;
1229 	  }
1230 
1231 	order_stats_accumulate_idx (os, n_os,
1232 				    casereader_clone (es[v].sorted_reader),
1233 				    EX_WT, EX_VAL);
1234 
1235 	free (os);
1236       }
1237 
1238       if (examine->plot & PLOT_BOXPLOT)
1239         {
1240           struct order_stats *os;
1241 
1242           es[v].box_whisker = box_whisker_create (es[v].hinges,
1243                                                   EX_ID, examine->id_var);
1244 
1245           os = &es[v].box_whisker->parent;
1246 	  order_stats_accumulate_idx (&os, 1,
1247 				      casereader_clone (es[v].sorted_reader),
1248 				      EX_WT, EX_VAL);
1249         }
1250 
1251       if (examine->plot)
1252         {
1253 	  double mean;
1254 
1255 	  moments_calculate (es[v].mom, NULL, &mean, NULL, NULL, NULL);
1256 
1257           es[v].shapiro_wilk = shapiro_wilk_create (es[v].non_missing, mean);
1258 
1259 	  if (es[v].shapiro_wilk)
1260 	    {
1261 	      struct order_stats *os = &es[v].shapiro_wilk->parent;
1262 	      order_stats_accumulate_idx (&os, 1,
1263 					  casereader_clone (es[v].sorted_reader),
1264 					  EX_WT, EX_VAL);
1265 	    }
1266         }
1267 
1268       if (examine->plot & PLOT_NPPLOT)
1269         {
1270           double n, mean, var;
1271           struct order_stats *os;
1272 
1273           moments_calculate (es[v].mom, &n, &mean, &var, NULL, NULL);
1274 
1275           es[v].np = np_create (n, mean, var);
1276 
1277           os = &es[v].np->parent;
1278 
1279           order_stats_accumulate_idx (&os, 1,
1280 				      casereader_clone (es[v].sorted_reader),
1281 				      EX_WT, EX_VAL);
1282         }
1283 
1284     }
1285 }
1286 
1287 static void
cleanup_exploratory_stats(struct examine * cmd)1288 cleanup_exploratory_stats (struct examine *cmd)
1289 {
1290   int i;
1291   for (i = 0; i < cmd->n_iacts; ++i)
1292     {
1293       int v;
1294       const size_t n_cats =  categoricals_n_count (cmd->cats, i);
1295 
1296       for (v = 0; v < cmd->n_dep_vars; ++v)
1297 	{
1298 	  int grp;
1299 	  for (grp = 0; grp < n_cats; ++grp)
1300 	    {
1301 	      int q;
1302 	      const struct exploratory_stats *es =
1303 		categoricals_get_user_data_by_category_real (cmd->cats, i, grp);
1304 
1305 	      struct order_stats *os = &es[v].hinges->parent;
1306 	      struct statistic  *stat = &os->parent;
1307 	      stat->destroy (stat);
1308 
1309 	      for (q = 0; q < 3 ; q++)
1310 		{
1311 		  os = &es[v].quartiles[q]->parent;
1312 		  stat = &os->parent;
1313 		  stat->destroy (stat);
1314 		}
1315 
1316 	      for (q = 0; q < cmd->n_percentiles ; q++)
1317 		{
1318 		  os = &es[v].percentiles[q]->parent;
1319 		  stat = &os->parent;
1320 		  stat->destroy (stat);
1321 		}
1322 
1323               if (es[v].shapiro_wilk)
1324                 {
1325                   stat = &es[v].shapiro_wilk->parent.parent;
1326                   stat->destroy (stat);
1327                 }
1328 
1329 	      os = &es[v].trimmed_mean->parent;
1330 	      stat = &os->parent;
1331 	      stat->destroy (stat);
1332 
1333 	      os = &es[v].np->parent;
1334 	      if (os)
1335 		{
1336 		  stat = &os->parent;
1337 		  stat->destroy (stat);
1338 		}
1339 
1340 	      statistic_destroy (&es[v].histogram->parent);
1341 	      moments_destroy (es[v].mom);
1342 
1343               if (es[v].box_whisker)
1344                 {
1345                   stat = &es[v].box_whisker->parent.parent;
1346                   stat->destroy (stat);
1347                 }
1348 
1349 	      casereader_destroy (es[v].sorted_reader);
1350 	    }
1351 	}
1352     }
1353 }
1354 
1355 
1356 static void
run_examine(struct examine * cmd,struct casereader * input)1357 run_examine (struct examine *cmd, struct casereader *input)
1358 {
1359   int i;
1360   struct ccase *c;
1361   struct casereader *reader;
1362 
1363   struct payload payload;
1364   payload.create = create_n;
1365   payload.update = update_n;
1366   payload.calculate = calculate_n;
1367   payload.destroy = NULL;
1368 
1369   cmd->wv = dict_get_weight (cmd->dict);
1370 
1371   cmd->cats
1372     = categoricals_create (cmd->iacts, cmd->n_iacts, cmd->wv, cmd->fctr_excl);
1373 
1374   categoricals_set_payload (cmd->cats, &payload, cmd, NULL);
1375 
1376   if (cmd->id_var == NULL)
1377     {
1378       struct ccase *c = casereader_peek (input,  0);
1379 
1380       cmd->id_idx = case_get_value_cnt (c);
1381       input = casereader_create_arithmetic_sequence (input, 1.0, 1.0);
1382 
1383       case_unref (c);
1384     }
1385 
1386   for (reader = input;
1387        (c = casereader_read (reader)) != NULL; case_unref (c))
1388     {
1389       categoricals_update (cmd->cats, c);
1390     }
1391   casereader_destroy (reader);
1392   categoricals_done (cmd->cats);
1393 
1394   for (i = 0; i < cmd->n_iacts; ++i)
1395     {
1396       summary_report (cmd, i);
1397 
1398       const size_t n_cats =  categoricals_n_count (cmd->cats, i);
1399       if (n_cats == 0)
1400 	continue;
1401 
1402       if (cmd->disp_extremes > 0)
1403         extremes_report (cmd, i);
1404 
1405       if (cmd->n_percentiles > 0)
1406         percentiles_report (cmd, i);
1407 
1408       if (cmd->plot & PLOT_BOXPLOT)
1409         {
1410           switch (cmd->boxplot_mode)
1411             {
1412             case BP_GROUPS:
1413               show_boxplot_grouped (cmd, i);
1414               break;
1415             case BP_VARIABLES:
1416               show_boxplot_variabled (cmd, i);
1417               break;
1418             default:
1419               NOT_REACHED ();
1420               break;
1421             }
1422         }
1423 
1424       if (cmd->plot & PLOT_HISTOGRAM)
1425         show_histogram (cmd, i);
1426 
1427       if (cmd->plot & PLOT_NPPLOT)
1428         show_npplot (cmd, i);
1429 
1430       if (cmd->plot & PLOT_SPREADLEVEL)
1431         show_spreadlevel (cmd, i);
1432 
1433       if (cmd->descriptives)
1434         descriptives_report (cmd, i);
1435 
1436       if (cmd->plot)
1437 	normality_report (cmd, i);
1438     }
1439 
1440   cleanup_exploratory_stats (cmd);
1441   categoricals_destroy (cmd->cats);
1442 }
1443 
1444 
1445 int
cmd_examine(struct lexer * lexer,struct dataset * ds)1446 cmd_examine (struct lexer *lexer, struct dataset *ds)
1447 {
1448   int i;
1449   bool nototals_seen = false;
1450   bool totals_seen = false;
1451 
1452   struct interaction **iacts_mem = NULL;
1453   struct examine examine;
1454   bool percentiles_seen = false;
1455 
1456   examine.missing_pw = false;
1457   examine.disp_extremes = 0;
1458   examine.calc_extremes = 0;
1459   examine.descriptives = false;
1460   examine.conf = 0.95;
1461   examine.pc_alg = PC_HAVERAGE;
1462   examine.ptiles = NULL;
1463   examine.n_percentiles = 0;
1464   examine.id_idx = -1;
1465   examine.id_width = 0;
1466   examine.id_var = NULL;
1467   examine.boxplot_mode = BP_GROUPS;
1468 
1469   examine.ex_proto = caseproto_create ();
1470 
1471   examine.pool = pool_create ();
1472 
1473   /* Allocate space for the first interaction.
1474      This is interaction is an empty one (for the totals).
1475      If no totals are requested, we will simply ignore this
1476      interaction.
1477   */
1478   examine.n_iacts = 1;
1479   examine.iacts = iacts_mem = pool_zalloc (examine.pool, sizeof (struct interaction *));
1480   examine.iacts[0] = interaction_create (NULL);
1481 
1482   examine.dep_excl = MV_ANY;
1483   examine.fctr_excl = MV_ANY;
1484   examine.plot = 0;
1485   examine.sl_power = 0;
1486   examine.dep_vars = NULL;
1487   examine.n_dep_vars = 0;
1488   examine.dict = dataset_dict (ds);
1489 
1490   /* Accept an optional, completely pointless "/VARIABLES=" */
1491   lex_match (lexer, T_SLASH);
1492   if (lex_match_id  (lexer, "VARIABLES"))
1493     {
1494       if (! lex_force_match (lexer, T_EQUALS))
1495         goto error;
1496     }
1497 
1498   if (!parse_variables_const (lexer, examine.dict,
1499 			      &examine.dep_vars, &examine.n_dep_vars,
1500 			      PV_NO_DUPLICATE | PV_NUMERIC))
1501     goto error;
1502 
1503   if (lex_match (lexer, T_BY))
1504     {
1505       struct interaction *iact = NULL;
1506       do
1507         {
1508           iact = parse_interaction (lexer, &examine);
1509           if (iact)
1510             {
1511               examine.n_iacts++;
1512               iacts_mem =
1513                 pool_nrealloc (examine.pool, iacts_mem,
1514 			       examine.n_iacts,
1515 			       sizeof (*iacts_mem));
1516 
1517               iacts_mem[examine.n_iacts - 1] = iact;
1518             }
1519         }
1520       while (iact);
1521     }
1522 
1523 
1524   while (lex_token (lexer) != T_ENDCMD)
1525     {
1526       lex_match (lexer, T_SLASH);
1527 
1528       if (lex_match_id (lexer, "STATISTICS"))
1529 	{
1530 	  lex_match (lexer, T_EQUALS);
1531 
1532 	  while (lex_token (lexer) != T_ENDCMD
1533 		 && lex_token (lexer) != T_SLASH)
1534 	    {
1535               if (lex_match_id (lexer, "DESCRIPTIVES"))
1536                 {
1537                   examine.descriptives = true;
1538                 }
1539               else if (lex_match_id (lexer, "EXTREME"))
1540                 {
1541                   int extr = 5;
1542                   if (lex_match (lexer, T_LPAREN))
1543                     {
1544                       if (!lex_force_int (lexer))
1545                         goto error;
1546                       extr = lex_integer (lexer);
1547 
1548                       if (extr < 0)
1549                         {
1550                           msg (MW, _("%s may not be negative. Using default value (%g)."), "EXTREME", 5.0);
1551                           extr = 5;
1552                         }
1553 
1554                       lex_get (lexer);
1555                       if (! lex_force_match (lexer, T_RPAREN))
1556                         goto error;
1557                     }
1558                   examine.disp_extremes  = extr;
1559                 }
1560               else if (lex_match_id (lexer, "NONE"))
1561                 {
1562                 }
1563               else if (lex_match (lexer, T_ALL))
1564                 {
1565                   if (examine.disp_extremes == 0)
1566                     examine.disp_extremes = 5;
1567                 }
1568               else
1569                 {
1570                   lex_error (lexer, NULL);
1571                   goto error;
1572                 }
1573             }
1574         }
1575       else if (lex_match_id (lexer, "PERCENTILES"))
1576         {
1577           percentiles_seen = true;
1578           if (lex_match (lexer, T_LPAREN))
1579             {
1580               while (lex_is_number (lexer))
1581                 {
1582                   double p = lex_number (lexer);
1583 
1584                   if (p <= 0 || p >= 100.0)
1585                     {
1586                       lex_error (lexer,
1587                                  _("Percentiles must lie in the range (0, 100)"));
1588                       goto error;
1589                     }
1590 
1591                   examine.n_percentiles++;
1592                   examine.ptiles =
1593                     xrealloc (examine.ptiles,
1594                               sizeof (*examine.ptiles) *
1595                               examine.n_percentiles);
1596 
1597                   examine.ptiles[examine.n_percentiles - 1] = p;
1598 
1599                   lex_get (lexer);
1600                   lex_match (lexer, T_COMMA);
1601                 }
1602               if (!lex_force_match (lexer, T_RPAREN))
1603                 goto error;
1604             }
1605 
1606 	  lex_match (lexer, T_EQUALS);
1607 
1608 	  while (lex_token (lexer) != T_ENDCMD
1609 		 && lex_token (lexer) != T_SLASH)
1610 	    {
1611               if (lex_match_id (lexer, "HAVERAGE"))
1612                 {
1613                   examine.pc_alg = PC_HAVERAGE;
1614                 }
1615               else if (lex_match_id (lexer, "WAVERAGE"))
1616                 {
1617                   examine.pc_alg = PC_WAVERAGE;
1618                 }
1619               else if (lex_match_id (lexer, "ROUND"))
1620                 {
1621                   examine.pc_alg = PC_ROUND;
1622                 }
1623               else if (lex_match_id (lexer, "EMPIRICAL"))
1624                 {
1625                   examine.pc_alg = PC_EMPIRICAL;
1626                 }
1627               else if (lex_match_id (lexer, "AEMPIRICAL"))
1628                 {
1629                   examine.pc_alg = PC_AEMPIRICAL;
1630                 }
1631               else if (lex_match_id (lexer, "NONE"))
1632                 {
1633                   examine.pc_alg = PC_NONE;
1634                 }
1635               else
1636                 {
1637                   lex_error (lexer, NULL);
1638                   goto error;
1639                 }
1640             }
1641         }
1642       else if (lex_match_id (lexer, "TOTAL"))
1643         {
1644           totals_seen = true;
1645         }
1646       else if (lex_match_id (lexer, "NOTOTAL"))
1647         {
1648           nototals_seen = true;
1649         }
1650       else if (lex_match_id (lexer, "MISSING"))
1651         {
1652 	  lex_match (lexer, T_EQUALS);
1653 
1654 	  while (lex_token (lexer) != T_ENDCMD
1655 		 && lex_token (lexer) != T_SLASH)
1656 	    {
1657               if (lex_match_id (lexer, "LISTWISE"))
1658                 {
1659                   examine.missing_pw = false;
1660                 }
1661               else if (lex_match_id (lexer, "PAIRWISE"))
1662                 {
1663                   examine.missing_pw = true;
1664                 }
1665               else if (lex_match_id (lexer, "EXCLUDE"))
1666                 {
1667                   examine.dep_excl = MV_ANY;
1668                 }
1669               else if (lex_match_id (lexer, "INCLUDE"))
1670                 {
1671                   examine.dep_excl = MV_SYSTEM;
1672                 }
1673               else if (lex_match_id (lexer, "REPORT"))
1674                 {
1675                   examine.fctr_excl = MV_NEVER;
1676                 }
1677               else if (lex_match_id (lexer, "NOREPORT"))
1678                 {
1679                   examine.fctr_excl = MV_ANY;
1680                 }
1681               else
1682                 {
1683                   lex_error (lexer, NULL);
1684                   goto error;
1685                 }
1686             }
1687         }
1688       else if (lex_match_id (lexer, "COMPARE"))
1689         {
1690 	  lex_match (lexer, T_EQUALS);
1691           if (lex_match_id (lexer, "VARIABLES"))
1692             {
1693               examine.boxplot_mode = BP_VARIABLES;
1694             }
1695           else if (lex_match_id (lexer, "GROUPS"))
1696             {
1697               examine.boxplot_mode = BP_GROUPS;
1698             }
1699           else
1700             {
1701               lex_error (lexer, NULL);
1702               goto error;
1703             }
1704         }
1705       else if (lex_match_id (lexer, "PLOT"))
1706         {
1707 	  lex_match (lexer, T_EQUALS);
1708 
1709 	  while (lex_token (lexer) != T_ENDCMD
1710 		 && lex_token (lexer) != T_SLASH)
1711 	    {
1712               if (lex_match_id (lexer, "BOXPLOT"))
1713                 {
1714                   examine.plot |= PLOT_BOXPLOT;
1715                 }
1716               else if (lex_match_id (lexer, "NPPLOT"))
1717                 {
1718                   examine.plot |= PLOT_NPPLOT;
1719                 }
1720               else if (lex_match_id (lexer, "HISTOGRAM"))
1721                 {
1722                   examine.plot |= PLOT_HISTOGRAM;
1723                 }
1724               else if (lex_match_id (lexer, "SPREADLEVEL"))
1725                 {
1726                   examine.plot |= PLOT_SPREADLEVEL;
1727 		  examine.sl_power = 0;
1728 		  if (lex_match (lexer, T_LPAREN) && lex_force_int (lexer))
1729 		    {
1730                       examine.sl_power = lex_integer (lexer);
1731 
1732                       lex_get (lexer);
1733                       if (! lex_force_match (lexer, T_RPAREN))
1734                         goto error;
1735 		    }
1736                 }
1737               else if (lex_match_id (lexer, "NONE"))
1738                 {
1739                   examine.plot = 0;
1740                 }
1741               else if (lex_match (lexer, T_ALL))
1742                 {
1743                   examine.plot = ~0;
1744                 }
1745               else
1746                 {
1747                   lex_error (lexer, NULL);
1748                   goto error;
1749                 }
1750               lex_match (lexer, T_COMMA);
1751             }
1752         }
1753       else if (lex_match_id (lexer, "CINTERVAL"))
1754         {
1755           if (!lex_force_num (lexer))
1756             goto error;
1757 
1758           examine.conf = lex_number (lexer);
1759           lex_get (lexer);
1760         }
1761       else if (lex_match_id (lexer, "ID"))
1762         {
1763           lex_match (lexer, T_EQUALS);
1764 
1765           examine.id_var = parse_variable_const (lexer, examine.dict);
1766         }
1767       else
1768         {
1769           lex_error (lexer, NULL);
1770           goto error;
1771         }
1772     }
1773 
1774 
1775   if (totals_seen && nototals_seen)
1776     {
1777       msg (SE, _("%s and %s are mutually exclusive"), "TOTAL", "NOTOTAL");
1778       goto error;
1779     }
1780 
1781   /* If totals have been requested or if there are no factors
1782      in this analysis, then the totals need to be included. */
1783   if (!nototals_seen || examine.n_iacts == 1)
1784     {
1785       examine.iacts = &iacts_mem[0];
1786     }
1787   else
1788     {
1789       examine.n_iacts--;
1790       examine.iacts = &iacts_mem[1];
1791       interaction_destroy (iacts_mem[0]);
1792     }
1793 
1794 
1795   if (examine.id_var)
1796     {
1797       examine.id_idx = var_get_case_index (examine.id_var);
1798       examine.id_width = var_get_width (examine.id_var);
1799     }
1800 
1801   examine.ex_proto = caseproto_add_width (examine.ex_proto, 0); /* value */
1802   examine.ex_proto = caseproto_add_width (examine.ex_proto, examine.id_width);   /* id */
1803   examine.ex_proto = caseproto_add_width (examine.ex_proto, 0); /* weight */
1804 
1805 
1806   if (examine.disp_extremes > 0)
1807     {
1808       examine.calc_extremes = examine.disp_extremes;
1809     }
1810 
1811   if (examine.descriptives && examine.calc_extremes == 0)
1812     {
1813       /* Descriptives always displays the max and min */
1814       examine.calc_extremes = 1;
1815     }
1816 
1817   if (percentiles_seen && examine.n_percentiles == 0)
1818     {
1819       examine.n_percentiles = 7;
1820       examine.ptiles = xcalloc (examine.n_percentiles,
1821                                 sizeof (*examine.ptiles));
1822 
1823       examine.ptiles[0] = 5;
1824       examine.ptiles[1] = 10;
1825       examine.ptiles[2] = 25;
1826       examine.ptiles[3] = 50;
1827       examine.ptiles[4] = 75;
1828       examine.ptiles[5] = 90;
1829       examine.ptiles[6] = 95;
1830     }
1831 
1832   assert (examine.calc_extremes >= examine.disp_extremes);
1833   {
1834     struct casegrouper *grouper;
1835     struct casereader *group;
1836     bool ok;
1837 
1838     grouper = casegrouper_create_splits (proc_open (ds), examine.dict);
1839     while (casegrouper_get_next_group (grouper, &group))
1840       run_examine (&examine, group);
1841     ok = casegrouper_destroy (grouper);
1842     ok = proc_commit (ds) && ok;
1843   }
1844 
1845   caseproto_unref (examine.ex_proto);
1846 
1847   for (i = 0; i < examine.n_iacts; ++i)
1848     interaction_destroy (examine.iacts[i]);
1849   free (examine.ptiles);
1850   free (examine.dep_vars);
1851   pool_destroy (examine.pool);
1852 
1853   return CMD_SUCCESS;
1854 
1855  error:
1856   caseproto_unref (examine.ex_proto);
1857   examine.iacts = iacts_mem;
1858   for (i = 0; i < examine.n_iacts; ++i)
1859     interaction_destroy (examine.iacts[i]);
1860   free (examine.dep_vars);
1861   free (examine.ptiles);
1862   pool_destroy (examine.pool);
1863 
1864   return CMD_FAILURE;
1865 }
1866