1 /* PSPP - a program for statistical analysis.
2    Copyright (C) 2009, 2010, 2011, 2013, 2015, 2016 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
16 
17 #include <config.h>
18 
19 #include <math.h>
20 
21 #include "data/casegrouper.h"
22 #include "data/casereader.h"
23 #include "data/dataset.h"
24 #include "data/dictionary.h"
25 #include "data/format.h"
26 #include "data/missing-values.h"
27 #include "language/command.h"
28 #include "language/lexer/lexer.h"
29 #include "language/lexer/variable-parser.h"
30 #include "libpspp/message.h"
31 #include "libpspp/misc.h"
32 #include "libpspp/str.h"
33 #include "math/moments.h"
34 #include "output/pivot-table.h"
35 #include "output/text-item.h"
36 
37 #include "gettext.h"
38 #define _(msgid) gettext (msgid)
39 #define N_(msgid) msgid
40 
41 struct cronbach
42 {
43   const struct variable **items;
44   size_t n_items;
45   double alpha;
46   double sum_of_variances;
47   double variance_of_sums;
48   int totals_idx;          /* Casereader index into the totals */
49 
50   struct moments1 **m ;    /* Moments of the items */
51   struct moments1 *total ; /* Moments of the totals */
52 };
53 
54 #if 0
55 static void
56 dump_cronbach (const struct cronbach *s)
57 {
58   int i;
59   printf ("N items %d\n", s->n_items);
60   for (i = 0 ; i < s->n_items; ++i)
61     {
62       printf ("%s\n", var_get_name (s->items[i]));
63     }
64 
65   printf ("Totals idx %d\n", s->totals_idx);
66 
67   printf ("scale variance %g\n", s->variance_of_sums);
68   printf ("alpha %g\n", s->alpha);
69   putchar ('\n');
70 }
71 #endif
72 
73 enum model
74   {
75     MODEL_ALPHA,
76     MODEL_SPLIT
77   };
78 
79 
80 enum summary_opts
81   {
82     SUMMARY_TOTAL = 0x0001,
83   };
84 
85 
86 struct reliability
87 {
88   const struct variable **variables;
89   size_t n_variables;
90   enum mv_class exclude;
91 
92   struct cronbach *sc;
93   int n_sc;
94 
95   int total_start;
96 
97   struct string scale_name;
98 
99   enum model model;
100   int split_point;
101 
102 
103   enum summary_opts summary;
104 
105   const struct variable *wv;
106 };
107 
108 
109 static bool run_reliability (struct dataset *ds, const struct reliability *reliability);
110 
111 static void
reliability_destroy(struct reliability * rel)112 reliability_destroy (struct reliability *rel)
113 {
114   int j;
115   ds_destroy (&rel->scale_name);
116   if (rel->sc)
117     for (j = 0; j < rel->n_sc ; ++j)
118       {
119 	int x;
120 	free (rel->sc[j].items);
121         moments1_destroy (rel->sc[j].total);
122         if (rel->sc[j].m)
123           for (x = 0; x < rel->sc[j].n_items; ++x)
124             free (rel->sc[j].m[x]);
125 	free (rel->sc[j].m);
126       }
127 
128   free (rel->sc);
129   free (rel->variables);
130 }
131 
132 int
cmd_reliability(struct lexer * lexer,struct dataset * ds)133 cmd_reliability (struct lexer *lexer, struct dataset *ds)
134 {
135   const struct dictionary *dict = dataset_dict (ds);
136 
137   struct reliability reliability;
138   reliability.n_variables = 0;
139   reliability.variables = NULL;
140   reliability.model = MODEL_ALPHA;
141   reliability.exclude = MV_ANY;
142   reliability.summary = 0;
143   reliability.n_sc = 0;
144   reliability.sc = NULL;
145   reliability.wv = dict_get_weight (dict);
146   reliability.total_start = 0;
147   ds_init_empty (&reliability.scale_name);
148 
149 
150   lex_match (lexer, T_SLASH);
151 
152   if (!lex_force_match_id (lexer, "VARIABLES"))
153     {
154       goto error;
155     }
156 
157   lex_match (lexer, T_EQUALS);
158 
159   if (!parse_variables_const (lexer, dict, &reliability.variables, &reliability.n_variables,
160 			      PV_NO_DUPLICATE | PV_NUMERIC))
161     goto error;
162 
163   if (reliability.n_variables < 2)
164     msg (MW, _("Reliability on a single variable is not useful."));
165 
166 
167   {
168     int i;
169     struct cronbach *c;
170     /* Create a default Scale */
171 
172     reliability.n_sc = 1;
173     reliability.sc = xzalloc (sizeof (struct cronbach) * reliability.n_sc);
174 
175     ds_assign_cstr (&reliability.scale_name, "ANY");
176 
177     c = &reliability.sc[0];
178     c->n_items = reliability.n_variables;
179     c->items = xzalloc (sizeof (struct variable*) * c->n_items);
180 
181     for (i = 0 ; i < c->n_items ; ++i)
182       c->items[i] = reliability.variables[i];
183   }
184 
185 
186 
187   while (lex_token (lexer) != T_ENDCMD)
188     {
189       lex_match (lexer, T_SLASH);
190 
191       if (lex_match_id (lexer, "SCALE"))
192 	{
193 	  struct const_var_set *vs;
194 	  if (! lex_force_match (lexer, T_LPAREN))
195 	    goto error;
196 
197 	  if (! lex_force_string (lexer))
198 	    goto error;
199 
200 	  ds_assign_substring (&reliability.scale_name, lex_tokss (lexer));
201 
202 	  lex_get (lexer);
203 
204 	  if (! lex_force_match (lexer, T_RPAREN))
205 	    goto error;
206 
207           lex_match (lexer, T_EQUALS);
208 
209 	  vs = const_var_set_create_from_array (reliability.variables, reliability.n_variables);
210 
211 	  free (reliability.sc->items);
212 	  if (!parse_const_var_set_vars (lexer, vs, &reliability.sc->items, &reliability.sc->n_items, 0))
213 	    {
214 	      const_var_set_destroy (vs);
215 	      goto error;
216 	    }
217 
218 	  const_var_set_destroy (vs);
219 	}
220       else if (lex_match_id (lexer, "MODEL"))
221 	{
222           lex_match (lexer, T_EQUALS);
223 	  if (lex_match_id (lexer, "ALPHA"))
224 	    {
225 	      reliability.model = MODEL_ALPHA;
226 	    }
227 	  else if (lex_match_id (lexer, "SPLIT"))
228 	    {
229 	      reliability.model = MODEL_SPLIT;
230 	      reliability.split_point = -1;
231 
232 	      if (lex_match (lexer, T_LPAREN)
233 		   && lex_force_num (lexer))
234 		{
235 		  reliability.split_point = lex_number (lexer);
236 		  lex_get (lexer);
237 		  if (! lex_force_match (lexer, T_RPAREN))
238 		    goto error;
239 		}
240 	    }
241 	  else
242 	    goto error;
243 	}
244       else if (lex_match_id (lexer, "SUMMARY"))
245         {
246           lex_match (lexer, T_EQUALS);
247 	  if (lex_match_id (lexer, "TOTAL"))
248 	    {
249 	      reliability.summary |= SUMMARY_TOTAL;
250 	    }
251 	  else if (lex_match (lexer, T_ALL))
252 	    {
253 	      reliability.summary = 0xFFFF;
254 	    }
255 	  else
256 	    goto error;
257 	}
258       else if (lex_match_id (lexer, "MISSING"))
259         {
260           lex_match (lexer, T_EQUALS);
261           while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
262             {
263 	      if (lex_match_id (lexer, "INCLUDE"))
264 		{
265 		  reliability.exclude = MV_SYSTEM;
266 		}
267 	      else if (lex_match_id (lexer, "EXCLUDE"))
268 		{
269 		  reliability.exclude = MV_ANY;
270 		}
271 	      else
272 		{
273                   lex_error (lexer, NULL);
274 		  goto error;
275 		}
276 	    }
277 	}
278       else if (lex_match_id (lexer, "STATISTICS"))
279         {
280           lex_match (lexer, T_EQUALS);
281           msg (SW, _("The STATISTICS subcommand is not yet implemented.  "
282                      "No statistics will be produced."));
283           while (lex_match (lexer, T_ID))
284             continue;
285         }
286       else
287 	{
288 	  lex_error (lexer, NULL);
289 	  goto error;
290 	}
291     }
292 
293   if (reliability.model == MODEL_SPLIT)
294     {
295       int i;
296       const struct cronbach *s;
297 
298       if (reliability.split_point >= reliability.n_variables)
299         {
300           msg (ME, _("The split point must be less than the number of variables"));
301           goto error;
302         }
303 
304       reliability.n_sc += 2 ;
305       reliability.sc = xrealloc (reliability.sc, sizeof (struct cronbach) * reliability.n_sc);
306 
307       s = &reliability.sc[0];
308 
309       reliability.sc[1].n_items =
310 	(reliability.split_point == -1) ? s->n_items / 2 : reliability.split_point;
311 
312       reliability.sc[2].n_items = s->n_items - reliability.sc[1].n_items;
313       reliability.sc[1].items = xzalloc (sizeof (struct variable *)
314 				 * reliability.sc[1].n_items);
315 
316       reliability.sc[2].items = xzalloc (sizeof (struct variable *) *
317 				 reliability.sc[2].n_items);
318 
319       for  (i = 0; i < reliability.sc[1].n_items ; ++i)
320 	reliability.sc[1].items[i] = s->items[i];
321 
322       while (i < s->n_items)
323 	{
324 	  reliability.sc[2].items[i - reliability.sc[1].n_items] = s->items[i];
325 	  i++;
326 	}
327     }
328 
329   if (reliability.summary & SUMMARY_TOTAL)
330     {
331       int i;
332       const int base_sc = reliability.n_sc;
333 
334       reliability.total_start = base_sc;
335 
336       reliability.n_sc +=  reliability.sc[0].n_items ;
337       reliability.sc = xrealloc (reliability.sc, sizeof (struct cronbach) * reliability.n_sc);
338 
339 
340       for (i = 0 ; i < reliability.sc[0].n_items; ++i)
341 	{
342 	  int v_src;
343 	  int v_dest = 0;
344 	  struct cronbach *s = &reliability.sc[i + base_sc];
345 
346 	  s->n_items = reliability.sc[0].n_items - 1;
347 	  s->items = xzalloc (sizeof (struct variable *) * s->n_items);
348 	  for (v_src = 0 ; v_src < reliability.sc[0].n_items ; ++v_src)
349 	    {
350 	      if (v_src != i)
351 		s->items[v_dest++] = reliability.sc[0].items[v_src];
352 	    }
353 	}
354     }
355 
356 
357   if (! run_reliability (ds, &reliability))
358     goto error;
359 
360   reliability_destroy (&reliability);
361   return CMD_SUCCESS;
362 
363  error:
364   reliability_destroy (&reliability);
365   return CMD_FAILURE;
366 }
367 
368 
369 static void
370 do_reliability (struct casereader *group, struct dataset *ds,
371 		const struct reliability *rel);
372 
373 
374 static void reliability_summary_total (const struct reliability *rel);
375 
376 static void reliability_statistics (const struct reliability *rel);
377 
378 
379 static bool
run_reliability(struct dataset * ds,const struct reliability * reliability)380 run_reliability (struct dataset *ds, const struct reliability *reliability)
381 {
382   struct dictionary *dict = dataset_dict (ds);
383   bool ok;
384   struct casereader *group;
385 
386   struct casegrouper *grouper = casegrouper_create_splits (proc_open (ds), dict);
387   int si;
388 
389   for (si = 0 ; si < reliability->n_sc; ++si)
390     {
391       struct cronbach *s = &reliability->sc[si];
392       int i;
393 
394       s->m = xzalloc (sizeof *s->m * s->n_items);
395       s->total = moments1_create (MOMENT_VARIANCE);
396 
397       for (i = 0 ; i < s->n_items ; ++i)
398 	s->m[i] = moments1_create (MOMENT_VARIANCE);
399     }
400 
401 
402   while (casegrouper_get_next_group (grouper, &group))
403     {
404       do_reliability (group, ds, reliability);
405 
406       reliability_statistics (reliability);
407 
408       if (reliability->summary & SUMMARY_TOTAL)
409 	reliability_summary_total (reliability);
410     }
411 
412   ok = casegrouper_destroy (grouper);
413   ok = proc_commit (ds) && ok;
414 
415   return ok;
416 }
417 
418 
419 
420 
421 
422 /* Return the sum of all the item variables in S */
423 static  double
append_sum(const struct ccase * c,casenumber n UNUSED,void * aux)424 append_sum (const struct ccase *c, casenumber n UNUSED, void *aux)
425 {
426   double sum = 0;
427   const struct cronbach *s = aux;
428 
429   int v;
430   for (v = 0 ; v < s->n_items; ++v)
431     {
432       sum += case_data (c, s->items[v])->f;
433     }
434 
435   return sum;
436 };
437 
438 static void
439 case_processing_summary (casenumber n_valid, casenumber n_missing,
440 			 const struct dictionary *dict);
441 
442 
443 static double
alpha(int k,double sum_of_variances,double variance_of_sums)444 alpha (int k, double sum_of_variances, double variance_of_sums)
445 {
446   return k / (k - 1.0) * (1 - sum_of_variances / variance_of_sums);
447 }
448 
449 static void
do_reliability(struct casereader * input,struct dataset * ds,const struct reliability * rel)450 do_reliability (struct casereader *input, struct dataset *ds,
451 		const struct reliability *rel)
452 {
453   int i;
454   int si;
455   struct ccase *c;
456   casenumber n_missing ;
457   casenumber n_valid = 0;
458 
459 
460   for (si = 0 ; si < rel->n_sc; ++si)
461     {
462       struct cronbach *s = &rel->sc[si];
463 
464       moments1_clear (s->total);
465 
466       for (i = 0 ; i < s->n_items ; ++i)
467         moments1_clear (s->m[i]);
468     }
469 
470   input = casereader_create_filter_missing (input,
471 					    rel->variables,
472 					    rel->n_variables,
473 					    rel->exclude,
474 					    &n_missing,
475 					    NULL);
476 
477   for (si = 0 ; si < rel->n_sc; ++si)
478     {
479       struct cronbach *s = &rel->sc[si];
480 
481 
482       s->totals_idx = caseproto_get_n_widths (casereader_get_proto (input));
483       input =
484 	casereader_create_append_numeric (input, append_sum,
485 					  s, NULL);
486     }
487 
488   for (; (c = casereader_read (input)) != NULL; case_unref (c))
489     {
490       double weight = 1.0;
491       n_valid ++;
492 
493       for (si = 0; si < rel->n_sc; ++si)
494 	{
495 	  struct cronbach *s = &rel->sc[si];
496 
497 	  for (i = 0 ; i < s->n_items ; ++i)
498 	    moments1_add (s->m[i], case_data (c, s->items[i])->f, weight);
499 
500 	  moments1_add (s->total, case_data_idx (c, s->totals_idx)->f, weight);
501 	}
502     }
503   casereader_destroy (input);
504 
505   for (si = 0; si < rel->n_sc; ++si)
506     {
507       struct cronbach *s = &rel->sc[si];
508 
509       s->sum_of_variances = 0;
510       for (i = 0 ; i < s->n_items ; ++i)
511 	{
512 	  double weight, mean, variance;
513 	  moments1_calculate (s->m[i], &weight, &mean, &variance, NULL, NULL);
514 
515 	  s->sum_of_variances += variance;
516 	}
517 
518       moments1_calculate (s->total, NULL, NULL, &s->variance_of_sums,
519 			  NULL, NULL);
520 
521       s->alpha =
522 	alpha (s->n_items, s->sum_of_variances, s->variance_of_sums);
523     }
524 
525   text_item_submit (text_item_create_format (TEXT_ITEM_TITLE, _("Scale: %s"),
526                                              ds_cstr (&rel->scale_name)));
527 
528   case_processing_summary (n_valid, n_missing, dataset_dict (ds));
529 }
530 
531 
532 
533 
534 
535 static void
case_processing_summary(casenumber n_valid,casenumber n_missing,const struct dictionary * dict)536 case_processing_summary (casenumber n_valid, casenumber n_missing,
537 			 const struct dictionary *dict)
538 {
539   struct pivot_table *table = pivot_table_create (
540     N_("Case Processing Summary"));
541   pivot_table_set_weight_var (table, dict_get_weight (dict));
542 
543   pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
544                           N_("N"), PIVOT_RC_COUNT,
545                           N_("Percent"), PIVOT_RC_PERCENT);
546 
547   struct pivot_dimension *cases = pivot_dimension_create (
548     table, PIVOT_AXIS_ROW, N_("Cases"), N_("Valid"), N_("Excluded"),
549     N_("Total"));
550   cases->root->show_label = true;
551 
552   casenumber total = n_missing + n_valid;
553 
554   struct entry
555     {
556       int stat_idx;
557       int case_idx;
558       double x;
559     }
560   entries[] = {
561     { 0, 0, n_valid },
562     { 0, 1, n_missing },
563     { 0, 2, total },
564     { 1, 0, 100.0 * n_valid / total },
565     { 1, 1, 100.0 * n_missing / total },
566     { 1, 2, 100.0 }
567   };
568 
569   for (size_t i = 0; i < sizeof entries / sizeof *entries; i++)
570     {
571       const struct entry *e = &entries[i];
572       pivot_table_put2 (table, e->stat_idx, e->case_idx,
573                         pivot_value_new_number (e->x));
574     }
575 
576   pivot_table_submit (table);
577 }
578 
579 static void
reliability_summary_total(const struct reliability * rel)580 reliability_summary_total (const struct reliability *rel)
581 {
582   struct pivot_table *table = pivot_table_create (N_("Item-Total Statistics"));
583 
584   pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
585                           N_("Scale Mean if Item Deleted"),
586                           N_("Scale Variance if Item Deleted"),
587                           N_("Corrected Item-Total Correlation"),
588                           N_("Cronbach's Alpha if Item Deleted"));
589 
590   struct pivot_dimension *variables = pivot_dimension_create (
591     table, PIVOT_AXIS_ROW, N_("Variables"));
592 
593   for (size_t i = 0 ; i < rel->sc[0].n_items; ++i)
594     {
595       const struct cronbach *s = &rel->sc[rel->total_start + i];
596 
597       int var_idx = pivot_category_create_leaf (
598         variables->root, pivot_value_new_variable (rel->sc[0].items[i]));
599 
600       double mean;
601       moments1_calculate (s->total, NULL, &mean, NULL, NULL, NULL);
602 
603       double var;
604       moments1_calculate (rel->sc[0].m[i], NULL, NULL, &var, NULL, NULL);
605       double cov
606         = (rel->sc[0].variance_of_sums + var - s->variance_of_sums) / 2.0;
607 
608       double entries[] = {
609         mean,
610         s->variance_of_sums,
611         (cov - var) / sqrt (var * s->variance_of_sums),
612         s->alpha,
613       };
614       for (size_t i = 0; i < sizeof entries / sizeof *entries; i++)
615         pivot_table_put2 (table, i, var_idx,
616                           pivot_value_new_number (entries[i]));
617     }
618 
619   pivot_table_submit (table);
620 }
621 
622 
623 static void
reliability_statistics(const struct reliability * rel)624 reliability_statistics (const struct reliability *rel)
625 {
626   struct pivot_table *table = pivot_table_create (
627     N_("Reliability Statistics"));
628   pivot_table_set_weight_var (table, rel->wv);
629 
630   if (rel->model == MODEL_ALPHA)
631     {
632       pivot_dimension_create (table, PIVOT_AXIS_COLUMN,
633                               N_("Statistics"),
634                               N_("Cronbach's Alpha"), PIVOT_RC_OTHER,
635                               N_("N of Items"), PIVOT_RC_COUNT);
636 
637       const struct cronbach *s = &rel->sc[0];
638       pivot_table_put1 (table, 0, pivot_value_new_number (s->alpha));
639       pivot_table_put1 (table, 1, pivot_value_new_number (s->n_items));
640     }
641   else
642     {
643       struct pivot_dimension *statistics = pivot_dimension_create (
644         table, PIVOT_AXIS_ROW, N_("Statistics"));
645       struct pivot_category *alpha = pivot_category_create_group (
646         statistics->root, N_("Cronbach's Alpha"));
647       pivot_category_create_group (alpha, N_("Part 1"),
648                                    N_("Value"), PIVOT_RC_OTHER,
649                                    N_("N of Items"), PIVOT_RC_COUNT);
650       pivot_category_create_group (alpha, N_("Part 2"),
651                                    N_("Value"), PIVOT_RC_OTHER,
652                                    N_("N of Items"), PIVOT_RC_COUNT);
653       pivot_category_create_leaves (alpha,
654                                     N_("Total N of Items"), PIVOT_RC_COUNT);
655       pivot_category_create_leaves (statistics->root,
656                                     N_("Correlation Between Forms"),
657                                     PIVOT_RC_OTHER);
658       pivot_category_create_group (statistics->root,
659                                    N_("Spearman-Brown Coefficient"),
660                                    N_("Equal Length"), PIVOT_RC_OTHER,
661                                    N_("Unequal Length"), PIVOT_RC_OTHER);
662       pivot_category_create_leaves (statistics->root,
663                                     N_("Guttman Split-Half Coefficient"),
664                                     PIVOT_RC_OTHER);
665 
666       /* R is the correlation between the two parts */
667       double r0 = rel->sc[0].variance_of_sums -
668         rel->sc[1].variance_of_sums -
669         rel->sc[2].variance_of_sums ;
670       double r1 = (r0 / sqrt (rel->sc[1].variance_of_sums)
671                    / sqrt (rel->sc[2].variance_of_sums)
672                    / 2.0);
673 
674       /* Guttman Split Half Coefficient */
675       double g = 2 * r0 / rel->sc[0].variance_of_sums;
676 
677       double tmp = (1.0 - r1*r1) * rel->sc[1].n_items * rel->sc[2].n_items /
678         pow2 (rel->sc[0].n_items);
679 
680       double entries[] = {
681         rel->sc[1].alpha,
682         rel->sc[1].n_items,
683         rel->sc[2].alpha,
684         rel->sc[2].n_items,
685         rel->sc[1].n_items + rel->sc[2].n_items,
686         r1,
687         2 * r1 / (1.0 + r1),
688         (sqrt (pow4 (r1) + 4 * pow2 (r1) * tmp) - pow2 (r1)) / (2 * tmp),
689         g,
690       };
691       for (size_t i = 0; i < sizeof entries / sizeof *entries; i++)
692         pivot_table_put1 (table, i, pivot_value_new_number (entries[i]));
693     }
694 
695   pivot_table_submit (table);
696 }
697