1 /* PSPP - a program for statistical analysis.
2    Copyright (C) 1997-9, 2000, 2006, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
16 
17 #include <config.h>
18 
19 #include "language/stats/aggregate.h"
20 
21 #include <stdlib.h>
22 
23 #include "data/any-writer.h"
24 #include "data/case.h"
25 #include "data/casegrouper.h"
26 #include "data/casereader.h"
27 #include "data/casewriter.h"
28 #include "data/dataset.h"
29 #include "data/dictionary.h"
30 #include "data/file-handle-def.h"
31 #include "data/format.h"
32 #include "data/settings.h"
33 #include "data/subcase.h"
34 #include "data/sys-file-writer.h"
35 #include "data/variable.h"
36 #include "language/command.h"
37 #include "language/data-io/file-handle.h"
38 #include "language/lexer/lexer.h"
39 #include "language/lexer/variable-parser.h"
40 #include "language/stats/sort-criteria.h"
41 #include "libpspp/assertion.h"
42 #include "libpspp/i18n.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/pool.h"
46 #include "libpspp/str.h"
47 #include "math/moments.h"
48 #include "math/percentiles.h"
49 #include "math/sort.h"
50 #include "math/statistic.h"
51 
52 #include "gl/c-strcase.h"
53 #include "gl/minmax.h"
54 #include "gl/xalloc.h"
55 
56 #include "gettext.h"
57 #define _(msgid) gettext (msgid)
58 #define N_(msgid) msgid
59 
60 /* Argument for AGGREGATE function. */
61 union agr_argument
62   {
63     double f;                           /* Numeric. */
64     char *c;                            /* Short or long string. */
65   };
66 
67 /* Specifies how to make an aggregate variable. */
68 struct agr_var
69   {
70     struct agr_var *next;		/* Next in list. */
71 
72     /* Collected during parsing. */
73     const struct variable *src;	/* Source variable. */
74     struct variable *dest;	/* Target variable. */
75     int function;		/* Function. */
76     enum mv_class exclude;      /* Classes of missing values to exclude. */
77     union agr_argument arg[2];	/* Arguments. */
78 
79     /* Accumulated during AGGREGATE execution. */
80     double dbl[3];
81     int int1, int2;
82     char *string;
83     bool saw_missing;
84     struct moments1 *moments;
85     double cc;
86 
87     struct variable *subject;
88     struct variable *weight;
89     struct casewriter *writer;
90   };
91 
92 
93 /* Attributes of aggregation functions. */
94 const struct agr_func agr_func_tab[] =
95   {
96     {"SUM",     N_("Sum of values"),                         AGR_SV_YES, 0, -1,          {FMT_F, 8, 2}},
97     {"MEAN",	N_("Mean average"),                          AGR_SV_YES, 0, -1,          {FMT_F, 8, 2}},
98     {"MEDIAN",	N_("Median average"),                        AGR_SV_YES, 0, -1,          {FMT_F, 8, 2}},
99     {"SD",      N_("Standard deviation"),                    AGR_SV_YES, 0, -1,          {FMT_F, 8, 2}},
100     {"MAX",     N_("Maximum value"),                         AGR_SV_YES, 0, VAL_STRING,  {-1, -1, -1}},
101     {"MIN",     N_("Minimum value"),                         AGR_SV_YES, 0, VAL_STRING,  {-1, -1, -1}},
102     {"PGT",     N_("Percentage greater than"),               AGR_SV_YES, 1, VAL_NUMERIC, {FMT_F, 5, 1}},
103     {"PLT",     N_("Percentage less than"),                  AGR_SV_YES, 1, VAL_NUMERIC, {FMT_F, 5, 1}},
104     {"PIN",     N_("Percentage included in range"),          AGR_SV_YES, 2, VAL_NUMERIC, {FMT_F, 5, 1}},
105     {"POUT",    N_("Percentage excluded from range"),        AGR_SV_YES, 2, VAL_NUMERIC, {FMT_F, 5, 1}},
106     {"FGT",     N_("Fraction greater than"),                 AGR_SV_YES, 1, VAL_NUMERIC, {FMT_F, 5, 3}},
107     {"FLT",     N_("Fraction less than"),                    AGR_SV_YES, 1, VAL_NUMERIC, {FMT_F, 5, 3}},
108     {"FIN",     N_("Fraction included in range"),            AGR_SV_YES, 2, VAL_NUMERIC, {FMT_F, 5, 3}},
109     {"FOUT",    N_("Fraction excluded from range"),          AGR_SV_YES, 2, VAL_NUMERIC, {FMT_F, 5, 3}},
110     {"N",       N_("Number of cases"),                       AGR_SV_NO,  0, VAL_NUMERIC, {FMT_F, 7, 0}},
111     {"NU",      N_("Number of cases (unweighted)"),          AGR_SV_OPT, 0, VAL_NUMERIC, {FMT_F, 7, 0}},
112     {"NMISS",   N_("Number of missing values"),              AGR_SV_YES, 0, VAL_NUMERIC, {FMT_F, 7, 0}},
113     {"NUMISS",  N_("Number of missing values (unweighted)"), AGR_SV_YES, 0, VAL_NUMERIC, {FMT_F, 7, 0}},
114     {"FIRST",   N_("First non-missing value"),               AGR_SV_YES, 0, VAL_STRING,  {-1, -1, -1}},
115     {"LAST",    N_("Last non-missing value"),                AGR_SV_YES, 0, VAL_STRING,  {-1, -1, -1}},
116     {NULL,      NULL,                                        AGR_SV_NO,  0, -1,          {-1, -1, -1}},
117   };
118 
119 /* Missing value types. */
120 enum missing_treatment
121   {
122     ITEMWISE,		/* Missing values item by item. */
123     COLUMNWISE		/* Missing values column by column. */
124   };
125 
126 /* An entire AGGREGATE procedure. */
127 struct agr_proc
128   {
129     /* Break variables. */
130     struct subcase sort;                /* Sort criteria (break variables). */
131     const struct variable **break_vars;       /* Break variables. */
132     size_t break_var_cnt;               /* Number of break variables. */
133 
134     enum missing_treatment missing;     /* How to treat missing values. */
135     struct agr_var *agr_vars;           /* First aggregate variable. */
136     struct dictionary *dict;            /* Aggregate dictionary. */
137     const struct dictionary *src_dict;  /* Dict of the source */
138     int case_cnt;                       /* Counts aggregated cases. */
139 
140     bool add_variables;                 /* True iff the aggregated variables should
141 					   be appended to the existing dictionary */
142   };
143 
144 static void initialize_aggregate_info (struct agr_proc *);
145 
146 static void accumulate_aggregate_info (struct agr_proc *,
147                                        const struct ccase *);
148 /* Prototypes. */
149 static bool parse_aggregate_functions (struct lexer *, const struct dictionary *,
150 				       struct agr_proc *);
151 static void agr_destroy (struct agr_proc *);
152 static void dump_aggregate_info (const struct agr_proc *agr,
153                                  struct casewriter *output,
154 				 const struct ccase *break_case);
155 
156 /* Parsing. */
157 
158 /* Parses and executes the AGGREGATE procedure. */
159 int
cmd_aggregate(struct lexer * lexer,struct dataset * ds)160 cmd_aggregate (struct lexer *lexer, struct dataset *ds)
161 {
162   struct dictionary *dict = dataset_dict (ds);
163   struct agr_proc agr;
164   struct file_handle *out_file = NULL;
165   struct casereader *input = NULL, *group;
166   struct casegrouper *grouper;
167   struct casewriter *output = NULL;
168 
169   bool copy_documents = false;
170   bool presorted = false;
171   bool saw_direction;
172   bool ok;
173 
174   memset(&agr, 0 , sizeof (agr));
175   agr.missing = ITEMWISE;
176   agr.src_dict = dict;
177   subcase_init_empty (&agr.sort);
178 
179   /* OUTFILE subcommand must be first. */
180   lex_match (lexer, T_SLASH);
181   if (!lex_force_match_id (lexer, "OUTFILE"))
182     goto error;
183   lex_match (lexer, T_EQUALS);
184   if (!lex_match (lexer, T_ASTERISK))
185     {
186       out_file = fh_parse (lexer, FH_REF_FILE, dataset_session (ds));
187       if (out_file == NULL)
188         goto error;
189     }
190 
191   if (out_file == NULL && lex_match_id (lexer, "MODE"))
192     {
193       lex_match (lexer, T_EQUALS);
194       if (lex_match_id (lexer, "ADDVARIABLES"))
195 	{
196 	  agr.add_variables = true;
197 
198 	  /* presorted is assumed in ADDVARIABLES mode */
199 	  presorted = true;
200 	}
201       else if (lex_match_id (lexer, "REPLACE"))
202 	{
203 	  agr.add_variables = false;
204 	}
205       else
206 	goto error;
207     }
208 
209   if (agr.add_variables)
210     agr.dict = dict_clone (dict);
211   else
212     agr.dict = dict_create (dict_get_encoding (dict));
213 
214   dict_set_label (agr.dict, dict_get_label (dict));
215   dict_set_documents (agr.dict, dict_get_documents (dict));
216 
217   /* Read most of the subcommands. */
218   for (;;)
219     {
220       lex_match (lexer, T_SLASH);
221 
222       if (lex_match_id (lexer, "MISSING"))
223 	{
224 	  lex_match (lexer, T_EQUALS);
225 	  if (!lex_match_id (lexer, "COLUMNWISE"))
226 	    {
227 	      lex_error_expecting (lexer, "COLUMNWISE");
228               goto error;
229 	    }
230 	  agr.missing = COLUMNWISE;
231 	}
232       else if (lex_match_id (lexer, "DOCUMENT"))
233         copy_documents = true;
234       else if (lex_match_id (lexer, "PRESORTED"))
235         presorted = true;
236       else if (lex_force_match_id (lexer, "BREAK"))
237 	{
238           int i;
239 
240 	  lex_match (lexer, T_EQUALS);
241           if (!parse_sort_criteria (lexer, dict, &agr.sort, &agr.break_vars,
242                                     &saw_direction))
243             goto error;
244           agr.break_var_cnt = subcase_get_n_fields (&agr.sort);
245 
246 	  if  (! agr.add_variables)
247 	    for (i = 0; i < agr.break_var_cnt; i++)
248 	      dict_clone_var_assert (agr.dict, agr.break_vars[i]);
249 
250           /* BREAK must follow the options. */
251           break;
252 	}
253       else
254         goto error;
255 
256     }
257   if (presorted && saw_direction)
258     msg (SW, _("When PRESORTED is specified, specifying sorting directions "
259                "with (A) or (D) has no effect.  Output data will be sorted "
260                "the same way as the input data."));
261 
262   /* Read in the aggregate functions. */
263   lex_match (lexer, T_SLASH);
264   if (!parse_aggregate_functions (lexer, dict, &agr))
265     goto error;
266 
267   /* Delete documents. */
268   if (!copy_documents)
269     dict_clear_documents (agr.dict);
270 
271   /* Cancel SPLIT FILE. */
272   dict_set_split_vars (agr.dict, NULL, 0);
273 
274   /* Initialize. */
275   agr.case_cnt = 0;
276 
277   if (out_file == NULL)
278     {
279       /* The active dataset will be replaced by the aggregated data,
280          so TEMPORARY is moot. */
281       proc_cancel_temporary_transformations (ds);
282       proc_discard_output (ds);
283       output = autopaging_writer_create (dict_get_proto (agr.dict));
284     }
285   else
286     {
287       output = any_writer_open (out_file, agr.dict);
288       if (output == NULL)
289         goto error;
290     }
291 
292   input = proc_open (ds);
293   if (!subcase_is_empty (&agr.sort) && !presorted)
294     {
295       input = sort_execute (input, &agr.sort);
296       subcase_clear (&agr.sort);
297     }
298 
299   for (grouper = casegrouper_create_vars (input, agr.break_vars,
300                                           agr.break_var_cnt);
301        casegrouper_get_next_group (grouper, &group);
302        casereader_destroy (group))
303     {
304       struct casereader *placeholder = NULL;
305       struct ccase *c = casereader_peek (group, 0);
306 
307       if (c == NULL)
308         {
309           casereader_destroy (group);
310           continue;
311         }
312 
313       initialize_aggregate_info (&agr);
314 
315       if (agr.add_variables)
316 	placeholder = casereader_clone (group);
317 
318       {
319 	struct ccase *cg;
320 	for (; (cg = casereader_read (group)) != NULL; case_unref (cg))
321 	  accumulate_aggregate_info (&agr, cg);
322       }
323 
324 
325       if  (agr.add_variables)
326 	{
327 	  struct ccase *cg;
328 	  for (; (cg = casereader_read (placeholder)) != NULL; case_unref (cg))
329 	    dump_aggregate_info (&agr, output, cg);
330 
331 	  casereader_destroy (placeholder);
332 	}
333       else
334 	{
335 	  dump_aggregate_info (&agr, output, c);
336 	}
337       case_unref (c);
338     }
339   if (!casegrouper_destroy (grouper))
340     goto error;
341 
342   if (!proc_commit (ds))
343     {
344       input = NULL;
345       goto error;
346     }
347   input = NULL;
348 
349   if (out_file == NULL)
350     {
351       struct casereader *next_input = casewriter_make_reader (output);
352       if (next_input == NULL)
353         goto error;
354 
355       dataset_set_dict (ds, agr.dict);
356       dataset_set_source (ds, next_input);
357       agr.dict = NULL;
358     }
359   else
360     {
361       ok = casewriter_destroy (output);
362       output = NULL;
363       if (!ok)
364         goto error;
365     }
366 
367   agr_destroy (&agr);
368   fh_unref (out_file);
369   return CMD_SUCCESS;
370 
371 error:
372   if (input != NULL)
373     proc_commit (ds);
374   casewriter_destroy (output);
375   agr_destroy (&agr);
376   fh_unref (out_file);
377   return CMD_CASCADING_FAILURE;
378 }
379 
380 /* Parse all the aggregate functions. */
381 static bool
parse_aggregate_functions(struct lexer * lexer,const struct dictionary * dict,struct agr_proc * agr)382 parse_aggregate_functions (struct lexer *lexer, const struct dictionary *dict,
383 			   struct agr_proc *agr)
384 {
385   struct agr_var *tail; /* Tail of linked list starting at agr->vars. */
386 
387   /* Parse everything. */
388   tail = NULL;
389   for (;;)
390     {
391       char **dest;
392       char **dest_label;
393       size_t n_dest;
394       struct string function_name;
395 
396       enum mv_class exclude;
397       const struct agr_func *function;
398       int func_index;
399 
400       union agr_argument arg[2];
401 
402       const struct variable **src;
403       size_t n_src;
404 
405       size_t i;
406 
407       dest = NULL;
408       dest_label = NULL;
409       n_dest = 0;
410       src = NULL;
411       function = NULL;
412       n_src = 0;
413       arg[0].c = NULL;
414       arg[1].c = NULL;
415       ds_init_empty (&function_name);
416 
417       /* Parse the list of target variables. */
418       while (!lex_match (lexer, T_EQUALS))
419 	{
420 	  size_t n_dest_prev = n_dest;
421 
422 	  if (!parse_DATA_LIST_vars (lexer, dict, &dest, &n_dest,
423                                      (PV_APPEND | PV_SINGLE | PV_NO_SCRATCH
424                                       | PV_NO_DUPLICATE)))
425 	    goto error;
426 
427 	  /* Assign empty labels. */
428 	  {
429 	    int j;
430 
431 	    dest_label = xnrealloc (dest_label, n_dest, sizeof *dest_label);
432 	    for (j = n_dest_prev; j < n_dest; j++)
433 	      dest_label[j] = NULL;
434 	  }
435 
436 
437 
438 	  if (lex_is_string (lexer))
439 	    {
440 	      dest_label[n_dest - 1] = xstrdup (lex_tokcstr (lexer));
441 	      lex_get (lexer);
442 	    }
443 	}
444 
445       /* Get the name of the aggregation function. */
446       if (lex_token (lexer) != T_ID)
447 	{
448 	  lex_error (lexer, _("expecting aggregation function"));
449 	  goto error;
450 	}
451 
452       ds_assign_substring (&function_name, lex_tokss (lexer));
453       exclude = ds_chomp_byte (&function_name, '.') ? MV_SYSTEM : MV_ANY;
454 
455       for (function = agr_func_tab; function->name; function++)
456 	if (!c_strcasecmp (function->name, ds_cstr (&function_name)))
457 	  break;
458       if (NULL == function->name)
459 	{
460 	  msg (SE, _("Unknown aggregation function %s."),
461 	       ds_cstr (&function_name));
462 	  goto error;
463 	}
464       ds_destroy (&function_name);
465       func_index = function - agr_func_tab;
466       lex_get (lexer);
467 
468       /* Check for leading lparen. */
469       if (!lex_match (lexer, T_LPAREN))
470 	{
471 	  if (function->src_vars == AGR_SV_YES)
472 	    {
473 	      goto error;
474 	    }
475 	}
476       else
477         {
478 	  /* Parse list of source variables. */
479 	  {
480 	    int pv_opts = PV_NO_SCRATCH;
481 
482 	    if (func_index == SUM || func_index == MEAN || func_index == SD)
483 	      pv_opts |= PV_NUMERIC;
484 	    else if (function->n_args)
485 	      pv_opts |= PV_SAME_TYPE;
486 
487 	    if (!parse_variables_const (lexer, dict, &src, &n_src, pv_opts))
488 	      goto error;
489 	  }
490 
491 	  /* Parse function arguments, for those functions that
492 	     require arguments. */
493 	  if (function->n_args != 0)
494 	    for (i = 0; i < function->n_args; i++)
495 	      {
496 		int type;
497 
498 		lex_match (lexer, T_COMMA);
499 		if (lex_is_string (lexer))
500 		  {
501 		    arg[i].c = recode_string (dict_get_encoding (agr->dict),
502                                               "UTF-8", lex_tokcstr (lexer),
503                                               -1);
504 		    type = VAL_STRING;
505 		  }
506 		else if (lex_is_number (lexer))
507 		  {
508 		    arg[i].f = lex_tokval (lexer);
509 		    type = VAL_NUMERIC;
510 		  }
511                 else
512                   {
513 		    msg (SE, _("Missing argument %zu to %s."),
514                          i + 1, function->name);
515 		    goto error;
516 		  }
517 
518 		lex_get (lexer);
519 
520 		if (type != var_get_type (src[0]))
521 		  {
522 		    msg (SE, _("Arguments to %s must be of same type as "
523 			       "source variables."),
524 			 function->name);
525 		    goto error;
526 		  }
527 	      }
528 
529 	  /* Trailing rparen. */
530 	  if (!lex_force_match (lexer, T_RPAREN))
531             goto error;
532 
533 	  /* Now check that the number of source variables match
534 	     the number of target variables.  If we check earlier
535 	     than this, the user can get very misleading error
536 	     message, i.e. `AGGREGATE x=SUM(y t).' will get this
537 	     error message when a proper message would be more
538 	     like `unknown variable t'. */
539 	  if (n_src != n_dest)
540 	    {
541 	      msg (SE, _("Number of source variables (%zu) does not match "
542 			 "number of target variables (%zu)."),
543 		    n_src, n_dest);
544 	      goto error;
545 	    }
546 
547           if ((func_index == PIN || func_index == POUT
548               || func_index == FIN || func_index == FOUT)
549               && (var_is_numeric (src[0])
550                   ? arg[0].f > arg[1].f
551                   : str_compare_rpad (arg[0].c, arg[1].c) > 0))
552             {
553               union agr_argument t = arg[0];
554               arg[0] = arg[1];
555               arg[1] = t;
556 
557               msg (SW, _("The value arguments passed to the %s function "
558                          "are out-of-order.  They will be treated as if "
559                          "they had been specified in the correct order."),
560                    function->name);
561             }
562 	}
563 
564       /* Finally add these to the linked list of aggregation
565          variables. */
566       for (i = 0; i < n_dest; i++)
567 	{
568 	  struct agr_var *v = xzalloc (sizeof *v);
569 
570 	  /* Add variable to chain. */
571 	  if (agr->agr_vars != NULL)
572 	    tail->next = v;
573 	  else
574 	    agr->agr_vars = v;
575           tail = v;
576 	  tail->next = NULL;
577           v->moments = NULL;
578 
579 	  /* Create the target variable in the aggregate
580              dictionary. */
581 	  {
582 	    struct variable *destvar;
583 
584 	    v->function = func_index;
585 
586 	    if (src)
587 	      {
588 		v->src = src[i];
589 
590 		if (var_is_alpha (src[i]))
591 		  {
592 		    v->function |= FSTRING;
593 		    v->string = xmalloc (var_get_width (src[i]));
594 		  }
595 
596 		if (function->alpha_type == VAL_STRING)
597 		  destvar = dict_clone_var_as (agr->dict, v->src, dest[i]);
598 		else
599                   {
600                     assert (var_is_numeric (v->src)
601                             || function->alpha_type == VAL_NUMERIC);
602                     destvar = dict_create_var (agr->dict, dest[i], 0);
603                     if (destvar != NULL)
604                       {
605                         struct fmt_spec f;
606                         if ((func_index == N || func_index == NMISS)
607                             && dict_get_weight (dict) != NULL)
608                           f = fmt_for_output (FMT_F, 8, 2);
609                         else
610                           f = function->format;
611                         var_set_both_formats (destvar, &f);
612                       }
613                   }
614 	      } else {
615                 struct fmt_spec f;
616 		v->src = NULL;
617 		destvar = dict_create_var (agr->dict, dest[i], 0);
618 		if (destvar != NULL)
619 		  {
620 		    if ((func_index == N || func_index == NMISS)
621 			&& dict_get_weight (dict) != NULL)
622 		      f = fmt_for_output (FMT_F, 8, 2);
623 		    else
624 		      f = function->format;
625 		    var_set_both_formats (destvar, &f);
626 		  }
627 	    }
628 
629 	    if (!destvar)
630 	      {
631 		msg (SE, _("Variable name %s is not unique within the "
632 			   "aggregate file dictionary, which contains "
633 			   "the aggregate variables and the break "
634 			   "variables."),
635 		     dest[i]);
636 		goto error;
637 	      }
638 
639 	    free (dest[i]);
640 	    if (dest_label[i])
641               var_set_label (destvar, dest_label[i]);
642 
643 	    v->dest = destvar;
644 	  }
645 
646 	  v->exclude = exclude;
647 
648 	  if (v->src != NULL)
649 	    {
650 	      int j;
651 
652 	      if (var_is_numeric (v->src))
653 		for (j = 0; j < function->n_args; j++)
654 		  v->arg[j].f = arg[j].f;
655 	      else
656 		for (j = 0; j < function->n_args; j++)
657 		  v->arg[j].c = xstrdup (arg[j].c);
658 	    }
659 	}
660 
661       if (src != NULL && var_is_alpha (src[0]))
662 	for (i = 0; i < function->n_args; i++)
663 	  {
664 	    free (arg[i].c);
665 	    arg[i].c = NULL;
666 	  }
667 
668       free (src);
669       free (dest);
670       free (dest_label);
671 
672       if (!lex_match (lexer, T_SLASH))
673 	{
674 	  if (lex_token (lexer) == T_ENDCMD)
675 	    return true;
676 
677 	  lex_error (lexer, "expecting end of command");
678 	  return false;
679 	}
680       continue;
681 
682     error:
683       ds_destroy (&function_name);
684       for (i = 0; i < n_dest; i++)
685 	{
686 	  free (dest[i]);
687 	  free (dest_label[i]);
688 	}
689       free (dest);
690       free (dest_label);
691       free (arg[0].c);
692       free (arg[1].c);
693       if (src && n_src && var_is_alpha (src[0]))
694 	for (i = 0; i < function->n_args; i++)
695 	  {
696 	    free (arg[i].c);
697 	    arg[i].c = NULL;
698 	  }
699       free (src);
700 
701       return false;
702     }
703 }
704 
705 /* Destroys AGR. */
706 static void
agr_destroy(struct agr_proc * agr)707 agr_destroy (struct agr_proc *agr)
708 {
709   struct agr_var *iter, *next;
710 
711   subcase_destroy (&agr->sort);
712   free (agr->break_vars);
713   for (iter = agr->agr_vars; iter; iter = next)
714     {
715       next = iter->next;
716 
717       if (iter->function & FSTRING)
718 	{
719 	  size_t n_args;
720 	  size_t i;
721 
722 	  n_args = agr_func_tab[iter->function & FUNC].n_args;
723 	  for (i = 0; i < n_args; i++)
724 	    free (iter->arg[i].c);
725 	  free (iter->string);
726 	}
727       else if (iter->function == SD)
728         moments1_destroy (iter->moments);
729 
730       dict_destroy_internal_var (iter->subject);
731       dict_destroy_internal_var (iter->weight);
732 
733       free (iter);
734     }
735   if (agr->dict != NULL)
736     dict_unref (agr->dict);
737 }
738 
739 /* Execution. */
740 
741 /* Accumulates aggregation data from the case INPUT. */
742 static void
accumulate_aggregate_info(struct agr_proc * agr,const struct ccase * input)743 accumulate_aggregate_info (struct agr_proc *agr, const struct ccase *input)
744 {
745   struct agr_var *iter;
746   double weight;
747   bool bad_warn = true;
748 
749   weight = dict_get_case_weight (agr->src_dict, input, &bad_warn);
750 
751   for (iter = agr->agr_vars; iter; iter = iter->next)
752     if (iter->src)
753       {
754 	const union value *v = case_data (input, iter->src);
755         int src_width = var_get_width (iter->src);
756 
757         if (var_is_value_missing (iter->src, v, iter->exclude))
758 	  {
759 	    switch (iter->function)
760 	      {
761 	      case NMISS:
762 	      case NMISS | FSTRING:
763 		iter->dbl[0] += weight;
764                 break;
765 	      case NUMISS:
766 	      case NUMISS | FSTRING:
767 		iter->int1++;
768 		break;
769 	      }
770 	    iter->saw_missing = true;
771 	    continue;
772 	  }
773 
774 	/* This is horrible.  There are too many possibilities. */
775 	switch (iter->function)
776 	  {
777 	  case SUM:
778 	    iter->dbl[0] += v->f * weight;
779             iter->int1 = 1;
780 	    break;
781 	  case MEAN:
782             iter->dbl[0] += v->f * weight;
783             iter->dbl[1] += weight;
784             break;
785 	  case MEDIAN:
786 	    {
787 	      double wv ;
788 	      struct ccase *cout;
789 
790               cout = case_create (casewriter_get_proto (iter->writer));
791 
792 	      case_data_rw (cout, iter->subject)->f
793                 = case_data (input, iter->src)->f;
794 
795 	      wv = dict_get_case_weight (agr->src_dict, input, NULL);
796 
797 	      case_data_rw (cout, iter->weight)->f = wv;
798 
799 	      iter->cc += wv;
800 
801 	      casewriter_write (iter->writer, cout);
802 	    }
803 	    break;
804 	  case SD:
805             moments1_add (iter->moments, v->f, weight);
806             break;
807 	  case MAX:
808 	    iter->dbl[0] = MAX (iter->dbl[0], v->f);
809 	    iter->int1 = 1;
810 	    break;
811 	  case MAX | FSTRING:
812             /* Need to do some kind of Unicode collation thingy here */
813 	    if (memcmp (iter->string, v->s, src_width) < 0)
814 	      memcpy (iter->string, v->s, src_width);
815 	    iter->int1 = 1;
816 	    break;
817 	  case MIN:
818 	    iter->dbl[0] = MIN (iter->dbl[0], v->f);
819 	    iter->int1 = 1;
820 	    break;
821 	  case MIN | FSTRING:
822 	    if (memcmp (iter->string, v->s, src_width) > 0)
823 	      memcpy (iter->string, v->s, src_width);
824 	    iter->int1 = 1;
825 	    break;
826 	  case FGT:
827 	  case PGT:
828             if (v->f > iter->arg[0].f)
829               iter->dbl[0] += weight;
830             iter->dbl[1] += weight;
831             break;
832 	  case FGT | FSTRING:
833 	  case PGT | FSTRING:
834             if (memcmp (iter->arg[0].c, v->s, src_width) < 0)
835               iter->dbl[0] += weight;
836             iter->dbl[1] += weight;
837             break;
838 	  case FLT:
839 	  case PLT:
840             if (v->f < iter->arg[0].f)
841               iter->dbl[0] += weight;
842             iter->dbl[1] += weight;
843             break;
844 	  case FLT | FSTRING:
845 	  case PLT | FSTRING:
846             if (memcmp (iter->arg[0].c, v->s, src_width) > 0)
847               iter->dbl[0] += weight;
848             iter->dbl[1] += weight;
849             break;
850 	  case FIN:
851 	  case PIN:
852             if (iter->arg[0].f <= v->f && v->f <= iter->arg[1].f)
853               iter->dbl[0] += weight;
854             iter->dbl[1] += weight;
855             break;
856 	  case FIN | FSTRING:
857 	  case PIN | FSTRING:
858             if (memcmp (iter->arg[0].c, v->s, src_width) <= 0
859                 && memcmp (iter->arg[1].c, v->s, src_width) >= 0)
860               iter->dbl[0] += weight;
861             iter->dbl[1] += weight;
862             break;
863 	  case FOUT:
864 	  case POUT:
865             if (iter->arg[0].f > v->f || v->f > iter->arg[1].f)
866               iter->dbl[0] += weight;
867             iter->dbl[1] += weight;
868             break;
869 	  case FOUT | FSTRING:
870 	  case POUT | FSTRING:
871             if (memcmp (iter->arg[0].c, v->s, src_width) > 0
872                 || memcmp (iter->arg[1].c, v->s, src_width) < 0)
873               iter->dbl[0] += weight;
874             iter->dbl[1] += weight;
875             break;
876 	  case N:
877 	  case N | FSTRING:
878 	    iter->dbl[0] += weight;
879 	    break;
880 	  case NU:
881 	  case NU | FSTRING:
882 	    iter->int1++;
883 	    break;
884 	  case FIRST:
885 	    if (iter->int1 == 0)
886 	      {
887 		iter->dbl[0] = v->f;
888 		iter->int1 = 1;
889 	      }
890 	    break;
891 	  case FIRST | FSTRING:
892 	    if (iter->int1 == 0)
893 	      {
894 		memcpy (iter->string, v->s, src_width);
895 		iter->int1 = 1;
896 	      }
897 	    break;
898 	  case LAST:
899 	    iter->dbl[0] = v->f;
900 	    iter->int1 = 1;
901 	    break;
902 	  case LAST | FSTRING:
903 	    memcpy (iter->string, v->s, src_width);
904 	    iter->int1 = 1;
905 	    break;
906           case NMISS:
907           case NMISS | FSTRING:
908           case NUMISS:
909           case NUMISS | FSTRING:
910             /* Our value is not missing or it would have been
911                caught earlier.  Nothing to do. */
912             break;
913 	  default:
914 	    NOT_REACHED ();
915 	  }
916       } else {
917       switch (iter->function)
918 	{
919 	case N:
920 	  iter->dbl[0] += weight;
921 	  break;
922 	case NU:
923 	  iter->int1++;
924 	  break;
925 	default:
926 	  NOT_REACHED ();
927 	}
928     }
929 }
930 
931 /* Writes an aggregated record to OUTPUT. */
932 static void
dump_aggregate_info(const struct agr_proc * agr,struct casewriter * output,const struct ccase * break_case)933 dump_aggregate_info (const struct agr_proc *agr, struct casewriter *output, const struct ccase *break_case)
934 {
935   struct ccase *c = case_create (dict_get_proto (agr->dict));
936 
937   if (agr->add_variables)
938     {
939       case_copy (c, 0, break_case, 0, dict_get_var_cnt (agr->src_dict));
940     }
941   else
942     {
943       int value_idx = 0;
944       int i;
945 
946       for (i = 0; i < agr->break_var_cnt; i++)
947 	{
948 	  const struct variable *v = agr->break_vars[i];
949 	  value_copy (case_data_rw_idx (c, value_idx),
950 		      case_data (break_case, v),
951 		      var_get_width (v));
952 	  value_idx++;
953 	}
954     }
955 
956   {
957     struct agr_var *i;
958 
959     for (i = agr->agr_vars; i; i = i->next)
960       {
961 	union value *v = case_data_rw (c, i->dest);
962         int width = var_get_width (i->dest);
963 
964 	if (agr->missing == COLUMNWISE && i->saw_missing
965 	    && (i->function & FUNC) != N && (i->function & FUNC) != NU
966 	    && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS)
967 	  {
968             value_set_missing (v, width);
969 	    casewriter_destroy (i->writer);
970 	    continue;
971 	  }
972 
973 	switch (i->function)
974 	  {
975 	  case SUM:
976 	    v->f = i->int1 ? i->dbl[0] : SYSMIS;
977 	    break;
978 	  case MEAN:
979 	    v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS;
980 	    break;
981 	  case MEDIAN:
982 	    {
983 	      if (i->writer)
984 		{
985 		  struct percentile *median = percentile_create (0.5, i->cc);
986 		  struct order_stats *os = &median->parent;
987 		  struct casereader *sorted_reader = casewriter_make_reader (i->writer);
988 		  i->writer = NULL;
989 
990 		  order_stats_accumulate (&os, 1,
991 					  sorted_reader,
992 					  i->weight,
993 					  i->subject,
994 					  i->exclude);
995 		  i->dbl[0] = percentile_calculate (median, PC_HAVERAGE);
996 		  statistic_destroy (&median->parent.parent);
997 		}
998 	      v->f = i->dbl[0];
999 	    }
1000 	    break;
1001 	  case SD:
1002             {
1003               double variance;
1004 
1005               /* FIXME: we should use two passes. */
1006               moments1_calculate (i->moments, NULL, NULL, &variance,
1007                                  NULL, NULL);
1008               if (variance != SYSMIS)
1009                 v->f = sqrt (variance);
1010               else
1011                 v->f = SYSMIS;
1012             }
1013 	    break;
1014 	  case MAX:
1015 	  case MIN:
1016 	    v->f = i->int1 ? i->dbl[0] : SYSMIS;
1017 	    break;
1018 	  case MAX | FSTRING:
1019 	  case MIN | FSTRING:
1020 	    if (i->int1)
1021 	      memcpy (v->s, i->string, width);
1022 	    else
1023               value_set_missing (v, width);
1024 	    break;
1025 	  case FGT:
1026 	  case FGT | FSTRING:
1027 	  case FLT:
1028 	  case FLT | FSTRING:
1029 	  case FIN:
1030 	  case FIN | FSTRING:
1031 	  case FOUT:
1032 	  case FOUT | FSTRING:
1033 	    v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] : SYSMIS;
1034 	    break;
1035 	  case PGT:
1036 	  case PGT | FSTRING:
1037 	  case PLT:
1038 	  case PLT | FSTRING:
1039 	  case PIN:
1040 	  case PIN | FSTRING:
1041 	  case POUT:
1042 	  case POUT | FSTRING:
1043 	    v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] * 100.0 : SYSMIS;
1044 	    break;
1045 	  case N:
1046 	  case N | FSTRING:
1047 	      v->f = i->dbl[0];
1048             break;
1049 	  case NU:
1050 	  case NU | FSTRING:
1051 	    v->f = i->int1;
1052 	    break;
1053 	  case FIRST:
1054 	  case LAST:
1055 	    v->f = i->int1 ? i->dbl[0] : SYSMIS;
1056 	    break;
1057 	  case FIRST | FSTRING:
1058 	  case LAST | FSTRING:
1059 	    if (i->int1)
1060 	      memcpy (v->s, i->string, width);
1061 	    else
1062               value_set_missing (v, width);
1063 	    break;
1064 	  case NMISS:
1065 	  case NMISS | FSTRING:
1066 	    v->f = i->dbl[0];
1067 	    break;
1068 	  case NUMISS:
1069 	  case NUMISS | FSTRING:
1070 	    v->f = i->int1;
1071 	    break;
1072 	  default:
1073 	    NOT_REACHED ();
1074 	  }
1075       }
1076   }
1077 
1078   casewriter_write (output, c);
1079 }
1080 
1081 /* Resets the state for all the aggregate functions. */
1082 static void
initialize_aggregate_info(struct agr_proc * agr)1083 initialize_aggregate_info (struct agr_proc *agr)
1084 {
1085   struct agr_var *iter;
1086 
1087   for (iter = agr->agr_vars; iter; iter = iter->next)
1088     {
1089       iter->saw_missing = false;
1090       iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0;
1091       iter->int1 = iter->int2 = 0;
1092       switch (iter->function)
1093 	{
1094 	case MIN:
1095 	  iter->dbl[0] = DBL_MAX;
1096 	  break;
1097 	case MIN | FSTRING:
1098 	  memset (iter->string, 255, var_get_width (iter->src));
1099 	  break;
1100 	case MAX:
1101 	  iter->dbl[0] = -DBL_MAX;
1102 	  break;
1103 	case MAX | FSTRING:
1104 	  memset (iter->string, 0, var_get_width (iter->src));
1105 	  break;
1106 	case MEDIAN:
1107 	  {
1108             struct caseproto *proto;
1109             struct subcase ordering;
1110 
1111             proto = caseproto_create ();
1112             proto = caseproto_add_width (proto, 0);
1113             proto = caseproto_add_width (proto, 0);
1114 
1115 	    if (! iter->subject)
1116 	      iter->subject = dict_create_internal_var (0, 0);
1117 
1118 	    if (! iter->weight)
1119 	      iter->weight = dict_create_internal_var (1, 0);
1120 
1121             subcase_init_var (&ordering, iter->subject, SC_ASCEND);
1122 	    iter->writer = sort_create_writer (&ordering, proto);
1123             subcase_destroy (&ordering);
1124             caseproto_unref (proto);
1125 
1126 	    iter->cc = 0;
1127 	  }
1128 	  break;
1129         case SD:
1130           if (iter->moments == NULL)
1131             iter->moments = moments1_create (MOMENT_VARIANCE);
1132           else
1133             moments1_clear (iter->moments);
1134           break;
1135         default:
1136           break;
1137 	}
1138     }
1139 }
1140