1 /* PSPP - a program for statistical analysis.
2    Copyright (C) 2017 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
16 
17 #include <config.h>
18 
19 #include "data/case.h"
20 #include "data/casereader.h"
21 #include "data/casewriter.h"
22 #include "data/dataset.h"
23 #include "data/dictionary.h"
24 #include "data/format.h"
25 #include "data/transformations.h"
26 #include "data/variable.h"
27 #include "language/command.h"
28 #include "language/data-io/data-parser.h"
29 #include "language/data-io/data-reader.h"
30 #include "language/data-io/file-handle.h"
31 #include "language/data-io/inpt-pgm.h"
32 #include "language/data-io/placement-parser.h"
33 #include "language/lexer/lexer.h"
34 #include "language/lexer/variable-parser.h"
35 #include "libpspp/i18n.h"
36 #include "libpspp/message.h"
37 #include "libpspp/misc.h"
38 
39 #include "gl/xsize.h"
40 #include "gl/xalloc.h"
41 
42 #include "gettext.h"
43 #define _(msgid) gettext (msgid)
44 
45 /* DATA LIST transformation data. */
46 struct data_list_trns
47   {
48     struct data_parser *parser; /* Parser. */
49     struct dfm_reader *reader;  /* Data file reader. */
50     struct variable *end;	/* Variable specified on END subcommand. */
51   };
52 
53 static trns_free_func data_list_trns_free;
54 static trns_proc_func data_list_trns_proc;
55 
56 enum diagonal
57   {
58     DIAGONAL,
59     NO_DIAGONAL
60   };
61 
62 enum triangle
63   {
64     LOWER,
65     UPPER,
66     FULL
67   };
68 
69 static const int ROWTYPE_WIDTH = 8;
70 
71 struct matrix_format
72 {
73   enum triangle triangle;
74   enum diagonal diagonal;
75   const struct variable *rowtype;
76   const struct variable *varname;
77   int n_continuous_vars;
78   struct variable **split_vars;
79   size_t n_split_vars;
80   long n;
81 };
82 
83 /*
84 valid rowtype_ values:
85   CORR,
86   COV,
87   MAT,
88 
89 
90   MSE,
91   DFE,
92   MEAN,
93   STDDEV (or SD),
94   N_VECTOR (or N),
95   N_SCALAR,
96   N_MATRIX,
97   COUNT,
98   PROX.
99 */
100 
101 /* Sets the value of OUTCASE which corresponds to VNAME
102    to the value STR.  VNAME must be of type string.
103  */
104 static void
set_varname_column(struct ccase * outcase,const struct variable * vname,const char * str)105 set_varname_column (struct ccase *outcase, const struct variable *vname,
106      const char *str)
107 {
108   int len = var_get_width (vname);
109   uint8_t *s = case_str_rw (outcase, vname);
110 
111   strncpy (CHAR_CAST (char *, s), str, len);
112 }
113 
114 static void
blank_varname_column(struct ccase * outcase,const struct variable * vname)115 blank_varname_column (struct ccase *outcase, const struct variable *vname)
116 {
117   int len = var_get_width (vname);
118   uint8_t *s = case_str_rw (outcase, vname);
119 
120   memset (s, ' ', len);
121 }
122 
123 static struct casereader *
preprocess(struct casereader * casereader0,const struct dictionary * dict,void * aux)124 preprocess (struct casereader *casereader0, const struct dictionary *dict, void *aux)
125 {
126   struct matrix_format *mformat = aux;
127   const struct caseproto *proto = casereader_get_proto (casereader0);
128   struct casewriter *writer = autopaging_writer_create (proto);
129   struct ccase *prev_case = NULL;
130   double **matrices = NULL;
131   size_t n_splits = 0;
132 
133   const size_t sizeof_matrix =
134     sizeof (double) * mformat->n_continuous_vars * mformat->n_continuous_vars;
135 
136 
137   /* Make an initial pass to populate our temporary matrix */
138   struct casereader *pass0 = casereader_clone (casereader0);
139   struct ccase *c;
140   union value *prev_values = XCALLOC (mformat->n_split_vars,  union value);
141   int row = (mformat->triangle == LOWER && mformat->diagonal == NO_DIAGONAL) ? 1 : 0;
142   bool first_case = true;
143   for (; (c = casereader_read (pass0)) != NULL; case_unref (c))
144     {
145       int s;
146       bool match = false;
147       if (!first_case)
148 	{
149 	  match = true;
150 	  for (s = 0; s < mformat->n_split_vars; ++s)
151 	    {
152 	      const struct variable *svar = mformat->split_vars[s];
153 	      const union value *sv = case_data (c, svar);
154 	      if (! value_equal (prev_values + s, sv, var_get_width (svar)))
155 		{
156 		  match = false;
157 		  break;
158 		}
159 	    }
160 	}
161       first_case = false;
162 
163       if (matrices == NULL || ! match)
164 	{
165 	  row = (mformat->triangle == LOWER && mformat->diagonal == NO_DIAGONAL) ?
166 	    1 : 0;
167 
168 	  n_splits++;
169 	  matrices = xrealloc (matrices, sizeof (double*)  * n_splits);
170 	  matrices[n_splits - 1] = xmalloc (sizeof_matrix);
171 	}
172 
173       for (s = 0; s < mformat->n_split_vars; ++s)
174 	{
175 	  const struct variable *svar = mformat->split_vars[s];
176 	  const union value *sv = case_data (c, svar);
177 	  value_clone (prev_values + s, sv, var_get_width (svar));
178 	}
179 
180       int c_offset = (mformat->triangle == UPPER) ? row : 0;
181       if (mformat->triangle == UPPER && mformat->diagonal == NO_DIAGONAL)
182 	c_offset++;
183       const union value *v = case_data (c, mformat->rowtype);
184       const char *val = CHAR_CAST (const char *, v->s);
185       if (0 == strncasecmp (val, "corr    ", ROWTYPE_WIDTH) ||
186 	  0 == strncasecmp (val, "cov     ", ROWTYPE_WIDTH))
187 	{
188 	  if (row >= mformat->n_continuous_vars)
189 	    {
190 	      msg (SE,
191 		   _("There are %d variable declared but the data has at least %d matrix rows."),
192 		   mformat->n_continuous_vars, row + 1);
193 	      case_unref (c);
194 	      casereader_destroy (pass0);
195 	      free (prev_values);
196 	      goto error;
197 	    }
198 	  int col;
199 	  for (col = c_offset; col < mformat->n_continuous_vars; ++col)
200 	    {
201 	      const struct variable *var =
202 		dict_get_var (dict,
203 			      1 + col - c_offset +
204 			      var_get_dict_index (mformat->varname));
205 
206 	      double e = case_data (c, var)->f;
207 	      if (e == SYSMIS)
208 	      	continue;
209 
210 	      /* Fill in the lower triangle */
211 	      (matrices[n_splits-1])[col + mformat->n_continuous_vars * row] = e;
212 
213 	      if (mformat->triangle != FULL)
214 		/* Fill in the upper triangle */
215 		(matrices[n_splits-1]) [row + mformat->n_continuous_vars * col] = e;
216 	    }
217 	  row++;
218 	}
219     }
220   casereader_destroy (pass0);
221   free (prev_values);
222 
223   if (!matrices)
224     goto error;
225 
226   /* Now make a second pass to fill in the other triangle from our
227      temporary matrix */
228   const int idx = var_get_dict_index (mformat->varname);
229   row = 0;
230 
231   if (mformat->n >= 0)
232     {
233       int col;
234       struct ccase *outcase = case_create (proto);
235       union value *v = case_data_rw (outcase, mformat->rowtype);
236       memcpy (v->s, "N       ", ROWTYPE_WIDTH);
237       blank_varname_column (outcase, mformat->varname);
238       for (col = 0; col < mformat->n_continuous_vars; ++col)
239 	{
240 	  union value *dest_val =
241 	    case_data_rw_idx (outcase,
242 			      1 + col + var_get_dict_index (mformat->varname));
243 	  dest_val->f = mformat->n;
244 	}
245       casewriter_write (writer, outcase);
246     }
247 
248   n_splits = 0;
249   prev_values = xcalloc (mformat->n_split_vars, sizeof *prev_values);
250   first_case = true;
251   for (; (c = casereader_read (casereader0)) != NULL; prev_case = c)
252     {
253       int s;
254       bool match = false;
255       if (!first_case)
256 	{
257 	  match = true;
258 	  for (s = 0; s < mformat->n_split_vars; ++s)
259 	    {
260 	      const struct variable *svar = mformat->split_vars[s];
261 	      const union value *sv = case_data (c, svar);
262 	      if (! value_equal (prev_values + s, sv, var_get_width (svar)))
263 		{
264 		  match = false;
265 		  break;
266 		}
267 	    }
268 	}
269       first_case = false;
270       if (! match)
271 	{
272 	  n_splits++;
273 	  row = 0;
274 	}
275 
276       for (s = 0; s < mformat->n_split_vars; ++s)
277 	{
278 	  const struct variable *svar = mformat->split_vars[s];
279 	  const union value *sv = case_data (c, svar);
280 	  value_clone (prev_values + s, sv, var_get_width (svar));
281 	}
282 
283       case_unref (prev_case);
284       const union value *v = case_data (c, mformat->rowtype);
285       const char *val = CHAR_CAST (const char *, v->s);
286       if (mformat->n >= 0)
287 	{
288 	  if (0 == strncasecmp (val, "n       ", ROWTYPE_WIDTH) ||
289 	      0 == strncasecmp (val, "n_vector", ROWTYPE_WIDTH))
290 	    {
291 	      msg (SW,
292 		   _("The N subcommand was specified, but a N record was also found in the data.  The N record will be ignored."));
293 	      continue;
294 	    }
295 	}
296 
297       struct ccase *outcase = case_create (proto);
298       case_copy (outcase, 0, c, 0, caseproto_get_n_widths (proto));
299 
300       if (0 == strncasecmp (val, "corr    ", ROWTYPE_WIDTH) ||
301 	  0 == strncasecmp (val, "cov     ", ROWTYPE_WIDTH))
302 	{
303 	  int col;
304 	  const struct variable *var = dict_get_var (dict, idx + 1 + row);
305 	  set_varname_column (outcase, mformat->varname, var_get_name (var));
306 	  value_copy (case_data_rw (outcase, mformat->rowtype), v, ROWTYPE_WIDTH);
307 
308 	  for (col = 0; col < mformat->n_continuous_vars; ++col)
309 	    {
310 	      union value *dest_val =
311 		case_data_rw_idx (outcase,
312 				  1 + col + var_get_dict_index (mformat->varname));
313 	      dest_val->f = (matrices[n_splits - 1])[col + mformat->n_continuous_vars * row];
314 	      if (col == row && mformat->diagonal == NO_DIAGONAL)
315 		dest_val->f = 1.0;
316 	    }
317 	  row++;
318 	}
319       else
320 	{
321 	  blank_varname_column (outcase, mformat->varname);
322 	}
323 
324       /* Special case for SD and N_VECTOR: Rewrite as STDDEV and N respectively */
325       if (0 == strncasecmp (val, "sd      ", ROWTYPE_WIDTH))
326 	{
327 	  value_copy_buf_rpad (case_data_rw (outcase, mformat->rowtype), ROWTYPE_WIDTH,
328 			       (uint8_t *) "STDDEV", 6, ' ');
329 	}
330       else if (0 == strncasecmp (val, "n_vector", ROWTYPE_WIDTH))
331 	{
332 	  value_copy_buf_rpad (case_data_rw (outcase, mformat->rowtype), ROWTYPE_WIDTH,
333 			       (uint8_t *) "N", 1, ' ');
334 	}
335 
336       casewriter_write (writer, outcase);
337     }
338 
339   /* If NODIAGONAL is specified, then a final case must be written */
340   if (mformat->diagonal == NO_DIAGONAL)
341     {
342       int col;
343       struct ccase *outcase = case_create (proto);
344 
345       if (prev_case)
346 	case_copy (outcase, 0, prev_case, 0, caseproto_get_n_widths (proto));
347 
348       const struct variable *var = dict_get_var (dict, idx + 1 + row);
349       set_varname_column (outcase, mformat->varname, var_get_name (var));
350 
351       for (col = 0; col < mformat->n_continuous_vars; ++col)
352 	{
353 	  union value *dest_val =
354 	    case_data_rw_idx (outcase, 1 + col +
355 			      var_get_dict_index (mformat->varname));
356 	  dest_val->f = (matrices[n_splits - 1]) [col + mformat->n_continuous_vars * row];
357 	  if (col == row && mformat->diagonal == NO_DIAGONAL)
358 	    dest_val->f = 1.0;
359 	}
360 
361       casewriter_write (writer, outcase);
362     }
363   free (prev_values);
364 
365   if (prev_case)
366     case_unref (prev_case);
367 
368   int i;
369   for (i = 0 ; i < n_splits; ++i)
370     free (matrices[i]);
371   free (matrices);
372   struct casereader *reader1 = casewriter_make_reader (writer);
373   casereader_destroy (casereader0);
374   return reader1;
375 
376 
377 error:
378   if (prev_case)
379     case_unref (prev_case);
380 
381   if (matrices)
382     for (i = 0 ; i < n_splits; ++i)
383       free (matrices[i]);
384   free (matrices);
385   casereader_destroy (casereader0);
386   casewriter_destroy (writer);
387   return NULL;
388 }
389 
390 int
cmd_matrix(struct lexer * lexer,struct dataset * ds)391 cmd_matrix (struct lexer *lexer, struct dataset *ds)
392 {
393   struct dictionary *dict;
394   struct data_parser *parser;
395   struct dfm_reader *reader;
396   struct file_handle *fh = NULL;
397   char *encoding = NULL;
398   struct matrix_format mformat;
399   int i;
400   size_t n_names;
401   char **names = NULL;
402 
403   mformat.triangle = LOWER;
404   mformat.diagonal = DIAGONAL;
405   mformat.n_split_vars = 0;
406   mformat.split_vars = NULL;
407   mformat.n = -1;
408 
409   dict = (in_input_program ()
410           ? dataset_dict (ds)
411           : dict_create (get_default_encoding ()));
412   parser = data_parser_create (dict);
413   reader = NULL;
414 
415   data_parser_set_type (parser, DP_DELIMITED);
416   data_parser_set_warn_missing_fields (parser, false);
417   data_parser_set_span (parser, false);
418 
419   mformat.rowtype = dict_create_var (dict, "ROWTYPE_", ROWTYPE_WIDTH);
420 
421   mformat.n_continuous_vars = 0;
422   mformat.n_split_vars = 0;
423 
424   if (! lex_force_match_id (lexer, "VARIABLES"))
425     goto error;
426 
427   lex_match (lexer, T_EQUALS);
428 
429   if (! parse_mixed_vars (lexer, dict, &names, &n_names, PV_NO_DUPLICATE))
430     {
431       int i;
432       for (i = 0; i < n_names; ++i)
433 	free (names[i]);
434       free (names);
435       goto error;
436     }
437 
438   int longest_name = 0;
439   for (i = 0; i < n_names; ++i)
440     {
441       maximize_int (&longest_name, strlen (names[i]));
442     }
443 
444   mformat.varname = dict_create_var (dict, "VARNAME_",
445 				     8 * DIV_RND_UP (longest_name, 8));
446 
447   for (i = 0; i < n_names; ++i)
448     {
449       if (0 == strcasecmp (names[i], "ROWTYPE_"))
450 	{
451 	  const struct fmt_spec fmt = fmt_for_input (FMT_A, 8, 0);
452 	  data_parser_add_delimited_field (parser,
453 					   &fmt,
454 					   var_get_case_index (mformat.rowtype),
455 					   "ROWTYPE_");
456 	}
457       else
458 	{
459 	  const struct fmt_spec fmt = fmt_for_input (FMT_F, 10, 4);
460 	  struct variable *v = dict_create_var (dict, names[i], 0);
461 	  var_set_both_formats (v, &fmt);
462 	  data_parser_add_delimited_field (parser,
463 					   &fmt,
464 					   var_get_case_index (mformat.varname) +
465 					   ++mformat.n_continuous_vars,
466 					   names[i]);
467 	}
468     }
469   for (i = 0; i < n_names; ++i)
470     free (names[i]);
471   free (names);
472 
473   while (lex_token (lexer) != T_ENDCMD)
474     {
475       if (! lex_force_match (lexer, T_SLASH))
476 	goto error;
477 
478       if (lex_match_id (lexer, "N"))
479 	{
480 	  lex_match (lexer, T_EQUALS);
481 
482 	  if (! lex_force_int (lexer))
483 	    goto error;
484 
485 	  mformat.n = lex_integer (lexer);
486 	  if (mformat.n < 0)
487 	    {
488 	      msg (SE, _("%s must not be negative."), "N");
489 	      goto error;
490 	    }
491 	  lex_get (lexer);
492 	}
493       else if (lex_match_id (lexer, "FORMAT"))
494 	{
495 	  lex_match (lexer, T_EQUALS);
496 
497 	  while (lex_token (lexer) != T_SLASH && (lex_token (lexer) != T_ENDCMD))
498 	    {
499 	      if (lex_match_id (lexer, "LIST"))
500 		{
501 		  data_parser_set_span (parser, false);
502 		}
503 	      else if (lex_match_id (lexer, "FREE"))
504 		{
505 		  data_parser_set_span (parser, true);
506 		}
507 	      else if (lex_match_id (lexer, "UPPER"))
508 		{
509 		  mformat.triangle = UPPER;
510 		}
511 	      else if (lex_match_id (lexer, "LOWER"))
512 		{
513 		  mformat.triangle = LOWER;
514 		}
515 	      else if (lex_match_id (lexer, "FULL"))
516 		{
517 		  mformat.triangle = FULL;
518 		}
519 	      else if (lex_match_id (lexer, "DIAGONAL"))
520 		{
521 		  mformat.diagonal = DIAGONAL;
522 		}
523 	      else if (lex_match_id (lexer, "NODIAGONAL"))
524 		{
525 		  mformat.diagonal = NO_DIAGONAL;
526 		}
527 	      else
528 		{
529 		  lex_error (lexer, NULL);
530 		  goto error;
531 		}
532 	    }
533 	}
534       else if (lex_match_id (lexer, "FILE"))
535 	{
536 	  lex_match (lexer, T_EQUALS);
537           fh_unref (fh);
538 	  fh = fh_parse (lexer, FH_REF_FILE | FH_REF_INLINE, NULL);
539 	  if (fh == NULL)
540 	    goto error;
541 	}
542       else if (lex_match_id (lexer, "SPLIT"))
543 	{
544 	  lex_match (lexer, T_EQUALS);
545 	  if (! parse_variables (lexer, dict, &mformat.split_vars, &mformat.n_split_vars, 0))
546 	    {
547 	      free (mformat.split_vars);
548 	      goto error;
549 	    }
550 	  int i;
551 	  for (i = 0; i < mformat.n_split_vars; ++i)
552 	    {
553 	      const struct fmt_spec fmt = fmt_for_input (FMT_F, 4, 0);
554 	      var_set_both_formats (mformat.split_vars[i], &fmt);
555 	    }
556 	  dict_reorder_vars (dict, mformat.split_vars, mformat.n_split_vars);
557 	  mformat.n_continuous_vars -= mformat.n_split_vars;
558 	}
559       else
560 	{
561 	  lex_error (lexer, NULL);
562 	  goto error;
563 	}
564     }
565 
566   if (mformat.diagonal == NO_DIAGONAL && mformat.triangle == FULL)
567     {
568       msg (SE, _("FORMAT = FULL and FORMAT = NODIAGONAL are mutually exclusive."));
569       goto error;
570     }
571 
572   if (fh == NULL)
573     fh = fh_inline_file ();
574   fh_set_default_handle (fh);
575 
576   if (!data_parser_any_fields (parser))
577     {
578       msg (SE, _("At least one variable must be specified."));
579       goto error;
580     }
581 
582   if (lex_end_of_command (lexer) != CMD_SUCCESS)
583     goto error;
584 
585   reader = dfm_open_reader (fh, lexer, encoding);
586   if (reader == NULL)
587     goto error;
588 
589   if (in_input_program ())
590     {
591       struct data_list_trns *trns = xmalloc (sizeof *trns);
592       trns->parser = parser;
593       trns->reader = reader;
594       trns->end = NULL;
595       add_transformation (ds, data_list_trns_proc, data_list_trns_free, trns);
596     }
597   else
598     {
599       data_parser_make_active_file (parser, ds, reader, dict, preprocess,
600 				    &mformat);
601     }
602 
603   fh_unref (fh);
604   free (encoding);
605   free (mformat.split_vars);
606 
607   return CMD_DATA_LIST;
608 
609  error:
610   data_parser_destroy (parser);
611   if (!in_input_program ())
612     dict_unref (dict);
613   fh_unref (fh);
614   free (encoding);
615   free (mformat.split_vars);
616   return CMD_CASCADING_FAILURE;
617 }
618 
619 
620 /* Input procedure. */
621 
622 /* Destroys DATA LIST transformation TRNS.
623    Returns true if successful, false if an I/O error occurred. */
624 static bool
data_list_trns_free(void * trns_)625 data_list_trns_free (void *trns_)
626 {
627   struct data_list_trns *trns = trns_;
628   data_parser_destroy (trns->parser);
629   dfm_close_reader (trns->reader);
630   free (trns);
631   return true;
632 }
633 
634 /* Handle DATA LIST transformation TRNS, parsing data into *C. */
635 static int
data_list_trns_proc(void * trns_,struct ccase ** c,casenumber case_num UNUSED)636 data_list_trns_proc (void *trns_, struct ccase **c, casenumber case_num UNUSED)
637 {
638   struct data_list_trns *trns = trns_;
639   int retval;
640 
641   *c = case_unshare (*c);
642   if (data_parser_parse (trns->parser, trns->reader, *c))
643     retval = TRNS_CONTINUE;
644   else if (dfm_reader_error (trns->reader) || dfm_eof (trns->reader) > 1)
645     {
646       /* An I/O error, or encountering end of file for a second
647          time, should be escalated into a more serious error. */
648       retval = TRNS_ERROR;
649     }
650   else
651     retval = TRNS_END_FILE;
652 
653   /* If there was an END subcommand handle it. */
654   if (trns->end != NULL)
655     {
656       double *end = &case_data_rw (*c, trns->end)->f;
657       if (retval == TRNS_END_FILE)
658         {
659           *end = 1.0;
660           retval = TRNS_CONTINUE;
661         }
662       else
663         *end = 0.0;
664     }
665 
666   return retval;
667 }
668