1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2017 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
16
17 #include <config.h>
18
19 #include "data/case.h"
20 #include "data/casereader.h"
21 #include "data/casewriter.h"
22 #include "data/dataset.h"
23 #include "data/dictionary.h"
24 #include "data/format.h"
25 #include "data/transformations.h"
26 #include "data/variable.h"
27 #include "language/command.h"
28 #include "language/data-io/data-parser.h"
29 #include "language/data-io/data-reader.h"
30 #include "language/data-io/file-handle.h"
31 #include "language/data-io/inpt-pgm.h"
32 #include "language/data-io/placement-parser.h"
33 #include "language/lexer/lexer.h"
34 #include "language/lexer/variable-parser.h"
35 #include "libpspp/i18n.h"
36 #include "libpspp/message.h"
37 #include "libpspp/misc.h"
38
39 #include "gl/xsize.h"
40 #include "gl/xalloc.h"
41
42 #include "gettext.h"
43 #define _(msgid) gettext (msgid)
44
45 /* DATA LIST transformation data. */
46 struct data_list_trns
47 {
48 struct data_parser *parser; /* Parser. */
49 struct dfm_reader *reader; /* Data file reader. */
50 struct variable *end; /* Variable specified on END subcommand. */
51 };
52
53 static trns_free_func data_list_trns_free;
54 static trns_proc_func data_list_trns_proc;
55
56 enum diagonal
57 {
58 DIAGONAL,
59 NO_DIAGONAL
60 };
61
62 enum triangle
63 {
64 LOWER,
65 UPPER,
66 FULL
67 };
68
69 static const int ROWTYPE_WIDTH = 8;
70
71 struct matrix_format
72 {
73 enum triangle triangle;
74 enum diagonal diagonal;
75 const struct variable *rowtype;
76 const struct variable *varname;
77 int n_continuous_vars;
78 struct variable **split_vars;
79 size_t n_split_vars;
80 long n;
81 };
82
83 /*
84 valid rowtype_ values:
85 CORR,
86 COV,
87 MAT,
88
89
90 MSE,
91 DFE,
92 MEAN,
93 STDDEV (or SD),
94 N_VECTOR (or N),
95 N_SCALAR,
96 N_MATRIX,
97 COUNT,
98 PROX.
99 */
100
101 /* Sets the value of OUTCASE which corresponds to VNAME
102 to the value STR. VNAME must be of type string.
103 */
104 static void
set_varname_column(struct ccase * outcase,const struct variable * vname,const char * str)105 set_varname_column (struct ccase *outcase, const struct variable *vname,
106 const char *str)
107 {
108 int len = var_get_width (vname);
109 uint8_t *s = case_str_rw (outcase, vname);
110
111 strncpy (CHAR_CAST (char *, s), str, len);
112 }
113
114 static void
blank_varname_column(struct ccase * outcase,const struct variable * vname)115 blank_varname_column (struct ccase *outcase, const struct variable *vname)
116 {
117 int len = var_get_width (vname);
118 uint8_t *s = case_str_rw (outcase, vname);
119
120 memset (s, ' ', len);
121 }
122
123 static struct casereader *
preprocess(struct casereader * casereader0,const struct dictionary * dict,void * aux)124 preprocess (struct casereader *casereader0, const struct dictionary *dict, void *aux)
125 {
126 struct matrix_format *mformat = aux;
127 const struct caseproto *proto = casereader_get_proto (casereader0);
128 struct casewriter *writer = autopaging_writer_create (proto);
129 struct ccase *prev_case = NULL;
130 double **matrices = NULL;
131 size_t n_splits = 0;
132
133 const size_t sizeof_matrix =
134 sizeof (double) * mformat->n_continuous_vars * mformat->n_continuous_vars;
135
136
137 /* Make an initial pass to populate our temporary matrix */
138 struct casereader *pass0 = casereader_clone (casereader0);
139 struct ccase *c;
140 union value *prev_values = XCALLOC (mformat->n_split_vars, union value);
141 int row = (mformat->triangle == LOWER && mformat->diagonal == NO_DIAGONAL) ? 1 : 0;
142 bool first_case = true;
143 for (; (c = casereader_read (pass0)) != NULL; case_unref (c))
144 {
145 int s;
146 bool match = false;
147 if (!first_case)
148 {
149 match = true;
150 for (s = 0; s < mformat->n_split_vars; ++s)
151 {
152 const struct variable *svar = mformat->split_vars[s];
153 const union value *sv = case_data (c, svar);
154 if (! value_equal (prev_values + s, sv, var_get_width (svar)))
155 {
156 match = false;
157 break;
158 }
159 }
160 }
161 first_case = false;
162
163 if (matrices == NULL || ! match)
164 {
165 row = (mformat->triangle == LOWER && mformat->diagonal == NO_DIAGONAL) ?
166 1 : 0;
167
168 n_splits++;
169 matrices = xrealloc (matrices, sizeof (double*) * n_splits);
170 matrices[n_splits - 1] = xmalloc (sizeof_matrix);
171 }
172
173 for (s = 0; s < mformat->n_split_vars; ++s)
174 {
175 const struct variable *svar = mformat->split_vars[s];
176 const union value *sv = case_data (c, svar);
177 value_clone (prev_values + s, sv, var_get_width (svar));
178 }
179
180 int c_offset = (mformat->triangle == UPPER) ? row : 0;
181 if (mformat->triangle == UPPER && mformat->diagonal == NO_DIAGONAL)
182 c_offset++;
183 const union value *v = case_data (c, mformat->rowtype);
184 const char *val = CHAR_CAST (const char *, v->s);
185 if (0 == strncasecmp (val, "corr ", ROWTYPE_WIDTH) ||
186 0 == strncasecmp (val, "cov ", ROWTYPE_WIDTH))
187 {
188 if (row >= mformat->n_continuous_vars)
189 {
190 msg (SE,
191 _("There are %d variable declared but the data has at least %d matrix rows."),
192 mformat->n_continuous_vars, row + 1);
193 case_unref (c);
194 casereader_destroy (pass0);
195 free (prev_values);
196 goto error;
197 }
198 int col;
199 for (col = c_offset; col < mformat->n_continuous_vars; ++col)
200 {
201 const struct variable *var =
202 dict_get_var (dict,
203 1 + col - c_offset +
204 var_get_dict_index (mformat->varname));
205
206 double e = case_data (c, var)->f;
207 if (e == SYSMIS)
208 continue;
209
210 /* Fill in the lower triangle */
211 (matrices[n_splits-1])[col + mformat->n_continuous_vars * row] = e;
212
213 if (mformat->triangle != FULL)
214 /* Fill in the upper triangle */
215 (matrices[n_splits-1]) [row + mformat->n_continuous_vars * col] = e;
216 }
217 row++;
218 }
219 }
220 casereader_destroy (pass0);
221 free (prev_values);
222
223 if (!matrices)
224 goto error;
225
226 /* Now make a second pass to fill in the other triangle from our
227 temporary matrix */
228 const int idx = var_get_dict_index (mformat->varname);
229 row = 0;
230
231 if (mformat->n >= 0)
232 {
233 int col;
234 struct ccase *outcase = case_create (proto);
235 union value *v = case_data_rw (outcase, mformat->rowtype);
236 memcpy (v->s, "N ", ROWTYPE_WIDTH);
237 blank_varname_column (outcase, mformat->varname);
238 for (col = 0; col < mformat->n_continuous_vars; ++col)
239 {
240 union value *dest_val =
241 case_data_rw_idx (outcase,
242 1 + col + var_get_dict_index (mformat->varname));
243 dest_val->f = mformat->n;
244 }
245 casewriter_write (writer, outcase);
246 }
247
248 n_splits = 0;
249 prev_values = xcalloc (mformat->n_split_vars, sizeof *prev_values);
250 first_case = true;
251 for (; (c = casereader_read (casereader0)) != NULL; prev_case = c)
252 {
253 int s;
254 bool match = false;
255 if (!first_case)
256 {
257 match = true;
258 for (s = 0; s < mformat->n_split_vars; ++s)
259 {
260 const struct variable *svar = mformat->split_vars[s];
261 const union value *sv = case_data (c, svar);
262 if (! value_equal (prev_values + s, sv, var_get_width (svar)))
263 {
264 match = false;
265 break;
266 }
267 }
268 }
269 first_case = false;
270 if (! match)
271 {
272 n_splits++;
273 row = 0;
274 }
275
276 for (s = 0; s < mformat->n_split_vars; ++s)
277 {
278 const struct variable *svar = mformat->split_vars[s];
279 const union value *sv = case_data (c, svar);
280 value_clone (prev_values + s, sv, var_get_width (svar));
281 }
282
283 case_unref (prev_case);
284 const union value *v = case_data (c, mformat->rowtype);
285 const char *val = CHAR_CAST (const char *, v->s);
286 if (mformat->n >= 0)
287 {
288 if (0 == strncasecmp (val, "n ", ROWTYPE_WIDTH) ||
289 0 == strncasecmp (val, "n_vector", ROWTYPE_WIDTH))
290 {
291 msg (SW,
292 _("The N subcommand was specified, but a N record was also found in the data. The N record will be ignored."));
293 continue;
294 }
295 }
296
297 struct ccase *outcase = case_create (proto);
298 case_copy (outcase, 0, c, 0, caseproto_get_n_widths (proto));
299
300 if (0 == strncasecmp (val, "corr ", ROWTYPE_WIDTH) ||
301 0 == strncasecmp (val, "cov ", ROWTYPE_WIDTH))
302 {
303 int col;
304 const struct variable *var = dict_get_var (dict, idx + 1 + row);
305 set_varname_column (outcase, mformat->varname, var_get_name (var));
306 value_copy (case_data_rw (outcase, mformat->rowtype), v, ROWTYPE_WIDTH);
307
308 for (col = 0; col < mformat->n_continuous_vars; ++col)
309 {
310 union value *dest_val =
311 case_data_rw_idx (outcase,
312 1 + col + var_get_dict_index (mformat->varname));
313 dest_val->f = (matrices[n_splits - 1])[col + mformat->n_continuous_vars * row];
314 if (col == row && mformat->diagonal == NO_DIAGONAL)
315 dest_val->f = 1.0;
316 }
317 row++;
318 }
319 else
320 {
321 blank_varname_column (outcase, mformat->varname);
322 }
323
324 /* Special case for SD and N_VECTOR: Rewrite as STDDEV and N respectively */
325 if (0 == strncasecmp (val, "sd ", ROWTYPE_WIDTH))
326 {
327 value_copy_buf_rpad (case_data_rw (outcase, mformat->rowtype), ROWTYPE_WIDTH,
328 (uint8_t *) "STDDEV", 6, ' ');
329 }
330 else if (0 == strncasecmp (val, "n_vector", ROWTYPE_WIDTH))
331 {
332 value_copy_buf_rpad (case_data_rw (outcase, mformat->rowtype), ROWTYPE_WIDTH,
333 (uint8_t *) "N", 1, ' ');
334 }
335
336 casewriter_write (writer, outcase);
337 }
338
339 /* If NODIAGONAL is specified, then a final case must be written */
340 if (mformat->diagonal == NO_DIAGONAL)
341 {
342 int col;
343 struct ccase *outcase = case_create (proto);
344
345 if (prev_case)
346 case_copy (outcase, 0, prev_case, 0, caseproto_get_n_widths (proto));
347
348 const struct variable *var = dict_get_var (dict, idx + 1 + row);
349 set_varname_column (outcase, mformat->varname, var_get_name (var));
350
351 for (col = 0; col < mformat->n_continuous_vars; ++col)
352 {
353 union value *dest_val =
354 case_data_rw_idx (outcase, 1 + col +
355 var_get_dict_index (mformat->varname));
356 dest_val->f = (matrices[n_splits - 1]) [col + mformat->n_continuous_vars * row];
357 if (col == row && mformat->diagonal == NO_DIAGONAL)
358 dest_val->f = 1.0;
359 }
360
361 casewriter_write (writer, outcase);
362 }
363 free (prev_values);
364
365 if (prev_case)
366 case_unref (prev_case);
367
368 int i;
369 for (i = 0 ; i < n_splits; ++i)
370 free (matrices[i]);
371 free (matrices);
372 struct casereader *reader1 = casewriter_make_reader (writer);
373 casereader_destroy (casereader0);
374 return reader1;
375
376
377 error:
378 if (prev_case)
379 case_unref (prev_case);
380
381 if (matrices)
382 for (i = 0 ; i < n_splits; ++i)
383 free (matrices[i]);
384 free (matrices);
385 casereader_destroy (casereader0);
386 casewriter_destroy (writer);
387 return NULL;
388 }
389
390 int
cmd_matrix(struct lexer * lexer,struct dataset * ds)391 cmd_matrix (struct lexer *lexer, struct dataset *ds)
392 {
393 struct dictionary *dict;
394 struct data_parser *parser;
395 struct dfm_reader *reader;
396 struct file_handle *fh = NULL;
397 char *encoding = NULL;
398 struct matrix_format mformat;
399 int i;
400 size_t n_names;
401 char **names = NULL;
402
403 mformat.triangle = LOWER;
404 mformat.diagonal = DIAGONAL;
405 mformat.n_split_vars = 0;
406 mformat.split_vars = NULL;
407 mformat.n = -1;
408
409 dict = (in_input_program ()
410 ? dataset_dict (ds)
411 : dict_create (get_default_encoding ()));
412 parser = data_parser_create (dict);
413 reader = NULL;
414
415 data_parser_set_type (parser, DP_DELIMITED);
416 data_parser_set_warn_missing_fields (parser, false);
417 data_parser_set_span (parser, false);
418
419 mformat.rowtype = dict_create_var (dict, "ROWTYPE_", ROWTYPE_WIDTH);
420
421 mformat.n_continuous_vars = 0;
422 mformat.n_split_vars = 0;
423
424 if (! lex_force_match_id (lexer, "VARIABLES"))
425 goto error;
426
427 lex_match (lexer, T_EQUALS);
428
429 if (! parse_mixed_vars (lexer, dict, &names, &n_names, PV_NO_DUPLICATE))
430 {
431 int i;
432 for (i = 0; i < n_names; ++i)
433 free (names[i]);
434 free (names);
435 goto error;
436 }
437
438 int longest_name = 0;
439 for (i = 0; i < n_names; ++i)
440 {
441 maximize_int (&longest_name, strlen (names[i]));
442 }
443
444 mformat.varname = dict_create_var (dict, "VARNAME_",
445 8 * DIV_RND_UP (longest_name, 8));
446
447 for (i = 0; i < n_names; ++i)
448 {
449 if (0 == strcasecmp (names[i], "ROWTYPE_"))
450 {
451 const struct fmt_spec fmt = fmt_for_input (FMT_A, 8, 0);
452 data_parser_add_delimited_field (parser,
453 &fmt,
454 var_get_case_index (mformat.rowtype),
455 "ROWTYPE_");
456 }
457 else
458 {
459 const struct fmt_spec fmt = fmt_for_input (FMT_F, 10, 4);
460 struct variable *v = dict_create_var (dict, names[i], 0);
461 var_set_both_formats (v, &fmt);
462 data_parser_add_delimited_field (parser,
463 &fmt,
464 var_get_case_index (mformat.varname) +
465 ++mformat.n_continuous_vars,
466 names[i]);
467 }
468 }
469 for (i = 0; i < n_names; ++i)
470 free (names[i]);
471 free (names);
472
473 while (lex_token (lexer) != T_ENDCMD)
474 {
475 if (! lex_force_match (lexer, T_SLASH))
476 goto error;
477
478 if (lex_match_id (lexer, "N"))
479 {
480 lex_match (lexer, T_EQUALS);
481
482 if (! lex_force_int (lexer))
483 goto error;
484
485 mformat.n = lex_integer (lexer);
486 if (mformat.n < 0)
487 {
488 msg (SE, _("%s must not be negative."), "N");
489 goto error;
490 }
491 lex_get (lexer);
492 }
493 else if (lex_match_id (lexer, "FORMAT"))
494 {
495 lex_match (lexer, T_EQUALS);
496
497 while (lex_token (lexer) != T_SLASH && (lex_token (lexer) != T_ENDCMD))
498 {
499 if (lex_match_id (lexer, "LIST"))
500 {
501 data_parser_set_span (parser, false);
502 }
503 else if (lex_match_id (lexer, "FREE"))
504 {
505 data_parser_set_span (parser, true);
506 }
507 else if (lex_match_id (lexer, "UPPER"))
508 {
509 mformat.triangle = UPPER;
510 }
511 else if (lex_match_id (lexer, "LOWER"))
512 {
513 mformat.triangle = LOWER;
514 }
515 else if (lex_match_id (lexer, "FULL"))
516 {
517 mformat.triangle = FULL;
518 }
519 else if (lex_match_id (lexer, "DIAGONAL"))
520 {
521 mformat.diagonal = DIAGONAL;
522 }
523 else if (lex_match_id (lexer, "NODIAGONAL"))
524 {
525 mformat.diagonal = NO_DIAGONAL;
526 }
527 else
528 {
529 lex_error (lexer, NULL);
530 goto error;
531 }
532 }
533 }
534 else if (lex_match_id (lexer, "FILE"))
535 {
536 lex_match (lexer, T_EQUALS);
537 fh_unref (fh);
538 fh = fh_parse (lexer, FH_REF_FILE | FH_REF_INLINE, NULL);
539 if (fh == NULL)
540 goto error;
541 }
542 else if (lex_match_id (lexer, "SPLIT"))
543 {
544 lex_match (lexer, T_EQUALS);
545 if (! parse_variables (lexer, dict, &mformat.split_vars, &mformat.n_split_vars, 0))
546 {
547 free (mformat.split_vars);
548 goto error;
549 }
550 int i;
551 for (i = 0; i < mformat.n_split_vars; ++i)
552 {
553 const struct fmt_spec fmt = fmt_for_input (FMT_F, 4, 0);
554 var_set_both_formats (mformat.split_vars[i], &fmt);
555 }
556 dict_reorder_vars (dict, mformat.split_vars, mformat.n_split_vars);
557 mformat.n_continuous_vars -= mformat.n_split_vars;
558 }
559 else
560 {
561 lex_error (lexer, NULL);
562 goto error;
563 }
564 }
565
566 if (mformat.diagonal == NO_DIAGONAL && mformat.triangle == FULL)
567 {
568 msg (SE, _("FORMAT = FULL and FORMAT = NODIAGONAL are mutually exclusive."));
569 goto error;
570 }
571
572 if (fh == NULL)
573 fh = fh_inline_file ();
574 fh_set_default_handle (fh);
575
576 if (!data_parser_any_fields (parser))
577 {
578 msg (SE, _("At least one variable must be specified."));
579 goto error;
580 }
581
582 if (lex_end_of_command (lexer) != CMD_SUCCESS)
583 goto error;
584
585 reader = dfm_open_reader (fh, lexer, encoding);
586 if (reader == NULL)
587 goto error;
588
589 if (in_input_program ())
590 {
591 struct data_list_trns *trns = xmalloc (sizeof *trns);
592 trns->parser = parser;
593 trns->reader = reader;
594 trns->end = NULL;
595 add_transformation (ds, data_list_trns_proc, data_list_trns_free, trns);
596 }
597 else
598 {
599 data_parser_make_active_file (parser, ds, reader, dict, preprocess,
600 &mformat);
601 }
602
603 fh_unref (fh);
604 free (encoding);
605 free (mformat.split_vars);
606
607 return CMD_DATA_LIST;
608
609 error:
610 data_parser_destroy (parser);
611 if (!in_input_program ())
612 dict_unref (dict);
613 fh_unref (fh);
614 free (encoding);
615 free (mformat.split_vars);
616 return CMD_CASCADING_FAILURE;
617 }
618
619
620 /* Input procedure. */
621
622 /* Destroys DATA LIST transformation TRNS.
623 Returns true if successful, false if an I/O error occurred. */
624 static bool
data_list_trns_free(void * trns_)625 data_list_trns_free (void *trns_)
626 {
627 struct data_list_trns *trns = trns_;
628 data_parser_destroy (trns->parser);
629 dfm_close_reader (trns->reader);
630 free (trns);
631 return true;
632 }
633
634 /* Handle DATA LIST transformation TRNS, parsing data into *C. */
635 static int
data_list_trns_proc(void * trns_,struct ccase ** c,casenumber case_num UNUSED)636 data_list_trns_proc (void *trns_, struct ccase **c, casenumber case_num UNUSED)
637 {
638 struct data_list_trns *trns = trns_;
639 int retval;
640
641 *c = case_unshare (*c);
642 if (data_parser_parse (trns->parser, trns->reader, *c))
643 retval = TRNS_CONTINUE;
644 else if (dfm_reader_error (trns->reader) || dfm_eof (trns->reader) > 1)
645 {
646 /* An I/O error, or encountering end of file for a second
647 time, should be escalated into a more serious error. */
648 retval = TRNS_ERROR;
649 }
650 else
651 retval = TRNS_END_FILE;
652
653 /* If there was an END subcommand handle it. */
654 if (trns->end != NULL)
655 {
656 double *end = &case_data_rw (*c, trns->end)->f;
657 if (retval == TRNS_END_FILE)
658 {
659 *end = 1.0;
660 retval = TRNS_CONTINUE;
661 }
662 else
663 *end = 0.0;
664 }
665
666 return retval;
667 }
668