1 /* GNU Datamash - perform simple calculation on input data
2
3 Copyright (C) 2013-2020 Assaf Gordon <assafgordon@gmail.com>
4
5 This file is part of GNU Datamash.
6
7 GNU Datamash is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 GNU Datamash is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Datamash. If not, see <https://www.gnu.org/licenses/>.
19 */
20
21 /* Written by Assaf Gordon */
22 #include <config.h>
23 #include <assert.h>
24 #include <ctype.h>
25 #include <locale.h>
26 #include <math.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <stdint.h>
30 #include <inttypes.h>
31 #include <string.h>
32 #include <stdbool.h>
33 #include <time.h>
34 #include <libgen.h> /* for dirname & POSIX version of basename */
35
36 #include "die.h"
37 #include "minmax.h"
38 #include "linebuffer.h"
39 #include "system.h"
40 #include "md5.h"
41 #include "sha1.h"
42 #include "sha256.h"
43 #include "sha512.h"
44 #include "base64.h"
45 #include "xalloc.h"
46 #include "hash-pjw-bare.h"
47
48 #include "utils.h"
49 #include "text-options.h"
50 #include "text-lines.h"
51 #include "column-headers.h"
52 #include "op-defs.h"
53 #include "field-ops.h"
54
55 struct operation_data operations[] =
56 {
57 /* OP_COUNT */
58 {STRING_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
59 /* OP_SUM */
60 {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
61 /* OP_MIN */
62 {NUMERIC_SCALAR, AUTO_SET_FIRST, NUMERIC_RESULT},
63 /* OP_MAX */
64 {NUMERIC_SCALAR, AUTO_SET_FIRST, NUMERIC_RESULT},
65 /* OP_ABSMIN */
66 {NUMERIC_SCALAR, AUTO_SET_FIRST, NUMERIC_RESULT},
67 /* OP_ABSMAX */
68 {NUMERIC_SCALAR, AUTO_SET_FIRST, NUMERIC_RESULT},
69 /* OP_RANGE */
70 {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
71 /* OP_FIRST */
72 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
73 /* OP_LAST */
74 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
75 /* OP_RAND */
76 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
77 /* OP_MEAN */
78 {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
79 /* OP_GEOMEAN */
80 {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
81 /* OP_HARMMEAN */
82 {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
83 /* OP_MEDIAN */
84 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
85 /* OP_QUARTILE_1 */
86 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
87 /* OP_QUARTILE_3 */
88 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
89 /* OP_IQR */
90 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
91 /* OP_PERCENTILE */
92 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
93 /* OP_PSTDEV */
94 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
95 /* OP_SSTDEV */
96 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
97 /* OP_PVARIANCE */
98 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
99 /* OP_SVARIANCE */
100 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
101 /* OP_MAD */
102 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
103 /* OP_MADRAW */
104 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
105 /* OP_S_SKEWNESS */
106 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
107 /* OP_P_SKEWNESS */
108 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
109 /* OP_S_EXCESS_KURTOSIS */
110 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
111 /* OP_P_EXCESS_KURTOSIS */
112 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
113 /* OP_JARQUE_BETA */
114 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
115 /* OP_DP_OMNIBUS */
116 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
117 /* OP_MODE */
118 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
119 /* OP_ANTIMODE */
120 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
121 /* OP_UNIQUE */
122 {STRING_VECTOR, IGNORE_FIRST, STRING_RESULT},
123 /* OP_COLLAPSE */
124 {STRING_VECTOR, IGNORE_FIRST, STRING_RESULT},
125 /* OP_COUNT_UNIQUE */
126 {STRING_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
127 /* OP_BASE64 */
128 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
129 /* OP_DEBASE64 */
130 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
131 /* OP_MD5 */
132 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
133 /* OP_SHA1 */
134 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
135 /* OP_SHA224 */
136 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
137 /* OP_SHA256 */
138 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
139 /* OP_SHA384 */
140 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
141 /* OP_SHA512 */
142 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
143 /* OP_P_COVARIANCE */
144 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
145 /* OP_S_COVARIANCE */
146 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
147 /* OP_P_PEARSON_COR */
148 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
149 /* OP_S_PEARSON_COR */
150 {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
151 /* OP_BIN_BUCKETS */
152 {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
153 /* OP_STRBIN */
154 {STRING_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
155 /* OP_FLOOR */
156 {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
157 /* OP_CEIL */
158 {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
159 /* OP_ROUND */
160 {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
161 /* OP_TRUNCATE */
162 {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
163 /* OP_FRACTION */
164 {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
165 /* OP_TRIMMED_MEAN */
166 {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
167 /* OP_DIRNAME */
168 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
169 /* OP_BASENAME */
170 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
171 /* OP_EXTNAME */
172 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
173 /* OP_BARENAME */
174 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
175 /* OP_GETNUM */
176 {STRING_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
177 /* OP_CUT */
178 {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
179 {0, 0, NUMERIC_RESULT}
180 };
181
182 //struct fieldop* field_ops = NULL;
183
184 enum { VALUES_BATCH_INCREMENT = 1024 };
185
186 /* Add a numeric value to the values vector, allocating memory as needed */
187 static void
field_op_add_value(struct fieldop * op,long double val)188 field_op_add_value (struct fieldop *op, long double val)
189 {
190 if (op->num_values >= op->alloc_values)
191 {
192 op->alloc_values += VALUES_BATCH_INCREMENT;
193 op->values = xnrealloc (op->values, op->alloc_values,
194 sizeof (long double));
195 }
196 op->values[op->num_values] = val;
197 op->num_values++;
198 }
199
200 static void
field_op_reserve_out_buf(struct fieldop * op,const size_t minsize)201 field_op_reserve_out_buf (struct fieldop *op, const size_t minsize)
202 {
203 if (op->out_buf_alloc < minsize)
204 {
205 op->out_buf = xrealloc (op->out_buf, minsize);
206 op->out_buf_alloc = minsize;
207 }
208 }
209
210 /* stores the hexadecimal representation of 'buffer' in op->out_buf */
211 static void
field_op_to_hex(struct fieldop * op,const char * buffer,const size_t inlen)212 field_op_to_hex ( struct fieldop* op, const char *buffer, const size_t inlen )
213 {
214 static const char hex_digits[] =
215 {
216 '0', '1', '2', '3', '4', '5', '6', '7',
217 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
218 };
219 size_t len = inlen*2+1;
220 const char* inp = buffer;
221 field_op_reserve_out_buf (op, len);
222 char* ptr = op->out_buf;
223 for (size_t i = 0 ; i < inlen; ++i)
224 {
225 *ptr = hex_digits[ ((*inp)>>4) & 0xf ] ;
226 ++ptr;
227 *ptr = hex_digits[ (*inp) & 0xf ] ;
228 ++ptr;
229 ++inp;
230 }
231 *ptr = 0 ;
232 }
233
234 /* Add a string to the strings vector, allocating memory as needed */
235 void
field_op_add_string(struct fieldop * op,const char * str,size_t slen)236 field_op_add_string (struct fieldop *op, const char* str, size_t slen)
237 {
238 if (op->str_buf_used + slen+1 >= op->str_buf_alloc)
239 {
240 op->str_buf_alloc += MAX (VALUES_BATCH_INCREMENT,slen+1);
241 op->str_buf = xrealloc (op->str_buf, op->str_buf_alloc);
242 }
243
244 /* Copy the string to the buffer */
245 memcpy (op->str_buf + op->str_buf_used, str, slen);
246 *(op->str_buf + op->str_buf_used + slen ) = 0;
247 op->str_buf_used += slen + 1 ;
248 }
249
250 /* Replace the current string in the string buffer.
251 This function assumes only one string is stored in the buffer. */
252 void
field_op_replace_string(struct fieldop * op,const char * str,size_t slen)253 field_op_replace_string (struct fieldop *op, const char* str, size_t slen)
254 {
255 if (slen+1 >= op->str_buf_alloc)
256 {
257 op->str_buf_alloc += MAX (VALUES_BATCH_INCREMENT,slen+1);
258 op->str_buf = xrealloc (op->str_buf, op->str_buf_alloc);
259 }
260
261 /* Copy the string to the buffer */
262 memcpy (op->str_buf, str, slen);
263 *(op->str_buf + slen ) = 0;
264 op->str_buf_used = slen + 1 ;
265 }
266
267 /* Returns an array of string-pointers (char*),
268 each pointing to a string in the string buffer
269 (added by field_op_add_string () ).
270
271 The returned pointer must be free'd.
272
273 The returned pointer will have 'op->count+1' elements,
274 pointing to 'op->count' strings + one last NULL.
275 */
276 static const char **
field_op_get_string_ptrs(struct fieldop * op,bool sort,bool sort_case_sensitive)277 field_op_get_string_ptrs ( struct fieldop *op, bool sort,
278 bool sort_case_sensitive )
279 {
280 const char **ptrs = xnmalloc (op->count+1, sizeof (char*));
281 char *p = op->str_buf;
282 const char* pend = op->str_buf + op->str_buf_used;
283 size_t idx=0;
284 while (p < pend)
285 {
286 ptrs[idx++] = p;
287 while ( p<pend && *p != '\0' )
288 ++p;
289 ++p;
290 }
291 ptrs[idx] = 0;
292
293 if (sort)
294 {
295 /* Sort the string pointers */
296 qsort ( ptrs, op->count, sizeof (char*), sort_case_sensitive
297 ?cmpstringp
298 :cmpstringp_nocase);
299 }
300 return ptrs;
301 }
302
303 /* Sort the numeric values vector in a fieldop structure */
304 static void
field_op_sort_values(struct fieldop * op)305 field_op_sort_values (struct fieldop *op)
306 {
307 qsortfl (op->values, op->num_values);
308 }
309
310 void
field_op_init(struct fieldop * op,enum field_operation oper,bool by_name,size_t num,const char * name)311 field_op_init (struct fieldop* /*out*/ op,
312 enum field_operation oper,
313 bool by_name, size_t num, const char* name)
314 {
315 assert (op != NULL); /* LCOV_EXCL_LINE */
316 memset (op, 0, sizeof *op);
317
318 op->op = oper;
319 op->acc_type = operations[oper].acc_type;
320 op->res_type = operations[oper].res_type;
321 op->numeric = (op->acc_type == NUMERIC_SCALAR
322 || op->acc_type == NUMERIC_VECTOR);
323 op->auto_first = operations[oper].auto_first;
324 op->slave = false;
325 op->slave_op = NULL;
326
327 op->field = num;
328 op->field_by_name = by_name;
329 op->field_name = (by_name)?xstrdup (name):NULL;
330 op->first = true;
331 if (op->res_type == STRING_RESULT)
332 {
333 op->out_buf_alloc = 1024;
334 op->out_buf = xmalloc (op->out_buf_alloc);
335 }
336 }
337
338 /* Ensure this (master) fieldop has the same number of values as
339 as it's slave fieldop. */
340 static void
verify_slave_num_values(const struct fieldop * op)341 verify_slave_num_values (const struct fieldop *op)
342 {
343 assert (op && !op->slave && op->slave_op); /* LCOV_EXCL_LINE */
344
345 if (op->num_values != op->slave_op->num_values)
346 die (EXIT_FAILURE, 0, _("input error for operation %s: \
347 fields %"PRIuMAX",%"PRIuMAX" have different number of items"),
348 quote (get_field_operation_name (op->op)),
349 (uintmax_t)op->slave_op->field,
350 (uintmax_t)op->field);
351 }
352
353 /* Add a value (from input) to the current field operation. */
354 enum FIELD_OP_COLLECT_RESULT
field_op_collect(struct fieldop * op,const char * str,size_t slen)355 field_op_collect (struct fieldop *op,
356 const char* str, size_t slen)
357 {
358 char *endptr=NULL;
359 long double num_value = 0;
360 #ifdef HAVE_BROKEN_STRTOLD
361 char tmpbuf[512];
362 #endif
363 enum FIELD_OP_COLLECT_RESULT rc = FLOCR_OK;
364
365 assert (str != NULL); /* LCOV_EXCL_LINE */
366
367 if (remove_na_values && is_na (str,slen))
368 return FLOCR_OK_SKIPPED;
369
370 if (op->numeric)
371 {
372 errno = 0;
373 #ifdef HAVE_BROKEN_STRTOLD
374 /* On Cygwin, strtold doesn't stop at a tab character,
375 and returns invalid value.
376 Make a copy of the input buffer and NULL-terminate it */
377 if (slen >= sizeof (tmpbuf))
378 die (EXIT_FAILURE, 0,
379 "internal error: input field too long (%zu)", slen);
380 memcpy (tmpbuf,str,slen);
381 tmpbuf[slen]=0;
382 num_value = strtold (tmpbuf, &endptr);
383 if (errno==ERANGE || endptr==tmpbuf || endptr!=(tmpbuf+slen))
384 return FLOCR_INVALID_NUMBER;
385 #else
386 if (slen == 0)
387 return FLOCR_INVALID_NUMBER;
388 num_value = strtold (str, &endptr);
389 if (errno==ERANGE || endptr==str || endptr!=(str+slen))
390 return FLOCR_INVALID_NUMBER;
391 #endif
392 }
393
394 op->count++;
395
396 if (op->first && op->auto_first && op->numeric)
397 op->value = num_value;
398
399 switch (op->op) /* LCOV_EXCL_BR_LINE */
400 {
401 case OP_SUM:
402 case OP_MEAN:
403 op->value += num_value;
404 break;
405
406 case OP_GEOMEAN:
407 op->value += logl (num_value);
408 break;
409
410 case OP_HARMMEAN:
411 op->value += 1.0 / num_value;
412 break;
413
414 case OP_COUNT:
415 op->value++;
416 break;
417
418 case OP_MIN:
419 if (num_value < op->value)
420 {
421 op->value = num_value;
422 rc = FLOCR_OK_KEEP_LINE;
423 }
424 break;
425
426 case OP_MAX:
427 if (num_value > op->value)
428 {
429 op->value = num_value;
430 rc = FLOCR_OK_KEEP_LINE;
431 }
432 break;
433
434 case OP_ABSMIN:
435 if (fabsl (num_value) < fabsl (op->value))
436 {
437 op->value = num_value;
438 rc = FLOCR_OK_KEEP_LINE;
439 }
440 break;
441
442 case OP_ABSMAX:
443 if (fabsl (num_value) > fabsl (op->value))
444 {
445 op->value = num_value;
446 rc = FLOCR_OK_KEEP_LINE;
447 }
448 break;
449
450 case OP_RANGE:
451 /* Upon the first value, we store it twice
452 (once for min, once for max).
453 For subsequence values, we update the min/max entries directly. */
454 if (op->first)
455 {
456 field_op_add_value (op, num_value);
457 field_op_add_value (op, num_value);
458 }
459 else
460 {
461 if (num_value < op->values[0])
462 op->values[0] = num_value;
463 if (num_value > op->values[1])
464 op->values[1] = num_value;
465 }
466 break;
467
468 case OP_FIRST:
469 if (op->first)
470 {
471 field_op_replace_string (op, str, slen);
472 rc = FLOCR_OK_KEEP_LINE;
473 }
474 break;
475
476 case OP_LAST:
477 /* Replace the 'current' string with the latest one */
478 field_op_replace_string (op, str, slen);
479 rc = FLOCR_OK_KEEP_LINE;
480 break;
481
482 case OP_DEBASE64:
483 /* Base64 decoding is a special case: we decode during collection,
484 and report any errors back to the caller. */
485 {
486 /* safe to assume decoded base64 is never larger than encoded base64 */
487 size_t decoded_size = slen;
488 field_op_reserve_out_buf (op, decoded_size);
489 if (!base64_decode ( str, slen, op->out_buf, &decoded_size ))
490 return FLOCR_INVALID_BASE64;
491 op->out_buf[decoded_size]=0;
492 }
493 break;
494
495 case OP_BASE64:
496 case OP_MD5:
497 case OP_SHA1:
498 case OP_SHA224:
499 case OP_SHA256:
500 case OP_SHA384:
501 case OP_SHA512:
502 case OP_DIRNAME:
503 case OP_BASENAME:
504 case OP_EXTNAME:
505 case OP_BARENAME:
506 /* Replace the 'current' string with the latest one */
507 field_op_replace_string (op, str, slen);
508 break;
509
510 case OP_RAND:
511 {
512 /* Reservoir sampling,
513 With a simpler case were "k=1" */
514 unsigned long i = random ()%op->count;
515 if (op->first || i==0)
516 {
517 field_op_replace_string (op, str, slen);
518 rc = FLOCR_OK_KEEP_LINE;
519 }
520 }
521 break;
522
523 case OP_MEDIAN:
524 case OP_QUARTILE_1:
525 case OP_QUARTILE_3:
526 case OP_IQR:
527 case OP_PERCENTILE:
528 case OP_PSTDEV:
529 case OP_SSTDEV:
530 case OP_PVARIANCE:
531 case OP_SVARIANCE:
532 case OP_MAD:
533 case OP_MADRAW:
534 case OP_S_SKEWNESS:
535 case OP_P_SKEWNESS:
536 case OP_S_EXCESS_KURTOSIS:
537 case OP_P_EXCESS_KURTOSIS:
538 case OP_JARQUE_BERA:
539 case OP_DP_OMNIBUS:
540 case OP_MODE:
541 case OP_ANTIMODE:
542 case OP_P_COVARIANCE:
543 case OP_S_COVARIANCE:
544 case OP_P_PEARSON_COR:
545 case OP_S_PEARSON_COR:
546 case OP_TRIMMED_MEAN:
547 field_op_add_value (op, num_value);
548 break;
549
550 case OP_UNIQUE:
551 case OP_COLLAPSE:
552 case OP_COUNT_UNIQUE:
553 field_op_add_string (op, str, slen);
554 break;
555
556 case OP_BIN_BUCKETS:
557 {
558 const long double val = num_value / op->params.bin_bucket_size;
559 modfl (val, & op->value);
560 /* signbit will take care of negative-zero as well. */
561 if (signbit (op->value))
562 --op->value;
563 op->value *= op->params.bin_bucket_size;
564 }
565 break;
566
567 case OP_STRBIN:
568 op->value = hash_pjw_bare (str,slen) % (op->params.strbin_bucket_size);
569 break;
570
571 case OP_FLOOR:
572 op->value = pos_zero (floorl (num_value));
573 break;
574
575 case OP_CEIL:
576 op->value = pos_zero (ceill (num_value));
577 break;
578
579 case OP_ROUND:
580 op->value = pos_zero (roundl (num_value));
581 break;
582
583 case OP_TRUNCATE:
584 modfl (num_value, &op->value);
585 op->value = pos_zero (op->value);
586 break;
587
588 case OP_FRACTION:
589 {
590 long double dummy;
591 op->value = pos_zero (modfl (num_value, &dummy));
592 };
593 break;
594
595 case OP_GETNUM:
596 op->value = extract_number (str, slen, op->params.get_num_type);
597 break;
598
599 case OP_CUT:
600 field_op_replace_string (op, str, slen);
601 break;
602
603 case OP_INVALID: /* LCOV_EXCL_LINE */
604 default: /* LCOV_EXCL_LINE */
605 /* Should never happen */
606 internal_error ("bad op"); /* LCOV_EXCL_LINE */
607 }
608
609 op->first = false;
610
611 return rc;
612 }
613
614 /* creates a list of unique strings from op->str_buf .
615 results are stored in op->out_buf. */
616 static void
unique_value(struct fieldop * op,bool case_sensitive)617 unique_value ( struct fieldop *op, bool case_sensitive )
618 {
619 const char *last_str;
620 char *pos;
621
622 const char **ptrs = field_op_get_string_ptrs (op, true, case_sensitive);
623
624 /* Uniquify them */
625 field_op_reserve_out_buf (op, op->str_buf_used);
626 pos = op->out_buf ;
627
628 /* Copy the first string */
629 last_str = ptrs[0];
630 strcpy (pos, ptrs[0]);
631 pos += strlen (ptrs[0]);
632
633 /* Copy the following strings, if they are different from the previous one */
634 for (size_t i = 1; i < op->count; ++i)
635 {
636 const char *newstr = ptrs[i];
637
638 if ((case_sensitive && (!STREQ (newstr, last_str)))
639 || (!case_sensitive && (strcasecmp (newstr, last_str)!=0)))
640 {
641 *pos++ = collapse_separator ;
642 strcpy (pos, newstr);
643 pos += strlen (newstr);
644 }
645 last_str = newstr;
646 }
647
648 free (ptrs);
649 }
650
651 /* Returns the number of unique string values in the given field operation */
652 size_t
count_unique_values(struct fieldop * op,bool case_sensitive)653 count_unique_values ( struct fieldop *op, bool case_sensitive )
654 {
655 const char *last_str, **cur_str;
656 size_t count = 1 ;
657
658 const char **ptrs = field_op_get_string_ptrs (op, true, case_sensitive);
659
660 /* Copy the first string */
661 cur_str = ptrs;
662 last_str = *cur_str;
663 ++cur_str;
664
665 /* Copy the following strings, if they are different from the previous one */
666 while ( *cur_str != 0 )
667 {
668 if ((case_sensitive && (!STREQ (*cur_str, last_str)))
669 || (!case_sensitive && (strcasecmp (*cur_str, last_str)!=0)))
670 {
671 ++count;
672 }
673 last_str = *cur_str;
674 ++cur_str;
675 }
676
677 free (ptrs);
678
679 return count;
680 }
681
682 /* Returns a nul-terimated string, composed of all the values
683 of the input strings. The return string must be free'd. */
684 void
collapse_value(struct fieldop * op)685 collapse_value ( struct fieldop *op )
686 {
687 /* Copy the string buffer as-is */
688 field_op_reserve_out_buf (op, op->str_buf_used);
689 char *buf = op->out_buf;
690 memcpy (buf, op->str_buf, op->str_buf_used);
691
692 /* convert every NUL to comma, except for the last one */
693 for (size_t i=0; i < op->str_buf_used-1 ; i++)
694 if (buf[i] == 0)
695 buf[i] = collapse_separator ;
696 }
697
698 /* stores in op->out_buf the result of the field operation
699 when there are no input values.
700 'no values' can happen with '--narm' and input of all N/As.
701 The printed results are consistent as much as possible with R */
702 static void
field_op_summarize_empty(struct fieldop * op)703 field_op_summarize_empty (struct fieldop *op)
704 {
705 long double numeric_result = 0 ;
706
707 switch (op->op) /* LCOV_EXCL_BR_LINE */
708 {
709 case OP_MEAN:
710 case OP_GEOMEAN:
711 case OP_HARMMEAN:
712 case OP_S_SKEWNESS:
713 case OP_P_SKEWNESS:
714 case OP_S_EXCESS_KURTOSIS:
715 case OP_P_EXCESS_KURTOSIS:
716 case OP_JARQUE_BERA:
717 case OP_DP_OMNIBUS:
718 case OP_MEDIAN:
719 case OP_QUARTILE_1:
720 case OP_QUARTILE_3:
721 case OP_IQR:
722 case OP_PERCENTILE:
723 case OP_MAD:
724 case OP_MADRAW:
725 case OP_PSTDEV:
726 case OP_SSTDEV:
727 case OP_PVARIANCE:
728 case OP_SVARIANCE:
729 case OP_MODE:
730 case OP_ANTIMODE:
731 case OP_P_COVARIANCE:
732 case OP_S_COVARIANCE:
733 case OP_P_PEARSON_COR:
734 case OP_S_PEARSON_COR:
735 case OP_BIN_BUCKETS:
736 case OP_STRBIN:
737 case OP_FLOOR:
738 case OP_CEIL:
739 case OP_ROUND:
740 case OP_TRUNCATE:
741 case OP_FRACTION:
742 case OP_RANGE:
743 case OP_TRIMMED_MEAN:
744 case OP_GETNUM:
745 numeric_result = nanl ("");
746 break;
747
748 case OP_SUM:
749 case OP_COUNT:
750 case OP_COUNT_UNIQUE:
751 numeric_result = 0;
752 break;
753
754 case OP_MIN:
755 case OP_ABSMIN:
756 numeric_result = -HUGE_VALL;
757 break;
758
759 case OP_MAX:
760 case OP_ABSMAX:
761 numeric_result = HUGE_VALL;
762 break;
763
764 case OP_FIRST:
765 case OP_LAST:
766 case OP_RAND:
767 case OP_CUT:
768 field_op_reserve_out_buf (op, 4);
769 strcpy (op->out_buf, "N/A");
770 break;
771
772 case OP_UNIQUE:
773 case OP_COLLAPSE:
774 case OP_BASE64:
775 case OP_DEBASE64:
776 case OP_MD5:
777 case OP_SHA1:
778 case OP_SHA224:
779 case OP_SHA256:
780 case OP_SHA384:
781 case OP_SHA512:
782 case OP_DIRNAME:
783 case OP_BASENAME:
784 case OP_EXTNAME:
785 case OP_BARENAME:
786 field_op_reserve_out_buf (op, 1);
787 strcpy (op->out_buf, "");
788 break;
789
790 case OP_INVALID: /* LCOV_EXCL_LINE */
791 default: /* LCOV_EXCL_LINE */
792 /* Should never happen */
793 internal_error ("bad op"); /* LCOV_EXCL_LINE */
794 }
795
796 if (op->res_type==NUMERIC_RESULT)
797 {
798 field_op_reserve_out_buf (op, numeric_output_bufsize);
799 snprintf (op->out_buf, op->out_buf_alloc,
800 numeric_output_format, numeric_result);
801 }
802 }
803
804 /* Prints to stdout the result of the field operation,
805 based on collected values */
806 void
field_op_summarize(struct fieldop * op)807 field_op_summarize (struct fieldop *op)
808 {
809 long double numeric_result = 0 ;
810 char tmpbuf[64]; /* 64 bytes - enough to hold sha512 */
811
812 /* In case of no values, each operation returns a specific result.
813 'no values' can happen with '--narm' and input of all N/As. */
814 if (op->count==0)
815 {
816 field_op_summarize_empty (op);
817 return ;
818 }
819
820 switch (op->op) /* LCOV_EXCL_BR_LINE */
821 {
822 case OP_MEAN:
823 numeric_result = op->value / op->count;
824 break;
825
826 case OP_GEOMEAN:
827 numeric_result = expl (op->value / op->count);
828 break;
829
830 case OP_HARMMEAN:
831 numeric_result = op->count / op->value;
832 break;
833
834 case OP_SUM:
835 case OP_COUNT:
836 case OP_MIN:
837 case OP_MAX:
838 case OP_ABSMIN:
839 case OP_ABSMAX:
840 case OP_BIN_BUCKETS:
841 case OP_STRBIN:
842 case OP_FLOOR:
843 case OP_CEIL:
844 case OP_ROUND:
845 case OP_TRUNCATE:
846 case OP_FRACTION:
847 case OP_GETNUM:
848 /* no summarization for these operations, just print the value */
849 numeric_result = op->value;
850 break;
851
852 case OP_FIRST:
853 case OP_LAST:
854 case OP_RAND:
855 case OP_CUT:
856 /* Only one string is returned in the buffer, return it */
857 field_op_reserve_out_buf (op, op->str_buf_used);
858 memcpy (op->out_buf, op->str_buf, op->str_buf_used);
859 break;
860
861 case OP_RANGE:
862 numeric_result = op->values[1] - op->values[0];
863 break;
864
865 case OP_MEDIAN:
866 field_op_sort_values (op);
867 numeric_result = median_value ( op->values, op->num_values );
868 break;
869
870 case OP_QUARTILE_1:
871 field_op_sort_values (op);
872 numeric_result = quartile1_value ( op->values, op->num_values );
873 break;
874
875 case OP_QUARTILE_3:
876 field_op_sort_values (op);
877 numeric_result = quartile3_value ( op->values, op->num_values );
878 break;
879
880 case OP_IQR:
881 field_op_sort_values (op);
882 numeric_result = quartile3_value ( op->values, op->num_values )
883 - quartile1_value ( op->values, op->num_values );
884 break;
885
886 case OP_PERCENTILE:
887 field_op_sort_values (op);
888 numeric_result = percentile_value ( op->values, op->num_values,
889 op->params.percentile / 100.0 );
890 break;
891
892 case OP_TRIMMED_MEAN:
893 field_op_sort_values (op);
894 numeric_result = trimmed_mean_value ( op->values, op->num_values,
895 op->params.trimmed_mean);
896 break;
897
898 case OP_PSTDEV:
899 numeric_result = stdev_value ( op->values, op->num_values, DF_POPULATION);
900 break;
901
902 case OP_SSTDEV:
903 numeric_result = stdev_value ( op->values, op->num_values, DF_SAMPLE);
904 break;
905
906 case OP_PVARIANCE:
907 numeric_result = variance_value ( op->values, op->num_values,
908 DF_POPULATION);
909 break;
910
911 case OP_SVARIANCE:
912 numeric_result = variance_value ( op->values, op->num_values,
913 DF_SAMPLE);
914 break;
915
916 case OP_MAD:
917 field_op_sort_values (op);
918 numeric_result = mad_value ( op->values, op->num_values, 1.4826 );
919 break;
920
921 case OP_MADRAW:
922 field_op_sort_values (op);
923 numeric_result = mad_value ( op->values, op->num_values, 1.0 );
924 break;
925
926 case OP_S_SKEWNESS:
927 numeric_result = skewness_value ( op->values, op->num_values,
928 DF_SAMPLE );
929 break;
930
931 case OP_P_SKEWNESS:
932 numeric_result = skewness_value ( op->values, op->num_values,
933 DF_POPULATION );
934 break;
935
936 case OP_S_EXCESS_KURTOSIS:
937 numeric_result = excess_kurtosis_value ( op->values, op->num_values,
938 DF_SAMPLE );
939 break;
940
941 case OP_P_EXCESS_KURTOSIS:
942 numeric_result = excess_kurtosis_value ( op->values, op->num_values,
943 DF_POPULATION );
944 break;
945
946 case OP_JARQUE_BERA:
947 numeric_result = jarque_bera_pvalue ( op->values, op->num_values );
948 break;
949
950 case OP_DP_OMNIBUS:
951 numeric_result = dagostino_pearson_omnibus_pvalue ( op->values,
952 op->num_values );
953 break;
954
955 case OP_P_COVARIANCE:
956 case OP_S_COVARIANCE:
957 assert (!op->slave); /* LCOV_EXCL_LINE */
958 assert (op->slave_op); /* LCOV_EXCL_LINE */
959 verify_slave_num_values (op);
960 numeric_result = covariance_value (op->values, op->slave_op->values,
961 op->num_values,
962 (op->op==OP_P_COVARIANCE)?
963 DF_POPULATION:DF_SAMPLE );
964 break;
965
966 case OP_P_PEARSON_COR:
967 case OP_S_PEARSON_COR:
968 assert (!op->slave); /* LCOV_EXCL_LINE */
969 assert (op->slave_op); /* LCOV_EXCL_LINE */
970 verify_slave_num_values (op);
971 numeric_result = pearson_corr_value (op->values, op->slave_op->values,
972 op->num_values,
973 (op->op==OP_P_PEARSON_COR)?
974 DF_POPULATION:DF_SAMPLE);
975 break;
976
977 case OP_MODE:
978 case OP_ANTIMODE:
979 field_op_sort_values (op);
980 numeric_result = mode_value ( op->values, op->num_values,
981 (op->op==OP_MODE)?MODE:ANTIMODE);
982 break;
983
984 case OP_UNIQUE:
985 unique_value (op, case_sensitive);
986 break;
987
988 case OP_COLLAPSE:
989 collapse_value (op);
990 break;
991
992 case OP_COUNT_UNIQUE:
993 numeric_result = count_unique_values (op,case_sensitive);
994 break;
995
996 case OP_BASE64:
997 field_op_reserve_out_buf (op, BASE64_LENGTH (op->str_buf_used-1)+1 ) ;
998 base64_encode ( op->str_buf, op->str_buf_used-1,
999 op->out_buf, BASE64_LENGTH (op->str_buf_used-1)+1 );
1000 break;
1001
1002 case OP_DEBASE64:
1003 /* Decoding base64 is a special case: decoding (and error checking) was
1004 done in field_op_collect. op->out_buf already contains the decoded
1005 value. */
1006 break;
1007
1008 case OP_MD5:
1009 md5_buffer (op->str_buf, op->str_buf_used-1, tmpbuf);
1010 field_op_to_hex (op, tmpbuf, 16);
1011 break;
1012
1013 case OP_SHA1:
1014 sha1_buffer (op->str_buf, op->str_buf_used-1, tmpbuf);
1015 field_op_to_hex (op, tmpbuf, 20);
1016 break;
1017
1018 case OP_SHA224:
1019 sha224_buffer (op->str_buf, op->str_buf_used-1, tmpbuf);
1020 field_op_to_hex (op, tmpbuf, 28);
1021 break;
1022
1023 case OP_SHA256:
1024 sha256_buffer (op->str_buf, op->str_buf_used-1, tmpbuf);
1025 field_op_to_hex (op, tmpbuf, 32);
1026 break;
1027
1028 case OP_SHA384:
1029 sha384_buffer (op->str_buf, op->str_buf_used-1, tmpbuf);
1030 field_op_to_hex (op, tmpbuf, 48);
1031 break;
1032
1033 case OP_SHA512:
1034 sha512_buffer (op->str_buf, op->str_buf_used-1, tmpbuf);
1035 field_op_to_hex (op, tmpbuf, 64);
1036 break;
1037
1038 case OP_DIRNAME:
1039 {
1040 op->str_buf[op->str_buf_used] = 0;
1041 char *t = dirname (op->str_buf);
1042 field_op_reserve_out_buf (op, op->str_buf_used);
1043 strcpy (op->out_buf,t);
1044 }
1045 break;
1046
1047 case OP_BASENAME:
1048 case OP_EXTNAME:
1049 case OP_BARENAME:
1050 {
1051 if (op->str_buf_used==1)
1052 {
1053 /* Empty string, containing only NUL */
1054 field_op_reserve_out_buf (op, 1);
1055 op->out_buf[0] = '\0';
1056 break;
1057 }
1058
1059 op->str_buf[op->str_buf_used] = 0;
1060 char *t = basename (op->str_buf);
1061 field_op_reserve_out_buf (op, op->str_buf_used);
1062
1063 if (op->op == OP_BASENAME)
1064 {
1065 /* Just copy the extracted base name */
1066 strcpy (op->out_buf,t);
1067 }
1068 else
1069 {
1070 /* Guess the file extension */
1071 size_t tl = strlen (t);
1072 size_t l = guess_file_extension (t, tl);
1073
1074 if (op->op == OP_EXTNAME)
1075 {
1076 /* Store the extension */
1077 if (l>0)
1078 {
1079 memcpy (op->out_buf, t+(tl-l+1), l-1);
1080 op->out_buf[l-1] = '\0';
1081 }
1082 else
1083 {
1084 op->out_buf[0] = '\0';
1085 }
1086 }
1087 else
1088 {
1089 /* Store the basename without the extension */
1090 memcpy (op->out_buf, t, tl-l);
1091 op->out_buf[tl-l] = '\0';
1092 }
1093 }
1094 }
1095 break;
1096
1097 case OP_INVALID: /* LCOV_EXCL_LINE */
1098 default: /* LCOV_EXCL_LINE */
1099 /* Should never happen */
1100 internal_error ("bad op"); /* LCOV_EXCL_LINE */
1101 }
1102
1103 if (op->res_type==NUMERIC_RESULT)
1104 {
1105 field_op_reserve_out_buf (op, numeric_output_bufsize);
1106 snprintf (op->out_buf, op->out_buf_alloc,
1107 numeric_output_format, numeric_result);
1108 }
1109 }
1110
1111 /* reset operation values for next group */
1112 void
field_op_reset(struct fieldop * op)1113 field_op_reset (struct fieldop *op)
1114 {
1115 op->first = true;
1116 op->count = 0 ;
1117 op->value = 0;
1118 op->num_values = 0 ;
1119 op->str_buf_used = 0;
1120 op->out_buf_used = 0;
1121 /* note: op->str_buf and op->str_alloc are not free'd, and reused */
1122 }
1123
1124 void
field_op_free(struct fieldop * op)1125 field_op_free (struct fieldop* op)
1126 {
1127 free (op->values);
1128 op->num_values = 0 ;
1129 op->alloc_values = 0;
1130
1131 free (op->str_buf);
1132 op->str_buf = NULL;
1133 op->str_buf_alloc = 0;
1134 op->str_buf_used = 0;
1135
1136 free (op->out_buf);
1137 op->out_buf = NULL;
1138 op->out_buf_alloc = 0;
1139 op->out_buf_used = 0;
1140
1141 free (op->field_name);
1142 op->field_name = NULL;
1143 }
1144
1145 /* long mix function, from:
1146 Robert Jenkins' 96 bit Mix Function
1147 http://burtleburtle.net/bob/hash/doobs.html */
1148 static unsigned long
mix(unsigned long a,unsigned long b,unsigned long c)1149 mix (unsigned long a, unsigned long b, unsigned long c)
1150 {
1151 a=a-b; a=a-c; a=a^(c >> 13);
1152 b=b-c; b=b-a; b=b^(a << 8);
1153 c=c-a; c=c-b; c=c^(b >> 13);
1154 a=a-b; a=a-c; a=a^(c >> 12);
1155 b=b-c; b=b-a; b=b^(a << 16);
1156 c=c-a; c=c-b; c=c^(b >> 5);
1157 a=a-b; a=a-c; a=a^(c >> 3);
1158 b=b-c; b=b-a; b=b^(a << 10);
1159 c=c-a; c=c-b; c=c^(b >> 15);
1160 return c;
1161 }
1162
1163 void
init_random(void)1164 init_random (void)
1165 {
1166 unsigned long seed = mix (clock (), time (NULL), getpid ());
1167 srandom (seed);
1168 }
1169
1170 const char*
field_op_collect_result_name(const enum FIELD_OP_COLLECT_RESULT flocr)1171 field_op_collect_result_name (const enum FIELD_OP_COLLECT_RESULT flocr)
1172 {
1173 switch (flocr) /* LCOV_EXCL_BR_LINE */
1174 {
1175 case FLOCR_INVALID_NUMBER:
1176 return _("invalid numeric value");
1177 case FLOCR_INVALID_BASE64:
1178 return _("invalid base64 value");
1179 case FLOCR_OK: /* LCOV_EXCL_LINE */
1180 case FLOCR_OK_KEEP_LINE: /* LCOV_EXCL_LINE */
1181 case FLOCR_OK_SKIPPED: /* LCOV_EXCL_LINE */
1182 default:
1183 internal_error ("op_collect_result_name"); /* LCOV_EXCL_LINE */
1184 return ""; /* LCOV_EXCL_LINE */
1185 }
1186 }
1187
1188 void
field_op_print_empty_value(enum field_operation mode)1189 field_op_print_empty_value (enum field_operation mode)
1190 {
1191 struct fieldop op;
1192 memset (&op, 0, sizeof op);
1193 op.op = mode;
1194 op.res_type = NUMERIC_RESULT;
1195 field_op_summarize_empty (&op);
1196 fputs (op.out_buf, stdout);
1197 }
1198