1 /* GNU Datamash - perform simple calculation on input data
2 
3    Copyright (C) 2013-2020 Assaf Gordon <assafgordon@gmail.com>
4 
5    This file is part of GNU Datamash.
6 
7    GNU Datamash is free software: you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation, either version 3 of the License, or
10    (at your option) any later version.
11 
12    GNU Datamash is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GNU Datamash.  If not, see <https://www.gnu.org/licenses/>.
19 */
20 
21 /* Written by Assaf Gordon */
22 #include <config.h>
23 #include <assert.h>
24 #include <ctype.h>
25 #include <locale.h>
26 #include <math.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <stdint.h>
30 #include <inttypes.h>
31 #include <string.h>
32 #include <stdbool.h>
33 #include <time.h>
34 #include <libgen.h> /* for dirname & POSIX version of basename */
35 
36 #include "die.h"
37 #include "minmax.h"
38 #include "linebuffer.h"
39 #include "system.h"
40 #include "md5.h"
41 #include "sha1.h"
42 #include "sha256.h"
43 #include "sha512.h"
44 #include "base64.h"
45 #include "xalloc.h"
46 #include "hash-pjw-bare.h"
47 
48 #include "utils.h"
49 #include "text-options.h"
50 #include "text-lines.h"
51 #include "column-headers.h"
52 #include "op-defs.h"
53 #include "field-ops.h"
54 
55 struct operation_data operations[] =
56 {
57   /* OP_COUNT */
58   {STRING_SCALAR,  IGNORE_FIRST, NUMERIC_RESULT},
59   /* OP_SUM */
60   {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
61   /* OP_MIN */
62   {NUMERIC_SCALAR, AUTO_SET_FIRST, NUMERIC_RESULT},
63   /* OP_MAX */
64   {NUMERIC_SCALAR, AUTO_SET_FIRST, NUMERIC_RESULT},
65   /* OP_ABSMIN */
66   {NUMERIC_SCALAR, AUTO_SET_FIRST, NUMERIC_RESULT},
67   /* OP_ABSMAX */
68   {NUMERIC_SCALAR, AUTO_SET_FIRST, NUMERIC_RESULT},
69   /* OP_RANGE */
70   {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
71   /* OP_FIRST */
72   {STRING_SCALAR,  IGNORE_FIRST, STRING_RESULT},
73   /* OP_LAST */
74   {STRING_SCALAR,  IGNORE_FIRST, STRING_RESULT},
75   /* OP_RAND */
76   {STRING_SCALAR,  IGNORE_FIRST, STRING_RESULT},
77   /* OP_MEAN */
78   {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
79   /* OP_GEOMEAN */
80   {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
81   /* OP_HARMMEAN */
82   {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
83   /* OP_MEDIAN */
84   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
85   /* OP_QUARTILE_1 */
86   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
87   /* OP_QUARTILE_3 */
88   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
89   /* OP_IQR */
90   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
91   /* OP_PERCENTILE */
92   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
93   /* OP_PSTDEV */
94   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
95   /* OP_SSTDEV */
96   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
97   /* OP_PVARIANCE */
98   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
99   /* OP_SVARIANCE */
100   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
101   /* OP_MAD */
102   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
103   /* OP_MADRAW */
104   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
105   /* OP_S_SKEWNESS */
106   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
107   /* OP_P_SKEWNESS */
108   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
109   /* OP_S_EXCESS_KURTOSIS */
110   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
111   /* OP_P_EXCESS_KURTOSIS */
112   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
113   /* OP_JARQUE_BETA */
114   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
115   /* OP_DP_OMNIBUS */
116   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
117   /* OP_MODE */
118   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
119   /* OP_ANTIMODE */
120   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
121   /* OP_UNIQUE */
122   {STRING_VECTOR,  IGNORE_FIRST, STRING_RESULT},
123   /* OP_COLLAPSE */
124   {STRING_VECTOR,  IGNORE_FIRST, STRING_RESULT},
125   /* OP_COUNT_UNIQUE */
126   {STRING_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
127   /* OP_BASE64 */
128   {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
129   /* OP_DEBASE64 */
130   {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
131   /* OP_MD5 */
132   {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
133   /* OP_SHA1 */
134   {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
135   /* OP_SHA224 */
136   {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
137   /* OP_SHA256 */
138   {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
139   /* OP_SHA384 */
140   {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
141   /* OP_SHA512 */
142   {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
143   /* OP_P_COVARIANCE */
144   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
145   /* OP_S_COVARIANCE */
146   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
147   /* OP_P_PEARSON_COR */
148   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
149   /* OP_S_PEARSON_COR */
150   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
151   /* OP_BIN_BUCKETS */
152   {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
153   /* OP_STRBIN */
154   {STRING_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
155   /* OP_FLOOR */
156   {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
157   /* OP_CEIL */
158   {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
159   /* OP_ROUND */
160   {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
161   /* OP_TRUNCATE */
162   {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
163   /* OP_FRACTION */
164   {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
165   /* OP_TRIMMED_MEAN */
166   {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
167   /* OP_DIRNAME */
168   {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
169   /* OP_BASENAME */
170   {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
171   /* OP_EXTNAME */
172   {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
173   /* OP_BARENAME */
174   {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
175   /* OP_GETNUM */
176   {STRING_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
177   /* OP_CUT */
178   {STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
179   {0, 0, NUMERIC_RESULT}
180 };
181 
182 //struct fieldop* field_ops = NULL;
183 
184 enum { VALUES_BATCH_INCREMENT = 1024 };
185 
186 /* Add a numeric value to the values vector, allocating memory as needed */
187 static void
field_op_add_value(struct fieldop * op,long double val)188 field_op_add_value (struct fieldop *op, long double val)
189 {
190   if (op->num_values >= op->alloc_values)
191     {
192       op->alloc_values += VALUES_BATCH_INCREMENT;
193       op->values = xnrealloc (op->values, op->alloc_values,
194                               sizeof (long double));
195     }
196   op->values[op->num_values] = val;
197   op->num_values++;
198 }
199 
200 static void
field_op_reserve_out_buf(struct fieldop * op,const size_t minsize)201 field_op_reserve_out_buf (struct fieldop *op, const size_t minsize)
202 {
203   if (op->out_buf_alloc < minsize)
204     {
205       op->out_buf = xrealloc (op->out_buf, minsize);
206       op->out_buf_alloc = minsize;
207     }
208 }
209 
210 /* stores the hexadecimal representation of 'buffer' in op->out_buf */
211 static void
field_op_to_hex(struct fieldop * op,const char * buffer,const size_t inlen)212 field_op_to_hex ( struct fieldop* op, const char *buffer, const size_t inlen )
213 {
214   static const char hex_digits[] =
215   {
216     '0', '1', '2', '3', '4', '5', '6', '7',
217     '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
218   };
219   size_t len = inlen*2+1;
220   const char* inp = buffer;
221   field_op_reserve_out_buf (op, len);
222   char* ptr = op->out_buf;
223   for (size_t i = 0 ; i < inlen; ++i)
224    {
225      *ptr = hex_digits[ ((*inp)>>4) & 0xf ] ;
226      ++ptr;
227      *ptr = hex_digits[  (*inp)     & 0xf ] ;
228      ++ptr;
229      ++inp;
230    }
231   *ptr = 0 ;
232 }
233 
234 /* Add a string to the strings vector, allocating memory as needed */
235 void
field_op_add_string(struct fieldop * op,const char * str,size_t slen)236 field_op_add_string (struct fieldop *op, const char* str, size_t slen)
237 {
238   if (op->str_buf_used + slen+1 >= op->str_buf_alloc)
239     {
240       op->str_buf_alloc += MAX (VALUES_BATCH_INCREMENT,slen+1);
241       op->str_buf = xrealloc (op->str_buf, op->str_buf_alloc);
242     }
243 
244   /* Copy the string to the buffer */
245   memcpy (op->str_buf + op->str_buf_used, str, slen);
246   *(op->str_buf + op->str_buf_used + slen ) = 0;
247   op->str_buf_used += slen + 1 ;
248 }
249 
250 /* Replace the current string in the string buffer.
251    This function assumes only one string is stored in the buffer. */
252 void
field_op_replace_string(struct fieldop * op,const char * str,size_t slen)253 field_op_replace_string (struct fieldop *op, const char* str, size_t slen)
254 {
255   if (slen+1 >= op->str_buf_alloc)
256     {
257       op->str_buf_alloc += MAX (VALUES_BATCH_INCREMENT,slen+1);
258       op->str_buf = xrealloc (op->str_buf, op->str_buf_alloc);
259     }
260 
261   /* Copy the string to the buffer */
262   memcpy (op->str_buf, str, slen);
263   *(op->str_buf + slen ) = 0;
264   op->str_buf_used = slen + 1 ;
265 }
266 
267 /* Returns an array of string-pointers (char*),
268    each pointing to a string in the string buffer
269    (added by field_op_add_string () ).
270 
271    The returned pointer must be free'd.
272 
273    The returned pointer will have 'op->count+1' elements,
274    pointing to 'op->count' strings + one last NULL.
275 */
276 static const char **
field_op_get_string_ptrs(struct fieldop * op,bool sort,bool sort_case_sensitive)277 field_op_get_string_ptrs ( struct fieldop *op, bool sort,
278                            bool sort_case_sensitive )
279 {
280   const char **ptrs = xnmalloc (op->count+1, sizeof (char*));
281   char *p = op->str_buf;
282   const char* pend = op->str_buf + op->str_buf_used;
283   size_t idx=0;
284   while (p < pend)
285     {
286       ptrs[idx++] = p;
287       while ( p<pend && *p != '\0' )
288         ++p;
289       ++p;
290     }
291   ptrs[idx] = 0;
292 
293   if (sort)
294     {
295       /* Sort the string pointers */
296       qsort ( ptrs, op->count, sizeof (char*), sort_case_sensitive
297                                             ?cmpstringp
298                                             :cmpstringp_nocase);
299     }
300   return ptrs;
301 }
302 
303 /* Sort the numeric values vector in a fieldop structure */
304 static void
field_op_sort_values(struct fieldop * op)305 field_op_sort_values (struct fieldop *op)
306 {
307   qsortfl (op->values, op->num_values);
308 }
309 
310 void
field_op_init(struct fieldop * op,enum field_operation oper,bool by_name,size_t num,const char * name)311 field_op_init (struct fieldop* /*out*/ op,
312                enum field_operation oper,
313                bool by_name, size_t num, const char* name)
314 {
315   assert (op != NULL); /* LCOV_EXCL_LINE */
316   memset (op, 0, sizeof *op);
317 
318   op->op = oper;
319   op->acc_type = operations[oper].acc_type;
320   op->res_type = operations[oper].res_type;
321   op->numeric = (op->acc_type == NUMERIC_SCALAR
322                  || op->acc_type == NUMERIC_VECTOR);
323   op->auto_first = operations[oper].auto_first;
324   op->slave = false;
325   op->slave_op = NULL;
326 
327   op->field = num;
328   op->field_by_name = by_name;
329   op->field_name = (by_name)?xstrdup (name):NULL;
330   op->first = true;
331   if (op->res_type == STRING_RESULT)
332     {
333       op->out_buf_alloc = 1024;
334       op->out_buf = xmalloc (op->out_buf_alloc);
335     }
336 }
337 
338 /* Ensure this (master) fieldop has the same number of values as
339    as it's slave fieldop. */
340 static void
verify_slave_num_values(const struct fieldop * op)341 verify_slave_num_values (const struct fieldop *op)
342 {
343   assert (op && !op->slave && op->slave_op);     /* LCOV_EXCL_LINE */
344 
345   if (op->num_values != op->slave_op->num_values)
346     die (EXIT_FAILURE, 0, _("input error for operation %s: \
347 fields %"PRIuMAX",%"PRIuMAX" have different number of items"),
348                             quote (get_field_operation_name (op->op)),
349                             (uintmax_t)op->slave_op->field,
350                             (uintmax_t)op->field);
351 }
352 
353 /* Add a value (from input) to the current field operation. */
354 enum FIELD_OP_COLLECT_RESULT
field_op_collect(struct fieldop * op,const char * str,size_t slen)355 field_op_collect (struct fieldop *op,
356                   const char* str, size_t slen)
357 {
358   char *endptr=NULL;
359   long double num_value = 0;
360 #ifdef HAVE_BROKEN_STRTOLD
361   char tmpbuf[512];
362 #endif
363   enum FIELD_OP_COLLECT_RESULT rc = FLOCR_OK;
364 
365   assert (str != NULL); /* LCOV_EXCL_LINE */
366 
367   if (remove_na_values && is_na (str,slen))
368     return FLOCR_OK_SKIPPED;
369 
370   if (op->numeric)
371     {
372       errno = 0;
373 #ifdef HAVE_BROKEN_STRTOLD
374       /* On Cygwin, strtold doesn't stop at a tab character,
375          and returns invalid value.
376          Make a copy of the input buffer and NULL-terminate it */
377       if (slen >= sizeof (tmpbuf))
378         die (EXIT_FAILURE, 0,
379                 "internal error: input field too long (%zu)", slen);
380       memcpy (tmpbuf,str,slen);
381       tmpbuf[slen]=0;
382       num_value = strtold (tmpbuf, &endptr);
383       if (errno==ERANGE || endptr==tmpbuf || endptr!=(tmpbuf+slen))
384         return FLOCR_INVALID_NUMBER;
385 #else
386       if (slen == 0)
387         return FLOCR_INVALID_NUMBER;
388       num_value = strtold (str, &endptr);
389       if (errno==ERANGE || endptr==str || endptr!=(str+slen))
390         return FLOCR_INVALID_NUMBER;
391 #endif
392     }
393 
394   op->count++;
395 
396   if (op->first && op->auto_first && op->numeric)
397       op->value = num_value;
398 
399   switch (op->op)                                /* LCOV_EXCL_BR_LINE */
400     {
401     case OP_SUM:
402     case OP_MEAN:
403       op->value += num_value;
404       break;
405 
406     case OP_GEOMEAN:
407       op->value += logl (num_value);
408       break;
409 
410     case OP_HARMMEAN:
411       op->value += 1.0 / num_value;
412       break;
413 
414     case OP_COUNT:
415       op->value++;
416       break;
417 
418     case OP_MIN:
419       if (num_value < op->value)
420         {
421           op->value = num_value;
422           rc = FLOCR_OK_KEEP_LINE;
423         }
424       break;
425 
426     case OP_MAX:
427       if (num_value > op->value)
428         {
429           op->value = num_value;
430           rc = FLOCR_OK_KEEP_LINE;
431         }
432       break;
433 
434     case OP_ABSMIN:
435       if (fabsl (num_value) < fabsl (op->value))
436         {
437           op->value = num_value;
438           rc = FLOCR_OK_KEEP_LINE;
439         }
440       break;
441 
442     case OP_ABSMAX:
443       if (fabsl (num_value) > fabsl (op->value))
444         {
445           op->value = num_value;
446           rc = FLOCR_OK_KEEP_LINE;
447         }
448       break;
449 
450     case OP_RANGE:
451       /* Upon the first value, we store it twice
452          (once for min, once for max).
453          For subsequence values, we update the min/max entries directly. */
454       if (op->first)
455         {
456           field_op_add_value (op, num_value);
457           field_op_add_value (op, num_value);
458         }
459       else
460         {
461           if (num_value < op->values[0])
462             op->values[0] = num_value;
463           if (num_value > op->values[1])
464             op->values[1] = num_value;
465         }
466       break;
467 
468     case OP_FIRST:
469       if (op->first)
470         {
471           field_op_replace_string (op, str, slen);
472           rc = FLOCR_OK_KEEP_LINE;
473         }
474       break;
475 
476     case OP_LAST:
477       /* Replace the 'current' string with the latest one */
478       field_op_replace_string (op, str, slen);
479       rc = FLOCR_OK_KEEP_LINE;
480       break;
481 
482     case OP_DEBASE64:
483       /* Base64 decoding is a special case: we decode during collection,
484          and report any errors back to the caller. */
485       {
486         /* safe to assume decoded base64 is never larger than encoded base64 */
487         size_t decoded_size = slen;
488         field_op_reserve_out_buf (op, decoded_size);
489         if (!base64_decode ( str, slen, op->out_buf, &decoded_size ))
490           return FLOCR_INVALID_BASE64;
491         op->out_buf[decoded_size]=0;
492       }
493       break;
494 
495     case OP_BASE64:
496     case OP_MD5:
497     case OP_SHA1:
498     case OP_SHA224:
499     case OP_SHA256:
500     case OP_SHA384:
501     case OP_SHA512:
502     case OP_DIRNAME:
503     case OP_BASENAME:
504     case OP_EXTNAME:
505     case OP_BARENAME:
506       /* Replace the 'current' string with the latest one */
507       field_op_replace_string (op, str, slen);
508       break;
509 
510     case OP_RAND:
511       {
512         /* Reservoir sampling,
513            With a simpler case were "k=1" */
514         unsigned long i = random ()%op->count;
515         if (op->first || i==0)
516           {
517             field_op_replace_string (op, str, slen);
518             rc = FLOCR_OK_KEEP_LINE;
519           }
520       }
521       break;
522 
523     case OP_MEDIAN:
524     case OP_QUARTILE_1:
525     case OP_QUARTILE_3:
526     case OP_IQR:
527     case OP_PERCENTILE:
528     case OP_PSTDEV:
529     case OP_SSTDEV:
530     case OP_PVARIANCE:
531     case OP_SVARIANCE:
532     case OP_MAD:
533     case OP_MADRAW:
534     case OP_S_SKEWNESS:
535     case OP_P_SKEWNESS:
536     case OP_S_EXCESS_KURTOSIS:
537     case OP_P_EXCESS_KURTOSIS:
538     case OP_JARQUE_BERA:
539     case OP_DP_OMNIBUS:
540     case OP_MODE:
541     case OP_ANTIMODE:
542     case OP_P_COVARIANCE:
543     case OP_S_COVARIANCE:
544     case OP_P_PEARSON_COR:
545     case OP_S_PEARSON_COR:
546     case OP_TRIMMED_MEAN:
547       field_op_add_value (op, num_value);
548       break;
549 
550     case OP_UNIQUE:
551     case OP_COLLAPSE:
552     case OP_COUNT_UNIQUE:
553       field_op_add_string (op, str, slen);
554       break;
555 
556     case OP_BIN_BUCKETS:
557       {
558         const long double val = num_value / op->params.bin_bucket_size;
559         modfl (val, & op->value);
560         /* signbit will take care of negative-zero as well. */
561         if (signbit (op->value))
562           --op->value;
563         op->value *= op->params.bin_bucket_size;
564       }
565       break;
566 
567     case OP_STRBIN:
568       op->value = hash_pjw_bare (str,slen) % (op->params.strbin_bucket_size);
569       break;
570 
571     case OP_FLOOR:
572       op->value = pos_zero (floorl (num_value));
573       break;
574 
575     case OP_CEIL:
576       op->value = pos_zero (ceill (num_value));
577       break;
578 
579     case OP_ROUND:
580       op->value = pos_zero (roundl (num_value));
581       break;
582 
583     case OP_TRUNCATE:
584       modfl (num_value, &op->value);
585       op->value = pos_zero (op->value);
586       break;
587 
588     case OP_FRACTION:
589       {
590         long double dummy;
591         op->value = pos_zero (modfl (num_value, &dummy));
592       };
593       break;
594 
595     case OP_GETNUM:
596       op->value = extract_number (str, slen, op->params.get_num_type);
597       break;
598 
599     case OP_CUT:
600       field_op_replace_string (op, str, slen);
601       break;
602 
603     case OP_INVALID:                 /* LCOV_EXCL_LINE */
604     default:                         /* LCOV_EXCL_LINE */
605       /* Should never happen */
606       internal_error ("bad op");     /* LCOV_EXCL_LINE */
607     }
608 
609   op->first = false;
610 
611   return rc;
612 }
613 
614 /* creates a list of unique strings from op->str_buf .
615    results are stored in op->out_buf. */
616 static void
unique_value(struct fieldop * op,bool case_sensitive)617 unique_value ( struct fieldop *op, bool case_sensitive )
618 {
619   const char *last_str;
620   char *pos;
621 
622   const char **ptrs = field_op_get_string_ptrs (op, true, case_sensitive);
623 
624   /* Uniquify them */
625   field_op_reserve_out_buf (op, op->str_buf_used);
626   pos = op->out_buf ;
627 
628   /* Copy the first string */
629   last_str = ptrs[0];
630   strcpy (pos, ptrs[0]);
631   pos += strlen (ptrs[0]);
632 
633   /* Copy the following strings, if they are different from the previous one */
634   for (size_t i = 1; i < op->count; ++i)
635     {
636       const char *newstr = ptrs[i];
637 
638       if ((case_sensitive && (!STREQ (newstr, last_str)))
639           || (!case_sensitive && (strcasecmp (newstr, last_str)!=0)))
640         {
641           *pos++ = collapse_separator ;
642           strcpy (pos, newstr);
643           pos += strlen (newstr);
644         }
645       last_str = newstr;
646     }
647 
648   free (ptrs);
649 }
650 
651 /* Returns the number of unique string values in the given field operation */
652 size_t
count_unique_values(struct fieldop * op,bool case_sensitive)653 count_unique_values ( struct fieldop *op, bool case_sensitive )
654 {
655   const char *last_str, **cur_str;
656   size_t count = 1 ;
657 
658   const char **ptrs = field_op_get_string_ptrs (op, true, case_sensitive);
659 
660   /* Copy the first string */
661   cur_str = ptrs;
662   last_str = *cur_str;
663   ++cur_str;
664 
665   /* Copy the following strings, if they are different from the previous one */
666   while ( *cur_str != 0 )
667     {
668       if ((case_sensitive && (!STREQ (*cur_str, last_str)))
669           || (!case_sensitive && (strcasecmp (*cur_str, last_str)!=0)))
670         {
671           ++count;
672         }
673       last_str = *cur_str;
674       ++cur_str;
675     }
676 
677   free (ptrs);
678 
679   return count;
680 }
681 
682 /* Returns a nul-terimated string, composed of all the values
683    of the input strings. The return string must be free'd. */
684 void
collapse_value(struct fieldop * op)685 collapse_value ( struct fieldop *op )
686 {
687   /* Copy the string buffer as-is */
688   field_op_reserve_out_buf (op, op->str_buf_used);
689   char *buf = op->out_buf;
690   memcpy (buf, op->str_buf, op->str_buf_used);
691 
692   /* convert every NUL to comma, except for the last one */
693   for (size_t i=0; i < op->str_buf_used-1 ; i++)
694       if (buf[i] == 0)
695         buf[i] = collapse_separator ;
696 }
697 
698 /* stores in op->out_buf the result of the field operation
699    when there are no input values.
700    'no values' can happen with '--narm' and input of all N/As.
701    The printed results are consistent as much as possible with R */
702 static void
field_op_summarize_empty(struct fieldop * op)703 field_op_summarize_empty (struct fieldop *op)
704 {
705   long double numeric_result = 0 ;
706 
707   switch (op->op)                                /* LCOV_EXCL_BR_LINE */
708     {
709     case OP_MEAN:
710     case OP_GEOMEAN:
711     case OP_HARMMEAN:
712     case OP_S_SKEWNESS:
713     case OP_P_SKEWNESS:
714     case OP_S_EXCESS_KURTOSIS:
715     case OP_P_EXCESS_KURTOSIS:
716     case OP_JARQUE_BERA:
717     case OP_DP_OMNIBUS:
718     case OP_MEDIAN:
719     case OP_QUARTILE_1:
720     case OP_QUARTILE_3:
721     case OP_IQR:
722     case OP_PERCENTILE:
723     case OP_MAD:
724     case OP_MADRAW:
725     case OP_PSTDEV:
726     case OP_SSTDEV:
727     case OP_PVARIANCE:
728     case OP_SVARIANCE:
729     case OP_MODE:
730     case OP_ANTIMODE:
731     case OP_P_COVARIANCE:
732     case OP_S_COVARIANCE:
733     case OP_P_PEARSON_COR:
734     case OP_S_PEARSON_COR:
735     case OP_BIN_BUCKETS:
736     case OP_STRBIN:
737     case OP_FLOOR:
738     case OP_CEIL:
739     case OP_ROUND:
740     case OP_TRUNCATE:
741     case OP_FRACTION:
742     case OP_RANGE:
743     case OP_TRIMMED_MEAN:
744     case OP_GETNUM:
745       numeric_result = nanl ("");
746       break;
747 
748     case OP_SUM:
749     case OP_COUNT:
750     case OP_COUNT_UNIQUE:
751       numeric_result = 0;
752       break;
753 
754     case OP_MIN:
755     case OP_ABSMIN:
756       numeric_result = -HUGE_VALL;
757       break;
758 
759     case OP_MAX:
760     case OP_ABSMAX:
761       numeric_result = HUGE_VALL;
762       break;
763 
764     case OP_FIRST:
765     case OP_LAST:
766     case OP_RAND:
767     case OP_CUT:
768       field_op_reserve_out_buf (op, 4);
769       strcpy (op->out_buf, "N/A");
770       break;
771 
772     case OP_UNIQUE:
773     case OP_COLLAPSE:
774     case OP_BASE64:
775     case OP_DEBASE64:
776     case OP_MD5:
777     case OP_SHA1:
778     case OP_SHA224:
779     case OP_SHA256:
780     case OP_SHA384:
781     case OP_SHA512:
782     case OP_DIRNAME:
783     case OP_BASENAME:
784     case OP_EXTNAME:
785     case OP_BARENAME:
786       field_op_reserve_out_buf (op, 1);
787       strcpy (op->out_buf, "");
788       break;
789 
790     case OP_INVALID:                 /* LCOV_EXCL_LINE */
791     default:                         /* LCOV_EXCL_LINE */
792       /* Should never happen */
793       internal_error ("bad op");     /* LCOV_EXCL_LINE */
794     }
795 
796   if (op->res_type==NUMERIC_RESULT)
797     {
798       field_op_reserve_out_buf (op, numeric_output_bufsize);
799       snprintf (op->out_buf, op->out_buf_alloc,
800                 numeric_output_format, numeric_result);
801     }
802 }
803 
804 /* Prints to stdout the result of the field operation,
805    based on collected values */
806 void
field_op_summarize(struct fieldop * op)807 field_op_summarize (struct fieldop *op)
808 {
809   long double numeric_result = 0 ;
810   char tmpbuf[64]; /* 64 bytes - enough to hold sha512 */
811 
812   /* In case of no values, each operation returns a specific result.
813      'no values' can happen with '--narm' and input of all N/As. */
814   if (op->count==0)
815     {
816       field_op_summarize_empty (op);
817       return ;
818     }
819 
820   switch (op->op)                                /* LCOV_EXCL_BR_LINE */
821     {
822     case OP_MEAN:
823       numeric_result = op->value / op->count;
824       break;
825 
826     case OP_GEOMEAN:
827       numeric_result = expl (op->value / op->count);
828       break;
829 
830     case OP_HARMMEAN:
831       numeric_result = op->count / op->value;
832       break;
833 
834     case OP_SUM:
835     case OP_COUNT:
836     case OP_MIN:
837     case OP_MAX:
838     case OP_ABSMIN:
839     case OP_ABSMAX:
840     case OP_BIN_BUCKETS:
841     case OP_STRBIN:
842     case OP_FLOOR:
843     case OP_CEIL:
844     case OP_ROUND:
845     case OP_TRUNCATE:
846     case OP_FRACTION:
847     case OP_GETNUM:
848       /* no summarization for these operations, just print the value */
849       numeric_result = op->value;
850       break;
851 
852     case OP_FIRST:
853     case OP_LAST:
854     case OP_RAND:
855     case OP_CUT:
856       /* Only one string is returned in the buffer, return it */
857       field_op_reserve_out_buf (op, op->str_buf_used);
858       memcpy (op->out_buf, op->str_buf, op->str_buf_used);
859       break;
860 
861     case OP_RANGE:
862       numeric_result = op->values[1] - op->values[0];
863       break;
864 
865     case OP_MEDIAN:
866       field_op_sort_values (op);
867       numeric_result = median_value ( op->values, op->num_values );
868       break;
869 
870     case OP_QUARTILE_1:
871       field_op_sort_values (op);
872       numeric_result = quartile1_value ( op->values, op->num_values );
873       break;
874 
875     case OP_QUARTILE_3:
876       field_op_sort_values (op);
877       numeric_result = quartile3_value ( op->values, op->num_values );
878       break;
879 
880     case OP_IQR:
881       field_op_sort_values (op);
882       numeric_result = quartile3_value ( op->values, op->num_values )
883                        - quartile1_value ( op->values, op->num_values );
884       break;
885 
886     case OP_PERCENTILE:
887       field_op_sort_values (op);
888       numeric_result = percentile_value ( op->values, op->num_values,
889                                           op->params.percentile / 100.0 );
890       break;
891 
892     case OP_TRIMMED_MEAN:
893       field_op_sort_values (op);
894       numeric_result = trimmed_mean_value ( op->values, op->num_values,
895 					    op->params.trimmed_mean);
896       break;
897 
898     case OP_PSTDEV:
899       numeric_result = stdev_value ( op->values, op->num_values, DF_POPULATION);
900       break;
901 
902     case OP_SSTDEV:
903       numeric_result = stdev_value ( op->values, op->num_values, DF_SAMPLE);
904       break;
905 
906     case OP_PVARIANCE:
907       numeric_result = variance_value ( op->values, op->num_values,
908                                         DF_POPULATION);
909       break;
910 
911     case OP_SVARIANCE:
912       numeric_result = variance_value ( op->values, op->num_values,
913                                         DF_SAMPLE);
914       break;
915 
916     case OP_MAD:
917       field_op_sort_values (op);
918       numeric_result = mad_value ( op->values, op->num_values, 1.4826 );
919       break;
920 
921     case OP_MADRAW:
922       field_op_sort_values (op);
923       numeric_result = mad_value ( op->values, op->num_values, 1.0 );
924       break;
925 
926     case OP_S_SKEWNESS:
927       numeric_result = skewness_value ( op->values, op->num_values,
928                                         DF_SAMPLE );
929       break;
930 
931     case OP_P_SKEWNESS:
932       numeric_result = skewness_value ( op->values, op->num_values,
933                                         DF_POPULATION );
934       break;
935 
936     case OP_S_EXCESS_KURTOSIS:
937       numeric_result = excess_kurtosis_value ( op->values, op->num_values,
938                                                DF_SAMPLE );
939       break;
940 
941     case OP_P_EXCESS_KURTOSIS:
942       numeric_result = excess_kurtosis_value ( op->values, op->num_values,
943                                                DF_POPULATION );
944       break;
945 
946     case OP_JARQUE_BERA:
947       numeric_result = jarque_bera_pvalue ( op->values, op->num_values );
948       break;
949 
950     case OP_DP_OMNIBUS:
951       numeric_result = dagostino_pearson_omnibus_pvalue ( op->values,
952                                                           op->num_values );
953       break;
954 
955     case OP_P_COVARIANCE:
956     case OP_S_COVARIANCE:
957       assert (!op->slave);                       /* LCOV_EXCL_LINE */
958       assert (op->slave_op);                     /* LCOV_EXCL_LINE */
959       verify_slave_num_values (op);
960       numeric_result = covariance_value (op->values, op->slave_op->values,
961                                          op->num_values,
962                                          (op->op==OP_P_COVARIANCE)?
963                                                 DF_POPULATION:DF_SAMPLE );
964       break;
965 
966     case OP_P_PEARSON_COR:
967     case OP_S_PEARSON_COR:
968       assert (!op->slave);                       /* LCOV_EXCL_LINE */
969       assert (op->slave_op);                     /* LCOV_EXCL_LINE */
970       verify_slave_num_values (op);
971       numeric_result = pearson_corr_value (op->values, op->slave_op->values,
972                                            op->num_values,
973                                            (op->op==OP_P_PEARSON_COR)?
974                                                 DF_POPULATION:DF_SAMPLE);
975       break;
976 
977     case OP_MODE:
978     case OP_ANTIMODE:
979       field_op_sort_values (op);
980       numeric_result = mode_value ( op->values, op->num_values,
981                                     (op->op==OP_MODE)?MODE:ANTIMODE);
982       break;
983 
984     case OP_UNIQUE:
985       unique_value (op, case_sensitive);
986       break;
987 
988     case OP_COLLAPSE:
989       collapse_value (op);
990       break;
991 
992     case OP_COUNT_UNIQUE:
993       numeric_result = count_unique_values (op,case_sensitive);
994       break;
995 
996     case OP_BASE64:
997       field_op_reserve_out_buf (op, BASE64_LENGTH (op->str_buf_used-1)+1 ) ;
998       base64_encode ( op->str_buf, op->str_buf_used-1,
999       op->out_buf, BASE64_LENGTH (op->str_buf_used-1)+1 );
1000       break;
1001 
1002     case OP_DEBASE64:
1003       /* Decoding base64 is a special case: decoding (and error checking) was
1004          done in field_op_collect.  op->out_buf already contains the decoded
1005          value. */
1006       break;
1007 
1008     case OP_MD5:
1009       md5_buffer (op->str_buf, op->str_buf_used-1, tmpbuf);
1010       field_op_to_hex (op, tmpbuf, 16);
1011       break;
1012 
1013     case OP_SHA1:
1014       sha1_buffer (op->str_buf, op->str_buf_used-1, tmpbuf);
1015       field_op_to_hex (op, tmpbuf, 20);
1016       break;
1017 
1018     case OP_SHA224:
1019       sha224_buffer (op->str_buf, op->str_buf_used-1, tmpbuf);
1020       field_op_to_hex (op, tmpbuf, 28);
1021       break;
1022 
1023     case OP_SHA256:
1024       sha256_buffer (op->str_buf, op->str_buf_used-1, tmpbuf);
1025       field_op_to_hex (op, tmpbuf, 32);
1026       break;
1027 
1028     case OP_SHA384:
1029       sha384_buffer (op->str_buf, op->str_buf_used-1, tmpbuf);
1030       field_op_to_hex (op, tmpbuf, 48);
1031       break;
1032 
1033     case OP_SHA512:
1034       sha512_buffer (op->str_buf, op->str_buf_used-1, tmpbuf);
1035       field_op_to_hex (op, tmpbuf, 64);
1036       break;
1037 
1038     case OP_DIRNAME:
1039       {
1040         op->str_buf[op->str_buf_used] = 0;
1041         char *t = dirname (op->str_buf);
1042         field_op_reserve_out_buf (op, op->str_buf_used);
1043         strcpy (op->out_buf,t);
1044       }
1045       break;
1046 
1047     case OP_BASENAME:
1048     case OP_EXTNAME:
1049     case OP_BARENAME:
1050       {
1051         if (op->str_buf_used==1)
1052           {
1053             /* Empty string, containing only NUL */
1054             field_op_reserve_out_buf (op, 1);
1055             op->out_buf[0] = '\0';
1056             break;
1057           }
1058 
1059         op->str_buf[op->str_buf_used] = 0;
1060         char *t = basename (op->str_buf);
1061         field_op_reserve_out_buf (op, op->str_buf_used);
1062 
1063         if (op->op == OP_BASENAME)
1064           {
1065             /* Just copy the extracted base name */
1066             strcpy (op->out_buf,t);
1067           }
1068         else
1069           {
1070             /* Guess the file extension */
1071             size_t tl = strlen (t);
1072             size_t l = guess_file_extension (t, tl);
1073 
1074             if (op->op == OP_EXTNAME)
1075               {
1076                 /* Store the extension */
1077                 if (l>0)
1078                   {
1079                     memcpy (op->out_buf, t+(tl-l+1), l-1);
1080                     op->out_buf[l-1] = '\0';
1081                   }
1082                 else
1083                   {
1084                     op->out_buf[0] = '\0';
1085                   }
1086               }
1087             else
1088               {
1089                 /* Store the basename without the extension */
1090                 memcpy (op->out_buf, t, tl-l);
1091                 op->out_buf[tl-l] = '\0';
1092               }
1093           }
1094       }
1095       break;
1096 
1097     case OP_INVALID:                 /* LCOV_EXCL_LINE */
1098     default:                         /* LCOV_EXCL_LINE */
1099       /* Should never happen */
1100       internal_error ("bad op");     /* LCOV_EXCL_LINE */
1101     }
1102 
1103   if (op->res_type==NUMERIC_RESULT)
1104     {
1105       field_op_reserve_out_buf (op, numeric_output_bufsize);
1106       snprintf (op->out_buf, op->out_buf_alloc,
1107                 numeric_output_format, numeric_result);
1108     }
1109 }
1110 
1111 /* reset operation values for next group */
1112 void
field_op_reset(struct fieldop * op)1113 field_op_reset (struct fieldop *op)
1114 {
1115   op->first = true;
1116   op->count = 0 ;
1117   op->value = 0;
1118   op->num_values = 0 ;
1119   op->str_buf_used = 0;
1120   op->out_buf_used = 0;
1121   /* note: op->str_buf and op->str_alloc are not free'd, and reused */
1122 }
1123 
1124 void
field_op_free(struct fieldop * op)1125 field_op_free (struct fieldop* op)
1126 {
1127   free (op->values);
1128   op->num_values = 0 ;
1129   op->alloc_values = 0;
1130 
1131   free (op->str_buf);
1132   op->str_buf = NULL;
1133   op->str_buf_alloc = 0;
1134   op->str_buf_used = 0;
1135 
1136   free (op->out_buf);
1137   op->out_buf = NULL;
1138   op->out_buf_alloc = 0;
1139   op->out_buf_used = 0;
1140 
1141   free (op->field_name);
1142   op->field_name = NULL;
1143 }
1144 
1145 /* long mix function, from:
1146    Robert Jenkins' 96 bit Mix Function
1147    http://burtleburtle.net/bob/hash/doobs.html */
1148 static unsigned long
mix(unsigned long a,unsigned long b,unsigned long c)1149 mix (unsigned long a, unsigned long b, unsigned long c)
1150 {
1151     a=a-b;  a=a-c;  a=a^(c >> 13);
1152     b=b-c;  b=b-a;  b=b^(a << 8);
1153     c=c-a;  c=c-b;  c=c^(b >> 13);
1154     a=a-b;  a=a-c;  a=a^(c >> 12);
1155     b=b-c;  b=b-a;  b=b^(a << 16);
1156     c=c-a;  c=c-b;  c=c^(b >> 5);
1157     a=a-b;  a=a-c;  a=a^(c >> 3);
1158     b=b-c;  b=b-a;  b=b^(a << 10);
1159     c=c-a;  c=c-b;  c=c^(b >> 15);
1160     return c;
1161 }
1162 
1163 void
init_random(void)1164 init_random (void)
1165 {
1166   unsigned long seed = mix (clock (), time (NULL), getpid ());
1167   srandom (seed);
1168 }
1169 
1170 const char*
field_op_collect_result_name(const enum FIELD_OP_COLLECT_RESULT flocr)1171 field_op_collect_result_name (const enum FIELD_OP_COLLECT_RESULT flocr)
1172 {
1173   switch (flocr)                                 /* LCOV_EXCL_BR_LINE */
1174    {
1175    case FLOCR_INVALID_NUMBER:
1176      return _("invalid numeric value");
1177    case FLOCR_INVALID_BASE64:
1178      return _("invalid base64 value");
1179    case FLOCR_OK:                                /* LCOV_EXCL_LINE */
1180    case FLOCR_OK_KEEP_LINE:                      /* LCOV_EXCL_LINE */
1181    case FLOCR_OK_SKIPPED:                        /* LCOV_EXCL_LINE */
1182    default:
1183      internal_error ("op_collect_result_name");  /* LCOV_EXCL_LINE */
1184      return "";                                  /* LCOV_EXCL_LINE */
1185    }
1186 }
1187 
1188 void
field_op_print_empty_value(enum field_operation mode)1189 field_op_print_empty_value (enum field_operation mode)
1190 {
1191   struct fieldop op;
1192   memset (&op, 0, sizeof op);
1193   op.op = mode;
1194   op.res_type = NUMERIC_RESULT;
1195   field_op_summarize_empty (&op);
1196   fputs (op.out_buf, stdout);
1197 }
1198