1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 2003-2011 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Eclipse Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.eclipse.org/org/documents/epl-v10.html *
11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <glenn.s.fowler@gmail.com> *
18 * *
19 ***********************************************************************/
20 #pragma prototyped
21
22 /*
23 * sort uniq summary discipline
24 */
25
26 static const char usage[] =
27 "[-1lp0s5P?\n@(#)$Id: sum (AT&T Research) 2011-08-23 $\n]"
28 USAGE_LICENSE
29 "[+PLUGIN?sum - sort uniq summary discipline]"
30 "[+DESCRIPTION?The \bsum\b \bsort\b(1) discipline applies "
31 "summarization operations to selected fields in records that compare "
32 "equal. The discipline sets the \bsort\b \b--unique\b option. Summary "
33 "fields in non-unique records are modified according to the operations "
34 "specified in the \bop\b discipline option.]"
35 "[l:library?Load the \bdss\b(1) type library \alibrary\a. Types are used "
36 "by the \bop\b option. The \bnum_t\b library is loaded by default. \vdss "
37 "--plugin=man\v lists the information on all \bdss\b libraries and \vdss "
38 "--plugin=man\v \aname\a lists the information for the \aname\a "
39 "library.]:[library]"
40 "[o:op?A field summary operation. \aarg\a is a \bdss\b(1) type name for "
41 "all but the \bset\b \aop\a, either from the \bnum_t\b library or from a "
42 "library loaded by the \blibrary\b option. \atype\a may also contain one "
43 "or more \b:\b separated attributes. \akey\a is a \bsort\b(1) \b-k\b "
44 "style field specification. \aop\a\b:\b\aarg\a may be specified multiple "
45 "times; \aop\a and \aarg\a are inherited across \akey\a values from left "
46 "to right. The default type is native character set \binteger\b; some "
47 "operations may ignore the type. Spaces may be used in place of the "
48 "\b:\b. \aop\a may be one of:]:[op::[arg...]]::key[...]]]"
49 "{"
50 "[+max (M)?maximum value]"
51 "[+min (m)?minimum value]"
52 "[+average (a)?average value]"
53 "[+sum (s)?sum]"
54 "[+count (c)?multiply subsequent field values and increment the "
55 "total count by this value]"
56 "[+set (s)?set all field field bytes to the first character of "
57 "\aarg\a, which may be a C-style escape sequence]"
58 "}"
59 "[r:regress?Massage output for regression testing.]"
60 "[d:debug?List the field operations on the standard error.]"
61 "[+EXAMPLES]"
62 "{"
63 "[+sort -k.2.1 -lsum,op=sum::integer::.6.2?Sorts on the 1 byte "
64 "fixed width field starting at byte position 2 (counting from 1) "
65 "and computes the sum of the integers in the 2 byte fixed width "
66 "field starting at byte position 6.]"
67 "[+dlls --base dss | grep '_t$'?Lists the \bdss\b(1) type "
68 "library names.]"
69 "[+dss --plugin=man num_t?Lists the \bdss\b(1) \bnum_t\b type "
70 "library description in the \b--man\b style.]"
71 "}"
72 "[+SEE ALSO?\bdss\b(1), \bsort\b(1)]"
73 "\n\n--library=sum[,option[=value]...]\n\n"
74 ;
75
76 #include <ast.h>
77 #include <ctype.h>
78 #include <ccode.h>
79 #include <dss.h>
80 #include <error.h>
81 #include <recsort.h>
82 #include <recfmt.h>
83 #include <vmalloc.h>
84
85 struct Library_s; typedef struct Library_s Library_t;
86 struct Summary_s; typedef struct Summary_s Summary_t;
87
88 struct Library_s
89 {
90 Library_t* next;
91 Cxtype_t* types;
92 };
93
94 typedef struct Position_s
95 {
96 short field;
97 short index;
98 } Position_t;
99
100 struct Summary_s
101 {
102 Summary_t* next;
103 Cxtype_t* type;
104 unsigned char* map;
105 unsigned char* pam;
106 Cxformat_t format;
107 Position_t beg;
108 Position_t end;
109 int op;
110 int set;
111 int fixed;
112 int width;
113 Sflong_t count;
114 Sfdouble_t value;
115 };
116
117 typedef struct Buffer_s
118 {
119 unsigned char* buf;
120 size_t siz;
121 } Buffer_t;
122
123 typedef struct State_s
124 {
125 Rsdisc_t disc;
126 Dss_t* dss;
127 Summary_t* sum;
128 Sflong_t records;
129 Recfmt_t fmt;
130 unsigned char* tab;
131 unsigned char delim[256];
132 int alt;
133 int regress;
134 Buffer_t tmp;
135 Buffer_t buf[2];
136 } State_t;
137
138 #define ASSURE(s,b,z) do{if(((b)->siz<(z))&&assure(s,b,z))return -1;}while(0)
139
140 static int
assure(State_t * state,Buffer_t * b,size_t z)141 assure(State_t* state, Buffer_t* b, size_t z)
142 {
143 if (b->siz < z)
144 {
145 b->siz = roundof(z, 32);
146 if (!(b->buf = vmnewof(state->dss->vm, b->buf, unsigned char, b->siz, 0)))
147 {
148 error(ERROR_SYSTEM|3, "out of space extending to %I*u", sizeof(b->siz), b->siz);
149 return -1;
150 }
151 }
152 return 0;
153 }
154
155 static int
record(register State_t * state,register Rsobj_t * r,int op)156 record(register State_t* state, register Rsobj_t* r, int op)
157 {
158 Cx_t* cx = state->dss->cx;
159 register Summary_t* sum;
160 register unsigned char* s;
161 register unsigned char* e;
162 register unsigned char* a;
163 register unsigned char* z;
164 register unsigned char* del;
165 register const unsigned char* map;
166 unsigned char* x;
167 unsigned char* tab;
168 Buffer_t* ext;
169 int beg;
170 int end;
171 int t;
172 int c;
173 size_t count;
174 size_t w;
175 size_t y;
176 ssize_t n;
177 Cxoperand_t v;
178
179 state->records++;
180 s = r->data;
181 e = s + r->datalen - (RECTYPE(state->fmt) == REC_delimited);
182 beg = end = 0;
183 count = 1;
184 tab = state->tab;
185 t = *tab++;
186 if (!*tab)
187 tab = 0;
188 del = state->delim;
189 for (sum = state->sum; sum; sum = sum->next)
190 {
191 while (beg < sum->beg.field)
192 {
193 tab1:
194 while (s < e)
195 if (del[*s++])
196 {
197 if (tab)
198 {
199 for (c = 0; (s + c) < e; c++)
200 if (!tab[c])
201 {
202 s += c;
203 break;
204 }
205 else if (tab[c] != s[c])
206 goto tab1;
207 }
208 else if (t == ' ')
209 while (s < e && del[*s])
210 s++;
211 break;
212 }
213 end = ++beg;
214 }
215 if (sum->beg.index < (e - s))
216 {
217 a = s + sum->beg.index;
218 while (end < sum->end.field)
219 {
220 tab2:
221 while (s < e)
222 if (del[*s++])
223 {
224 if (tab)
225 {
226 for (c = 0; (s + c) < e; c++)
227 if (!tab[c])
228 {
229 s += c;
230 break;
231 }
232 else if (tab[c] != s[c])
233 goto tab2;
234 }
235 else if (t == ' ')
236 while (s < e && del[*s])
237 s++;
238 break;
239 }
240 end++;
241 }
242 if (!sum->end.index)
243 {
244 tab3:
245 while (s < e)
246 if (del[*s++])
247 {
248 if (tab)
249 {
250 for (c = 0; (s + c) < e; c++)
251 if (!tab[c])
252 break;
253 else if (tab[c] != s[c])
254 goto tab3;
255 }
256 else if (t == ' ')
257 while (s < e && del[*s])
258 s++;
259 s--;
260 break;
261 }
262 z = s;
263 }
264 else if (sum->end.index <= (e - s))
265 z = s + sum->end.index;
266 else
267 z = a;
268 }
269 else
270 a = z = s;
271 w = z - a;
272 if (!sum->width)
273 sum->format.width = RECTYPE(state->fmt) == REC_fixed ? w : (!(sum->format.flags & CX_FLOAT) || sum->end.index || w >= 8) ? 0 : 8;
274 if (map = sum->map)
275 {
276 ASSURE(state, &state->tmp, w + 2);
277 for (x = state->tmp.buf; a < z; *a++ = map[*x++]);
278 map = sum->pam;
279 x = state->tmp.buf;
280 a -= w;
281 }
282 else
283 x = a;
284 if (sum->op == 'v' || (*sum->type->internalf)(cx, sum->type, NiL, &sum->format, &v, (char*)x, w, cx->rm, cx->disc) < 0)
285 v.value.number = 0;
286 else if (state->regress && (sum->format.flags & CX_FLOAT))
287 {
288 n = v.value.number * 1000.0;
289 n /= 10;
290 v.value.number = n;
291 }
292 if (op < 0)
293 {
294 sum->value = v.value.number;
295 sum->count = 1;
296 }
297 else
298 {
299 if (count != 1)
300 v.value.number *= count;
301 switch (sum->op)
302 {
303 case 'a':
304 sum->value += v.value.number;
305 sum->count += count;
306 break;
307 case 'c':
308 count = v.value.number;
309 continue;
310 case 'M':
311 if (sum->value < v.value.number)
312 sum->value = v.value.number;
313 break;
314 case 'm':
315 if (sum->value > v.value.number)
316 sum->value = v.value.number;
317 break;
318 case 's':
319 sum->value += v.value.number;
320 break;
321 }
322 if (op > 0)
323 {
324 v.value.number = sum->value;
325 switch (sum->op)
326 {
327 case 'a':
328 v.value.number /= sum->count;
329 break;
330 case 'v':
331 while (a < z)
332 *a++ = sum->set;
333 continue;
334 }
335 n = (RECTYPE(state->fmt) == REC_fixed || w < 7) ? 7 : w;
336 for (;;)
337 {
338 y = n + 1;
339 ASSURE(state, &state->tmp, y);
340 if ((n = (*sum->type->externalf)(cx, sum->type, NiL, &sum->format, &v.value, (char*)state->tmp.buf, y, cx->disc)) < 0)
341 {
342 error(2, "%s value %I*g conversion error", sum->type->name, sizeof(v.value.number), v.value.number);
343 return -1;
344 }
345 if (n < y)
346 break;
347 }
348 if (n > w)
349 {
350 if (sum->end.index || RECTYPE(state->fmt) == REC_fixed)
351 {
352 error(2, "%s value %I*g width exceeds %d", sum->type->name, sizeof(v.value.number), v.value.number, w);
353 return -1;
354 }
355 ext = &state->buf[state->alt = !state->alt];
356 ASSURE(state, ext, r->datalen + (n - w));
357 memcpy(ext->buf, r->data, a - r->data);
358 memcpy(ext->buf + (a - r->data) + n, a + w, r->datalen - (w + (a - r->data)));
359 s = ext->buf + (s - r->data);
360 a = ext->buf + (a - r->data);
361 z = ext->buf + (z - r->data) + (n - w);
362 r->data = ext->buf;
363 r->datalen += n - w;
364 e = s + r->datalen - (RECTYPE(state->fmt) == REC_delimited);
365 }
366 if (map)
367 {
368 if (n < w)
369 {
370 c = (sum->type->format.flags & CX_BINARY) ? 0 : map[' '];
371 while (n++ < w)
372 *a++ = c;
373 }
374 for (x = state->tmp.buf; a < z; *a++ = map[*x++]);
375 }
376 else
377 {
378 if (n < w)
379 {
380 c = (sum->type->format.flags & CX_BINARY) ? 0 : ' ';
381 while (n++ < w)
382 *a++ = c;
383 }
384 for (x = state->tmp.buf; a < z; *a++ = *x++);
385 }
386 }
387 }
388 }
389 return 0;
390 }
391
392 static int
summary(Rs_t * rs,int op,Void_t * data,Void_t * arg,Rsdisc_t * disc)393 summary(Rs_t* rs, int op, Void_t* data, Void_t* arg, Rsdisc_t* disc)
394 {
395 State_t* state = (State_t*)disc;
396 register Rsobj_t* r;
397 register Rsobj_t* q;
398
399 switch (op)
400 {
401 case RS_POP:
402 dssclose(state->dss);
403 break;
404 case RS_SUMMARY:
405 r = (Rsobj_t*)data;
406 for (op = -1, q = r->equal; q; op = 0, q = q->right)
407 if (record(state, q, op))
408 return -1;
409 if (record(state, r, 1))
410 return -1;
411 break;
412 default:
413 return -1;
414 }
415 return 0;
416 }
417
418 Rsdisc_t*
rs_disc(Rskey_t * key,const char * options)419 rs_disc(Rskey_t* key, const char* options)
420 {
421 register Summary_t* sum;
422 char* s;
423 char* t;
424 char* b;
425 char* loc;
426 State_t* state;
427 Cxtype_t* type;
428 Dss_t* dss;
429 Position_t* pos;
430 Summary_t* cur;
431 Summary_t* def;
432 Summary_t* prv;
433 int tok;
434 int n;
435 int debug;
436 char chr;
437
438 static Dssdisc_t disc;
439
440 dssinit(&disc, errorf);
441 if (!(dss = dssopen(0, 0, &disc, dssmeth("dss", &disc))))
442 return 0;
443 if (!(state = vmnewof(dss->vm, 0, State_t, 1, 0)))
444 error(ERROR_SYSTEM|3, "out of space");
445 state->dss = dss;
446 if (!dssload("num_t", dss->disc))
447 goto drop;
448 debug = 0;
449 if (options)
450 {
451 for (;;)
452 {
453 switch (optstr(options, usage))
454 {
455 case 0:
456 break;
457 case 'd':
458 debug = 1;
459 continue;
460 case 'l':
461 if (!dssload(opt_info.arg, dss->disc))
462 goto drop;
463 continue;
464 case 'o':
465 def = 0;
466 s = opt_info.arg;
467 for (;;)
468 {
469 while (*s == ':' || isspace(*s))
470 s++;
471 if (!*s)
472 break;
473 if (!(sum = vmnewof(dss->vm, 0, Summary_t, 1, 0)))
474 error(ERROR_SYSTEM|3, "out of space");
475 sum->beg.field = -1;
476 if (def)
477 {
478 sum->type = def->type;
479 sum->format = def->format;
480 sum->op = def->op;
481 sum->set = def->set;
482 }
483 else
484 sum->format.code = key->code;
485 def = sum;
486 b = s;
487 tok = 0;
488 /*UNDENT...*/
489 for (;;)
490 {
491 if (*s == '.' || isdigit(*s))
492 {
493 pos = 0;
494 while (*s == '.' || isdigit(*s))
495 {
496 if (!pos)
497 {
498 pos = &sum->beg;
499 loc = "begin";
500 }
501 else if (pos == &sum->beg)
502 {
503 pos = &sum->end;
504 loc = "end";
505 }
506 else
507 {
508 error(2, "%s: invalid summary field position", s);
509 goto drop;
510 }
511 if (*s == '.')
512 n = 1;
513 else
514 for (n = 0; *s >= '0' && *s <= '9'; n = n * 10 + (*s++ - '0'));
515 if ((pos->field = n - 1) < 0)
516 {
517 error(2, "%d: invalid summary field %s position", n, loc);
518 goto drop;
519 }
520 switch (*s)
521 {
522 case '.':
523 for (n = 0; *++s >= '0' && *s <= '9'; n = n * 10 + (*s - '0'));
524 if ((pos->index = n - 1) < 0)
525 {
526 error(2, "%d: invalid summary field %s offset", n, loc);
527 goto drop;
528 }
529 if (*s == '.')
530 {
531 n = 0;
532 if (pos == &sum->beg)
533 for (n = 0; *++s >= '0' && *s <= '9'; n = n * 10 + (*s - '0'));
534 if (n <= 0)
535 {
536 error(2, "%d: invalid summary field %s size", n, loc);
537 goto drop;
538 }
539 sum->end.field = sum->beg.field;
540 sum->end.index = sum->beg.index + n;
541 }
542 break;
543 case 'C':
544 s++;
545 switch (*s++)
546 {
547 case 'a':
548 n = CC_ASCII;
549 break;
550 case 'e':
551 n = CC_EBCDIC_E;
552 break;
553 case 'i':
554 n = CC_EBCDIC_I;
555 break;
556 case 'o':
557 n = CC_EBCDIC_O;
558 break;
559 case 'n':
560 n = CC_NATIVE;
561 break;
562 default:
563 error(2, "%s: invalid code set", s - 1);
564 goto drop;
565 }
566 switch (*s++)
567 {
568 case 'a':
569 n = CCOP(n, CC_ASCII);
570 break;
571 case 'e':
572 n = CCOP(n, CC_EBCDIC_E);
573 break;
574 case 'i':
575 n = CCOP(n, CC_EBCDIC_I);
576 break;
577 case 'o':
578 n = CCOP(n, CC_EBCDIC_O);
579 break;
580 case 'n':
581 n = CCOP(n, CC_NATIVE);
582 break;
583 default:
584 s--;
585 break;
586 }
587 if (n && n != CC_NATIVE && CCIN(n) != CCOUT(n))
588 sum->format.code = n;
589 break;
590 default:
591 if (isalpha(*s))
592 {
593 error(2, "%s: invalid summary field attribute", s);
594 goto drop;
595 }
596 break;
597 }
598 }
599 break;
600 }
601 switch (tok)
602 {
603 case 0:
604 switch (sum->op = *s++)
605 {
606 case 'a':
607 case 'c':
608 break;
609 case 'M':
610 if (*s == 'I')
611 sum->op = 'm';
612 break;
613 case 'm':
614 if (*s == 'a')
615 sum->op = 'M';
616 break;
617 case 's':
618 if (*s != 'e')
619 break;
620 sum->op = 'v';
621 /*FALLTHROUGH*/
622 case 'v':
623 t = s - 1;
624 while (isalnum(*s))
625 s++;
626 if (*s != ':' || !*++s)
627 {
628 error(2, "%s: summary field character value expected", t);
629 goto drop;
630 }
631 sum->set = chresc(s, &s);
632 break;
633 default:
634 error(2, "%s: invalid summary field operation", s - 1);
635 goto drop;
636 }
637 while (isalnum(*s))
638 s++;
639 tok++;
640 break;
641 case 1:
642 if (type = cxattr(dss->cx, s, &t, &sum->format, dss->cx->disc))
643 {
644 s = t;
645 sum->type = type;
646 sum->width = sum->format.width;
647 tok++;
648 break;
649 }
650 /*FALLTHROUGH*/
651 default:
652 error(2, "%s: invalid summary field specification", s);
653 goto drop;
654 }
655 while (*s == ':' || isspace(*s))
656 s++;
657 if (!*s)
658 break;
659 }
660 /*...INDENT*/
661 if (sum->beg.field < 0)
662 {
663 error(2, "%s: field position expected", b);
664 goto drop;
665 }
666 if (!sum->type)
667 sum->type = cxattr(dss->cx, "integer", NiL, &sum->format, dss->cx->disc);
668 for (prv = 0, cur = state->sum; cur; cur = (prv = cur)->next)
669 if (sum->beg.field < cur->beg.field || sum->beg.field == cur->beg.field && sum->end.field < cur->end.field)
670 break;
671 if (prv)
672 prv->next = sum;
673 else
674 state->sum = sum;
675 sum->next = cur;
676 }
677 continue;
678 case 'r':
679 state->regress = 1;
680 continue;
681 case '?':
682 error(ERROR_USAGE|4, "%s", opt_info.arg);
683 goto drop;
684 case ':':
685 error(2, "%s", opt_info.arg);
686 goto drop;
687 }
688 break;
689 }
690 }
691 key->type &= ~RS_DATA;
692 key->type |= RS_UNIQ;
693 state->fmt = key->disc->data;
694 if (!*key->tab || *key->tab == ' ')
695 {
696 state->tab = (unsigned char*)" ";
697 for (n = 0; n < elementsof(state->delim); n++)
698 if (isspace(n))
699 state->delim[n] = 1;
700 }
701 else
702 state->delim[*(state->tab = key->tab)] = 1;
703 state->disc.eventf = summary;
704 state->disc.events = RS_SUMMARY|RS_POP;
705 for (sum = state->sum; sum; sum = sum->next)
706 if (sum->format.code)
707 {
708 if (!CCCONVERT(sum->format.code))
709 {
710 if (sum->format.code == CC_NATIVE || (sum->type->format.flags & CX_BINARY))
711 sum->format.code = 0;
712 else
713 sum->format.code = CCOP(sum->format.code, CC_NATIVE);
714 }
715 if (sum->format.code)
716 {
717 sum->map = ccmap(CCIN(sum->format.code), CCOUT(sum->format.code));
718 sum->pam = ccmap(CCOUT(sum->format.code), CCIN(sum->format.code));
719 }
720 }
721 if (debug || key->verbose)
722 for (n = 1, sum = state->sum; sum; n++, sum = sum->next)
723 {
724 sfprintf(sfstderr, "op %d ", n);
725 if (sum->beg.field == sum->end.field)
726 sfprintf(sfstderr, ".%d.%d", sum->beg.index + 1, sum->end.index - sum->beg.index);
727 else
728 sfprintf(sfstderr, "%d.%d,%d.%d", sum->beg.field + 1, sum->beg.index + 1, sum->end.field + 1, sum->end.index);
729 sfprintf(sfstderr, " %c", sum->op);
730 if (sum->format.code)
731 sfprintf(sfstderr, " %d=>%d ", CCIN(sum->format.code), CCOUT(sum->format.code));
732 else
733 sfprintf(sfstderr, " ");
734 if (sum->op == 'v')
735 {
736 chr = sum->set;
737 sfprintf(sfstderr, "'%s'", fmtquote(&chr, NiL, "'", 1, 0));
738 }
739 else
740 sfprintf(sfstderr, "%s", sum->type->name);
741 sfprintf(sfstderr, "\n");
742 }
743 return &state->disc;
744 drop:
745 dssclose(dss);
746 return 0;
747 }
748
749 SORTLIB(sum)
750