1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *          Copyright (c) 2003-2011 AT&T Intellectual Property          *
5 *                      and is licensed under the                       *
6 *                 Eclipse Public License, Version 1.0                  *
7 *                    by AT&T Intellectual Property                     *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *          http://www.eclipse.org/org/documents/epl-v10.html           *
11 *         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *               Glenn Fowler <glenn.s.fowler@gmail.com>                *
18 *                                                                      *
19 ***********************************************************************/
20 #pragma prototyped
21 
22 /*
23  * sort uniq summary discipline
24  */
25 
26 static const char usage[] =
27 "[-1lp0s5P?\n@(#)$Id: sum (AT&T Research) 2011-08-23 $\n]"
28 USAGE_LICENSE
29 "[+PLUGIN?sum - sort uniq summary discipline]"
30 "[+DESCRIPTION?The \bsum\b \bsort\b(1) discipline applies "
31     "summarization operations to selected fields in records that compare "
32     "equal. The discipline sets the \bsort\b \b--unique\b option. Summary "
33     "fields in non-unique records are modified according to the operations "
34     "specified in the \bop\b discipline option.]"
35 "[l:library?Load the \bdss\b(1) type library \alibrary\a. Types are used "
36     "by the \bop\b option. The \bnum_t\b library is loaded by default. \vdss "
37     "--plugin=man\v lists the information on all \bdss\b libraries and \vdss "
38     "--plugin=man\v \aname\a lists the information for the \aname\a "
39     "library.]:[library]"
40 "[o:op?A field summary operation. \aarg\a is a \bdss\b(1) type name for "
41     "all but the \bset\b \aop\a, either from the \bnum_t\b library or from a "
42     "library loaded by the \blibrary\b option. \atype\a may also contain one "
43     "or more \b:\b separated attributes. \akey\a is a \bsort\b(1) \b-k\b "
44     "style field specification. \aop\a\b:\b\aarg\a may be specified multiple "
45     "times; \aop\a and \aarg\a are inherited across \akey\a values from left "
46     "to right. The default type is native character set \binteger\b; some "
47     "operations may ignore the type. Spaces may be used in place of the "
48     "\b:\b. \aop\a may be one of:]:[op::[arg...]]::key[...]]]"
49     "{"
50         "[+max (M)?maximum value]"
51         "[+min (m)?minimum value]"
52         "[+average (a)?average value]"
53         "[+sum (s)?sum]"
54         "[+count (c)?multiply subsequent field values and increment the "
55             "total count by this value]"
56         "[+set (s)?set all field field bytes to the first character of "
57             "\aarg\a, which may be a C-style escape sequence]"
58     "}"
59 "[r:regress?Massage output for regression testing.]"
60 "[d:debug?List the field operations on the standard error.]"
61 "[+EXAMPLES]"
62     "{"
63         "[+sort -k.2.1 -lsum,op=sum::integer::.6.2?Sorts on the 1 byte "
64             "fixed width field starting at byte position 2 (counting from 1) "
65             "and computes the sum of the integers in the 2 byte fixed width "
66             "field starting at byte position 6.]"
67         "[+dlls --base dss | grep '_t$'?Lists the \bdss\b(1) type "
68             "library names.]"
69         "[+dss --plugin=man num_t?Lists the \bdss\b(1) \bnum_t\b type "
70             "library description in the \b--man\b style.]"
71     "}"
72 "[+SEE ALSO?\bdss\b(1), \bsort\b(1)]"
73 "\n\n--library=sum[,option[=value]...]\n\n"
74 ;
75 
76 #include <ast.h>
77 #include <ctype.h>
78 #include <ccode.h>
79 #include <dss.h>
80 #include <error.h>
81 #include <recsort.h>
82 #include <recfmt.h>
83 #include <vmalloc.h>
84 
85 struct Library_s; typedef struct Library_s Library_t;
86 struct Summary_s; typedef struct Summary_s Summary_t;
87 
88 struct Library_s
89 {
90 	Library_t*	next;
91 	Cxtype_t*	types;
92 };
93 
94 typedef struct Position_s
95 {
96 	short		field;
97 	short		index;
98 } Position_t;
99 
100 struct Summary_s
101 {
102 	Summary_t*	next;
103 	Cxtype_t*	type;
104 	unsigned char*	map;
105 	unsigned char*	pam;
106 	Cxformat_t	format;
107 	Position_t	beg;
108 	Position_t	end;
109 	int		op;
110 	int		set;
111 	int		fixed;
112 	int		width;
113 	Sflong_t	count;
114 	Sfdouble_t	value;
115 };
116 
117 typedef struct Buffer_s
118 {
119 	unsigned char*	buf;
120 	size_t		siz;
121 } Buffer_t;
122 
123 typedef struct State_s
124 {
125 	Rsdisc_t	disc;
126 	Dss_t*		dss;
127 	Summary_t*	sum;
128 	Sflong_t	records;
129 	Recfmt_t	fmt;
130 	unsigned char*	tab;
131 	unsigned char	delim[256];
132 	int		alt;
133 	int		regress;
134 	Buffer_t	tmp;
135 	Buffer_t	buf[2];
136 } State_t;
137 
138 #define ASSURE(s,b,z)	do{if(((b)->siz<(z))&&assure(s,b,z))return -1;}while(0)
139 
140 static int
assure(State_t * state,Buffer_t * b,size_t z)141 assure(State_t* state, Buffer_t* b, size_t z)
142 {
143 	if (b->siz < z)
144 	{
145 		b->siz = roundof(z, 32);
146 		if (!(b->buf = vmnewof(state->dss->vm, b->buf, unsigned char, b->siz, 0)))
147 		{
148 			error(ERROR_SYSTEM|3, "out of space extending to %I*u", sizeof(b->siz), b->siz);
149 			return -1;
150 		}
151 	}
152 	return 0;
153 }
154 
155 static int
record(register State_t * state,register Rsobj_t * r,int op)156 record(register State_t* state, register Rsobj_t* r, int op)
157 {
158 	Cx_t*				cx = state->dss->cx;
159 	register Summary_t*		sum;
160 	register unsigned char*		s;
161 	register unsigned char*		e;
162 	register unsigned char*		a;
163 	register unsigned char*		z;
164 	register unsigned char*		del;
165 	register const unsigned char*	map;
166 	unsigned char*			x;
167 	unsigned char*			tab;
168 	Buffer_t*			ext;
169 	int				beg;
170 	int				end;
171 	int				t;
172 	int				c;
173 	size_t				count;
174 	size_t				w;
175 	size_t				y;
176 	ssize_t				n;
177 	Cxoperand_t			v;
178 
179 	state->records++;
180 	s = r->data;
181 	e = s + r->datalen - (RECTYPE(state->fmt) == REC_delimited);
182 	beg = end = 0;
183 	count = 1;
184 	tab = state->tab;
185 	t = *tab++;
186 	if (!*tab)
187 		tab = 0;
188 	del = state->delim;
189 	for (sum = state->sum; sum; sum = sum->next)
190 	{
191 		while (beg < sum->beg.field)
192 		{
193 		tab1:
194 			while (s < e)
195 				if (del[*s++])
196 				{
197 					if (tab)
198 					{
199 						for (c = 0; (s + c) < e; c++)
200 							if (!tab[c])
201 							{
202 								s += c;
203 								break;
204 							}
205 							else if (tab[c] != s[c])
206 								goto tab1;
207 					}
208 					else if (t == ' ')
209 						while (s < e && del[*s])
210 							s++;
211 					break;
212 				}
213 			end = ++beg;
214 		}
215 		if (sum->beg.index < (e - s))
216 		{
217 			a = s + sum->beg.index;
218 			while (end < sum->end.field)
219 			{
220 			tab2:
221 				while (s < e)
222 					if (del[*s++])
223 					{
224 						if (tab)
225 						{
226 							for (c = 0; (s + c) < e; c++)
227 								if (!tab[c])
228 								{
229 									s += c;
230 									break;
231 								}
232 								else if (tab[c] != s[c])
233 									goto tab2;
234 						}
235 						else if (t == ' ')
236 							while (s < e && del[*s])
237 								s++;
238 						break;
239 					}
240 				end++;
241 			}
242 			if (!sum->end.index)
243 			{
244 			tab3:
245 				while (s < e)
246 					if (del[*s++])
247 					{
248 						if (tab)
249 						{
250 							for (c = 0; (s + c) < e; c++)
251 								if (!tab[c])
252 									break;
253 								else if (tab[c] != s[c])
254 									goto tab3;
255 						}
256 						else if (t == ' ')
257 							while (s < e && del[*s])
258 								s++;
259 						s--;
260 						break;
261 					}
262 				z = s;
263 			}
264 			else if (sum->end.index <= (e - s))
265 				z = s + sum->end.index;
266 			else
267 				z = a;
268 		}
269 		else
270 			a = z = s;
271 		w = z - a;
272 		if (!sum->width)
273 			sum->format.width = RECTYPE(state->fmt) == REC_fixed ? w : (!(sum->format.flags & CX_FLOAT) || sum->end.index || w >= 8) ? 0 : 8;
274 		if (map = sum->map)
275 		{
276 			ASSURE(state, &state->tmp, w + 2);
277 			for (x = state->tmp.buf; a < z; *a++ = map[*x++]);
278 			map = sum->pam;
279 			x = state->tmp.buf;
280 			a -= w;
281 		}
282 		else
283 			x = a;
284 		if (sum->op == 'v' || (*sum->type->internalf)(cx, sum->type, NiL, &sum->format, &v, (char*)x, w, cx->rm, cx->disc) < 0)
285 			v.value.number = 0;
286 		else if (state->regress && (sum->format.flags & CX_FLOAT))
287 		{
288 			n = v.value.number * 1000.0;
289 			n /= 10;
290 			v.value.number = n;
291 		}
292 		if (op < 0)
293 		{
294 			sum->value = v.value.number;
295 			sum->count = 1;
296 		}
297 		else
298 		{
299 			if (count != 1)
300 				v.value.number *= count;
301 			switch (sum->op)
302 			{
303 			case 'a':
304 				sum->value += v.value.number;
305 				sum->count += count;
306 				break;
307 			case 'c':
308 				count = v.value.number;
309 				continue;
310 			case 'M':
311 				if (sum->value < v.value.number)
312 					sum->value = v.value.number;
313 				break;
314 			case 'm':
315 				if (sum->value > v.value.number)
316 					sum->value = v.value.number;
317 				break;
318 			case 's':
319 				sum->value += v.value.number;
320 				break;
321 			}
322 			if (op > 0)
323 			{
324 				v.value.number = sum->value;
325 				switch (sum->op)
326 				{
327 				case 'a':
328 					v.value.number /= sum->count;
329 					break;
330 				case 'v':
331 					while (a < z)
332 						*a++ = sum->set;
333 					continue;
334 				}
335 				n = (RECTYPE(state->fmt) == REC_fixed || w < 7) ? 7 : w;
336 				for (;;)
337 				{
338 					y = n + 1;
339 					ASSURE(state, &state->tmp, y);
340 					if ((n = (*sum->type->externalf)(cx, sum->type, NiL, &sum->format, &v.value, (char*)state->tmp.buf, y, cx->disc)) < 0)
341 					{
342 						error(2, "%s value %I*g conversion error", sum->type->name, sizeof(v.value.number), v.value.number);
343 						return -1;
344 					}
345 					if (n < y)
346 						break;
347 				}
348 				if (n > w)
349 				{
350 					if (sum->end.index || RECTYPE(state->fmt) == REC_fixed)
351 					{
352 						error(2, "%s value %I*g width exceeds %d", sum->type->name, sizeof(v.value.number), v.value.number, w);
353 						return -1;
354 					}
355 					ext = &state->buf[state->alt = !state->alt];
356 					ASSURE(state, ext, r->datalen + (n - w));
357 					memcpy(ext->buf, r->data, a - r->data);
358 					memcpy(ext->buf + (a - r->data) + n, a + w, r->datalen - (w + (a - r->data)));
359 					s = ext->buf + (s - r->data);
360 					a = ext->buf + (a - r->data);
361 					z = ext->buf + (z - r->data) + (n - w);
362 					r->data = ext->buf;
363 					r->datalen += n - w;
364 					e = s + r->datalen - (RECTYPE(state->fmt) == REC_delimited);
365 				}
366 				if (map)
367 				{
368 					if (n < w)
369 					{
370 						c = (sum->type->format.flags & CX_BINARY) ? 0 : map[' '];
371 						while (n++ < w)
372 							*a++ = c;
373 					}
374 					for (x = state->tmp.buf; a < z; *a++ = map[*x++]);
375 				}
376 				else
377 				{
378 					if (n < w)
379 					{
380 						c = (sum->type->format.flags & CX_BINARY) ? 0 : ' ';
381 						while (n++ < w)
382 							*a++ = c;
383 					}
384 					for (x = state->tmp.buf; a < z; *a++ = *x++);
385 				}
386 			}
387 		}
388 	}
389 	return 0;
390 }
391 
392 static int
summary(Rs_t * rs,int op,Void_t * data,Void_t * arg,Rsdisc_t * disc)393 summary(Rs_t* rs, int op, Void_t* data, Void_t* arg, Rsdisc_t* disc)
394 {
395 	State_t*		state = (State_t*)disc;
396 	register Rsobj_t*	r;
397 	register Rsobj_t*	q;
398 
399 	switch (op)
400 	{
401 	case RS_POP:
402 		dssclose(state->dss);
403 		break;
404 	case RS_SUMMARY:
405 		r = (Rsobj_t*)data;
406 		for (op = -1, q = r->equal; q; op = 0, q = q->right)
407 			if (record(state, q, op))
408 				return -1;
409 		if (record(state, r, 1))
410 			return -1;
411 		break;
412 	default:
413 		return -1;
414 	}
415 	return 0;
416 }
417 
418 Rsdisc_t*
rs_disc(Rskey_t * key,const char * options)419 rs_disc(Rskey_t* key, const char* options)
420 {
421 	register Summary_t*	sum;
422 	char*			s;
423 	char*			t;
424 	char*			b;
425 	char*			loc;
426 	State_t*		state;
427 	Cxtype_t*		type;
428 	Dss_t*			dss;
429 	Position_t*		pos;
430 	Summary_t*		cur;
431 	Summary_t*		def;
432 	Summary_t*		prv;
433 	int			tok;
434 	int			n;
435 	int			debug;
436 	char			chr;
437 
438 	static Dssdisc_t	disc;
439 
440 	dssinit(&disc, errorf);
441 	if (!(dss = dssopen(0, 0, &disc, dssmeth("dss", &disc))))
442 		return 0;
443 	if (!(state = vmnewof(dss->vm, 0, State_t, 1, 0)))
444 		error(ERROR_SYSTEM|3, "out of space");
445 	state->dss = dss;
446 	if (!dssload("num_t", dss->disc))
447 		goto drop;
448 	debug = 0;
449 	if (options)
450 	{
451 		for (;;)
452 		{
453 			switch (optstr(options, usage))
454 			{
455 			case 0:
456 				break;
457 			case 'd':
458 				debug = 1;
459 				continue;
460 			case 'l':
461 				if (!dssload(opt_info.arg, dss->disc))
462 					goto drop;
463 				continue;
464 			case 'o':
465 				def = 0;
466 				s = opt_info.arg;
467 				for (;;)
468 				{
469 					while (*s == ':' || isspace(*s))
470 						s++;
471 					if (!*s)
472 						break;
473 					if (!(sum = vmnewof(dss->vm, 0, Summary_t, 1, 0)))
474 						error(ERROR_SYSTEM|3, "out of space");
475 					sum->beg.field = -1;
476 					if (def)
477 					{
478 						sum->type = def->type;
479 						sum->format = def->format;
480 						sum->op = def->op;
481 						sum->set = def->set;
482 					}
483 					else
484 						sum->format.code = key->code;
485 					def = sum;
486 					b = s;
487 					tok = 0;
488 					/*UNDENT...*/
489 	for (;;)
490 	{
491 		if (*s == '.' || isdigit(*s))
492 		{
493 			pos = 0;
494 			while (*s == '.' || isdigit(*s))
495 			{
496 				if (!pos)
497 				{
498 					pos = &sum->beg;
499 					loc = "begin";
500 				}
501 				else if (pos == &sum->beg)
502 				{
503 					pos = &sum->end;
504 					loc = "end";
505 				}
506 				else
507 				{
508 					error(2, "%s: invalid summary field position", s);
509 					goto drop;
510 				}
511 				if (*s == '.')
512 					n = 1;
513 				else
514 					for (n = 0; *s >= '0' && *s <= '9'; n = n * 10 + (*s++ - '0'));
515 				if ((pos->field = n - 1) < 0)
516 				{
517 					error(2, "%d: invalid summary field %s position", n, loc);
518 					goto drop;
519 				}
520 				switch (*s)
521 				{
522 				case '.':
523 					for (n = 0; *++s >= '0' && *s <= '9'; n = n * 10 + (*s - '0'));
524 					if ((pos->index = n - 1) < 0)
525 					{
526 						error(2, "%d: invalid summary field %s offset", n, loc);
527 						goto drop;
528 					}
529 					if (*s == '.')
530 					{
531 						n = 0;
532 						if (pos == &sum->beg)
533 							for (n = 0; *++s >= '0' && *s <= '9'; n = n * 10 + (*s - '0'));
534 						if (n <= 0)
535 						{
536 							error(2, "%d: invalid summary field %s size", n, loc);
537 							goto drop;
538 						}
539 						sum->end.field = sum->beg.field;
540 						sum->end.index = sum->beg.index + n;
541 					}
542 					break;
543 				case 'C':
544 					s++;
545 					switch (*s++)
546 					{
547 					case 'a':
548 						n = CC_ASCII;
549 						break;
550 					case 'e':
551 						n = CC_EBCDIC_E;
552 						break;
553 					case 'i':
554 						n = CC_EBCDIC_I;
555 						break;
556 					case 'o':
557 						n = CC_EBCDIC_O;
558 						break;
559 					case 'n':
560 						n = CC_NATIVE;
561 						break;
562 					default:
563 						error(2, "%s: invalid code set", s - 1);
564 						goto drop;
565 					}
566 					switch (*s++)
567 					{
568 					case 'a':
569 						n = CCOP(n, CC_ASCII);
570 						break;
571 					case 'e':
572 						n = CCOP(n, CC_EBCDIC_E);
573 						break;
574 					case 'i':
575 						n = CCOP(n, CC_EBCDIC_I);
576 						break;
577 					case 'o':
578 						n = CCOP(n, CC_EBCDIC_O);
579 						break;
580 					case 'n':
581 						n = CCOP(n, CC_NATIVE);
582 						break;
583 					default:
584 						s--;
585 						break;
586 					}
587 					if (n && n != CC_NATIVE && CCIN(n) != CCOUT(n))
588 						sum->format.code = n;
589 					break;
590 				default:
591 					if (isalpha(*s))
592 					{
593 						error(2, "%s: invalid summary field attribute", s);
594 						goto drop;
595 					}
596 					break;
597 				}
598 			}
599 			break;
600 		}
601 		switch (tok)
602 		{
603 		case 0:
604 			switch (sum->op = *s++)
605 			{
606 			case 'a':
607 			case 'c':
608 				break;
609 			case 'M':
610 				if (*s == 'I')
611 					sum->op = 'm';
612 				break;
613 			case 'm':
614 				if (*s == 'a')
615 					sum->op = 'M';
616 				break;
617 			case 's':
618 				if (*s != 'e')
619 					break;
620 				sum->op = 'v';
621 				/*FALLTHROUGH*/
622 			case 'v':
623 				t = s - 1;
624 				while (isalnum(*s))
625 					s++;
626 				if (*s != ':' || !*++s)
627 				{
628 					error(2, "%s: summary field character value expected", t);
629 					goto drop;
630 				}
631 				sum->set = chresc(s, &s);
632 				break;
633 			default:
634 				error(2, "%s: invalid summary field operation", s - 1);
635 				goto drop;
636 			}
637 			while (isalnum(*s))
638 				s++;
639 			tok++;
640 			break;
641 		case 1:
642 			if (type = cxattr(dss->cx, s, &t, &sum->format, dss->cx->disc))
643 			{
644 				s = t;
645 				sum->type = type;
646 				sum->width = sum->format.width;
647 				tok++;
648 				break;
649 			}
650 			/*FALLTHROUGH*/
651 		default:
652 			error(2, "%s: invalid summary field specification", s);
653 			goto drop;
654 		}
655 		while (*s == ':' || isspace(*s))
656 			s++;
657 		if (!*s)
658 			break;
659 	}
660 					/*...INDENT*/
661 					if (sum->beg.field < 0)
662 					{
663 						error(2, "%s: field position expected", b);
664 						goto drop;
665 					}
666 					if (!sum->type)
667 						sum->type = cxattr(dss->cx, "integer", NiL, &sum->format, dss->cx->disc);
668 					for (prv = 0, cur = state->sum; cur; cur = (prv = cur)->next)
669 						if (sum->beg.field < cur->beg.field || sum->beg.field == cur->beg.field && sum->end.field < cur->end.field)
670 							break;
671 					if (prv)
672 						prv->next = sum;
673 					else
674 						state->sum = sum;
675 					sum->next = cur;
676 				}
677 				continue;
678 			case 'r':
679 				state->regress = 1;
680 				continue;
681 			case '?':
682 				error(ERROR_USAGE|4, "%s", opt_info.arg);
683 				goto drop;
684 			case ':':
685 				error(2, "%s", opt_info.arg);
686 				goto drop;
687 			}
688 			break;
689 		}
690 	}
691 	key->type &= ~RS_DATA;
692 	key->type |= RS_UNIQ;
693 	state->fmt = key->disc->data;
694 	if (!*key->tab || *key->tab == ' ')
695 	{
696 		state->tab = (unsigned char*)" ";
697 		for (n = 0; n < elementsof(state->delim); n++)
698 			if (isspace(n))
699 				state->delim[n] = 1;
700 	}
701 	else
702 		state->delim[*(state->tab = key->tab)] = 1;
703 	state->disc.eventf = summary;
704 	state->disc.events = RS_SUMMARY|RS_POP;
705 	for (sum = state->sum; sum; sum = sum->next)
706 		if (sum->format.code)
707 		{
708 			if (!CCCONVERT(sum->format.code))
709 			{
710 				if (sum->format.code == CC_NATIVE || (sum->type->format.flags & CX_BINARY))
711 					sum->format.code = 0;
712 				else
713 					sum->format.code = CCOP(sum->format.code, CC_NATIVE);
714 			}
715 			if (sum->format.code)
716 			{
717 				sum->map = ccmap(CCIN(sum->format.code), CCOUT(sum->format.code));
718 				sum->pam = ccmap(CCOUT(sum->format.code), CCIN(sum->format.code));
719 			}
720 		}
721 	if (debug || key->verbose)
722 		for (n = 1, sum = state->sum; sum; n++, sum = sum->next)
723 		{
724 			sfprintf(sfstderr, "op %d ", n);
725 			if (sum->beg.field == sum->end.field)
726 				sfprintf(sfstderr, ".%d.%d", sum->beg.index + 1, sum->end.index - sum->beg.index);
727 			else
728 				sfprintf(sfstderr, "%d.%d,%d.%d", sum->beg.field + 1, sum->beg.index + 1, sum->end.field + 1, sum->end.index);
729 			sfprintf(sfstderr, " %c", sum->op);
730 			if (sum->format.code)
731 				sfprintf(sfstderr, " %d=>%d ", CCIN(sum->format.code), CCOUT(sum->format.code));
732 			else
733 				sfprintf(sfstderr, "      ");
734 			if (sum->op == 'v')
735 			{
736 				chr = sum->set;
737 				sfprintf(sfstderr, "'%s'", fmtquote(&chr, NiL, "'", 1, 0));
738 			}
739 			else
740 				sfprintf(sfstderr, "%s", sum->type->name);
741 			sfprintf(sfstderr, "\n");
742 		}
743 	return &state->disc;
744  drop:
745 	dssclose(dss);
746 	return 0;
747 }
748 
749 SORTLIB(sum)
750