1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *          Copyright (c) 1998-2011 AT&T Intellectual Property          *
5 *                      and is licensed under the                       *
6 *                 Eclipse Public License, Version 1.0                  *
7 *                    by AT&T Intellectual Property                     *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *          http://www.eclipse.org/org/documents/epl-v10.html           *
11 *         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                                                                      *
19 ***********************************************************************/
20 #pragma prototyped
21 
22 /*
23  * induce fixed length record groups from data
24  */
25 
26 static const char usage[] =
27 "[-?\n@(#)$Id: rectify (AT&T Research) 1999-03-22 $\n]"
28 USAGE_LICENSE
29 "[+NAME?rectify - induce fixed length record groups from data]"
30 "[+DESCRIPTION?\brectify\b induces fixed length record groups from input data"
31 "	by sampling and comparing character frequencies. The standard input is"
32 "	read if \a-\a or no files are specified.]"
33 
34 "[c:context?List \acontext\a records at the beginning and end of"
35 "	record groups larger that 3*\acontext\a.]#[context]"
36 "[d:description?Specify a structured dump description file. Each line of"
37 "	this file describes the size and content of a contiguous portion"
38 "	of the input file. The description is applied separately to each"
39 "	input file. Comments and optional labels in the following"
40 "	descriptions are listed with the \b--verbose\b option. Supported"
41 "	descriptions are:]:[file]{"
42 "		[+c comment?comment]"
43 "		[+d size [label]]?\asize\a bytes of data with optional label]"
44 "		[+i size [label]]?ignore \asize\a bytes of data]"
45 "		[+r size count [label]]?\acount\a records of length \asize\a]"
46 "		[+t count?Match \acount\a records against the \bT\b record"
47 "			table. \acount\a=0 continues until no record type"
48 "			match is found.]"
49 "		[+z size [label]]?a string with length determined by a"
50 "			\asize\a byte binary integer]"
51 "		[+T idlen id size unit [offset]]?Defines a sized record"
52 "			table entry.]{"
53 "			[+idlen?type identifier length, must be"
54 "				<= 4 bytes]"
55 "			[+id?type identifier, starting at record offset 0]"
56 "			[+size?default record size]"
57 "			[+unit?if > 0 then the record is variable length and"
58 "				the size is the byte at \aoffset\a]"
59 "			[+offset?if \aunit\a > 0 then this byte multiplied by"
60 "				\aunit\a is the size of variable length data"
61 "				appended to the record]"
62 "		}"
63 "}"
64 "[f:format?Byte output \bprintf\b(3) format.]:[format:=02x]"
65 "[g!:group?Group output in 4's.]"
66 "[m:min?Minimum record length to consider.]#[min:=8]"
67 "[n:count?List the top \acount\a candidate record lengths.]#[count:=16]"
68 "[o:offset?Start description listing at \aoffset\a.]#[offset:=0]"
69 "[r:run?List runs at least as long as \arun\a.]#[run]"
70 "[v:verbose?Dump description labels with data.]"
71 
72 "\n"
73 "\n[ file ... ]\n"
74 "\n"
75 "[+SEE ALSO?\bpin\b(1), \bpop\b(1)]"
76 ;
77 
78 #include <ast.h>
79 #include <error.h>
80 #include <tok.h>
81 
82 typedef struct Item_s
83 {
84 	unsigned long	index;
85 	unsigned long	offset;
86 	unsigned long	start;
87 	unsigned long	count;
88 	unsigned long	run;
89 } Item_t;
90 
91 typedef struct
92 {
93 	int		len;
94 	unsigned long	id;
95 	int		size;
96 	int		unit;
97 	int		offset;
98 } Type_t;
99 
100 typedef struct
101 {
102 	Sfoff_t		offset;
103 	unsigned long	count;
104 } Loop_t;
105 
106 static struct
107 {
108 	Type_t		type[4 * 1024];
109 	Item_t		mod[4 * 1024];
110 	unsigned long	hit[UCHAR_MAX + 1];
111 	Sfoff_t		offset;
112 	char*		format1;
113 	char*		format4;
114 	unsigned long	context;
115 	unsigned long	count;
116 	unsigned long	min;
117 	unsigned long	run;
118 	int		group;
119 	int		types;
120 	int		typelen;
121 	int		typelast;
122 } state;
123 
124 /*
125  * order items by count hi to lo
126  */
127 
128 static int
bycount(const void * va,const void * vb)129 bycount(const void* va, const void* vb)
130 {
131 	register Item_t*	a = (Item_t*)va;
132 	register Item_t*	b = (Item_t*)vb;
133 
134 	if (a->count < b->count)
135 		return 1;
136 	if (a->count > b->count)
137 		return -1;
138 	if (a < b)
139 		return 1;
140 	if (a > b)
141 		return -1;
142 	return 0;
143 }
144 
145 /*
146  * rectify fp open for read on file
147  */
148 
149 static void
rectify(register Sfio_t * fp,char * file,int verbose)150 rectify(register Sfio_t* fp, char* file, int verbose)
151 {
152 	register unsigned char*	s;
153 	register Item_t*	p;
154 	register unsigned long*	q;
155 	register unsigned long	offset;
156 	register unsigned long	i;
157 	unsigned long		n;
158 	unsigned long		cur;
159 	unsigned long		dif;
160 	unsigned long		max;
161 
162 	memset(state.hit, 0, sizeof(state.hit));
163 	memset(state.mod, 0, sizeof(state.mod));
164 	for (i = 0; i < elementsof(state.mod); i++)
165 		state.mod[i].index = i;
166 	max = 0;
167 	offset = 0;
168 	while (s = sfreserve(fp, SF_UNBOUND, 0))
169 	{
170 		n = sfvalue(fp);
171 		for (i = 0; i < n; i++)
172 		{
173 			cur = offset + i;
174 			q = state.hit + s[i];
175 			dif = cur - *q;
176 			*q = cur;
177 			if (dif < elementsof(state.mod))
178 			{
179 				p = state.mod + dif;
180 				if (dif > max)
181 					max = dif;
182 				p->count++;
183 				if ((cur - p->offset) <= dif)
184 				{
185 					if (!p->run++)
186 						p->start = cur;
187 				}
188 				else if (p->run)
189 				{
190 					if (state.run && p->run >= state.run && p->index >= state.min)
191 						sfprintf(sfstdout, "run %7lu %7lu %7lu\n", p->index, p->run, p->start);
192 					p->run = 0;
193 				}
194 				p->offset = cur;
195 			}
196 		}
197 		offset += n;
198 	}
199 	qsort(state.mod, elementsof(state.mod), sizeof(state.mod[0]), bycount);
200 	n = 0;
201 	for (i = 0; i < elementsof(state.mod) && n < state.count; i++)
202 		if (state.mod[i].index >= state.min)
203 		{
204 			n++;
205 			sfprintf(sfstdout, "rec %7lu %7lu %7lu\n", state.mod[i].index, state.mod[i].count, state.mod[i].offset);
206 		}
207 }
208 
209 /*
210  * dump size n buffer b to op in 4 hex byte chunks
211  */
212 
213 static void
dump(Sfio_t * op,register unsigned char * b,size_t n)214 dump(Sfio_t* op, register unsigned char* b, size_t n)
215 {
216 	register unsigned char*	e = b + n / 4 * 4;
217 	register unsigned char*	x;
218 
219 	x = state.group ? (b + n) : b;
220 	while (b < e)
221 	{
222 		sfprintf(op, state.format4, b[0], b[1], b[2], b[3]);
223 		if ((b += 4) < x)
224 			sfputc(op, ' ');
225 	}
226 	while (b < x)
227 		sfprintf(op, state.format1, *b++);
228 	sfputc(op, '\n');
229 }
230 
231 /*
232  * return a number from b and advance b
233  */
234 
235 static unsigned long
number(char ** b)236 number(char** b)
237 {
238 	register char*	s;
239 	unsigned long	r;
240 
241 	for (s = *b; *s == ' ' || *s == '\t'; s++);
242 	r = strtoul(s, b, 0);
243 	if (*b == s)
244 		error(3, "numeric argument expected");
245 	for (s = *b; *s == ' ' || *s == '\t'; s++);
246 	*b = s;
247 	return r;
248 }
249 
250 /*
251  * dump fp according to dp
252  */
253 
254 static void
describe(register Sfio_t * dp,char * desc,register Sfio_t * fp,char * file,int verbose)255 describe(register Sfio_t* dp, char* desc, register Sfio_t* fp, char* file, int verbose)
256 {
257 	register unsigned char*	p;
258 	unsigned char*		e;
259 	long			size;
260 	long			count;
261 	unsigned long		context;
262 	Sfoff_t			offset;
263 	Sfoff_t			skip;
264 	int			nest;
265 	int			op;
266 	char*			s;
267 	char*			t;
268 	Loop_t			loop[64];
269 	unsigned long		id[5];
270 
271 	error_info.file = desc;
272 	error_info.line = 0;
273 	offset = 0;
274 	nest = -1;
275 	while (s = sfgetr(dp, '\n', 0))
276 	{
277 		error_info.line++;
278 		for (t = s + sfvalue(dp) - 1; *s == ' ' || *s == '\t'; s++);
279 		for (op = *s; *s != ' ' && *s != '\t' && *s != '\n'; s++);
280 		for (; *s == ' ' || *s == '\t'; s++);
281 		switch (op)
282 		{
283 		case '#':
284 		case '\n':
285 			break;
286 		case '{':
287 			if (++nest >= elementsof(loop))
288 				error(3, "%c: nesting too deep -- %d max", op, elementsof(loop));
289 			count = number(&s);
290 			loop[nest].offset = sfseek(dp, (Sfoff_t)0, SEEK_CUR);
291 			loop[nest].count = count;
292 			if (verbose && offset >= state.offset)
293 				sfprintf(sfstdout, "=== %I*d === loop %d %lu %I*d === %-.*s\n", sizeof(offset), offset, nest, loop[nest].count, sizeof(loop[nest].offset), loop[nest].offset, t - s, s);
294 			break;
295 		case '}':
296 			if (nest < 0)
297 				error(3, "%c: no matching {", op); /*balance}*/
298 			if (loop[nest].count-- <= 1)
299 				nest--;
300 			else if (sfseek(dp, loop[nest].offset, SEEK_SET) < 0)
301 				error(ERROR_SYSTEM|3, "loop seek error to %I*d", sizeof(loop[nest].offset), loop[nest].offset);
302 			else if (verbose && offset >= state.offset)
303 				sfprintf(sfstdout, "=== %I*d === loop %d %lu %I*d === %-.*s\n", sizeof(offset), offset, nest, loop[nest].count, sizeof(loop[nest].offset), loop[nest].offset, t - s, s);
304 			break;
305 		case 'c':
306 			if (verbose && offset >= state.offset)
307 				sfprintf(sfstdout, "=== %I*d === %-.*s\n", sizeof(offset), offset, t - s, s);
308 			break;
309 		case 'd':
310 			size = number(&s);
311 			if (offset >= state.offset)
312 			{
313 				if (verbose)
314 					sfprintf(sfstdout, "=== %I*d === %ld === %-.*s\n", sizeof(offset), offset, size, t - s, s);
315 				if (!(p = sfreserve(fp, size, 0)))
316 					error(ERROR_SYSTEM|3, "%s: cannot read %ld bytes at %I*d", file, size, sizeof(offset), offset);
317 				dump(sfstdout, p, size);
318 			}
319 			else if (sfseek(fp, (Sfoff_t)size, SEEK_CUR) < 0)
320 				error(ERROR_SYSTEM|3, "%s: cannot seek %ld bytes at %I*d", file, size, sizeof(offset), offset);
321 			offset += size;
322 			break;
323 		case 'i':
324 			size = number(&s);
325 			if (verbose && offset >= state.offset)
326 				sfprintf(sfstdout, "=== %I*d === %ld === %-.*s\n", sizeof(offset), offset, size, t - s, s);
327 			if (sfseek(fp, (Sfoff_t)size, SEEK_CUR) < 0)
328 				error(ERROR_SYSTEM|3, "%s: cannot seek %ld bytes at %I*d", file, size, sizeof(offset), offset);
329 			offset += size;
330 			break;
331 		case 'r':
332 			size = number(&s);
333 			count = number(&s);
334 			if (offset < state.offset)
335 			{
336 				skip = count * size;
337 				if ((offset + skip) > state.offset)
338 				{
339 					skip = (state.offset - offset) / size;
340 					count -= skip;
341 					skip *= size;
342 					if (sfseek(fp, skip, SEEK_CUR) < 0)
343 						error(ERROR_SYSTEM|3, "%s: cannot seek %I*d bytes at %I*d", file, sizeof(skip), skip, sizeof(offset), offset);
344 					offset += skip;
345 				}
346 			}
347 			if (offset >= state.offset)
348 			{
349 				if (verbose)
350 					sfprintf(sfstdout, "=== %I*d === %ld * %ld === %-.*s\n", sizeof(offset), offset, size, count, t - s, s);
351 				if (state.context && count > (3 * state.context))
352 				{
353 					skip = (count - 2 * state.context) * size;
354 					count = state.context;
355 					while (count-- > 0)
356 					{
357 						if (!(p = sfreserve(fp, size, 0)))
358 							error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", size, sizeof(offset), offset);
359 						offset += size;
360 						dump(sfstdout, p, size);
361 					}
362 					sfprintf(sfstdout, " . . .\n");
363 					if (sfseek(fp, skip, SEEK_CUR) < 0)
364 						error(ERROR_SYSTEM|3, "%s: cannot seek %I*d bytes at %I*d", file, sizeof(skip), skip, sizeof(offset), offset);
365 					offset += skip;
366 					count = state.context;
367 				}
368 				while (count-- > 0)
369 				{
370 					if (!(p = sfreserve(fp, size, 0)))
371 						error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", size, sizeof(offset), offset);
372 					offset += size;
373 					dump(sfstdout, p, size);
374 				}
375 			}
376 			else
377 			{
378 				skip = count * size;
379 				if (sfseek(fp, skip, SEEK_CUR) < 0)
380 					error(ERROR_SYSTEM|3, "%s: cannot seek %I*d bytes at %I*d", file, sizeof(skip), skip, sizeof(offset), offset);
381 				offset += skip;
382 			}
383 			break;
384 		case 't':
385 			if (!state.typelen)
386 				error(3, "no sized record types defined");
387 			context = 0;
388 			count = number(&s);
389 			do
390 			{
391 				if (!(p = sfreserve(fp, state.typelen, SF_LOCKR)))
392 					break;
393 				switch (state.typelen)
394 				{
395 				case 4: id[4] = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
396 				case 3: id[3] = (p[0] << 16) | (p[1] << 8) | p[2];
397 				case 2: id[2] = (p[0] << 8) | p[1];
398 				case 1: id[1] = p[0];
399 				}
400 				sfread(fp, p, 0);
401 				if (state.type[state.typelast].id != id[state.type[state.typelast].len])
402 				{
403 					for (state.typelast = 0; state.typelast < state.types && state.type[state.typelast].id != id[state.type[state.typelast].len]; state.typelast++);
404 					if (state.typelast >= state.types)
405 					{
406 						if (verbose)
407 							sfprintf(sfstdout, "=== %I*d === %0*x === type not found\n", sizeof(offset), offset, 2 * state.typelen, id[state.typelen]);
408 						break;
409 					}
410 					if (verbose && offset >= state.offset)
411 						sfprintf(sfstdout, "=== %I*d === %0*x === type\n", sizeof(offset), offset, 2 * state.type[state.typelast].len, id[state.type[state.typelast].len]);
412 					context = 0;
413 				}
414 				size = state.type[state.typelast].size;
415 				if (!(p = sfreserve(fp, size, state.type[state.typelast].unit ? SF_LOCKR : 0)))
416 					error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", size, sizeof(offset), offset);
417 				if (state.type[state.typelast].unit)
418 				{
419 					size += p[state.type[state.typelast].offset] * state.type[state.typelast].unit;
420 					sfread(fp, p, 0);
421 					if (!(p = sfreserve(fp, size, 0)))
422 						error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", size, sizeof(offset), offset);
423 				}
424 				if (offset >= state.offset)
425 				{
426 					if (!state.context)
427 						dump(sfstdout, p, size);
428 					else if (context++ < state.context)
429 						dump(sfstdout, p, size);
430 					else if (context == state.context + 1)
431 						sfprintf(sfstdout, " . . .\n");
432 				}
433 				offset += size;
434 			} while (!count || --count);
435 			break;
436 		case 'z':
437 			size = number(&s);
438 			if (!(p = sfreserve(fp, size, 0)))
439 				error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", size, sizeof(offset), offset);
440 			count = 0;
441 			e = p + size;
442 			while (p < e)
443 				count = (count << 8) | *p++;
444 			if (offset >= state.offset)
445 			{
446 				if (verbose)
447 					sfprintf(sfstdout, "=== %I*d === %ld === %-.*s\n", sizeof(offset), offset, size, t - s, s);
448 				offset += size;
449 				if (!(p = sfreserve(fp, count, 0)))
450 					error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", count, sizeof(offset), offset);
451 				sfprintf(sfstdout, "\"%s\"\n", fmtnesq((char*)p, "\"", count));
452 			}
453 			else
454 			{
455 				offset += 2;
456 				if (sfseek(fp, (Sfoff_t)count, SEEK_CUR) < 0)
457 					error(ERROR_SYSTEM|3, "%s: cannot seek %ld bytes at %I*d", file, count, sizeof(offset), offset);
458 			}
459 			offset += count;
460 			break;
461 		case 'T':
462 			if (state.types >= elementsof(state.type))
463 				error(3, "too many types -- %d max", elementsof(state.type));
464 			if ((state.type[state.types].len = number(&s)) > state.typelen)
465 				state.typelen = state.type[state.types].len;
466 			if (state.type[state.types].len >= elementsof(id))
467 				error(3, "type id length must be <= %d", elementsof(id) - 1);
468 			state.type[state.types].id = number(&s);
469 			state.type[state.types].size = number(&s);
470 			if (state.type[state.types].unit = number(&s))
471 				state.type[state.types].offset = number(&s);
472 			state.types++;
473 			break;
474 		default:
475 			error(2, "%c: unknown description op", op);
476 			break;
477 		}
478 	}
479 	if (verbose && offset >= state.offset)
480 		sfprintf(sfstdout, "=== %I*d === EOF\n", sizeof(offset), offset);
481 	error_info.file = 0;
482 	error_info.line = 0;
483 	if (skip = sfseek(fp, (Sfoff_t)0, SEEK_END) - offset)
484 		error(1, "%s: %I*d bytes ignored at %I*d", file, sizeof(skip), skip, sizeof(offset), offset);
485 }
486 
487 int
main(int argc,char ** argv)488 main(int argc, char** argv)
489 {
490 	register char*	file;
491 	int		n;
492 	Sfio_t*		fp;
493 	Sfio_t*		dp;
494 
495 	char*		desc = 0;
496 	char*		format = "02x";
497 	int		verbose = 0;
498 
499 	error_info.id = "rectify";
500 	state.count = 16;
501 	state.group = 1;
502 	state.min = 8;
503 	state.run = 0;
504 	for (;;)
505 	{
506 		switch (optget(argv, usage))
507 		{
508 		case 'c':
509 			state.context = opt_info.num;
510 			continue;
511 		case 'd':
512 			if (desc)
513 				error(2, "%s: only one description file allowed", opt_info.arg);
514 			else
515 				desc = opt_info.arg;
516 			continue;
517 		case 'f':
518 			format = opt_info.arg;
519 			continue;
520 		case 'g':
521 			state.group = opt_info.num;
522 			continue;
523 		case 'm':
524 			state.min = opt_info.num;
525 			continue;
526 		case 'n':
527 			state.count = opt_info.num;
528 			continue;
529 		case 'o':
530 			state.offset = opt_info.num;
531 			continue;
532 		case 'r':
533 			state.run = opt_info.num;
534 			continue;
535 		case 'v':
536 			verbose = opt_info.num;
537 			continue;
538 		case '?':
539 			error(ERROR_USAGE|4, "%s", opt_info.arg);
540 			continue;
541 		case ':':
542 			error(2, "%s", opt_info.arg);
543 			continue;
544 		}
545 		break;
546 	}
547 	argv += opt_info.index;
548 	if (error_info.errors)
549 		error(ERROR_USAGE|4, "%s", optusage(NiL));
550 	n = (strlen(format) + 1) * 4 + 1;
551 	if (!(state.format4 = newof(0, char, n, 0)))
552 		error(ERROR_SYSTEM|3, "out of space [format]");
553 	sfsprintf(state.format4, n, "%%%s%%%s%%%s%%%s", format, format, format, format);
554 	state.format1 = state.format4 + 3 * (strlen(format) + 1);
555 	if (desc && !(dp = sfopen(NiL, desc, "r")))
556 		error(ERROR_SYSTEM|3, "%s: cannot open description file", desc);
557 	if (file = *argv)
558 		argv++;
559 	do
560 	{
561 		if (!file || streq(file, "-"))
562 			fp = sfstdin;
563 		else if (!(fp = sfopen(NiL, file, "r")))
564 			error(ERROR_SYSTEM|3, "%s: cannot read", file);
565 		if (desc)
566 			describe(dp, desc, fp, file, verbose);
567 		else
568 			rectify(fp, file, verbose);
569 	} while (file = *argv++);
570 	return error_info.errors != 0;;
571 }
572