1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1998-2011 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Eclipse Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.eclipse.org/org/documents/epl-v10.html *
11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <gsf@research.att.com> *
18 * *
19 ***********************************************************************/
20 #pragma prototyped
21
22 /*
23 * induce fixed length record groups from data
24 */
25
26 static const char usage[] =
27 "[-?\n@(#)$Id: rectify (AT&T Research) 1999-03-22 $\n]"
28 USAGE_LICENSE
29 "[+NAME?rectify - induce fixed length record groups from data]"
30 "[+DESCRIPTION?\brectify\b induces fixed length record groups from input data"
31 " by sampling and comparing character frequencies. The standard input is"
32 " read if \a-\a or no files are specified.]"
33
34 "[c:context?List \acontext\a records at the beginning and end of"
35 " record groups larger that 3*\acontext\a.]#[context]"
36 "[d:description?Specify a structured dump description file. Each line of"
37 " this file describes the size and content of a contiguous portion"
38 " of the input file. The description is applied separately to each"
39 " input file. Comments and optional labels in the following"
40 " descriptions are listed with the \b--verbose\b option. Supported"
41 " descriptions are:]:[file]{"
42 " [+c comment?comment]"
43 " [+d size [label]]?\asize\a bytes of data with optional label]"
44 " [+i size [label]]?ignore \asize\a bytes of data]"
45 " [+r size count [label]]?\acount\a records of length \asize\a]"
46 " [+t count?Match \acount\a records against the \bT\b record"
47 " table. \acount\a=0 continues until no record type"
48 " match is found.]"
49 " [+z size [label]]?a string with length determined by a"
50 " \asize\a byte binary integer]"
51 " [+T idlen id size unit [offset]]?Defines a sized record"
52 " table entry.]{"
53 " [+idlen?type identifier length, must be"
54 " <= 4 bytes]"
55 " [+id?type identifier, starting at record offset 0]"
56 " [+size?default record size]"
57 " [+unit?if > 0 then the record is variable length and"
58 " the size is the byte at \aoffset\a]"
59 " [+offset?if \aunit\a > 0 then this byte multiplied by"
60 " \aunit\a is the size of variable length data"
61 " appended to the record]"
62 " }"
63 "}"
64 "[f:format?Byte output \bprintf\b(3) format.]:[format:=02x]"
65 "[g!:group?Group output in 4's.]"
66 "[m:min?Minimum record length to consider.]#[min:=8]"
67 "[n:count?List the top \acount\a candidate record lengths.]#[count:=16]"
68 "[o:offset?Start description listing at \aoffset\a.]#[offset:=0]"
69 "[r:run?List runs at least as long as \arun\a.]#[run]"
70 "[v:verbose?Dump description labels with data.]"
71
72 "\n"
73 "\n[ file ... ]\n"
74 "\n"
75 "[+SEE ALSO?\bpin\b(1), \bpop\b(1)]"
76 ;
77
78 #include <ast.h>
79 #include <error.h>
80 #include <tok.h>
81
82 typedef struct Item_s
83 {
84 unsigned long index;
85 unsigned long offset;
86 unsigned long start;
87 unsigned long count;
88 unsigned long run;
89 } Item_t;
90
91 typedef struct
92 {
93 int len;
94 unsigned long id;
95 int size;
96 int unit;
97 int offset;
98 } Type_t;
99
100 typedef struct
101 {
102 Sfoff_t offset;
103 unsigned long count;
104 } Loop_t;
105
106 static struct
107 {
108 Type_t type[4 * 1024];
109 Item_t mod[4 * 1024];
110 unsigned long hit[UCHAR_MAX + 1];
111 Sfoff_t offset;
112 char* format1;
113 char* format4;
114 unsigned long context;
115 unsigned long count;
116 unsigned long min;
117 unsigned long run;
118 int group;
119 int types;
120 int typelen;
121 int typelast;
122 } state;
123
124 /*
125 * order items by count hi to lo
126 */
127
128 static int
bycount(const void * va,const void * vb)129 bycount(const void* va, const void* vb)
130 {
131 register Item_t* a = (Item_t*)va;
132 register Item_t* b = (Item_t*)vb;
133
134 if (a->count < b->count)
135 return 1;
136 if (a->count > b->count)
137 return -1;
138 if (a < b)
139 return 1;
140 if (a > b)
141 return -1;
142 return 0;
143 }
144
145 /*
146 * rectify fp open for read on file
147 */
148
149 static void
rectify(register Sfio_t * fp,char * file,int verbose)150 rectify(register Sfio_t* fp, char* file, int verbose)
151 {
152 register unsigned char* s;
153 register Item_t* p;
154 register unsigned long* q;
155 register unsigned long offset;
156 register unsigned long i;
157 unsigned long n;
158 unsigned long cur;
159 unsigned long dif;
160 unsigned long max;
161
162 memset(state.hit, 0, sizeof(state.hit));
163 memset(state.mod, 0, sizeof(state.mod));
164 for (i = 0; i < elementsof(state.mod); i++)
165 state.mod[i].index = i;
166 max = 0;
167 offset = 0;
168 while (s = sfreserve(fp, SF_UNBOUND, 0))
169 {
170 n = sfvalue(fp);
171 for (i = 0; i < n; i++)
172 {
173 cur = offset + i;
174 q = state.hit + s[i];
175 dif = cur - *q;
176 *q = cur;
177 if (dif < elementsof(state.mod))
178 {
179 p = state.mod + dif;
180 if (dif > max)
181 max = dif;
182 p->count++;
183 if ((cur - p->offset) <= dif)
184 {
185 if (!p->run++)
186 p->start = cur;
187 }
188 else if (p->run)
189 {
190 if (state.run && p->run >= state.run && p->index >= state.min)
191 sfprintf(sfstdout, "run %7lu %7lu %7lu\n", p->index, p->run, p->start);
192 p->run = 0;
193 }
194 p->offset = cur;
195 }
196 }
197 offset += n;
198 }
199 qsort(state.mod, elementsof(state.mod), sizeof(state.mod[0]), bycount);
200 n = 0;
201 for (i = 0; i < elementsof(state.mod) && n < state.count; i++)
202 if (state.mod[i].index >= state.min)
203 {
204 n++;
205 sfprintf(sfstdout, "rec %7lu %7lu %7lu\n", state.mod[i].index, state.mod[i].count, state.mod[i].offset);
206 }
207 }
208
209 /*
210 * dump size n buffer b to op in 4 hex byte chunks
211 */
212
213 static void
dump(Sfio_t * op,register unsigned char * b,size_t n)214 dump(Sfio_t* op, register unsigned char* b, size_t n)
215 {
216 register unsigned char* e = b + n / 4 * 4;
217 register unsigned char* x;
218
219 x = state.group ? (b + n) : b;
220 while (b < e)
221 {
222 sfprintf(op, state.format4, b[0], b[1], b[2], b[3]);
223 if ((b += 4) < x)
224 sfputc(op, ' ');
225 }
226 while (b < x)
227 sfprintf(op, state.format1, *b++);
228 sfputc(op, '\n');
229 }
230
231 /*
232 * return a number from b and advance b
233 */
234
235 static unsigned long
number(char ** b)236 number(char** b)
237 {
238 register char* s;
239 unsigned long r;
240
241 for (s = *b; *s == ' ' || *s == '\t'; s++);
242 r = strtoul(s, b, 0);
243 if (*b == s)
244 error(3, "numeric argument expected");
245 for (s = *b; *s == ' ' || *s == '\t'; s++);
246 *b = s;
247 return r;
248 }
249
250 /*
251 * dump fp according to dp
252 */
253
254 static void
describe(register Sfio_t * dp,char * desc,register Sfio_t * fp,char * file,int verbose)255 describe(register Sfio_t* dp, char* desc, register Sfio_t* fp, char* file, int verbose)
256 {
257 register unsigned char* p;
258 unsigned char* e;
259 long size;
260 long count;
261 unsigned long context;
262 Sfoff_t offset;
263 Sfoff_t skip;
264 int nest;
265 int op;
266 char* s;
267 char* t;
268 Loop_t loop[64];
269 unsigned long id[5];
270
271 error_info.file = desc;
272 error_info.line = 0;
273 offset = 0;
274 nest = -1;
275 while (s = sfgetr(dp, '\n', 0))
276 {
277 error_info.line++;
278 for (t = s + sfvalue(dp) - 1; *s == ' ' || *s == '\t'; s++);
279 for (op = *s; *s != ' ' && *s != '\t' && *s != '\n'; s++);
280 for (; *s == ' ' || *s == '\t'; s++);
281 switch (op)
282 {
283 case '#':
284 case '\n':
285 break;
286 case '{':
287 if (++nest >= elementsof(loop))
288 error(3, "%c: nesting too deep -- %d max", op, elementsof(loop));
289 count = number(&s);
290 loop[nest].offset = sfseek(dp, (Sfoff_t)0, SEEK_CUR);
291 loop[nest].count = count;
292 if (verbose && offset >= state.offset)
293 sfprintf(sfstdout, "=== %I*d === loop %d %lu %I*d === %-.*s\n", sizeof(offset), offset, nest, loop[nest].count, sizeof(loop[nest].offset), loop[nest].offset, t - s, s);
294 break;
295 case '}':
296 if (nest < 0)
297 error(3, "%c: no matching {", op); /*balance}*/
298 if (loop[nest].count-- <= 1)
299 nest--;
300 else if (sfseek(dp, loop[nest].offset, SEEK_SET) < 0)
301 error(ERROR_SYSTEM|3, "loop seek error to %I*d", sizeof(loop[nest].offset), loop[nest].offset);
302 else if (verbose && offset >= state.offset)
303 sfprintf(sfstdout, "=== %I*d === loop %d %lu %I*d === %-.*s\n", sizeof(offset), offset, nest, loop[nest].count, sizeof(loop[nest].offset), loop[nest].offset, t - s, s);
304 break;
305 case 'c':
306 if (verbose && offset >= state.offset)
307 sfprintf(sfstdout, "=== %I*d === %-.*s\n", sizeof(offset), offset, t - s, s);
308 break;
309 case 'd':
310 size = number(&s);
311 if (offset >= state.offset)
312 {
313 if (verbose)
314 sfprintf(sfstdout, "=== %I*d === %ld === %-.*s\n", sizeof(offset), offset, size, t - s, s);
315 if (!(p = sfreserve(fp, size, 0)))
316 error(ERROR_SYSTEM|3, "%s: cannot read %ld bytes at %I*d", file, size, sizeof(offset), offset);
317 dump(sfstdout, p, size);
318 }
319 else if (sfseek(fp, (Sfoff_t)size, SEEK_CUR) < 0)
320 error(ERROR_SYSTEM|3, "%s: cannot seek %ld bytes at %I*d", file, size, sizeof(offset), offset);
321 offset += size;
322 break;
323 case 'i':
324 size = number(&s);
325 if (verbose && offset >= state.offset)
326 sfprintf(sfstdout, "=== %I*d === %ld === %-.*s\n", sizeof(offset), offset, size, t - s, s);
327 if (sfseek(fp, (Sfoff_t)size, SEEK_CUR) < 0)
328 error(ERROR_SYSTEM|3, "%s: cannot seek %ld bytes at %I*d", file, size, sizeof(offset), offset);
329 offset += size;
330 break;
331 case 'r':
332 size = number(&s);
333 count = number(&s);
334 if (offset < state.offset)
335 {
336 skip = count * size;
337 if ((offset + skip) > state.offset)
338 {
339 skip = (state.offset - offset) / size;
340 count -= skip;
341 skip *= size;
342 if (sfseek(fp, skip, SEEK_CUR) < 0)
343 error(ERROR_SYSTEM|3, "%s: cannot seek %I*d bytes at %I*d", file, sizeof(skip), skip, sizeof(offset), offset);
344 offset += skip;
345 }
346 }
347 if (offset >= state.offset)
348 {
349 if (verbose)
350 sfprintf(sfstdout, "=== %I*d === %ld * %ld === %-.*s\n", sizeof(offset), offset, size, count, t - s, s);
351 if (state.context && count > (3 * state.context))
352 {
353 skip = (count - 2 * state.context) * size;
354 count = state.context;
355 while (count-- > 0)
356 {
357 if (!(p = sfreserve(fp, size, 0)))
358 error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", size, sizeof(offset), offset);
359 offset += size;
360 dump(sfstdout, p, size);
361 }
362 sfprintf(sfstdout, " . . .\n");
363 if (sfseek(fp, skip, SEEK_CUR) < 0)
364 error(ERROR_SYSTEM|3, "%s: cannot seek %I*d bytes at %I*d", file, sizeof(skip), skip, sizeof(offset), offset);
365 offset += skip;
366 count = state.context;
367 }
368 while (count-- > 0)
369 {
370 if (!(p = sfreserve(fp, size, 0)))
371 error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", size, sizeof(offset), offset);
372 offset += size;
373 dump(sfstdout, p, size);
374 }
375 }
376 else
377 {
378 skip = count * size;
379 if (sfseek(fp, skip, SEEK_CUR) < 0)
380 error(ERROR_SYSTEM|3, "%s: cannot seek %I*d bytes at %I*d", file, sizeof(skip), skip, sizeof(offset), offset);
381 offset += skip;
382 }
383 break;
384 case 't':
385 if (!state.typelen)
386 error(3, "no sized record types defined");
387 context = 0;
388 count = number(&s);
389 do
390 {
391 if (!(p = sfreserve(fp, state.typelen, SF_LOCKR)))
392 break;
393 switch (state.typelen)
394 {
395 case 4: id[4] = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
396 case 3: id[3] = (p[0] << 16) | (p[1] << 8) | p[2];
397 case 2: id[2] = (p[0] << 8) | p[1];
398 case 1: id[1] = p[0];
399 }
400 sfread(fp, p, 0);
401 if (state.type[state.typelast].id != id[state.type[state.typelast].len])
402 {
403 for (state.typelast = 0; state.typelast < state.types && state.type[state.typelast].id != id[state.type[state.typelast].len]; state.typelast++);
404 if (state.typelast >= state.types)
405 {
406 if (verbose)
407 sfprintf(sfstdout, "=== %I*d === %0*x === type not found\n", sizeof(offset), offset, 2 * state.typelen, id[state.typelen]);
408 break;
409 }
410 if (verbose && offset >= state.offset)
411 sfprintf(sfstdout, "=== %I*d === %0*x === type\n", sizeof(offset), offset, 2 * state.type[state.typelast].len, id[state.type[state.typelast].len]);
412 context = 0;
413 }
414 size = state.type[state.typelast].size;
415 if (!(p = sfreserve(fp, size, state.type[state.typelast].unit ? SF_LOCKR : 0)))
416 error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", size, sizeof(offset), offset);
417 if (state.type[state.typelast].unit)
418 {
419 size += p[state.type[state.typelast].offset] * state.type[state.typelast].unit;
420 sfread(fp, p, 0);
421 if (!(p = sfreserve(fp, size, 0)))
422 error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", size, sizeof(offset), offset);
423 }
424 if (offset >= state.offset)
425 {
426 if (!state.context)
427 dump(sfstdout, p, size);
428 else if (context++ < state.context)
429 dump(sfstdout, p, size);
430 else if (context == state.context + 1)
431 sfprintf(sfstdout, " . . .\n");
432 }
433 offset += size;
434 } while (!count || --count);
435 break;
436 case 'z':
437 size = number(&s);
438 if (!(p = sfreserve(fp, size, 0)))
439 error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", size, sizeof(offset), offset);
440 count = 0;
441 e = p + size;
442 while (p < e)
443 count = (count << 8) | *p++;
444 if (offset >= state.offset)
445 {
446 if (verbose)
447 sfprintf(sfstdout, "=== %I*d === %ld === %-.*s\n", sizeof(offset), offset, size, t - s, s);
448 offset += size;
449 if (!(p = sfreserve(fp, count, 0)))
450 error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", count, sizeof(offset), offset);
451 sfprintf(sfstdout, "\"%s\"\n", fmtnesq((char*)p, "\"", count));
452 }
453 else
454 {
455 offset += 2;
456 if (sfseek(fp, (Sfoff_t)count, SEEK_CUR) < 0)
457 error(ERROR_SYSTEM|3, "%s: cannot seek %ld bytes at %I*d", file, count, sizeof(offset), offset);
458 }
459 offset += count;
460 break;
461 case 'T':
462 if (state.types >= elementsof(state.type))
463 error(3, "too many types -- %d max", elementsof(state.type));
464 if ((state.type[state.types].len = number(&s)) > state.typelen)
465 state.typelen = state.type[state.types].len;
466 if (state.type[state.types].len >= elementsof(id))
467 error(3, "type id length must be <= %d", elementsof(id) - 1);
468 state.type[state.types].id = number(&s);
469 state.type[state.types].size = number(&s);
470 if (state.type[state.types].unit = number(&s))
471 state.type[state.types].offset = number(&s);
472 state.types++;
473 break;
474 default:
475 error(2, "%c: unknown description op", op);
476 break;
477 }
478 }
479 if (verbose && offset >= state.offset)
480 sfprintf(sfstdout, "=== %I*d === EOF\n", sizeof(offset), offset);
481 error_info.file = 0;
482 error_info.line = 0;
483 if (skip = sfseek(fp, (Sfoff_t)0, SEEK_END) - offset)
484 error(1, "%s: %I*d bytes ignored at %I*d", file, sizeof(skip), skip, sizeof(offset), offset);
485 }
486
487 int
main(int argc,char ** argv)488 main(int argc, char** argv)
489 {
490 register char* file;
491 int n;
492 Sfio_t* fp;
493 Sfio_t* dp;
494
495 char* desc = 0;
496 char* format = "02x";
497 int verbose = 0;
498
499 error_info.id = "rectify";
500 state.count = 16;
501 state.group = 1;
502 state.min = 8;
503 state.run = 0;
504 for (;;)
505 {
506 switch (optget(argv, usage))
507 {
508 case 'c':
509 state.context = opt_info.num;
510 continue;
511 case 'd':
512 if (desc)
513 error(2, "%s: only one description file allowed", opt_info.arg);
514 else
515 desc = opt_info.arg;
516 continue;
517 case 'f':
518 format = opt_info.arg;
519 continue;
520 case 'g':
521 state.group = opt_info.num;
522 continue;
523 case 'm':
524 state.min = opt_info.num;
525 continue;
526 case 'n':
527 state.count = opt_info.num;
528 continue;
529 case 'o':
530 state.offset = opt_info.num;
531 continue;
532 case 'r':
533 state.run = opt_info.num;
534 continue;
535 case 'v':
536 verbose = opt_info.num;
537 continue;
538 case '?':
539 error(ERROR_USAGE|4, "%s", opt_info.arg);
540 continue;
541 case ':':
542 error(2, "%s", opt_info.arg);
543 continue;
544 }
545 break;
546 }
547 argv += opt_info.index;
548 if (error_info.errors)
549 error(ERROR_USAGE|4, "%s", optusage(NiL));
550 n = (strlen(format) + 1) * 4 + 1;
551 if (!(state.format4 = newof(0, char, n, 0)))
552 error(ERROR_SYSTEM|3, "out of space [format]");
553 sfsprintf(state.format4, n, "%%%s%%%s%%%s%%%s", format, format, format, format);
554 state.format1 = state.format4 + 3 * (strlen(format) + 1);
555 if (desc && !(dp = sfopen(NiL, desc, "r")))
556 error(ERROR_SYSTEM|3, "%s: cannot open description file", desc);
557 if (file = *argv)
558 argv++;
559 do
560 {
561 if (!file || streq(file, "-"))
562 fp = sfstdin;
563 else if (!(fp = sfopen(NiL, file, "r")))
564 error(ERROR_SYSTEM|3, "%s: cannot read", file);
565 if (desc)
566 describe(dp, desc, fp, file, verbose);
567 else
568 rectify(fp, file, verbose);
569 } while (file = *argv++);
570 return error_info.errors != 0;;
571 }
572