1 /* sumlines.c - total the numbers appearing in various input lines. */
2 /* B. D. McKay. May 19, 2003. */
3
4 #ifndef GMP
5 #define GMP 1 /* Non-zero if gmp multi-precise integers are allowed.
6 In this case you need the GNU multi-precision library,
7 available with -lgmp if it is installed. */
8 #endif
9
10 #define USAGE \
11 "sumlines [-w] [-v] [-d] [-n] [-f fmtfile]... file file file ..."
12
13 #define HELPTEXT \
14 " Sum lines matching specified formats.\n\
15 \n\
16 Any number of input files can be given. \"-\" means stdin.\n\
17 If there are no files given, just stdin is assumed.\n\
18 File names can contain wildcards, in which case all matching files\n\
19 are used in numerically sorted order.\n\
20 \n\
21 Formats are read from four sources in this order:\n\
22 (1) Any files mentioned with -f on the command line (any number).\n\
23 (2) The file named in the environment variable SUMLINES.FMT (if any)\n\
24 (3) The file sumlines.fmt in the current directory (if it exists)\n\
25 (4) The file sumlines.fmt in the home directory (if it exists)\n\
26 All these are read if they exist and the results concatenated.\n\
27 Formats exactly matching earlier formats (except perhaps for flags)\n\
28 are not used.\n\
29 \n\
30 Each format occupies exactly two lines. The first line gives a\n\
31 list of flags (DEFAULT FINAL ERROR UNIQUE COUNT CONTINUE NUMERIC\n\
32 SILENT ENDFILE P=# separated by spaces, commas or |s).\n\
33 The second line gives the format itself.\n\
34 \n\
35 Example. This totals the summary lines of autoson runs:\n\
36 DEFAULT # comment \n\
37 cpu=%fu,%fs,%fx pf=%d\n\
38 There can also be blank lines and lines with only comments, but\n\
39 not between the flags line and the format itself.\n\
40 \n\
41 -d don't read sumlines.fmt or ~/sumlines.fmt or $SUMLINES.FMT \n\
42 -w suppresses warning messages about no matching lines or no\n\
43 matching final lines.\n\
44 -n don't write the number of matching lines for each format.\n\
45 -v produces a list of all the formats.\n"
46
47 #define DEFAULT 0 /* No special flags */
48 #define FINAL 1 /* At least one of these must be in each input file */
49 #define ERROR 2 /* Must be none of these */
50 #define UNIQUE 4 /* The %s and %c parts must be unique over all inputs */
51 #define COUNT 8 /* The output only states how many lines matched */
52 #define CONTINUE 16 /* Try to match later formats too */
53 #define NUMERIC 32 /* Use numerical comparison (see numstrcmp() below) */
54 #define SILENT 64 /* Don't report, just check */
55 #define ENDFILE 128 /* Usually appears at end of output */
56
57 /* The formats are tried against each input line one at a time, and the
58 first one that matches is accepted. The entire line must match.
59 If the CONTINUE flag is present, the input line is also matched
60 against further formats.
61
62 Except in the case of formats with the COUNT flag, each format that
63 matches any lines produces output giving the total value of each of the
64 integers %d or real numbers %f in the lines which match. If there are
65 any %s or %c controls in the format, the output is given separately for
66 each value of the matching strings which appear in the input lines.
67 %m is like %d but allows arbitrarily large integers.
68 %x is like %d but the maximum rather than the sum is accumulated.
69 %p is like %d but the value is accumulated modulo some base.
70
71 In the case of the COUNT flag, the program only reports the number of
72 input lines which matched the format.
73
74 If a format has the UNIQUE flag, no two input lines may match with the
75 same values of the %s and %c controls present. Otherwise a warning
76 message is written for each duplicate match.
77
78 The sequence P=# where # is an integer value defined the base for the
79 %p directive. There can be no spaces in the sequence "P=#". The
80 default base is 2.
81
82 %d - matches an integer (small enough for 64 bits)
83 %x - same as %d but accumulates maximum rather than the sum
84 %p - same as %d but accumulates the value modulo a base
85 %m - matches a integer of unbounded size (if GMP!=0)
86 %f - matches a real number of the form ddddd.ddd or -ddddd.ddd
87 %v - same as %f but reports the average rather than the sum
88 %sx - matches a string, where 'x' is any character.
89 If 'x' is not a space, match zero or more characters from the
90 current position up but not including the first 'x'.
91 If 'x' is a space, match one or more characters from the current
92 position up to and including the first non-space character
93 which is followed by a space.
94 %c - matches a non-white character
95 %% - matches the character '%'
96 % - (with a space following the '%') matches zero or more spaces or
97 tabs, as many as appear in the input. In the output, this
98 sequence appears as one space.
99 % - (appearing exactly at the end of the format) matches zero or
100 more spaces at the end of the line. In the output, nothing.
101 %*d, %*m, %*x, %*p, %*f, %*sx, %*c - these are similar to the versions
102 without the '*' except that the value is ignored (not used for
103 summing, and not used to divide the output). In the output,
104 this field appears as a single character '*'.
105 %# - matches an unsigned integer. For each format containing this
106 control, a report is made of any breaks or duplicates in the
107 sequence of matching numbers. (So this is useful for checking
108 a sequence of case numbers.) At most one %# may appear in each
109 format.
110
111 At least one FINAL format must match in each file or a warning is given
112 (unless -w is used, in which case no warning is given).
113
114 A format marked ENDFILE will cause sumlines to act as if it started
115 reading from a new input file. This can have some effects on the
116 order of output lines.
117 */
118
119 #define HAS(i,flgs) ((format[i].flags&(flgs)) != 0)
120
121 #include <stdio.h>
122 #include <ctype.h>
123 #include <string.h>
124 #include <pwd.h>
125 #include <stdlib.h>
126 #include <glob.h>
127
128 #if GMP
129 #include <gmp.h>
130 #endif
131
132 #if defined(__alpha)
133 typedef long integer;
134 #define DOUT "%ld" /* Formats used to output %d/%x/%p,%f,%v quantities */
135 #define FOUT "%.2f"
136 #define VOUT "%.4f"
137 #elif defined(__sun) || defined(__GNUC__) || (__STDC_VERSION__ > 199900L)
138 typedef long long integer;
139 #define DOUT "%lld"
140 #define FOUT "%.2f"
141 #define VOUT "%.4f"
142 #else
143 typedef long long integer;
144 #define DOUT "%Ld"
145 #define FOUT "%.2f"
146 #define VOUT "%.4f"
147 #endif
148
149 static integer maxint; /* set by find_maxint */
150
151 #define INCR(x,inc) \
152 {if ((x) > 0 && maxint-(x) < (inc) || (x) < 0 && (maxint)+(x) < -(inc)) \
153 {fprintf(stderr,">E overflow with %d or %p format\n"); exit(1);} \
154 x += (inc);} /* x += inc with safety check */
155
156 typedef int boolean;
157 #define FALSE 0
158 #define TRUE 1
159
160 typedef union
161 {
162 double f;
163 integer d;
164 #if GMP
165 mpz_t *m;
166 #endif
167 } number;
168
169 #define D 0 /* Code for "integer" */
170 #define F 1 /* Code for "real" */
171 #define M 2 /* Code for "multiprecision integer" */
172 #define X 3 /* Code for "integer, take maximum" */
173 #define V 4 /* Code for "real, take average" */
174 #define P 5 /* Code for "integer, modulo some base" */
175
176 #define MAXLINELEN 100000 /* Maximum input line size
177 (longer lines are broken in bits) */
178 #define MAXVALUES 16 /* Maximum total number of
179 %d,%x,%p,%m,%v or %f items in a format */
180
181 #define MAXFORMATS 1000
182
183 static struct fmt_st
184 {
185 integer pmod;
186 int flags;
187 char *fmt;
188 } format[MAXFORMATS];
189
190 typedef struct countrec
191 {
192 struct countrec *left,*right,*parent;
193 char *fmt;
194 unsigned long count;
195 number total[MAXVALUES];
196 } countnode;
197
198 static countnode *count_root[MAXFORMATS];
199 static unsigned long matching_lines[MAXFORMATS];
200 static integer total_position[MAXFORMATS];
201 static integer lastseq[MAXFORMATS];
202
203 #if GMP
204 static mpz_t mp_value[MAXVALUES];
205 #endif
206
207 #define A 0
208 #define L 1
209 #define R 2
210 #define LL 3
211 #define LR 4
212 #define RL 5
213 #define RR 6
214
215 #ifndef GLOB_BRACE /* Allow {} processing -- Linux extension */
216 #define GLOB_BRACE 0
217 #endif
218
219 #ifndef GLOB_TILDE /* Allow ~ processing -- Linux extension */
220 #define GLOB_TILDE 0
221 #endif
222
223 #ifndef GLOB_NOMATCH
224 #define GLOB_NOMATCH 0 /* Some versions don't a special return for this */
225 #endif
226
227 #define GLOB_FLAGS (GLOB_ERR|GLOB_NOSORT|GLOB_BRACE|GLOB_TILDE)
228
229 #define HELP if (argc > 1 && (strcmp(argv[1],"-help")==0 \
230 || strcmp(argv[1],"--help")==0)) \
231 { printf("\nUsage: %s\n\n%s",USAGE,HELPTEXT); return 0;}
232
233 /****************************************************************************/
234
235 static int
numstrcmp(char * s1,char * s2)236 numstrcmp(char *s1, char *s2)
237 /* Same behaviour as strcmp(), except that when an unsigned integer is
238 found in each string, the numerical values are compared instead of
239 the ascii values. Overflow is impossible. Leading spaces before
240 numbers are considered part of the numbers. A number in one string
241 is considered less than a non-number in the other string. */
242 {
243 int c1,c2;
244 char *a1,*a2;
245
246 while (1)
247 {
248 for (a1 = s1; *a1 == ' '; ++a1) {}
249 if (isdigit(*a1))
250 {
251 for (s1 = a1+1; isdigit(*s1); ++s1) {}
252 }
253 else
254 {
255 a1 = s1;
256 ++s1;
257 }
258
259 for (a2 = s2; *a2 == ' '; ++a2) {}
260 if (isdigit(*a2))
261 {
262 for (s2 = a2+1; isdigit(*s2); ++s2) {}
263 }
264 else
265 {
266 a2 = s2;
267 ++s2;
268 }
269
270 if (!isdigit(*a1))
271 {
272 if (!isdigit(*a2))
273 {
274 if (*a1 < *a2) return -1;
275 if (*a1 > *a2) return 1;
276 if (*a1 == '\0') return 0;
277 }
278 else
279 return 1;
280 }
281 else
282 {
283 if (!isdigit(*a2))
284 return -1;
285 else
286 {
287 for (; *a1 == '0'; ++a1) {}
288 for (; *a2 == '0'; ++a2) {}
289
290 if (s1-a1 < s2-a2) return -1;
291 if (s1-a1 > s2-a2) return 1;
292 for (; a1 < s1 && *a1 == *a2; ++a1, ++a2) {}
293 if (a1 < s1)
294 {
295 if (*a1 < *a2) return -1;
296 else return 1;
297 }
298 }
299 }
300 }
301 }
302
303 /****************************************************************************/
304
305 static void
writeline(char * outf,number * val,unsigned long count)306 writeline(char *outf, number *val, unsigned long count)
307 /* Write an output line with the given format and values */
308 {
309 int n;
310
311 n = 0;
312
313 for (; *outf != '\0'; ++outf)
314 {
315 if (*outf == '%')
316 {
317 ++outf;
318 if (*outf == '%' || *outf == '#')
319 putchar(*outf);
320 else if (*outf == 'd' || *outf == 'x' || *outf == 'p')
321 printf(DOUT,val[n++].d);
322 else if (*outf == 'f')
323 printf(FOUT,val[n++].f);
324 else if (*outf == 'v')
325 printf(VOUT,val[n++].f/count);
326 #if GMP
327 else if (*outf == 'm')
328 mpz_out_str(NULL,10,*(val[n++].m));
329 #endif
330 else
331 {
332 fprintf(stderr,">E unknown output format %%%c\n",*outf);
333 exit(2);
334 }
335 }
336 else
337 putchar(*outf);
338 }
339 }
340
341 /*********************************************************************/
342
343 static void
print_counts(countnode * root,boolean printcounts)344 print_counts(countnode *root, boolean printcounts)
345 /* Use a non-recursive inorder traversal to print the tree */
346 {
347 int code;
348 countnode *p;
349
350 p = root;
351 code = A;
352
353 while (p)
354 {
355 switch (code) /* deliberate flow-ons */
356 {
357 case A:
358 if (p->left)
359 {
360 p = p->left;
361 break;
362 }
363 case L:
364 if (printcounts) printf("%5lu: ",p->count);
365 writeline(p->fmt,p->total,p->count);
366 if (p->right)
367 {
368 p = p->right;
369 code = A;
370 break;
371 }
372 case R:
373 if (p->parent && p->parent->left == p) code = L;
374 else code = R;
375 p = p->parent;
376 break;
377 }
378 }
379 }
380
381 /*********************************************************************/
382
383 static void
print_common(countnode * root)384 print_common(countnode *root)
385 /* Print the common ends of the formats in the tree */
386 {
387 int code;
388 countnode *p;
389 char *s0,*s1,*t0,*t1;
390 int i,comm0,comm1,minlen,maxlen;
391
392 if (root == NULL) return;
393
394 p = root;
395 code = A;
396
397 s0 = s1 = p->fmt;
398 while (*s1 != '\0') ++s1;
399 comm0 = comm1 = minlen = maxlen = s1-s0;
400
401 while (p)
402 {
403 switch (code) /* deliberate flow-ons */
404 {
405 case A:
406 if (p->left)
407 {
408 p = p->left;
409 break;
410 }
411 case L:
412 t0 = t1 = p->fmt;
413 for (i = 0; i < comm0; ++i)
414 if (s0[i] != t0[i]) break;
415 comm0 = i;
416
417 while (*t1 != '\0') ++t1;
418 for (i = 1; i <= comm1; ++i)
419 if (s1[-i] != t1[-i]) break;
420 comm1 = i-1;
421 if (t1-t0 < minlen) minlen = t1-t0;
422 if (t1-t0 > maxlen) maxlen = t1-t0;
423
424 if (p->right)
425 {
426 p = p->right;
427 code = A;
428 break;
429 }
430 case R:
431 if (p->parent && p->parent->left == p) code = L;
432 else code = R;
433 p = p->parent;
434 break;
435 }
436 }
437
438 if (comm0 + comm1 > minlen) comm1 = minlen - comm0;
439
440 for (i = 0; i < comm0; ++i)
441 printf("%c",s0[i]);
442 if (comm0 + comm1 < maxlen) printf("*");
443 for (i = comm1; i > 0; --i)
444 printf("%c",s1[-i]);
445 }
446
447 /*********************************************************************/
448
449 static void
splay(countnode * p)450 splay(countnode *p)
451 /* Splay the node p. It becomes the new root. */
452 {
453 countnode *q,*r,*s;
454 countnode *a,*b,*c,*d;
455 int code;
456
457 #define LCHILD(x,y) {(x)->left = y; if (y) (y)->parent = x;}
458 #define RCHILD(x,y) {(x)->right = y; if (y) (y)->parent = x;}
459
460 while (p->parent)
461 {
462 a = p->left;
463 b = p->right;
464 q = p->parent;
465 if (q->left == p)
466 {
467 code = L;
468 c = q->right;
469 }
470 else
471 {
472 code = R;
473 c = q->left;
474 }
475 r = q->parent;
476 if (r)
477 {
478 if (r->left == q) code = (code == L ? LL : LR);
479 else code = (code == L ? RL : RR);
480 s = r->parent;
481 p->parent = s;
482 if (s)
483 {
484 if (s->left == r) s->left = p;
485 else s->right = p;
486 }
487 }
488 else
489 {
490 p->parent = NULL;
491 }
492
493 switch (code)
494 {
495 case L:
496 RCHILD(p,q); LCHILD(q,b); break;
497 case R:
498 LCHILD(p,q); RCHILD(q,a); break;
499 case LL:
500 RCHILD(p,q); RCHILD(q,r); LCHILD(q,b); LCHILD(r,c); break;
501 case RR:
502 LCHILD(p,q); LCHILD(q,r); RCHILD(r,c); RCHILD(q,a); break;
503 case LR:
504 LCHILD(p,q); RCHILD(p,r); RCHILD(q,a); LCHILD(r,b); break;
505 case RL:
506 LCHILD(p,r); RCHILD(p,q); RCHILD(r,a); LCHILD(q,b); break;
507 }
508 }
509 }
510
511 /*********************************************************************/
512
513 static void
add_one(countnode ** to_root,char * fmt,integer pmod,int nval,number * val,int * valtype,int which,boolean numcompare)514 add_one(countnode **to_root, char *fmt, integer pmod, int nval,
515 number *val, int *valtype, int which, boolean numcompare)
516 /* Add one match to the node with the given format, creating it if it is new.
517 The tree is then splayed to ensure good efficiency. */
518 {
519 int i,cmp,len;
520 countnode *p,*ppar,*new_node;
521 integer w;
522
523 p = *to_root;
524 cmp = 0;
525
526 while (p != NULL)
527 {
528 cmp = (numcompare ? numstrcmp(fmt,p->fmt) : strcmp(fmt,p->fmt));
529 if (cmp == 0)
530 {
531 if (HAS(which,UNIQUE) && p->count == 1)
532 printf("ERROR: Multiple matches for %s",fmt);
533 for (i = 0; i < nval; ++i)
534 if (valtype[i] == D)
535 {INCR(p->total[i].d,val[i].d);}
536 else if (valtype[i] == X)
537 {if (val[i].d > p->total[i].d) p->total[i].d = val[i].d;}
538 else if (valtype[i] == P)
539 {w = val[i].d % pmod; INCR(p->total[i].d,w);
540 p->total[i].d %= pmod;}
541 #if GMP
542 else if (valtype[i] == M)
543 mpz_add(*(p->total[i].m),*(p->total[i].m),*(val[i].m));
544 #endif
545 else
546 p->total[i].f += val[i].f; /* F and V */
547 ++p->count;
548 splay(p);
549 *to_root = p;
550 return;
551 }
552 else if (cmp < 0)
553 {
554 ppar = p;
555 p = p->left;
556 }
557 else
558 {
559 ppar = p;
560 p = p->right;
561 }
562 }
563
564 if ((new_node = (countnode*)malloc(sizeof(countnode))) == NULL)
565 {
566 fprintf(stderr,">E malloc failed in add_one()\n");
567 exit(1);
568 }
569
570 if ((new_node->fmt = (char*)malloc(strlen(fmt)+1)) == NULL)
571 {
572 fprintf(stderr,">E malloc failed in add_one()\n");
573 exit(1);
574 }
575
576 new_node->count = 1;
577 strcpy(new_node->fmt,fmt);
578 for (i = 0; i < nval; ++i)
579 {
580 #if GMP
581 if (valtype[i] == M)
582 {
583 if ((new_node->total[i].m
584 = (mpz_t*)malloc(sizeof(mpz_t))) == NULL)
585 {
586 fprintf(stderr,"Malloc failed\n");
587 exit(1);
588 }
589 mpz_init_set(*(new_node->total[i].m),*(val[i].m));
590 }
591 else
592 #endif
593 new_node->total[i] = val[i];
594 }
595
596 new_node->left = new_node->right = NULL;
597
598 if (cmp == 0)
599 {
600 *to_root = new_node;
601 new_node->parent = NULL;
602 }
603 else if (cmp < 0)
604 {
605 ppar->left = new_node;
606 new_node->parent = ppar;
607 }
608 else
609 {
610 ppar->right = new_node;
611 new_node->parent = ppar;
612 }
613
614 splay(new_node);
615 *to_root = new_node;
616 }
617
618 /****************************************************************************/
619
620 static int
scanline(char * s,char * f,number * val,int * valtype,integer * seqno,char * outf)621 scanline(char *s, char *f, number *val, int *valtype,
622 integer *seqno, char *outf)
623 /* Perform sscanf-like scan of line.
624 The whole format must match. outf is set to be an output format
625 with unassigned values replaced by '*' and %s replaced by what
626 it matches. Assigned values except %s are put into val[] with
627 their types in valtype[]. The number of values (not counting %#)
628 is returned.
629 Integers matching %# are put into *seqno, with an error if there
630 are more than one, and -1 if there are none.
631 If the format doesn't match, -1 is returned.
632 WARNING: the gmp values are pointers to static data, so they
633 need to be copied if the values array is copied.
634 See the comments at the start of the program for more information.
635 */
636 {
637 int n; /* Number of values assigned */
638 int fracdigits,digit;
639 boolean doass,neednonsp,neg,oflow,badgmp;
640 integer ival;
641 double dval,digval;
642 char ends;
643 static boolean gmp_warning = FALSE;
644 #if GMP
645 char mp_line[MAXLINELEN+1],*mp;
646 #endif
647
648 n = 0;
649 *seqno = -1;
650 badgmp = oflow = FALSE;
651
652 while (*f != '\0')
653 {
654 if (*f == '%')
655 {
656 ++f;
657 if (*f == '*')
658 {
659 doass = FALSE;
660 ++f;
661 }
662 else
663 doass = TRUE;
664
665 if (*f == '%')
666 {
667 if (!doass)
668 {
669 fprintf(stderr,"Bad format item %%*\n");
670 exit(1);
671 }
672 if (*s++ != '%') return -1;
673 ++f;
674 *outf++ = '%';
675 *outf++ = '%';
676 }
677 else if (*f == '\n')
678 {
679 if (!doass)
680 {
681 fprintf(stderr,"Bad format item %%*\n");
682 exit(1);
683 }
684 while (*s != '\0')
685 {
686 if (*s != ' ' && *s != '\n') return -1;
687 ++s;
688 }
689 --s;
690 }
691 else if (*f == 'c')
692 {
693 if (*s == ' ' || *s == '\t' || *s == '\n') return -1;
694 if (doass) *outf++ = *s;
695 else *outf++ = '*';
696 ++f;
697 ++s;
698 }
699 else if (*f == 's')
700 {
701 ends = *(f+1);
702 if (ends == ' ')
703 {
704 while (*s == ' ' || *s == '\t')
705 {
706 if (doass) *outf++ = *s;
707 ++s;
708 }
709 }
710 while (*s != '\n' && *s != ends)
711 {
712 if (doass) *outf++ = *s;
713 ++s;
714 }
715 if (!doass) *outf++ = '*';
716 ++f;
717 }
718 #if GMP
719 else if (*f == 'd' || *f == 'x' || *f == 'p')
720 {
721 #else
722 else if (*f == 'd' || *f == 'x' || *f == 'p' || *f == 'm')
723 {
724 if (*f == 'm' && !gmp_warning)
725 {
726 fprintf(stderr,
727 ">W not compiled with GMP, treating %%m like %%d\n");
728 gmp_warning = TRUE;
729 }
730 #endif
731 while (*s == ' ' || *s == '\t') ++s;
732 if (!isdigit(*s) && *s != '-' && *s != '+') return -1;
733 neg = (*s == '-');
734 if (*s == '-' || *s == '+') ++s;
735 ival = 0;
736 while (isdigit(*s))
737 {
738 digit = *s++ - '0';
739 if (ival > (maxint-digit)/10)
740 oflow = TRUE;
741 else
742 ival = ival*10 + digit;
743 }
744 if (doass)
745 {
746 *outf++ = '%';
747 if (*f == 'd' || *f == 'm')
748 {
749 *outf++ = 'd';
750 valtype[n] = D;
751 }
752 else if (*f == 'x')
753 {
754 *outf++ = 'x';
755 valtype[n] = X;
756 }
757 else
758 {
759 *outf++ = 'p';
760 valtype[n] = P;
761 }
762 val[n++].d = (neg ? -ival : ival);
763 }
764 else
765 *outf++ = '*';
766 ++f;
767 }
768 #if GMP
769 else if (*f == 'm')
770 {
771 while (*s == ' ' || *s == '\t') ++s;
772 if (!isdigit(*s) && *s != '-' && *s != '+') return -1;
773 mp = mp_line;
774 if (*s == '-') *mp++ = *s++;
775 else if (*s == '+') s++;
776 while (isdigit(*s)) *mp++ = *s++;
777 *mp = '\0';
778 if (doass)
779 {
780 valtype[n] = M;
781 val[n].m = &mp_value[n];
782 if (mpz_set_str(mp_value[n],mp_line,10) < 0)
783 badgmp = TRUE;
784 ++n;
785 *outf++ = '%';
786 *outf++ = 'm';
787 }
788 else
789 *outf++ = '*';
790 ++f;
791 }
792 #endif
793 else if (*f == '#')
794 {
795 while (*s == ' ' || *s == '\t') ++s;
796 if (!isdigit(*s)) return -1;
797 ival = 0;
798 while (isdigit(*s))
799 {
800 digit = *s++ - '0';
801 if (ival > (maxint-digit)/10)
802 oflow = TRUE;
803 else
804 ival = ival*10 + digit;
805 }
806 if (*seqno >= 0)
807 {
808 fprintf(stderr,
809 ">E %# can only be used once per format\n");
810 exit(1);
811 }
812 *seqno = ival;
813 *outf++ = '#';
814 ++f;
815 }
816 else if (*f == 'f' || *f == 'v')
817 {
818 while (*s == ' ' || *s == '\t') ++s;
819
820 if (!isdigit(*s) && *s != '.' && *s != '-' && *s != '+')
821 return -1;
822 neg = (*s == '-');
823 if (*s == '-' || *s == '+') ++s;
824 dval = 0.0;
825 while (isdigit(*s)) dval = dval*10.0 + (*s++ - '0');
826 if (*s == '.')
827 {
828 digval = 1.0;
829 ++s;
830 while (isdigit(*s))
831 {
832 digval /= 10.0;
833 dval += (*s++ - '0') * digval;
834 }
835 }
836 if (doass)
837 {
838 valtype[n] = (*f == 'f' ? F : V);
839 val[n++].f = (neg ? -dval : dval);
840 *outf++ = '%';
841 *outf++ = *f;
842 }
843 else
844 *outf++ = '*';
845 ++f;
846 }
847 else if (*f == ' ')
848 {
849 while (*s == ' ' || *s == '\t') ++s;
850 *outf++ = ' ';
851 ++f;
852 }
853 else
854 {
855 fprintf(stderr,"Bad format item %%%c\n",*f);
856 exit(1);
857 }
858 }
859 else
860 {
861 if (*s != *f) return -1;
862 *outf++ = *f;
863 ++s;
864 ++f;
865 }
866 }
867
868 if (*s != '\0') return -1;
869
870 *outf = '\0';
871
872 if (oflow)
873 {
874 fprintf(stderr,"Integer too large\n");
875 exit(1);
876 }
877 if (badgmp)
878 {
879 fprintf(stderr,"Illegal multiprecision integer\n");
880 exit(1);
881 }
882
883 return n;
884 }
885
886 /****************************************************************************/
887
888 find_maxint(void)
889 {
890 /* Put the maximum possible integer value into maxint. */
891 integer x;
892
893 x = 1;
894 while (x > 0) x <<= 1;
895 x -= 1;
896
897 if (x <= 0)
898 {
899 fprintf(stderr,">E find_maxint() failed\n");
900 exit(1);
901 }
902
903 maxint = x;
904 }
905
906 /****************************************************************************/
907
908 static void
909 sort_formats(int *order, int numformats)
910 /* Make order[0..numformats-1] a permutation of 0..numformats-1 being
911 a good order to display the results. */
912 {
913 double score[MAXFORMATS];
914 int h,i,j,iw;
915
916 for (i = 0; i < numformats; ++i)
917 {
918 if (matching_lines[i] == 0)
919 score[i] = -1.0;
920 else
921 score[i] = i +
922 ((10.0*total_position[i]) / matching_lines[i]) * numformats;
923 order[i] = i;
924 }
925
926 j = numformats / 3;
927 h = 1;
928 do
929 h = 3 * h + 1;
930 while (h < j);
931
932 do
933 {
934 for (i = h; i < numformats; ++i)
935 {
936 iw = order[i];
937 for (j = i; score[order[j-h]] > score[iw]; )
938 {
939 order[j] = order[j-h];
940 if ((j -= h) < h) break;
941 }
942 order[j] = iw;
943 }
944 h /= 3;
945 }
946 while (h > 0);
947 }
948
949 /****************************************************************************/
950
951 static void
952 read_formats(char *filename, int *numformatsp, boolean mustexist)
953 /* Read formats from the given file. */
954 {
955 FILE *f;
956 int i,c,flags;
957 char flagname[52];
958 char line[MAXLINELEN+3];
959 integer pmod;
960 char *s;
961 boolean oflow,badpmod;
962 int digit;
963
964 if (strcmp(filename,"-") == 0)
965 f = stdin;
966 else if ((f = fopen(filename,"r")) == NULL)
967 {
968 if (mustexist)
969 {
970 fprintf(stderr,">E Can't open %s for reading.\n",filename);
971 exit(1);
972 }
973 return;
974 }
975
976 flagname[51] = line[MAXLINELEN+2] = '\0';
977
978 for (;;)
979 {
980 if ((c = getc(f)) == EOF) break;
981
982 while (c == ' ' || c == '\t') c = getc(f);
983 if (c == '\n') continue;
984 if (c == EOF) break;
985
986 if (c == '#')
987 {
988 while (c != '\n' && c != EOF) c = getc(f);
989 continue;
990 }
991
992 ungetc(c,f);
993
994 flags = 0;
995 pmod = 2;
996 for (;;)
997 {
998 while ((c = getc(f)) == ' '
999 || c == '|' || c == ',' || c == '\t') {}
1000 if (c == '#')
1001 while (c != '\n' && c != EOF) c = getc(f);
1002 if (c == '\n' || c == EOF) break;
1003
1004 ungetc(c,f);
1005 fscanf(f,"%50[A-Za-z0-9=]",flagname);
1006 if (strcmp(flagname,"DEFAULT") == 0) {}
1007 else if (strcmp(flagname,"FINAL") == 0) flags |= FINAL;
1008 else if (strcmp(flagname,"ERROR") == 0) flags |= ERROR;
1009 else if (strcmp(flagname,"UNIQUE") == 0) flags |= UNIQUE;
1010 else if (strcmp(flagname,"COUNT") == 0) flags |= COUNT;
1011 else if (strcmp(flagname,"CONTINUE") == 0) flags |= CONTINUE;
1012 else if (strcmp(flagname,"NUMERIC") == 0) flags |= NUMERIC;
1013 else if (strcmp(flagname,"SILENT") == 0) flags |= SILENT;
1014 else if (strcmp(flagname,"ENDFILE") == 0) flags |= ENDFILE;
1015 else if (flagname[0] == 'P' && flagname[1] == '=')
1016 {
1017 pmod = 0;
1018 oflow = FALSE;
1019 badpmod = (flagname[2] == '\0');
1020 for (s = flagname+2; *s != '\0'; ++s)
1021 {
1022 if (isdigit(*s))
1023 {
1024 digit = *s - '0';
1025 if (pmod > (maxint-digit)/10)
1026 oflow = TRUE;
1027 else
1028 pmod = pmod*10 + digit;
1029 }
1030 else
1031 badpmod = TRUE;
1032 }
1033 if (badpmod)
1034 {
1035 fprintf(stderr,">E Bad value for P= directive: %s\n",
1036 flagname+2);
1037 exit(1);
1038 }
1039 else if (oflow)
1040 {
1041 fprintf(stderr,">E Value for P= is too large\n");
1042 exit(1);
1043 }
1044
1045 }
1046 else
1047 {
1048 fprintf(stderr,">E Unknown flag %s in %s\n",
1049 flagname,filename);
1050 exit(1);
1051 }
1052 }
1053
1054 if (fgets(line,MAXLINELEN,f) == NULL)
1055 {
1056 fprintf(stderr,">E Missing format in %s\n",filename);
1057 exit(1);
1058 }
1059
1060 for (i = 0; i < *numformatsp; ++i)
1061 if (strcmp(line,format[i].fmt) == 0) break;
1062 if (i < *numformatsp) continue;
1063
1064 if (*numformatsp == MAXFORMATS)
1065 {
1066 fprintf(stderr,">E Increase MAXFORMATS\n");
1067 exit(1);
1068 }
1069
1070 format[*numformatsp].flags = flags;
1071 format[*numformatsp].pmod = pmod;
1072 if ((format[*numformatsp].fmt
1073 = (char*)malloc(strlen(line)+1)) == NULL)
1074 {
1075 fprintf(stderr,">E malloc() failed in read_formats()\n");
1076 exit(1);
1077 }
1078 strcpy(format[*numformatsp].fmt,line);
1079 ++*numformatsp;
1080 }
1081
1082 if (f != stdin) fclose(f);
1083 }
1084
1085 /****************************************************************************/
1086
1087 static void
1088 read_local_formats(int *numformatsp)
1089 /* Read formats from sumlines.fmt in current directory */
1090 {
1091 read_formats("sumlines.fmt",numformatsp,FALSE);
1092 }
1093
1094 /****************************************************************************/
1095
1096 static void
1097 read_global_formats(int *numformatsp)
1098 /* Read formats from sumlines.fmt in home directory */
1099 {
1100 struct passwd *pwd;
1101 char filename[257+12];
1102
1103 if ((pwd = getpwuid(getuid())) < 0)
1104 {
1105 fprintf(stderr,">E Can't find home directory\n");
1106 exit(1);
1107 }
1108
1109 sprintf(filename,"%s/sumlines.fmt",pwd->pw_dir);
1110
1111 read_formats(filename,numformatsp,FALSE);
1112 }
1113
1114 /****************************************************************************/
1115
1116 static void
1117 read_env_formats(int *numformatsp)
1118 /* Read formats from $SUMLINES.FMT if it exists */
1119 {
1120 char *filename;
1121
1122 if ((filename = getenv("SUMLINES.FMT")) != 0)
1123 read_formats(filename,numformatsp,FALSE);
1124 }
1125
1126 /****************************************************************************/
1127
1128 static boolean
1129 readline(FILE *f, char *line, int size, int *nulls)
1130 /* Get a line. Read at most size-1 chars until EOF or \n.
1131 If \n is read, it is stored. Then \0 is appended.
1132 *nulls is set to the number of NUL chars (which are also stored). */
1133 {
1134 int i,c;
1135
1136 *nulls = 0;
1137 for (i = 0; i < size-1; ++i)
1138 {
1139 c = getc(f);
1140 if (c == EOF) break;
1141 line[i] = c;
1142 if (c == '\0') ++*nulls;
1143 if (c == '\n') {++i; break;}
1144 }
1145 line[i] = '\0';
1146
1147 return i > 0;
1148 }
1149
1150 /****************************************************************************/
1151
1152 static int
1153 pnumstrcmp(const void *a, const void *b)
1154 /* numstrcmp on strings pointed at by a and b */
1155 {
1156 return numstrcmp(*(char**)a,*(char**)b);
1157 }
1158
1159 /****************************************************************************/
1160
1161 static void
1162 doglob(char *patt, glob_t *globlk)
1163 /* Find all files matching the given pattern, numeric sorting.
1164 Give a warning message if there are none. */
1165 {
1166 int ret;
1167
1168 ret = glob(patt,GLOB_FLAGS,NULL,globlk);
1169
1170 if (ret != 0) globlk->gl_pathc = 0;
1171
1172 if (ret == GLOB_NOSPACE)
1173 {
1174 fprintf(stderr,"ERROR: ran out of space during glob()\n");
1175 exit(1);
1176 }
1177 if (ret == GLOB_ERR)
1178 {
1179 fprintf(stderr,"ERROR: during glob(%s)\n",patt);
1180 exit(1);
1181 }
1182 if (ret != 0 && ret != GLOB_NOMATCH)
1183 {
1184 fprintf(stderr,"ERROR: value %d from glob(%s)\n",ret,patt);
1185 exit(1);
1186 }
1187
1188
1189 if (globlk->gl_pathc == 0) printf("WARNING: no files match %s\n",patt);
1190
1191 if (globlk->gl_pathc >= 2)
1192 qsort(globlk->gl_pathv,globlk->gl_pathc,sizeof(char*),pnumstrcmp);
1193 }
1194
1195 /****************************************************************************/
1196
1197 main(int argc, char *argv[])
1198 {
1199 int i,j,nvals,argnum;
1200 number val[MAXVALUES];
1201 int valtype[MAXVALUES];
1202 char line[MAXLINELEN+2];
1203 char outf[MAXLINELEN+MAXVALUES+6];
1204 unsigned long matched,unmatched,finalmatched;
1205 unsigned long errorlines,totalerrorlines;
1206 unsigned long line_number,nullcount,numfiles,ifile;
1207 char *filename;
1208 FILE *infile;
1209 int numformats,firstarg,nulls;
1210 boolean havefinal,nowarn,listformats,readfiles;
1211 integer seq;
1212 int order[MAXFORMATS];
1213 glob_t globlk,globlk_stdin,*pglob;
1214 char *glob_stdin_v[2];
1215 boolean printcounts;
1216
1217 HELP;
1218
1219 find_maxint();
1220
1221 firstarg = 1;
1222 numformats = 0;
1223 nowarn = FALSE;
1224 listformats = FALSE;
1225 readfiles = TRUE;
1226 printcounts = TRUE;
1227
1228 globlk_stdin.gl_pathc = 1;
1229 globlk_stdin.gl_pathv = glob_stdin_v;
1230 glob_stdin_v[0] = "-";
1231 glob_stdin_v[1] = NULL;
1232
1233 for (; firstarg < argc; ++firstarg)
1234 {
1235 if (argv[firstarg][0] == '-' && argv[firstarg][1] == 'f')
1236 {
1237 if (argv[firstarg][2] != '\0')
1238 read_formats(&argv[firstarg][2],&numformats,TRUE);
1239 else if (firstarg == argc - 1)
1240 {
1241 fprintf(stderr,">E No argument for -f\n");
1242 exit(1);
1243 }
1244 else
1245 {
1246 ++firstarg;
1247 read_formats(argv[firstarg],&numformats,TRUE);
1248 }
1249 }
1250 else if (strcmp(argv[firstarg],"-w") == 0)
1251 nowarn = TRUE;
1252 else if (strcmp(argv[firstarg],"-v") == 0)
1253 listformats = TRUE;
1254 else if (strcmp(argv[firstarg],"-d") == 0)
1255 readfiles = FALSE;
1256 else if (strcmp(argv[firstarg],"-n") == 0)
1257 printcounts = FALSE;
1258 else
1259 break;
1260 }
1261
1262 #if GMP
1263 for (i = 0; i < MAXVALUES; ++i) mpz_init(mp_value[i]);
1264 #endif
1265
1266 if (readfiles) read_local_formats(&numformats);
1267 if (readfiles) read_env_formats(&numformats);
1268 if (readfiles) read_global_formats(&numformats);
1269
1270 if (listformats)
1271 {
1272 printf("%d formats:\n",numformats);
1273 for (i = 0; i < numformats; ++i)
1274 printf("%03x %s",format[i].flags,format[i].fmt);
1275 }
1276
1277 if (numformats == 0)
1278 {
1279 fprintf(stderr,">E No formats\n");
1280 exit(1);
1281 }
1282
1283 havefinal = FALSE;
1284 for (i = 0; i < numformats; ++i)
1285 {
1286 count_root[i] = NULL;
1287 matching_lines[i] = 0;
1288 total_position[i] = 0;
1289 if (HAS(i,FINAL)) havefinal = TRUE;
1290 }
1291
1292 unmatched = totalerrorlines = 0;
1293 numfiles = 0;
1294
1295 for (argnum = firstarg;
1296 argnum < (argc == firstarg ? argc+1 : argc); ++argnum)
1297 {
1298 if (argnum >= argc || strcmp(argv[argnum],"-") == 0)
1299 pglob = &globlk_stdin;
1300 else
1301 {
1302 pglob = &globlk;
1303 doglob(argv[argnum],pglob);
1304 }
1305
1306 for (ifile = 0; ifile < pglob->gl_pathc; ++ifile)
1307 {
1308 matched = finalmatched = errorlines = 0;
1309 ++numfiles;
1310
1311 if (strcmp(pglob->gl_pathv[ifile],"-") == 0)
1312 {
1313 filename = "stdin";
1314 infile = stdin;
1315 }
1316 else
1317 {
1318 filename = pglob->gl_pathv[ifile];
1319 if ((infile = fopen(filename,"r")) == NULL)
1320 {
1321 fprintf(stderr,">E Can't open %s\n",filename);
1322 exit(1);
1323 }
1324 }
1325
1326 line_number = 0;
1327 nullcount = 0;
1328 while (readline(infile,line,MAXLINELEN,&nulls))
1329 {
1330 nullcount += nulls;
1331 line[MAXLINELEN] = '\n';
1332 line[MAXLINELEN+1] = '\0';
1333 if (line[0] == '\n') continue;
1334 ++line_number;
1335
1336 for (i = 0; i < numformats; ++i)
1337 {
1338 nvals
1339 = scanline(line,format[i].fmt,val,valtype,&seq,outf);
1340 if (nvals >= 0)
1341 {
1342 if (HAS(i,ENDFILE)) line_number = 0;
1343 ++matched;
1344 if (HAS(i,FINAL)) ++finalmatched;
1345 if (HAS(i,ERROR)) ++errorlines;
1346 ++matching_lines[i];
1347 total_position[i] += line_number;
1348 add_one(&count_root[i],outf,format[i].pmod,nvals,
1349 val,valtype,i,HAS(i,NUMERIC));
1350 if (matching_lines[i] > 1 && seq >= 0
1351 && seq != lastseq[i]+1)
1352 {
1353 printf("WARNING: Sequence number");
1354 if (seq == lastseq[i])
1355 {
1356 printf(" ");
1357 printf(DOUT,seq);
1358 printf(" is repeated.\n");
1359 }
1360 else if (seq != lastseq[i]+2)
1361 {
1362 printf("s ");
1363 printf(DOUT,lastseq[i]+1);
1364 printf("-");
1365 printf(DOUT,seq-1);
1366 printf(" are missing.\n");
1367 }
1368 else
1369 {
1370 printf(" ");
1371 printf(DOUT,seq-1);
1372 printf(" is missing.\n");
1373 }
1374 }
1375 lastseq[i] = seq;
1376 if (!HAS(i,CONTINUE)) break;
1377 }
1378 }
1379
1380 if (i == numformats) ++unmatched;
1381 }
1382 if (errorlines != 0)
1383 printf("ERRORS: Error lines in file %s\n",filename);
1384 else if (matched == 0 && !nowarn)
1385 printf("WARNING: No matching lines in file %s\n",filename);
1386 else if (finalmatched == 0 && havefinal && !nowarn)
1387 printf("WARNING: No final lines in file %s\n",filename);
1388 if (nullcount > 0)
1389 printf("WARNING: %ld NULs found in file %s\n",
1390 nullcount,filename);
1391 if (infile != stdin) fclose(infile);
1392
1393 totalerrorlines += errorlines;
1394 }
1395 if (pglob == &globlk) globfree(pglob);
1396 }
1397
1398 sort_formats(order,numformats);
1399
1400 for (j = 0; j < numformats; ++j)
1401 {
1402 i = order[j];
1403 if (HAS(i,SILENT)) continue;
1404
1405 if (HAS(i,COUNT))
1406 {
1407 if (matching_lines[i] > 0)
1408 printf("%5lu lines matched ",matching_lines[i]);
1409 print_common(count_root[i]);
1410 }
1411 else
1412 print_counts(count_root[i],printcounts);
1413 }
1414
1415 if (unmatched > 0)
1416 printf("%5lu non-empty lines not matched\n",unmatched);
1417 if (argc > firstarg) printf("%5lu files read altogether\n",numfiles);
1418 if (totalerrorlines > 0) printf("%5lu errors found\n",totalerrorlines);
1419
1420 exit(0);
1421 }
1422