1 /* sumlines.c - total the numbers appearing in various input lines. */
2 /* B. D. McKay.   May 19, 2003. */
3 
4 #ifndef GMP
5 #define GMP 1  /* Non-zero if gmp multi-precise integers are allowed.
6 		  In this case you need the GNU multi-precision library,
7 		  available with -lgmp if it is installed. */
8 #endif
9 
10 #define USAGE \
11 "sumlines [-w] [-v] [-d] [-n] [-f fmtfile]...  file file file ..."
12 
13 #define HELPTEXT \
14 "   Sum lines matching specified formats.\n\
15 \n\
16    Any number of input files can be given.  \"-\" means stdin.\n\
17    If there are no files given, just stdin is assumed.\n\
18    File names can contain wildcards, in which case all matching files\n\
19       are used in numerically sorted order.\n\
20 \n\
21    Formats are read from four sources in this order:\n\
22    (1) Any files mentioned with -f on the command line (any number).\n\
23    (2) The file named in the environment variable SUMLINES.FMT (if any)\n\
24    (3) The file sumlines.fmt in the current directory (if it exists)\n\
25    (4) The file sumlines.fmt in the home directory (if it exists)\n\
26    All these are read if they exist and the results concatenated.\n\
27    Formats exactly matching earlier formats (except perhaps for flags)\n\
28 	are not used.\n\
29 \n\
30    Each format occupies exactly two lines.  The first line gives a\n\
31    list of flags (DEFAULT FINAL ERROR UNIQUE COUNT CONTINUE NUMERIC\n\
32    SILENT ENDFILE P=# separated by spaces, commas or |s).\n\
33    The second line gives the format itself.\n\
34 \n\
35    Example.  This totals the summary lines of autoson runs:\n\
36      DEFAULT  # comment \n\
37      cpu=%fu,%fs,%fx  pf=%d\n\
38    There can also be blank lines and lines with only comments, but\n\
39    not between the flags line and the format itself.\n\
40 \n\
41    -d don't read sumlines.fmt or ~/sumlines.fmt or $SUMLINES.FMT \n\
42    -w suppresses warning messages about no matching lines or no\n\
43       matching final lines.\n\
44    -n don't write the number of matching lines for each format.\n\
45    -v produces a list of all the formats.\n"
46 
47 #define DEFAULT   0  /* No special flags */
48 #define FINAL     1  /* At least one of these must be in each input file */
49 #define ERROR     2  /* Must be none of these */
50 #define UNIQUE    4  /* The %s and %c parts must be unique over all inputs */
51 #define COUNT     8  /* The output only states how many lines matched */
52 #define CONTINUE 16  /* Try to match later formats too */
53 #define NUMERIC  32  /* Use numerical comparison (see numstrcmp() below) */
54 #define SILENT   64  /* Don't report, just check */
55 #define ENDFILE 128  /* Usually appears at end of output */
56 
57 /* The formats are tried against each input line one at a time, and the
58    first one that matches is accepted.  The entire line must match.
59    If the CONTINUE flag is present, the input line is also matched
60    against further formats.
61 
62    Except in the case of formats with the COUNT flag, each format that
63    matches any lines produces output giving the total value of each of the
64    integers %d or real numbers %f in the lines which match.  If there are
65    any %s or %c controls in the format, the output is given separately for
66    each value of the matching strings which appear in the input lines.
67    %m is like %d but allows arbitrarily large integers.
68    %x is like %d but the maximum rather than the sum is accumulated.
69    %p is like %d but the value is accumulated modulo some base.
70 
71    In the case of the COUNT flag, the program only reports the number of
72    input lines which matched the format.
73 
74    If a format has the UNIQUE flag, no two input lines may match with the
75    same values of the %s and %c controls present.  Otherwise a warning
76    message is written for each duplicate match.
77 
78    The sequence P=# where # is an integer value defined the base for the
79    %p directive.  There can be no spaces in the sequence "P=#".  The
80    default base is 2.
81 
82    %d  - matches an integer (small enough for 64 bits)
83    %x  - same as %d but accumulates maximum rather than the sum
84    %p  - same as %d but accumulates the value modulo a base
85    %m  - matches a integer of unbounded size (if GMP!=0)
86    %f  - matches a real number of the form ddddd.ddd or -ddddd.ddd
87    %v  - same as %f but reports the average rather than the sum
88    %sx - matches a string, where 'x' is any character.
89 	 If 'x' is not a space, match zero or more characters from the
90              current position up but not including the first 'x'.
91          If 'x' is a space, match one or more characters from the current
92              position up to and including the first non-space character
93              which is followed by a space.
94    %c  - matches a non-white character
95    %%  - matches the character '%'
96    %   - (with a space following the '%') matches zero or more spaces or
97             tabs, as many as appear in the input.  In the output, this
98             sequence appears as one space.
99    %   - (appearing exactly at the end of the format) matches zero or
100          more spaces at the end of the line.  In the output, nothing.
101    %*d, %*m, %*x, %*p, %*f, %*sx, %*c - these are similar to the versions
102          without the '*' except that the value is ignored (not used for
103          summing, and not used to divide the output).  In the output,
104          this field appears as a single character '*'.
105    %#  - matches an unsigned integer.  For each format containing this
106          control, a report is made of any breaks or duplicates in the
107          sequence of matching numbers.  (So this is useful for checking
108          a sequence of case numbers.)  At most one %# may appear in each
109          format.
110 
111   At least one FINAL format must match in each file or a warning is given
112   (unless -w is used, in which case no warning is given).
113 
114   A format marked ENDFILE will cause sumlines to act as if it started
115   reading from a new input file.  This can have some effects on the
116   order of output lines.
117 */
118 
119 #define HAS(i,flgs)  ((format[i].flags&(flgs)) != 0)
120 
121 #include <stdio.h>
122 #include <ctype.h>
123 #include <string.h>
124 #include <pwd.h>
125 #include <stdlib.h>
126 #include <glob.h>
127 
128 #if GMP
129 #include <gmp.h>
130 #endif
131 
132 #if defined(__alpha)
133 typedef long integer;
134 #define DOUT "%ld"       /* Formats used to output %d/%x/%p,%f,%v quantities */
135 #define FOUT "%.2f"
136 #define VOUT "%.4f"
137 #elif defined(__sun) || defined(__GNUC__) || (__STDC_VERSION__ > 199900L)
138 typedef long long integer;
139 #define DOUT "%lld"
140 #define FOUT "%.2f"
141 #define VOUT "%.4f"
142 #else
143 typedef long long integer;
144 #define DOUT "%Ld"
145 #define FOUT "%.2f"
146 #define VOUT "%.4f"
147 #endif
148 
149 static integer maxint;   /* set by find_maxint */
150 
151 #define INCR(x,inc) \
152      {if ((x) > 0 && maxint-(x) < (inc) || (x) < 0 && (maxint)+(x) < -(inc)) \
153            {fprintf(stderr,">E overflow with %d or %p format\n"); exit(1);} \
154       x += (inc);}    /*  x += inc   with safety check */
155 
156 typedef int boolean;
157 #define FALSE 0
158 #define TRUE  1
159 
160 typedef union
161 {
162     double f;
163     integer d;
164 #if GMP
165     mpz_t *m;
166 #endif
167 } number;
168 
169 #define D 0    /* Code for "integer" */
170 #define F 1    /* Code for "real" */
171 #define M 2    /* Code for "multiprecision integer" */
172 #define X 3    /* Code for "integer, take maximum" */
173 #define V 4    /* Code for "real, take average" */
174 #define P 5    /* Code for "integer, modulo some base" */
175 
176 #define MAXLINELEN 100000   /* Maximum input line size
177 		 	      (longer lines are broken in bits) */
178 #define MAXVALUES 16  /* Maximum  total number of
179                              %d,%x,%p,%m,%v or %f items in a format */
180 
181 #define MAXFORMATS 1000
182 
183 static struct fmt_st
184 {
185    integer pmod;
186    int flags;
187    char *fmt;
188 } format[MAXFORMATS];
189 
190 typedef struct countrec
191 {
192     struct countrec *left,*right,*parent;
193     char *fmt;
194     unsigned long count;
195     number total[MAXVALUES];
196 } countnode;
197 
198 static countnode *count_root[MAXFORMATS];
199 static unsigned long matching_lines[MAXFORMATS];
200 static integer total_position[MAXFORMATS];
201 static integer lastseq[MAXFORMATS];
202 
203 #if GMP
204 static mpz_t mp_value[MAXVALUES];
205 #endif
206 
207 #define A 0
208 #define L 1
209 #define R 2
210 #define LL 3
211 #define LR 4
212 #define RL 5
213 #define RR 6
214 
215 #ifndef GLOB_BRACE       /* Allow {} processing -- Linux extension */
216 #define GLOB_BRACE 0
217 #endif
218 
219 #ifndef GLOB_TILDE      /* Allow ~  processing -- Linux extension */
220 #define GLOB_TILDE 0
221 #endif
222 
223 #ifndef GLOB_NOMATCH
224 #define GLOB_NOMATCH 0  /* Some versions don't a special return for this */
225 #endif
226 
227 #define GLOB_FLAGS (GLOB_ERR|GLOB_NOSORT|GLOB_BRACE|GLOB_TILDE)
228 
229 #define HELP if (argc > 1 && (strcmp(argv[1],"-help")==0 \
230                            || strcmp(argv[1],"--help")==0)) \
231        { printf("\nUsage: %s\n\n%s",USAGE,HELPTEXT); return 0;}
232 
233 /****************************************************************************/
234 
235 static int
numstrcmp(char * s1,char * s2)236 numstrcmp(char *s1, char *s2)
237 /* Same behaviour as strcmp(), except that when an unsigned integer is
238    found in each string, the numerical values are compared instead of
239    the ascii values.   Overflow is impossible.  Leading spaces before
240    numbers are considered part of the numbers.  A number in one string
241    is considered less than a non-number in the other string. */
242 {
243 	int c1,c2;
244 	char *a1,*a2;
245 
246 	while (1)
247 	{
248 	    for (a1 = s1; *a1 == ' '; ++a1) {}
249 	    if (isdigit(*a1))
250 	    {
251 		for (s1 = a1+1; isdigit(*s1); ++s1) {}
252 	    }
253 	    else
254 	    {
255 		a1 = s1;
256 		++s1;
257 	    }
258 
259 	    for (a2 = s2; *a2 == ' '; ++a2) {}
260             if (isdigit(*a2))
261             {
262                 for (s2 = a2+1; isdigit(*s2); ++s2) {}
263             }
264             else
265             {
266                 a2 = s2;
267                 ++s2;
268             }
269 
270 	    if (!isdigit(*a1))
271 	    {
272 		if (!isdigit(*a2))
273 		{
274 		    if (*a1 < *a2) return -1;
275 		    if (*a1 > *a2) return 1;
276 		    if (*a1 == '\0') return 0;
277 		}
278 		else
279 		    return 1;
280 	    }
281 	    else
282 	    {
283 		if (!isdigit(*a2))
284 		    return -1;
285 		else
286 		{
287 		    for (; *a1 == '0'; ++a1) {}
288 		    for (; *a2 == '0'; ++a2) {}
289 
290 		    if (s1-a1 < s2-a2) return -1;
291 		    if (s1-a1 > s2-a2) return 1;
292 		    for (; a1 < s1 && *a1 == *a2; ++a1, ++a2) {}
293 		    if (a1 < s1)
294 		    {
295 			if (*a1 < *a2) return -1;
296 			else           return 1;
297 		    }
298 		}
299 	    }
300 	}
301 }
302 
303 /****************************************************************************/
304 
305 static void
writeline(char * outf,number * val,unsigned long count)306 writeline(char *outf, number *val, unsigned long count)
307 /* Write an output line with the given format and values */
308 {
309 	int n;
310 
311 	n = 0;
312 
313 	for (; *outf != '\0'; ++outf)
314 	{
315 	    if (*outf == '%')
316 	    {
317 		++outf;
318 		if (*outf == '%' || *outf == '#')
319 		    putchar(*outf);
320 		else if (*outf == 'd' || *outf == 'x' || *outf == 'p')
321 		    printf(DOUT,val[n++].d);
322 		else if (*outf == 'f')
323 		    printf(FOUT,val[n++].f);
324 		else if (*outf == 'v')
325 		    printf(VOUT,val[n++].f/count);
326 #if GMP
327 		else if (*outf == 'm')
328 		    mpz_out_str(NULL,10,*(val[n++].m));
329 #endif
330 		else
331 		{
332 		    fprintf(stderr,">E unknown output format %%%c\n",*outf);
333 		    exit(2);
334 		}
335 	    }
336 	    else
337 		putchar(*outf);
338 	}
339 }
340 
341 /*********************************************************************/
342 
343 static void
print_counts(countnode * root,boolean printcounts)344 print_counts(countnode *root, boolean printcounts)
345 /* Use a non-recursive inorder traversal to print the tree */
346 {
347     int code;
348     countnode *p;
349 
350     p = root;
351     code = A;
352 
353     while (p)
354     {
355 	switch (code)    /* deliberate flow-ons */
356 	{
357 	 case A:
358 	    if (p->left)
359 	    {
360 		p = p->left;
361 		break;
362 	    }
363 	 case L:
364 	    if (printcounts) printf("%5lu: ",p->count);
365 	    writeline(p->fmt,p->total,p->count);
366 	    if (p->right)
367 	    {
368 		p = p->right;
369 		code = A;
370 		break;
371 	    }
372 	 case R:
373 	    if (p->parent && p->parent->left == p) code = L;
374 	    else                                   code = R;
375 	    p = p->parent;
376 	    break;
377 	}
378     }
379 }
380 
381 /*********************************************************************/
382 
383 static void
print_common(countnode * root)384 print_common(countnode *root)
385 /* Print the common ends of the formats in the tree */
386 {
387     int code;
388     countnode *p;
389     char *s0,*s1,*t0,*t1;
390     int i,comm0,comm1,minlen,maxlen;
391 
392     if (root == NULL) return;
393 
394     p = root;
395     code = A;
396 
397     s0 = s1 = p->fmt;
398     while (*s1 != '\0') ++s1;
399     comm0 = comm1 = minlen = maxlen = s1-s0;
400 
401     while (p)
402     {
403         switch (code)    /* deliberate flow-ons */
404         {
405          case A:
406             if (p->left)
407             {
408                 p = p->left;
409                 break;
410             }
411          case L:
412 	    t0 = t1 = p->fmt;
413 	    for (i = 0; i < comm0; ++i)
414 		if (s0[i] != t0[i]) break;
415 	    comm0 = i;
416 
417 	    while (*t1 != '\0') ++t1;
418 	    for (i = 1; i <= comm1; ++i)
419 		if (s1[-i] != t1[-i]) break;
420 	    comm1 = i-1;
421 	    if (t1-t0 < minlen) minlen = t1-t0;
422 	    if (t1-t0 > maxlen) maxlen = t1-t0;
423 
424             if (p->right)
425             {
426                 p = p->right;
427                 code = A;
428                 break;
429             }
430          case R:
431             if (p->parent && p->parent->left == p) code = L;
432             else                                   code = R;
433             p = p->parent;
434             break;
435         }
436     }
437 
438     if (comm0 + comm1 > minlen) comm1 = minlen - comm0;
439 
440     for (i = 0; i < comm0; ++i)
441 	printf("%c",s0[i]);
442     if (comm0 + comm1 < maxlen) printf("*");
443     for (i = comm1; i > 0; --i)
444 	printf("%c",s1[-i]);
445 }
446 
447 /*********************************************************************/
448 
449 static void
splay(countnode * p)450 splay(countnode *p)
451 /* Splay the node p.  It becomes the new root. */
452 {
453     countnode *q,*r,*s;
454     countnode *a,*b,*c,*d;
455     int code;
456 
457 #define LCHILD(x,y) {(x)->left = y; if (y) (y)->parent = x;}
458 #define RCHILD(x,y) {(x)->right = y; if (y) (y)->parent = x;}
459 
460     while (p->parent)
461     {
462 	a = p->left;
463 	b = p->right;
464 	q = p->parent;
465 	if (q->left == p)
466 	{
467 	    code = L;
468 	    c = q->right;
469 	}
470 	else
471 	{
472 	    code = R;
473 	    c = q->left;
474 	}
475 	r = q->parent;
476 	if (r)
477 	{
478 	    if (r->left == q) code = (code == L ? LL : LR);
479 	    else              code = (code == L ? RL : RR);
480 	    s = r->parent;
481 	    p->parent = s;
482 	    if (s)
483 	    {
484 		if (s->left == r) s->left = p;
485 		else              s->right = p;
486 	    }
487 	}
488 	else
489 	{
490 	    p->parent = NULL;
491 	}
492 
493 	switch (code)
494 	{
495 	 case L:
496 	    RCHILD(p,q); LCHILD(q,b); break;
497 	 case R:
498 	    LCHILD(p,q); RCHILD(q,a); break;
499 	 case LL:
500 	    RCHILD(p,q); RCHILD(q,r); LCHILD(q,b); LCHILD(r,c); break;
501 	 case RR:
502 	    LCHILD(p,q); LCHILD(q,r); RCHILD(r,c); RCHILD(q,a); break;
503 	 case LR:
504 	    LCHILD(p,q); RCHILD(p,r); RCHILD(q,a); LCHILD(r,b); break;
505 	 case RL:
506 	    LCHILD(p,r); RCHILD(p,q); RCHILD(r,a); LCHILD(q,b); break;
507 	}
508     }
509 }
510 
511 /*********************************************************************/
512 
513 static void
add_one(countnode ** to_root,char * fmt,integer pmod,int nval,number * val,int * valtype,int which,boolean numcompare)514 add_one(countnode **to_root, char *fmt, integer pmod, int nval,
515         number *val, int *valtype, int which, boolean numcompare)
516 /* Add one match to the node with the given format, creating it if it is new.
517    The tree is then splayed to ensure good efficiency. */
518 {
519     int i,cmp,len;
520     countnode *p,*ppar,*new_node;
521     integer w;
522 
523     p = *to_root;
524     cmp = 0;
525 
526     while (p != NULL)
527     {
528         cmp = (numcompare ? numstrcmp(fmt,p->fmt) : strcmp(fmt,p->fmt));
529         if (cmp == 0)
530         {
531 	    if (HAS(which,UNIQUE) && p->count == 1)
532 		printf("ERROR: Multiple matches for %s",fmt);
533             for (i = 0; i < nval; ++i)
534 		if (valtype[i] == D)
535 		    {INCR(p->total[i].d,val[i].d);}
536 		else if (valtype[i] == X)
537 		    {if (val[i].d > p->total[i].d) p->total[i].d = val[i].d;}
538 		else if (valtype[i] == P)
539 		    {w = val[i].d % pmod; INCR(p->total[i].d,w);
540 		      p->total[i].d %= pmod;}
541 #if GMP
542 		else if (valtype[i] == M)
543 		    mpz_add(*(p->total[i].m),*(p->total[i].m),*(val[i].m));
544 #endif
545 		else
546                     p->total[i].f += val[i].f;   /* F and V */
547 	    ++p->count;
548 	    splay(p);
549 	    *to_root = p;
550             return;
551         }
552         else if (cmp < 0)
553         {
554             ppar = p;
555             p = p->left;
556         }
557         else
558         {
559             ppar = p;
560             p = p->right;
561         }
562     }
563 
564     if ((new_node = (countnode*)malloc(sizeof(countnode))) == NULL)
565     {
566         fprintf(stderr,">E malloc failed in add_one()\n");
567         exit(1);
568     }
569 
570     if ((new_node->fmt = (char*)malloc(strlen(fmt)+1)) == NULL)
571     {
572         fprintf(stderr,">E malloc failed in add_one()\n");
573         exit(1);
574     }
575 
576     new_node->count = 1;
577     strcpy(new_node->fmt,fmt);
578     for (i = 0; i < nval; ++i)
579     {
580 #if GMP
581 	if (valtype[i] == M)
582 	{
583 	    if ((new_node->total[i].m
584 				= (mpz_t*)malloc(sizeof(mpz_t))) == NULL)
585 	    {
586 		fprintf(stderr,"Malloc failed\n");
587 		exit(1);
588 	    }
589 	    mpz_init_set(*(new_node->total[i].m),*(val[i].m));
590 	}
591 	else
592 #endif
593 	    new_node->total[i] = val[i];
594     }
595 
596     new_node->left = new_node->right = NULL;
597 
598     if (cmp == 0)
599     {
600         *to_root = new_node;
601 	new_node->parent = NULL;
602     }
603     else if (cmp < 0)
604     {
605         ppar->left = new_node;
606 	new_node->parent = ppar;
607     }
608     else
609     {
610         ppar->right = new_node;
611 	new_node->parent = ppar;
612     }
613 
614     splay(new_node);
615     *to_root = new_node;
616 }
617 
618 /****************************************************************************/
619 
620 static int
scanline(char * s,char * f,number * val,int * valtype,integer * seqno,char * outf)621 scanline(char *s, char *f, number *val, int *valtype,
622          integer *seqno, char *outf)
623 /* Perform sscanf-like scan of line.
624    The whole format must match.  outf is set to be an output format
625    with unassigned values replaced by '*' and %s replaced by what
626    it matches.  Assigned values except %s are put into val[] with
627    their types in valtype[].  The number of values (not counting %#)
628    is returned.
629    Integers matching %# are put into *seqno, with an error if there
630    are more than one, and -1 if there are none.
631    If the format doesn't match, -1 is returned.
632    WARNING: the gmp values are pointers to static data, so they
633    need to be copied if the values array is copied.
634    See the comments at the start of the program for more information.
635 */
636 {
637 	int n;  		 /* Number of values assigned */
638 	int fracdigits,digit;
639 	boolean doass,neednonsp,neg,oflow,badgmp;
640 	integer ival;
641 	double dval,digval;
642 	char ends;
643 	static boolean gmp_warning = FALSE;
644 #if GMP
645 	char mp_line[MAXLINELEN+1],*mp;
646 #endif
647 
648 	n = 0;
649 	*seqno = -1;
650 	badgmp = oflow = FALSE;
651 
652 	while (*f != '\0')
653 	{
654 	    if (*f == '%')
655 	    {
656 		++f;
657 		if (*f == '*')
658 		{
659 		    doass = FALSE;
660 		    ++f;
661 		}
662 		else
663 		    doass = TRUE;
664 
665 		if (*f == '%')
666 		{
667 		    if (!doass)
668 		    {
669 			fprintf(stderr,"Bad format item %%*\n");
670 			exit(1);
671 		    }
672 		    if (*s++ != '%') return -1;
673 		    ++f;
674 		    *outf++ = '%';
675 		    *outf++ = '%';
676 		}
677 		else if (*f == '\n')
678 		{
679                     if (!doass)
680                     {
681                         fprintf(stderr,"Bad format item %%*\n");
682                         exit(1);
683                     }
684 		    while (*s != '\0')
685 		    {
686 			if (*s != ' ' && *s != '\n') return -1;
687 			++s;
688 		    }
689 		    --s;
690 		}
691 	        else if (*f == 'c')
692                 {
693 		    if (*s == ' ' || *s == '\t' || *s == '\n') return -1;
694                     if (doass) *outf++ = *s;
695 		    else       *outf++ = '*';
696                     ++f;
697                     ++s;
698                 }
699 		else if (*f == 's')
700 		{
701 		    ends = *(f+1);
702 		    if (ends == ' ')
703 		    {
704 			while (*s == ' ' || *s == '\t')
705 			{
706 			    if (doass) *outf++ = *s;
707                             ++s;
708                         }
709 		    }
710 		    while (*s != '\n' && *s != ends)
711 		    {
712 			if (doass) *outf++ = *s;
713 			++s;
714                     }
715 		    if (!doass) *outf++ = '*';
716 		    ++f;
717 		}
718 #if GMP
719 		else if (*f == 'd' || *f == 'x' || *f == 'p')
720 		{
721 #else
722 		else if (*f == 'd' || *f == 'x' || *f == 'p' || *f == 'm')
723 		{
724 		    if (*f == 'm' && !gmp_warning)
725 		    {
726 			fprintf(stderr,
727                          ">W not compiled with GMP, treating %%m like %%d\n");
728 			gmp_warning = TRUE;
729 		    }
730 #endif
731 		    while (*s == ' ' || *s == '\t') ++s;
732 		    if (!isdigit(*s) && *s != '-' && *s != '+') return -1;
733 		    neg = (*s == '-');
734 		    if (*s == '-' || *s == '+') ++s;
735 		    ival = 0;
736 		    while (isdigit(*s))
737 		    {
738 			digit =  *s++ - '0';
739 			if (ival > (maxint-digit)/10)
740 			    oflow = TRUE;
741 			else
742 			    ival = ival*10 + digit;
743 		    }
744 		    if (doass)
745 		    {
746 			*outf++ = '%';
747 			if (*f == 'd' || *f == 'm')
748 			{
749 			    *outf++ = 'd';
750 			    valtype[n] = D;
751 			}
752 			else if (*f == 'x')
753 			{
754 			    *outf++ = 'x';
755 			    valtype[n] = X;
756 			}
757 			else
758 			{
759 			    *outf++ = 'p';
760 			    valtype[n] = P;
761 			}
762 			val[n++].d = (neg ? -ival : ival);
763 		    }
764 		    else
765 			*outf++ = '*';
766 		    ++f;
767 		}
768 #if GMP
769 		else if (*f == 'm')
770 		{
771 		    while (*s == ' ' || *s == '\t') ++s;
772 		    if (!isdigit(*s) && *s != '-' && *s != '+') return -1;
773 		    mp = mp_line;
774 		    if      (*s == '-') *mp++ = *s++;
775 		    else if (*s == '+') s++;
776 		    while (isdigit(*s)) *mp++ = *s++;
777 		    *mp = '\0';
778 		    if (doass)
779 		    {
780 			valtype[n] = M;
781 			val[n].m = &mp_value[n];
782 			if (mpz_set_str(mp_value[n],mp_line,10) < 0)
783 			    badgmp = TRUE;
784 			++n;
785 			*outf++ = '%';
786                         *outf++ = 'm';
787                     }
788                     else
789                         *outf++ = '*';
790                     ++f;
791 		}
792 #endif
793                 else if (*f == '#')
794                 {
795                     while (*s == ' ' || *s == '\t') ++s;
796                     if (!isdigit(*s)) return -1;
797                     ival = 0;
798                     while (isdigit(*s))
799                     {
800                         digit =  *s++ - '0';
801                         if (ival > (maxint-digit)/10)
802                             oflow = TRUE;
803 			else
804                             ival = ival*10 + digit;
805                     }
806 		    if (*seqno >= 0)
807 		    {
808 			fprintf(stderr,
809 			        ">E %# can only be used once per format\n");
810 			exit(1);
811 		    }
812                     *seqno = ival;
813                     *outf++ = '#';
814                     ++f;
815                 }
816 		else if (*f == 'f' || *f == 'v')
817 		{
818 		    while (*s == ' ' || *s == '\t') ++s;
819 
820 		    if (!isdigit(*s) && *s != '.' && *s != '-' && *s != '+')
821 			return -1;
822                     neg = (*s == '-');
823                     if (*s == '-' || *s == '+') ++s;
824 		    dval = 0.0;
825 		    while (isdigit(*s)) dval = dval*10.0 + (*s++ - '0');
826 		    if (*s == '.')
827 		    {
828 			digval = 1.0;
829 			++s;
830 			while (isdigit(*s))
831 			{
832 			    digval /= 10.0;
833 			    dval += (*s++ - '0') * digval;
834 			}
835 		    }
836 		    if (doass)
837                     {
838 			valtype[n] = (*f == 'f' ? F : V);
839                         val[n++].f = (neg ? -dval : dval);
840                         *outf++ = '%';
841                         *outf++ = *f;
842                     }
843                     else
844                         *outf++ = '*';
845 		    ++f;
846 		}
847 		else if (*f == ' ')
848 		{
849 		    while (*s == ' ' || *s == '\t') ++s;
850 		    *outf++ = ' ';
851 		    ++f;
852 		}
853 		else
854 		{
855 		    fprintf(stderr,"Bad format item %%%c\n",*f);
856 		    exit(1);
857 		}
858 	    }
859 	    else
860 	    {
861 		if (*s != *f) return -1;
862 		*outf++ = *f;
863 		++s;
864 		++f;
865 	    }
866 	}
867 
868 	if (*s != '\0') return -1;
869 
870 	*outf = '\0';
871 
872 	if (oflow)
873 	{
874 	    fprintf(stderr,"Integer too large\n");
875 	    exit(1);
876 	}
877 	if (badgmp)
878 	{
879 	    fprintf(stderr,"Illegal multiprecision integer\n");
880 	    exit(1);
881 	}
882 
883 	return n;
884 }
885 
886 /****************************************************************************/
887 
888 find_maxint(void)
889 {
890 /* Put the maximum possible integer value into maxint. */
891 	integer x;
892 
893 	x = 1;
894 	while (x > 0) x <<= 1;
895 	x -= 1;
896 
897 	if (x <= 0)
898 	{
899 	    fprintf(stderr,">E find_maxint() failed\n");
900 	    exit(1);
901 	}
902 
903 	maxint = x;
904 }
905 
906 /****************************************************************************/
907 
908 static void
909 sort_formats(int *order, int numformats)
910 /* Make order[0..numformats-1] a permutation of 0..numformats-1 being
911    a good order to display the results. */
912 {
913 	double score[MAXFORMATS];
914 	int h,i,j,iw;
915 
916 	for (i = 0; i < numformats; ++i)
917 	{
918 	    if (matching_lines[i] == 0)
919 		score[i] = -1.0;
920 	    else
921 		score[i] = i +
922                    ((10.0*total_position[i]) / matching_lines[i]) * numformats;
923 	    order[i] = i;
924 	}
925 
926         j = numformats / 3;
927         h = 1;
928         do
929             h = 3 * h + 1;
930         while (h < j);
931 
932         do
933         {
934             for (i = h; i < numformats; ++i)
935             {
936                 iw = order[i];
937                 for (j = i; score[order[j-h]] > score[iw]; )
938                 {
939                     order[j] = order[j-h];
940                     if ((j -= h) < h) break;
941                 }
942                 order[j] = iw;
943             }
944             h /= 3;
945         }
946         while (h > 0);
947 }
948 
949 /****************************************************************************/
950 
951 static void
952 read_formats(char *filename, int *numformatsp, boolean mustexist)
953 /* Read formats from the given file. */
954 {
955 	FILE *f;
956 	int i,c,flags;
957 	char flagname[52];
958 	char line[MAXLINELEN+3];
959         integer pmod;
960         char *s;
961 	boolean oflow,badpmod;
962 	int digit;
963 
964 	if (strcmp(filename,"-") == 0)
965 	    f = stdin;
966 	else if ((f = fopen(filename,"r")) == NULL)
967 	{
968 	    if (mustexist)
969 	    {
970 		fprintf(stderr,">E Can't open %s for reading.\n",filename);
971 		exit(1);
972 	    }
973 	    return;
974 	}
975 
976 	flagname[51] = line[MAXLINELEN+2] = '\0';
977 
978 	for (;;)
979 	{
980 	    if ((c = getc(f)) == EOF) break;
981 
982 	    while (c == ' ' || c == '\t') c = getc(f);
983 	    if (c == '\n') continue;
984 	    if (c == EOF) break;
985 
986 	    if (c == '#')
987 	    {
988 		while (c != '\n' && c != EOF) c = getc(f);
989 		continue;
990 	    }
991 
992 	    ungetc(c,f);
993 
994 	    flags = 0;
995             pmod = 2;
996 	    for (;;)
997 	    {
998 		while ((c = getc(f)) == ' '
999 			            || c == '|' || c == ',' || c == '\t') {}
1000 		if (c == '#')
1001 		    while (c != '\n' && c != EOF) c = getc(f);
1002 		if (c == '\n' || c == EOF) break;
1003 
1004 		ungetc(c,f);
1005 		fscanf(f,"%50[A-Za-z0-9=]",flagname);
1006 		if      (strcmp(flagname,"DEFAULT") == 0)  {}
1007 		else if (strcmp(flagname,"FINAL") == 0)    flags |= FINAL;
1008 		else if (strcmp(flagname,"ERROR") == 0)    flags |= ERROR;
1009 		else if (strcmp(flagname,"UNIQUE") == 0)   flags |= UNIQUE;
1010 		else if (strcmp(flagname,"COUNT") == 0)    flags |= COUNT;
1011 		else if (strcmp(flagname,"CONTINUE") == 0) flags |= CONTINUE;
1012 		else if (strcmp(flagname,"NUMERIC") == 0)  flags |= NUMERIC;
1013 		else if (strcmp(flagname,"SILENT") == 0)   flags |= SILENT;
1014 		else if (strcmp(flagname,"ENDFILE") == 0)  flags |= ENDFILE;
1015 		else if (flagname[0] == 'P' && flagname[1] == '=')
1016 		{
1017 		    pmod = 0;
1018 		    oflow = FALSE;
1019 		    badpmod = (flagname[2] == '\0');
1020 		    for (s = flagname+2; *s != '\0'; ++s)
1021 		    {
1022 		        if (isdigit(*s))
1023 			{
1024 			    digit =  *s - '0';
1025 			    if (pmod > (maxint-digit)/10)
1026 				oflow = TRUE;
1027 			    else
1028 				pmod = pmod*10 + digit;
1029 			}
1030 			else
1031 			    badpmod = TRUE;
1032 		    }
1033 		    if (badpmod)
1034 		    {
1035 			fprintf(stderr,">E Bad value for P= directive: %s\n",
1036 				flagname+2);
1037 			exit(1);
1038 		    }
1039 		    else if (oflow)
1040 		    {
1041                         fprintf(stderr,">E Value for P= is too large\n");
1042                         exit(1);
1043                     }
1044 
1045 		}
1046 		else
1047 		{
1048 		    fprintf(stderr,">E Unknown flag %s in %s\n",
1049 			           flagname,filename);
1050 		    exit(1);
1051 		}
1052 	    }
1053 
1054 	    if (fgets(line,MAXLINELEN,f) == NULL)
1055 	    {
1056 		fprintf(stderr,">E Missing format in %s\n",filename);
1057                 exit(1);
1058             }
1059 
1060 	    for (i = 0; i < *numformatsp; ++i)
1061 		if (strcmp(line,format[i].fmt) == 0) break;
1062 	    if (i < *numformatsp) continue;
1063 
1064 	    if (*numformatsp == MAXFORMATS)
1065 	    {
1066 		fprintf(stderr,">E Increase MAXFORMATS\n");
1067 		exit(1);
1068 	    }
1069 
1070 	    format[*numformatsp].flags = flags;
1071 	    format[*numformatsp].pmod = pmod;
1072 	    if ((format[*numformatsp].fmt
1073 				= (char*)malloc(strlen(line)+1)) == NULL)
1074 	    {
1075 		fprintf(stderr,">E malloc() failed in read_formats()\n");
1076                 exit(1);
1077             }
1078 	    strcpy(format[*numformatsp].fmt,line);
1079 	    ++*numformatsp;
1080 	}
1081 
1082 	if (f != stdin) fclose(f);
1083 }
1084 
1085 /****************************************************************************/
1086 
1087 static void
1088 read_local_formats(int *numformatsp)
1089 /* Read formats from sumlines.fmt in current directory */
1090 {
1091 	read_formats("sumlines.fmt",numformatsp,FALSE);
1092 }
1093 
1094 /****************************************************************************/
1095 
1096 static void
1097 read_global_formats(int *numformatsp)
1098 /* Read formats from sumlines.fmt in home directory */
1099 {
1100 	struct passwd *pwd;
1101 	char filename[257+12];
1102 
1103 	if ((pwd = getpwuid(getuid())) < 0)
1104 	{
1105 	    fprintf(stderr,">E Can't find home directory\n");
1106 	    exit(1);
1107 	}
1108 
1109 	sprintf(filename,"%s/sumlines.fmt",pwd->pw_dir);
1110 
1111         read_formats(filename,numformatsp,FALSE);
1112 }
1113 
1114 /****************************************************************************/
1115 
1116 static void
1117 read_env_formats(int *numformatsp)
1118 /* Read formats from $SUMLINES.FMT if it exists */
1119 {
1120 	char *filename;
1121 
1122 	if ((filename = getenv("SUMLINES.FMT")) != 0)
1123             read_formats(filename,numformatsp,FALSE);
1124 }
1125 
1126 /****************************************************************************/
1127 
1128 static boolean
1129 readline(FILE *f, char *line, int size, int *nulls)
1130 /* Get a line.  Read at most size-1 chars until EOF or \n.
1131    If \n is read, it is stored.  Then \0 is appended.
1132    *nulls is set to the number of NUL chars (which are also stored). */
1133 {
1134 	int i,c;
1135 
1136 	*nulls = 0;
1137 	for (i = 0; i < size-1; ++i)
1138 	{
1139 	    c = getc(f);
1140 	    if (c == EOF) break;
1141 	    line[i] = c;
1142 	    if (c == '\0') ++*nulls;
1143 	    if (c == '\n') {++i; break;}
1144 	}
1145 	line[i] = '\0';
1146 
1147 	return i > 0;
1148 }
1149 
1150 /****************************************************************************/
1151 
1152 static int
1153 pnumstrcmp(const void *a, const void *b)
1154 /* numstrcmp on strings pointed at by a and b */
1155 {
1156 	return numstrcmp(*(char**)a,*(char**)b);
1157 }
1158 
1159 /****************************************************************************/
1160 
1161 static void
1162 doglob(char *patt, glob_t *globlk)
1163 /* Find all files matching the given pattern, numeric sorting.
1164    Give a warning message if there are none. */
1165 {
1166 	int ret;
1167 
1168 	ret = glob(patt,GLOB_FLAGS,NULL,globlk);
1169 
1170 	if (ret != 0) globlk->gl_pathc = 0;
1171 
1172 	if (ret == GLOB_NOSPACE)
1173 	{
1174 	    fprintf(stderr,"ERROR: ran out of space during glob()\n");
1175 	    exit(1);
1176 	}
1177         if (ret == GLOB_ERR)
1178         {
1179             fprintf(stderr,"ERROR: during glob(%s)\n",patt);
1180             exit(1);
1181         }
1182 	if (ret != 0 && ret != GLOB_NOMATCH)
1183 	{
1184             fprintf(stderr,"ERROR: value %d from glob(%s)\n",ret,patt);
1185             exit(1);
1186         }
1187 
1188 
1189 	if (globlk->gl_pathc == 0) printf("WARNING: no files match %s\n",patt);
1190 
1191 	if (globlk->gl_pathc >= 2)
1192 	    qsort(globlk->gl_pathv,globlk->gl_pathc,sizeof(char*),pnumstrcmp);
1193 }
1194 
1195 /****************************************************************************/
1196 
1197 main(int argc, char *argv[])
1198 {
1199 	int i,j,nvals,argnum;
1200 	number val[MAXVALUES];
1201 	int valtype[MAXVALUES];
1202 	char line[MAXLINELEN+2];
1203 	char outf[MAXLINELEN+MAXVALUES+6];
1204 	unsigned long matched,unmatched,finalmatched;
1205 	unsigned long errorlines,totalerrorlines;
1206 	unsigned long line_number,nullcount,numfiles,ifile;
1207 	char *filename;
1208 	FILE *infile;
1209 	int numformats,firstarg,nulls;
1210 	boolean havefinal,nowarn,listformats,readfiles;
1211 	integer seq;
1212 	int order[MAXFORMATS];
1213 	glob_t globlk,globlk_stdin,*pglob;
1214 	char *glob_stdin_v[2];
1215 	boolean printcounts;
1216 
1217 	HELP;
1218 
1219 	find_maxint();
1220 
1221 	firstarg = 1;
1222 	numformats = 0;
1223 	nowarn = FALSE;
1224 	listformats = FALSE;
1225 	readfiles = TRUE;
1226 	printcounts = TRUE;
1227 
1228 	globlk_stdin.gl_pathc = 1;
1229 	globlk_stdin.gl_pathv = glob_stdin_v;
1230 	glob_stdin_v[0] = "-";
1231 	glob_stdin_v[1] = NULL;
1232 
1233 	for (; firstarg < argc; ++firstarg)
1234 	{
1235 	    if (argv[firstarg][0] == '-' && argv[firstarg][1] == 'f')
1236 	    {
1237 		if (argv[firstarg][2] != '\0')
1238 		    read_formats(&argv[firstarg][2],&numformats,TRUE);
1239 		else if (firstarg == argc - 1)
1240 		{
1241 		    fprintf(stderr,">E No argument for -f\n");
1242 		    exit(1);
1243 		}
1244 		else
1245 		{
1246 		    ++firstarg;
1247 		    read_formats(argv[firstarg],&numformats,TRUE);
1248 		}
1249 	    }
1250 	    else if (strcmp(argv[firstarg],"-w") == 0)
1251 		nowarn = TRUE;
1252 	    else if (strcmp(argv[firstarg],"-v") == 0)
1253 		listformats = TRUE;
1254 	    else if (strcmp(argv[firstarg],"-d") == 0)
1255 		readfiles = FALSE;
1256 	    else if (strcmp(argv[firstarg],"-n") == 0)
1257 		printcounts = FALSE;
1258 	    else
1259 	        break;
1260 	}
1261 
1262 #if GMP
1263 	for (i = 0; i < MAXVALUES; ++i) mpz_init(mp_value[i]);
1264 #endif
1265 
1266 	if (readfiles) read_local_formats(&numformats);
1267 	if (readfiles) read_env_formats(&numformats);
1268 	if (readfiles) read_global_formats(&numformats);
1269 
1270 	if (listformats)
1271 	{
1272 	    printf("%d formats:\n",numformats);
1273 	    for (i = 0; i < numformats; ++i)
1274 	        printf("%03x %s",format[i].flags,format[i].fmt);
1275 	}
1276 
1277 	if (numformats == 0)
1278 	{
1279 	    fprintf(stderr,">E No formats\n");
1280 	    exit(1);
1281 	}
1282 
1283 	havefinal = FALSE;
1284 	for (i = 0; i < numformats; ++i)
1285 	{
1286 	    count_root[i] = NULL;
1287 	    matching_lines[i] = 0;
1288 	    total_position[i] = 0;
1289 	    if (HAS(i,FINAL)) havefinal = TRUE;
1290 	}
1291 
1292 	unmatched = totalerrorlines = 0;
1293 	numfiles = 0;
1294 
1295 	for (argnum = firstarg;
1296              argnum < (argc == firstarg ? argc+1 : argc); ++argnum)
1297 	{
1298 	    if (argnum >= argc || strcmp(argv[argnum],"-") == 0)
1299 		pglob = &globlk_stdin;
1300 	    else
1301 	    {
1302 		pglob = &globlk;
1303 		doglob(argv[argnum],pglob);
1304 	    }
1305 
1306 	    for (ifile = 0; ifile < pglob->gl_pathc; ++ifile)
1307 	    {
1308 		matched = finalmatched = errorlines = 0;
1309 		++numfiles;
1310 
1311 	   	if (strcmp(pglob->gl_pathv[ifile],"-") == 0)
1312 		{
1313 		    filename = "stdin";
1314 		    infile = stdin;
1315 		}
1316 	        else
1317 	        {
1318 		    filename = pglob->gl_pathv[ifile];
1319 		    if ((infile = fopen(filename,"r")) == NULL)
1320 	            {
1321 		        fprintf(stderr,">E Can't open %s\n",filename);
1322 		        exit(1);
1323 		    }
1324 	        }
1325 
1326 	        line_number = 0;
1327 	        nullcount = 0;
1328 	        while (readline(infile,line,MAXLINELEN,&nulls))
1329 	        {
1330 		    nullcount += nulls;
1331 		    line[MAXLINELEN] = '\n';
1332 		    line[MAXLINELEN+1] = '\0';
1333   	            if (line[0] == '\n') continue;
1334 		    ++line_number;
1335 
1336 	            for (i = 0; i < numformats; ++i)
1337 	            {
1338 		        nvals
1339 			  = scanline(line,format[i].fmt,val,valtype,&seq,outf);
1340 		        if (nvals >= 0)
1341 		        {
1342 			    if (HAS(i,ENDFILE)) line_number = 0;
1343 			    ++matched;
1344 			    if (HAS(i,FINAL)) ++finalmatched;
1345 			    if (HAS(i,ERROR)) ++errorlines;
1346 			    ++matching_lines[i];
1347 			    total_position[i] += line_number;
1348 		            add_one(&count_root[i],outf,format[i].pmod,nvals,
1349                                  val,valtype,i,HAS(i,NUMERIC));
1350 			    if (matching_lines[i] > 1 && seq >= 0
1351 			                           && seq != lastseq[i]+1)
1352 			    {
1353 			        printf("WARNING: Sequence number");
1354 			        if (seq == lastseq[i])
1355 			        {
1356                                     printf(" ");
1357                                     printf(DOUT,seq);
1358                                     printf(" is repeated.\n");
1359                                 }
1360 			        else if (seq != lastseq[i]+2)
1361 			        {
1362 				    printf("s ");
1363 				    printf(DOUT,lastseq[i]+1);
1364 				    printf("-");
1365 				    printf(DOUT,seq-1);
1366 				    printf(" are missing.\n");
1367 			        }
1368 			        else
1369                                 {
1370                                     printf(" ");
1371                                     printf(DOUT,seq-1);
1372 			            printf(" is missing.\n");
1373                                 }
1374 			    }
1375 			    lastseq[i] = seq;
1376 		            if (!HAS(i,CONTINUE)) break;
1377 		        }
1378 	            }
1379 
1380 	            if (i == numformats) ++unmatched;
1381 	        }
1382 	        if (errorlines != 0)
1383 	            printf("ERRORS: Error lines in file %s\n",filename);
1384 	        else if (matched == 0 && !nowarn)
1385 	            printf("WARNING: No matching lines in file %s\n",filename);
1386 	        else if (finalmatched == 0 && havefinal && !nowarn)
1387 	            printf("WARNING: No final lines in file %s\n",filename);
1388 	        if (nullcount > 0)
1389 	 	    printf("WARNING: %ld NULs found in file %s\n",
1390 							    nullcount,filename);
1391 	        if (infile != stdin) fclose(infile);
1392 
1393 	        totalerrorlines += errorlines;
1394 	    }
1395 	    if (pglob == &globlk) globfree(pglob);
1396 	}
1397 
1398 	sort_formats(order,numformats);
1399 
1400 	for (j = 0; j < numformats; ++j)
1401 	{
1402 	    i = order[j];
1403 	    if (HAS(i,SILENT)) continue;
1404 
1405 	    if (HAS(i,COUNT))
1406 	    {
1407 		if (matching_lines[i] > 0)
1408 		    printf("%5lu lines matched ",matching_lines[i]);
1409 		print_common(count_root[i]);
1410 	    }
1411 	    else
1412 	        print_counts(count_root[i],printcounts);
1413 	}
1414 
1415 	if (unmatched > 0)
1416 	    printf("%5lu non-empty lines not matched\n",unmatched);
1417 	if (argc > firstarg) printf("%5lu files read altogether\n",numfiles);
1418 	if (totalerrorlines > 0) printf("%5lu errors found\n",totalerrorlines);
1419 
1420 	exit(0);
1421 }
1422