1 /* mkid.c -- build an identifer database
2    Copyright (C) 1986, 1995-1996, 1999, 2007-2012 Free Software Foundation,
3    Inc.
4    Written by Greg McGary <gkm@gnu.ai.mit.edu>
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 */
19 
20 #include <config.h>
21 #include <stdlib.h>
22 #include <assert.h>
23 #include <stdio.h>
24 #include <errno.h>
25 #include <getopt.h>
26 #include <unistd.h>
27 #include <string.h>
28 #include <sys/stat.h>
29 #include <limits.h>
30 
31 #include "alloca.h"
32 #include "argv-iter.h"
33 #include "closeout.h"
34 #include "dirname.h"
35 #include "error.h"
36 #include "inttostr.h"
37 #include "pathmax.h"
38 #include "progname.h"
39 #include "quote.h"
40 #include "quotearg.h"
41 #include "xalloc.h"
42 
43 #include "xnls.h"
44 #include "idfile.h"
45 #include "idu-hash.h"
46 #include "scanners.h"
47 #include "iduglobal.h"
48 
49 struct summary
50 {
51   struct token **sum_tokens;
52   unsigned char const *sum_hits;
53   struct summary *sum_parent;
54   union {
55     struct summary *u_kids[8];	/* when sum_level > 0 */
56 #define sum_kids sum_u.u_kids
57     struct member_file *u_files[8];	/* when sum_level == 0 */
58   } sum_u;
59   unsigned long sum_tokens_size;
60   unsigned long sum_hits_count;
61   int sum_free_index;
62   int sum_level;
63 };
64 
65 void usage (void);
66 static int ceil_log_8 (unsigned long n);
67 static int ceil_log_2 (unsigned long n);
68 static void assert_writeable (char const *file_name);
69 static void scan_files (struct idhead const *idhp);
70 static void scan_member_file (struct member_file const *member);
71 static void scan_member_file_1 (get_token_func_t get_token,
72 				void const *args, FILE *source_FILE);
73 static void report_statistics (void);
74 static void write_id_file (struct idhead *idhp);
75 static unsigned long token_hash_1 (void const *key);
76 static unsigned long token_hash_2 (void const *key);
77 static int token_hash_cmp (void const *x, void const *y);
78 static int token_qsort_cmp (void const *x, void const *y);
79 static void bump_current_hits_signature (void);
80 static void init_hits_signature (int i);
81 static void free_summary_tokens (void);
82 static void summarize (void);
83 static void init_summary (void);
84 static struct summary *make_sibling_summary (struct summary *summary);
85 static int count_vec_size (struct summary *summary,
86 			   unsigned char const *tail_hits);
87 static int count_buf_size (struct summary *summary,
88 			   unsigned char const *tail_hits);
89 static int check_hits (struct summary* summary) _GL_ATTRIBUTE_PURE;
90 static void write_hits (FILE *fp, struct summary *summary,
91 			unsigned char const *tail_hits);
92 static void sign_token (struct token *token);
93 static void add_token_to_summary (struct summary *summary, struct token *token);
94 
95 static struct hash_table token_table;
96 
97 /* Miscellaneous statistics */
98 static unsigned long input_chars;
99 static unsigned long name_tokens;
100 static unsigned long number_tokens;
101 static unsigned long string_tokens;
102 static unsigned long literal_tokens;
103 static unsigned long comment_tokens;
104 static unsigned long occurrences;
105 static unsigned long hits_length = 0;
106 static unsigned long tokens_length = 0;
107 static unsigned long output_length = 0;
108 
109 static int verbose_flag = 0;
110 static int statistics_flag = 0;
111 
112 static int levels = 0;			/* ceil(log(8)) of file_name_count */
113 
114 static unsigned char *current_hits_signature;
115 #define INIT_TOKENS_SIZE(level) (1 << ((level) + 13))
116 static struct summary *summary_root;
117 static struct summary *summary_leaf;
118 
119 static char *lang_map_file_name = 0;
120 static int show_version = 0;
121 static int show_help = 0;
122 struct idhead idh;
123 static struct file_link *cw_dlink;
124 
125 void usage (void) __attribute__((__noreturn__));
126 void
usage(void)127 usage (void)
128 {
129   fprintf (stderr, _("Try `%s --help' for more information.\n"),
130 	   program_name);
131   exit (EXIT_FAILURE);
132 }
133 
134 /* For long options that have no equivalent short option, use a
135    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
136 enum
137 {
138   FILES0_FROM_OPTION = CHAR_MAX +1,
139 };
140 
141 static struct option const long_options[] =
142 {
143   { "file", required_argument, 0, 'f' },
144   { "output", required_argument, 0, 'o' },
145   { "include", required_argument, 0, 'i' },
146   { "exclude", required_argument, 0, 'x' },
147   { "lang-option", required_argument, 0, 'l' },
148   { "lang-map", required_argument, 0, 'm' },
149   { "default-lang", required_argument, 0, 'd' },
150   { "prune", required_argument, 0, 'p' },
151   { "verbose", no_argument, 0, 'v' },
152   { "statistics", no_argument, 0, 's' },
153   { "help", no_argument, &show_help, 1 },
154   { "version", no_argument, &show_version, 1 },
155   { "files0-from", required_argument, NULL, FILES0_FROM_OPTION },
156   {NULL, 0, NULL, 0}
157 };
158 
159 static void __attribute__((__noreturn__))
help_me(void)160 help_me (void)
161 {
162   printf (_("\
163 Usage: %s [OPTION]... [FILE]...\n\
164 "), program_name);
165 
166   printf (_("\
167 Build an identifier database.\n\
168   -o, --output=OUTFILE    file name of ID database output\n\
169   -f, --file=OUTFILE      synonym for --output\n\
170   -i, --include=LANGS     include languages in LANGS (default: \"C C++ asm\")\n\
171   -x, --exclude=LANGS     exclude languages in LANGS\n\
172   -l, --lang-option=L:OPT pass OPT as a default for language L (see below)\n\
173   -m, --lang-map=MAPFILE  use MAPFILE to map file names onto source language\n\
174   -d, --default-lang=LANG  make LANG the default source language\n\
175   -p, --prune=NAMES       exclude the named files and/or directories\n\
176   -v, --verbose           report per file statistics\n\
177   -s, --statistics        report statistics at end of run\n\
178 \n\
179       --files0-from=F     tokenize only the files specified by\n\
180                            NUL-terminated names in file F\n\
181 \n\
182        --help              display this help and exit\n\
183       --version           output version information and exit\n\
184 \n\
185 FILE may be a file name, or a directory name to recursively search.\n\
186 If no FILE is given, the current directory is searched by default.\n\
187 Note that the `--include' and `--exclude' options are mutually-exclusive.\n\
188 \n\
189 The following arguments apply to the language-specific scanners:\n\
190 "));
191   language_help_me ();
192   printf (_("\nReport bugs to " PACKAGE_BUGREPORT "\n\n"));
193   exit (EXIT_SUCCESS);
194 }
195 
196 static void *heap_initial;
197 static void *heap_after_walk;
198 static void *heap_after_scan;
199 
get_process_heap(void)200 static void *get_process_heap(void)
201 {
202 #if HAVE_SBRK
203   return sbrk(0);
204 #else
205   return 0;
206 #endif /* HAVE_SBRK */
207 }
208 
209 int
main(int argc,char ** argv)210 main (int argc, char **argv)
211 {
212   bool ok;
213   int nfiles;
214   char *files_from = NULL;
215 
216   set_program_name (argv[0]);
217   heap_initial = get_process_heap();
218   idh.idh_file_name = DEFAULT_ID_FILE_NAME;
219 
220 #if ENABLE_NLS
221   /* Set locale according to user's wishes.  */
222   setlocale (LC_ALL, "");
223 
224   /* Tell program which translations to use and where to find.  */
225   bindtextdomain (PACKAGE, LOCALEDIR);
226   textdomain (PACKAGE);
227 #endif
228 
229   atexit (close_stdout);
230 
231   for (;;)
232     {
233       int optc = getopt_long (argc, argv, "o:f:i:x:l:m:d:p:vVs",
234 			      long_options, (int *) 0);
235       if (optc < 0)
236 	break;
237       switch (optc)
238 	{
239 	case 0:
240 	  break;
241 
242 	case 'o':
243 	case 'f':
244 	  idh.idh_file_name = optarg;
245 	  break;
246 
247 	case 'i':
248 	  include_languages (optarg);
249 	  break;
250 
251 	case 'x':
252 	  exclude_languages (optarg);
253 	  break;
254 
255 	case 'l':
256 	  language_save_arg (optarg);
257 	  break;
258 
259 	case 'm':
260 	  lang_map_file_name = optarg;
261 	  break;
262 
263 	case 'd':
264 	  set_default_language (optarg);
265 	  break;
266 
267 	case 'p':
268 	  if (cw_dlink == 0)
269 	    cw_dlink = init_walker (&idh);
270 	  prune_file_names (optarg, cw_dlink);
271 	  break;
272 
273 	case FILES0_FROM_OPTION:
274 	  files_from = optarg;
275 	  break;
276 
277 	case 'V':
278 	  walker_verbose_flag = 1;
279 	case 'v':
280 	  verbose_flag = 1;
281 	case 's':
282 	  statistics_flag = 1;
283 	  break;
284 
285 	default:
286 	  usage ();
287 	}
288     }
289 
290   if (show_version)
291     {
292       printf ("%s - %s\n", program_name, PACKAGE_VERSION);
293       exit (EXIT_SUCCESS);
294     }
295 
296   if (show_help)
297     help_me ();
298 
299   nfiles = argc - optind;
300 
301   struct argv_iterator *ai;
302   if (files_from)
303     {
304       /* When using --files0-from=F, you may not specify any files
305 	 on the command-line.  */
306       if (nfiles != 0)
307 	{
308 	  error (0, 0, _("extra operand %s"), quote (argv[optind]));
309 	  fprintf (stderr, "%s\n",
310 		   _("file operands cannot be combined with --files0-from"));
311 	  usage();
312 	}
313 
314       if (! (strequ (files_from, "-") || freopen (files_from, "r", stdin)))
315 	error (EXIT_FAILURE, errno, _("cannot open %s for reading"),
316 	       quote (files_from));
317 
318       ai = argv_iter_init_stream (stdin);
319     }
320   else
321     {
322       static char *cwd_only[2];
323       cwd_only[0] = bad_cast (".");
324       cwd_only[1] = NULL;
325       char **files = (optind < argc ? argv + optind : cwd_only);
326       ai = argv_iter_init_argv (files);
327     }
328 
329   language_getopt ();
330   assert_writeable (idh.idh_file_name);
331   if (cw_dlink == 0)
332     cw_dlink = init_walker (&idh);
333   parse_language_map (lang_map_file_name);
334 
335   /* Walk the file and directory names given on the command line.  */
336   ok = true;
337   while (true)
338     {
339       bool skip_file = false;
340       enum argv_iter_err ai_err;
341       char *file_name = argv_iter (ai, &ai_err);
342       if (ai_err == AI_ERR_EOF)
343         break;
344       if (!file_name)
345         {
346           switch (ai_err)
347             {
348             case AI_ERR_READ:
349               error (0, errno, _("%s: read error"), quote (files_from));
350               continue;
351 
352             case AI_ERR_MEM:
353               xalloc_die ();
354 
355             default:
356               assert (!"unexpected error code from argv_iter");
357             }
358         }
359       if (files_from && STREQ (files_from, "-") && STREQ (file_name, "-"))
360         {
361           /* Give a better diagnostic in an unusual case:
362              printf - | du --files0-from=- */
363           error (0, 0, _("when reading file names from stdin, "
364                          "no file name of %s allowed"),
365                  quote (file_name));
366           skip_file = true;
367         }
368 
369       /* Report and skip any empty file names.  */
370       if (!file_name[0])
371         {
372           /* Diagnose a zero-length file name.  When it's one
373              among many, knowing the record number may help.
374              FIXME: currently print the record number only with
375              --files0-from=FILE.  Maybe do it for argv, too?  */
376           if (files_from == NULL)
377             error (0, 0, "%s", _("invalid zero-length file name"));
378           else
379             {
380               /* Using the standard `filename:line-number:' prefix here is
381                  not totally appropriate, since NUL is the separator, not NL,
382                  but it might be better than nothing.  */
383               unsigned long int file_number = argv_iter_n_args (ai);
384               error (0, 0, "%s:%lu: %s", quotearg_colon (files_from),
385                      file_number, _("invalid zero-length file name"));
386             }
387           skip_file = true;
388         }
389 
390       if (skip_file)
391         ok = false;
392       else
393         {
394           struct file_link *flink = parse_file_name (file_name, cw_dlink);
395           if (flink)
396             walk_flink (flink, 0);
397 	  /* FIXME: walk_flink can fail, so should return status.
398 	     Then caller can continue with other arguments.  */
399         }
400     }
401   argv_iter_free (ai);
402 
403   heap_after_walk = get_process_heap();
404 
405   mark_member_file_links (&idh);
406   log_8_member_files = ceil_log_8 (idh.idh_member_file_table.ht_fill);
407 
408   /* hack start: when scanning a single file, log_8_member_files results 0.
409      Adjust to 1 in this case, or xmalloc will be called for 0 bytes,
410      and later the struct token will have no 'hits' field.
411      This would cause a crash
412      (see testsuite/consistency, testsuite/single_file_token_bug.c)
413 
414      <claudio@gnu.org 2005>
415   */
416 
417   if (log_8_member_files == 0)
418     log_8_member_files = 1;
419 
420   /* hack end */
421 
422   current_hits_signature = xmalloc (log_8_member_files);
423 
424   /* If scannable files were given, then scan them.  */
425   if (idh.idh_member_file_table.ht_fill)
426     {
427       scan_files (&idh);
428       heap_after_scan = get_process_heap();
429 
430       free_summary_tokens ();
431       free (token_table.ht_vec);
432       chdir_to_link (cw_dlink);
433       write_id_file (&idh);
434 
435       if (statistics_flag)
436 	report_statistics ();
437     }
438   else
439     error (0, 0, _("nothing to do"));
440 
441   exit (ok ? EXIT_SUCCESS : EXIT_FAILURE);
442 }
443 
444 /* Return the integer ceiling of the base-8 logarithm of N.  */
445 
446 static int _GL_ATTRIBUTE_CONST
ceil_log_8(unsigned long n)447 ceil_log_8 (unsigned long n)
448 {
449   int log_8 = 0;
450 
451   n--;
452   while (n)
453     {
454       log_8++;
455       n >>= 3;
456     }
457   return log_8;
458 }
459 
460 /* Return the integer ceiling of the base-2 logarithm of N.  */
461 
462 static int _GL_ATTRIBUTE_CONST
ceil_log_2(unsigned long n)463 ceil_log_2 (unsigned long n)
464 {
465   int log_2 = 0;
466 
467   n--;
468   while (n)
469     {
470       log_2++;
471       n >>= 1;
472     }
473   return log_2;
474 }
475 
476 /* Ensure that FILE_NAME can be written.  If it exists, make sure it's
477    writable.  If it doesn't exist, make sure it can be created.  Exit
478    with a diagnostic if this is not so.  */
479 
480 static void
assert_writeable(char const * filename)481 assert_writeable (char const *filename)
482 {
483   if (access (filename, 06) < 0)
484     {
485       if (errno == ENOENT)
486 	{
487 	  char *dirname = dir_name (filename);
488 	  if (access (dirname, 06) < 0)
489 	    error (EXIT_FAILURE, errno, _("can't create `%s' in `%s'"),
490 		   base_name (filename), dirname);
491 	  free(dirname);
492 	}
493       else
494 	error (EXIT_FAILURE, errno, _("can't modify `%s'"), filename);
495     }
496 }
497 
498 /* Iterate over all eligible files (the members of the set of scannable files).
499    Create a tree8 to store the set of files where a token occurs.  */
500 
501 static void
scan_files(struct idhead const * idhp)502 scan_files (struct idhead const *idhp)
503 {
504   struct member_file **members_0
505     = (struct member_file **) hash_dump (&idhp->idh_member_file_table,
506 					 0, member_file_qsort_compare);
507   struct member_file **end = &members_0[idhp->idh_member_file_table.ht_fill];
508   struct member_file **members = members_0;
509   int n = idhp->idh_member_file_table.ht_fill;
510 
511   n = n * ceil_log_2 (n) * 16;
512   if (n == 0)
513     n = 1024;
514   else if (n > 1024*1024)
515     n = 1024*1024;
516 
517   hash_init (&token_table, n, token_hash_1, token_hash_2, token_hash_cmp);
518   if (verbose_flag) {
519     char offstr[INT_BUFSIZE_BOUND(off_t)];
520 
521     printf ("files=%ld, largest=%s, slots=%lu\n",
522 	    idhp->idh_member_file_table.ht_fill,
523 	    offtostr(largest_member_file, offstr),
524 	    token_table.ht_size);
525   }
526   init_hits_signature (0);
527   init_summary ();
528   obstack_init (&tokens_obstack);
529 
530   if (largest_member_file > MAX_LARGEST_MEMBER_FILE)
531     largest_member_file = MAX_LARGEST_MEMBER_FILE;
532   scanner_buffer = xmalloc (largest_member_file + 1);
533 
534   for (;;)
535     {
536       const struct member_file *member = *members++;
537       scan_member_file (member);
538       if (members == end)
539 	break;
540       if (current_hits_signature[0] & 0x80)
541 	summarize ();
542       bump_current_hits_signature ();
543     }
544 
545   free (scanner_buffer);
546   free (members_0);
547 }
548 
549 /* Open a file and scan it.  */
550 
551 static void
scan_member_file(struct member_file const * member)552 scan_member_file (struct member_file const *member)
553 {
554   struct lang_args const *lang_args = member->mf_lang_args;
555   struct language const *lang = lang_args->la_language;
556   get_token_func_t get_token = lang->lg_get_token;
557   struct file_link *flink = member->mf_link;
558   struct stat st;
559   FILE *source_FILE;
560 
561   chdir_to_link (flink->fl_parent);
562   source_FILE = fopen (flink->fl_name, "r");
563   if (source_FILE)
564     {
565       char *file_name = alloca (PATH_MAX);
566       if (statistics_flag)
567 	{
568 	  if (fstat (fileno (source_FILE), &st) < 0)
569 	    {
570 	      maybe_relative_file_name (file_name, flink, cw_dlink);
571 	      error (0, errno, _("can't stat `%s'"), file_name);
572 	    }
573 	  else
574 	    input_chars += st.st_size;
575 	}
576       if (verbose_flag)
577 	{
578 	  maybe_relative_file_name (file_name, flink, cw_dlink);
579 	  printf ("%ld: %s: %s", member->mf_index, lang->lg_name, file_name);
580 	  fflush (stdout);
581 	}
582       scan_member_file_1 (get_token, lang_args->la_args_digested, source_FILE);
583       if (verbose_flag)
584 	putchar ('\n');
585       fclose (source_FILE);
586     }
587   else
588     error (0, errno, _("can't open `%s'"), flink->fl_name);
589 }
590 
591 /* Iterate over all tokens in the file, and merge the file's tree8
592    signature into the token table entry.  */
593 
594 static void
scan_member_file_1(get_token_func_t get_token,void const * args,FILE * source_FILE)595 scan_member_file_1 (get_token_func_t get_token, void const *args, FILE *source_FILE)
596 {
597   struct token **slot;
598   struct token *token;
599   int flags;
600   int new_tokens = 0;
601   int distinct_tokens = 0;
602 
603   while ((token = (*get_token) (source_FILE, args, &flags)) != NULL)
604     {
605       if (*TOKEN_NAME (token) == '\0') {
606 	obstack_free (&tokens_obstack, token);
607 	continue;
608       }
609       slot = (struct token **) hash_find_slot (&token_table, token);
610       if (HASH_VACANT (*slot))
611 	{
612 	  token->tok_flags = flags;
613 	  token->tok_count = 1;
614 	  memset (TOKEN_HITS (token), 0, log_8_member_files);
615 	  sign_token (token);
616 	  if (verbose_flag)
617 	    {
618 	      distinct_tokens++;
619 	      new_tokens++;
620 	    }
621 	  hash_insert_at (&token_table, token, slot);
622 	}
623       else
624 	{
625 	  obstack_free (&tokens_obstack, token);
626 	  token = *slot;
627 	  token->tok_flags |= flags;
628 	  if (token->tok_count < USHRT_MAX)
629 	    token->tok_count++;
630 	  if (!(TOKEN_HITS (token)[0] & current_hits_signature[0]))
631 	    {
632 	      sign_token (token);
633 	      if (verbose_flag)
634 		distinct_tokens++;
635 	    }
636 	}
637     }
638   if (verbose_flag)
639     {
640       printf (_("  new = %d/%d"), new_tokens, distinct_tokens);
641       if (distinct_tokens != 0)
642 	printf (" = %.0f%%", 100.0 * (double) new_tokens / (double) distinct_tokens);
643     }
644 }
645 
646 static void
report_statistics(void)647 report_statistics (void)
648 {
649   printf (_("Name=%ld, "), name_tokens);
650   printf (_("Number=%ld, "), number_tokens);
651   printf (_("String=%ld, "), string_tokens);
652   printf (_("Literal=%ld, "), literal_tokens);
653   printf (_("Comment=%ld\n"), comment_tokens);
654 
655   printf (_("Files=%ld, "), idh.idh_files);
656   printf (_("Tokens=%ld, "), occurrences);
657   printf (_("Bytes=%ld Kb, "), input_chars / 1024);
658   printf (_("Heap=%llu+%llu Kb, "),
659 	  (unsigned long long) ((char *) heap_after_scan
660 				- (char *) heap_after_walk) / 1024,
661 	  (unsigned long long) ((char *) heap_after_walk
662 				- (char *) heap_initial) / 1024);
663   printf (_("Output=%ld (%ld tok, %ld hit)\n"),
664 	  output_length, tokens_length, hits_length);
665 
666   hash_print_stats (&token_table, stdout);
667   printf (_(", Freq=%ld/%ld=%.2f\n"), occurrences, token_table.ht_fill,
668 	  (double) occurrences / (double) token_table.ht_fill);
669 }
670 
671 /* As the database is written, may need to adjust the file names.  If
672    we are generating the ID file in a remote directory, then adjust
673    the file names to be relative to the location of the ID database.
674 
675    (This would be a common useage if you want to make a database for a
676    directory which you have no write access to, so you cannot create
677    the ID file.)  */
678 
679 static void
write_id_file(struct idhead * idhp)680 write_id_file (struct idhead *idhp)
681 {
682   struct token **tokens;
683   int i;
684   int buf_size;
685   int vec_size;
686   int tok_size;
687   int max_buf_size = 0;
688   int max_vec_size = 0;
689 
690   if (verbose_flag)
691     printf (_("Sorting tokens...\n"));
692 
693   assert (summary_root->sum_hits_count == token_table.ht_fill);
694   tokens = xnrealloc (summary_root->sum_tokens,
695 		      token_table.ht_fill, sizeof *tokens);
696   qsort (tokens, token_table.ht_fill, sizeof (struct token *), token_qsort_cmp);
697 
698   if (verbose_flag)
699     printf (_("Writing `%s'...\n"), idhp->idh_file_name);
700   idhp->idh_FILE = fopen (idhp->idh_file_name, "w+b");
701   if (idhp->idh_FILE == NULL)
702     error (EXIT_FAILURE, errno, _("can't create `%s'"), idhp->idh_file_name);
703 
704   idhp->idh_magic[0] = IDH_MAGIC_0;
705   idhp->idh_magic[1] = IDH_MAGIC_1;
706   idhp->idh_version = IDH_VERSION;
707   idhp->idh_flags = IDH_COUNTS;
708 
709   /* write out the list of pathnames */
710 
711   fseek (idhp->idh_FILE, sizeof_idhead (), 0);
712   off_t off = ftello (idhp->idh_FILE);
713   if (UINT32_MAX < off)
714     error (EXIT_FAILURE, 0, _("internal limitation: offset of 2^32 or larger"));
715   idhp->idh_flinks_offset = off;
716   serialize_file_links (idhp);
717 
718   /* write out the list of identifiers */
719 
720   putc ('\0', idhp->idh_FILE);
721   putc ('\0', idhp->idh_FILE);
722   off = ftello (idhp->idh_FILE);
723   if (UINT32_MAX < off)
724     error (EXIT_FAILURE, 0, _("internal limitation: offset of 2^32 or larger"));
725   idhp->idh_tokens_offset = off;
726 
727   for (i = 0; i < token_table.ht_fill; i++, tokens++)
728     {
729       struct token *token = *tokens;
730 
731       occurrences += token->tok_count;
732       if (token->tok_flags & TOK_NUMBER)
733 	number_tokens++;
734       if (token->tok_flags & TOK_NAME)
735 	name_tokens++;
736       if (token->tok_flags & TOK_STRING)
737 	string_tokens++;
738       if (token->tok_flags & TOK_LITERAL)
739 	literal_tokens++;
740       if (token->tok_flags & TOK_COMMENT)
741 	comment_tokens++;
742 
743       fputs (TOKEN_NAME (token), idhp->idh_FILE);
744       putc ('\0', idhp->idh_FILE);
745       if (token->tok_count > 0xff)
746 	token->tok_flags |= TOK_SHORT_COUNT;
747       putc (token->tok_flags, idhp->idh_FILE);
748       putc (token->tok_count & 0xff, idhp->idh_FILE);
749       if (token->tok_flags & TOK_SHORT_COUNT)
750 	putc (token->tok_count >> 8, idhp->idh_FILE);
751 
752       vec_size = count_vec_size (summary_root, TOKEN_HITS (token) + levels);
753       buf_size = count_buf_size (summary_root, TOKEN_HITS (token) + levels);
754       hits_length += buf_size;
755       tok_size = strlen (TOKEN_NAME (token)) + 1;
756       tokens_length += tok_size;
757       buf_size += tok_size + sizeof (token->tok_flags) + sizeof (token->tok_count) + 2;
758       if (buf_size > max_buf_size)
759 	max_buf_size = buf_size;
760       if (vec_size > max_vec_size)
761 	max_vec_size = vec_size;
762 
763       write_hits (idhp->idh_FILE, summary_root, TOKEN_HITS (token) + levels);
764       putc ('\0', idhp->idh_FILE);
765       putc ('\0', idhp->idh_FILE);
766     }
767   assert (check_hits (summary_root) == 0);
768   idhp->idh_tokens = token_table.ht_fill;
769   off = ftello (idhp->idh_FILE);
770   if (UINT32_MAX < off)
771     error (EXIT_FAILURE, 0, _("internal limitation: offset of 2^32 or larger"));
772   output_length = off;
773   idhp->idh_end_offset = output_length - 2;
774   idhp->idh_buf_size = max_buf_size;
775   idhp->idh_vec_size = max_vec_size;
776 
777   write_idhead (&idh);
778   if (ferror (idhp->idh_FILE) || fclose (idhp->idh_FILE) != 0)
779     error (EXIT_FAILURE, errno, _("error closing `%s'"), idhp->idh_file_name);
780 }
781 
782 /* Define primary and secondary hash and comparison functions for the
783    token table.  */
784 
785 static unsigned long _GL_ATTRIBUTE_PURE
token_hash_1(void const * key)786 token_hash_1 (void const *key)
787 {
788   return_STRING_HASH_1 (TOKEN_NAME ((struct token const *) key));
789 }
790 
791 static unsigned long _GL_ATTRIBUTE_PURE
token_hash_2(void const * key)792 token_hash_2 (void const *key)
793 {
794   return_STRING_HASH_2 (TOKEN_NAME ((struct token const *) key));
795 }
796 
797 static int _GL_ATTRIBUTE_PURE
token_hash_cmp(void const * x,void const * y)798 token_hash_cmp (void const *x, void const *y)
799 {
800   return_STRING_COMPARE (TOKEN_NAME ((struct token const *) x),
801 			 TOKEN_NAME ((struct token const *) y));
802 }
803 
804 static int _GL_ATTRIBUTE_PURE
token_qsort_cmp(void const * x,void const * y)805 token_qsort_cmp (void const *x, void const *y)
806 {
807   return_STRING_COMPARE (TOKEN_NAME (*(struct token const *const *) x),
808 			 TOKEN_NAME (*(struct token const *const *) y));
809 }
810 
811 
812 /****************************************************************************/
813 
814 /* Advance the tree8 hit signature that corresponds to the file
815    we are now scanning.  */
816 
817 static void
bump_current_hits_signature(void)818 bump_current_hits_signature (void)
819 {
820   unsigned char *hits = current_hits_signature;
821   while (*hits & 0x80)
822     *hits++ = 1;
823   *hits <<= 1;
824 }
825 
826 /* Initialize the tree8 hit signature for the first file we scan.  */
827 
828 static void
init_hits_signature(int i)829 init_hits_signature (int i)
830 {
831   unsigned char *hits = current_hits_signature;
832   unsigned char const *end = current_hits_signature + log_8_member_files;
833   while (hits < end)
834     {
835       *hits = 1 << (i & 7);
836       i >>= 3;
837       hits++;
838     }
839 }
840 
841 static void
free_summary_tokens(void)842 free_summary_tokens (void)
843 {
844   struct summary *summary = summary_leaf;
845   while (summary != summary_root)
846     {
847       free (summary->sum_tokens);
848       summary = summary->sum_parent;
849     }
850 }
851 
852 static void
summarize(void)853 summarize (void)
854 {
855   unsigned char const *hits_sig = current_hits_signature;
856   struct summary *summary = summary_leaf;
857 
858   do
859     {
860       unsigned long count = summary->sum_hits_count;
861       unsigned char *hits = xmalloc (count + 1);
862       unsigned int level = summary->sum_level;
863       struct token **tokens = summary->sum_tokens;
864       unsigned long init_size = INIT_TOKENS_SIZE (summary->sum_level);
865 
866       if (verbose_flag)
867 	printf (_("level %d: %ld/%ld = %.0f%%\n"),
868 		summary->sum_level, count, init_size,
869 		100.0 * (double) count / (double) init_size);
870 
871       qsort (tokens, count, sizeof (struct token *), token_qsort_cmp);
872       summary->sum_hits = hits;
873       while (count--)
874 	{
875 	  unsigned char *hit = TOKEN_HITS (*tokens++) + level;
876 	  *hits++ = *hit;
877 	  *hit = 0;
878 	}
879       *hits++ = 0;
880       if (summary->sum_parent)
881 	{
882 	  free (summary->sum_tokens);
883 	  summary->sum_tokens = 0;
884 	}
885       summary = summary->sum_parent;
886     }
887   while (*++hits_sig & 0x80);
888   summary_leaf = make_sibling_summary (summary_leaf);
889 }
890 
891 static void
init_summary(void)892 init_summary (void)
893 {
894   unsigned long size = INIT_TOKENS_SIZE (0);
895   summary_root = summary_leaf = xcalloc (1, sizeof(struct summary));
896   summary_root->sum_tokens_size = size;
897   summary_root->sum_tokens = xmalloc (sizeof(struct token *) * size);
898 }
899 
900 static struct summary *
make_sibling_summary(struct summary * summary)901 make_sibling_summary (struct summary *summary)
902 {
903   struct summary *parent = summary->sum_parent;
904   size_t size;
905 
906   if (parent == NULL)
907     {
908       levels++;
909       summary_root = summary->sum_parent = parent
910 	= xcalloc (1, sizeof(struct summary));
911       parent->sum_level = levels;
912       parent->sum_kids[0] = summary;
913       parent->sum_hits_count = summary->sum_hits_count;
914       parent->sum_free_index = 1;
915       size = INIT_TOKENS_SIZE (levels);
916       if (summary->sum_tokens_size >= size)
917 	{
918 	  parent->sum_tokens_size = summary->sum_tokens_size;
919 	  parent->sum_tokens = summary->sum_tokens;
920 	}
921       else
922 	{
923 	  parent->sum_tokens_size = size;
924 	  parent->sum_tokens = xnrealloc (summary->sum_tokens, size,
925 					  sizeof *summary->sum_tokens);
926 	}
927       summary->sum_tokens = 0;
928     }
929   if (parent->sum_free_index == 8)
930     parent = make_sibling_summary (parent);
931   summary = xcalloc (1, sizeof(struct summary));
932   summary->sum_level = parent->sum_level - 1;
933   parent->sum_kids[parent->sum_free_index++] = summary;
934   summary->sum_parent = parent;
935   size = INIT_TOKENS_SIZE (summary->sum_level);
936   summary->sum_tokens_size = size;
937   summary->sum_tokens = xmalloc (sizeof(struct token *) * size);
938   return summary;
939 }
940 
941 static int _GL_ATTRIBUTE_PURE
count_vec_size(struct summary * summary,unsigned char const * tail_hits)942 count_vec_size (struct summary *summary, unsigned char const *tail_hits)
943 {
944   struct summary **kids;
945   unsigned int hits = (summary->sum_hits ? *summary->sum_hits : *tail_hits);
946 
947   kids = summary->sum_kids;
948   if (*kids == NULL)
949     {
950       static char bits_per_nybble[] = { 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4 };
951       return bits_per_nybble[hits & 0xf] + bits_per_nybble[hits >> 4];
952     }
953   else
954     {
955       int bit;
956       int count = 0;
957       --tail_hits;
958       for (bit = 1; bit & 0xff; bit <<= 1, ++kids)
959 	if (bit & hits)
960 	  count += count_vec_size (*kids, tail_hits);
961       return count;
962     }
963 }
964 
965 static int _GL_ATTRIBUTE_PURE
count_buf_size(struct summary * summary,unsigned char const * tail_hits)966 count_buf_size (struct summary *summary, unsigned char const *tail_hits)
967 {
968   struct summary **kids;
969   unsigned int hits = (summary->sum_hits ? *summary->sum_hits : *tail_hits);
970 
971   kids = summary->sum_kids;
972   if (*kids == NULL)
973     return 1;
974   else
975     {
976       int bit;
977       int count = 1;
978       --tail_hits;
979       for (bit = 1; bit & 0xff; bit <<= 1, ++kids)
980 	if (bit & hits)
981 	  count += count_buf_size (*kids, tail_hits);
982       return count;
983     }
984 }
985 
986 /* Sanity-check hit counts.  Return nonzero if there's a problem.
987    Otherwise, return 0.  */
988 static int
check_hits(struct summary * summary)989 check_hits (struct summary* summary)
990 {
991   struct summary **kids = summary->sum_kids;
992   struct summary **end = &kids[8];
993 
994   if ( ! (summary->sum_hits == NULL || *summary->sum_hits == 0))
995     return 1;
996 
997   if (end[-1] == 0)
998     while (*--end == 0)
999       ;
1000   while (kids < end)
1001     if (check_hits (*kids++))
1002       return 1;
1003 
1004   return 0;
1005 }
1006 
1007 static void
write_hits(FILE * fp,struct summary * summary,unsigned char const * tail_hits)1008 write_hits (FILE *fp, struct summary *summary, unsigned char const *tail_hits)
1009 {
1010   struct summary **kids;
1011   unsigned int hits = (summary->sum_hits ? *summary->sum_hits++ : *tail_hits);
1012 
1013   assert (hits);
1014   putc (hits, fp);
1015 
1016   kids = summary->sum_kids;
1017   if (*kids)
1018     {
1019       int bit;
1020       --tail_hits;
1021       for (bit = 1; (bit & 0xff) && *kids; bit <<= 1, ++kids)
1022 	if (bit & hits)
1023 	  write_hits (fp, *kids, tail_hits);
1024     }
1025 }
1026 
1027 static void
sign_token(struct token * token)1028 sign_token (struct token *token)
1029 {
1030   unsigned char *tok_hits = TOKEN_HITS (token);
1031   unsigned char *hits_sig = current_hits_signature;
1032   unsigned char *end = current_hits_signature + log_8_member_files;
1033   struct summary *summary = summary_leaf;
1034 
1035   while (summary)
1036     {
1037       if (*tok_hits == 0)
1038 	add_token_to_summary (summary, token);
1039       if (*tok_hits & *hits_sig)
1040 	break;
1041       *tok_hits |= *hits_sig;
1042       summary = summary->sum_parent;
1043       tok_hits++;
1044       hits_sig++;
1045     }
1046   while (hits_sig < end)
1047     {
1048       if (*tok_hits & *hits_sig)
1049 	break;
1050       *tok_hits |= *hits_sig;
1051       tok_hits++;
1052       hits_sig++;
1053     }
1054 }
1055 
1056 static void
add_token_to_summary(struct summary * summary,struct token * token)1057 add_token_to_summary (struct summary *summary, struct token *token)
1058 {
1059   size_t size = summary->sum_tokens_size;
1060 
1061   if (summary->sum_hits_count >= size)
1062     {
1063       summary->sum_tokens = x2nrealloc (summary->sum_tokens, &size,
1064 					sizeof *summary->sum_tokens);
1065       summary->sum_tokens_size = size;
1066     }
1067   summary->sum_tokens[summary->sum_hits_count++] = token;
1068 }
1069