1 /* mkid.c -- build an identifer database
2 Copyright (C) 1986, 1995-1996, 1999, 2007-2012 Free Software Foundation,
3 Inc.
4 Written by Greg McGary <gkm@gnu.ai.mit.edu>
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include <config.h>
21 #include <stdlib.h>
22 #include <assert.h>
23 #include <stdio.h>
24 #include <errno.h>
25 #include <getopt.h>
26 #include <unistd.h>
27 #include <string.h>
28 #include <sys/stat.h>
29 #include <limits.h>
30
31 #include "alloca.h"
32 #include "argv-iter.h"
33 #include "closeout.h"
34 #include "dirname.h"
35 #include "error.h"
36 #include "inttostr.h"
37 #include "pathmax.h"
38 #include "progname.h"
39 #include "quote.h"
40 #include "quotearg.h"
41 #include "xalloc.h"
42
43 #include "xnls.h"
44 #include "idfile.h"
45 #include "idu-hash.h"
46 #include "scanners.h"
47 #include "iduglobal.h"
48
49 struct summary
50 {
51 struct token **sum_tokens;
52 unsigned char const *sum_hits;
53 struct summary *sum_parent;
54 union {
55 struct summary *u_kids[8]; /* when sum_level > 0 */
56 #define sum_kids sum_u.u_kids
57 struct member_file *u_files[8]; /* when sum_level == 0 */
58 } sum_u;
59 unsigned long sum_tokens_size;
60 unsigned long sum_hits_count;
61 int sum_free_index;
62 int sum_level;
63 };
64
65 void usage (void);
66 static int ceil_log_8 (unsigned long n);
67 static int ceil_log_2 (unsigned long n);
68 static void assert_writeable (char const *file_name);
69 static void scan_files (struct idhead const *idhp);
70 static void scan_member_file (struct member_file const *member);
71 static void scan_member_file_1 (get_token_func_t get_token,
72 void const *args, FILE *source_FILE);
73 static void report_statistics (void);
74 static void write_id_file (struct idhead *idhp);
75 static unsigned long token_hash_1 (void const *key);
76 static unsigned long token_hash_2 (void const *key);
77 static int token_hash_cmp (void const *x, void const *y);
78 static int token_qsort_cmp (void const *x, void const *y);
79 static void bump_current_hits_signature (void);
80 static void init_hits_signature (int i);
81 static void free_summary_tokens (void);
82 static void summarize (void);
83 static void init_summary (void);
84 static struct summary *make_sibling_summary (struct summary *summary);
85 static int count_vec_size (struct summary *summary,
86 unsigned char const *tail_hits);
87 static int count_buf_size (struct summary *summary,
88 unsigned char const *tail_hits);
89 static int check_hits (struct summary* summary) _GL_ATTRIBUTE_PURE;
90 static void write_hits (FILE *fp, struct summary *summary,
91 unsigned char const *tail_hits);
92 static void sign_token (struct token *token);
93 static void add_token_to_summary (struct summary *summary, struct token *token);
94
95 static struct hash_table token_table;
96
97 /* Miscellaneous statistics */
98 static unsigned long input_chars;
99 static unsigned long name_tokens;
100 static unsigned long number_tokens;
101 static unsigned long string_tokens;
102 static unsigned long literal_tokens;
103 static unsigned long comment_tokens;
104 static unsigned long occurrences;
105 static unsigned long hits_length = 0;
106 static unsigned long tokens_length = 0;
107 static unsigned long output_length = 0;
108
109 static int verbose_flag = 0;
110 static int statistics_flag = 0;
111
112 static int levels = 0; /* ceil(log(8)) of file_name_count */
113
114 static unsigned char *current_hits_signature;
115 #define INIT_TOKENS_SIZE(level) (1 << ((level) + 13))
116 static struct summary *summary_root;
117 static struct summary *summary_leaf;
118
119 static char *lang_map_file_name = 0;
120 static int show_version = 0;
121 static int show_help = 0;
122 struct idhead idh;
123 static struct file_link *cw_dlink;
124
125 void usage (void) __attribute__((__noreturn__));
126 void
usage(void)127 usage (void)
128 {
129 fprintf (stderr, _("Try `%s --help' for more information.\n"),
130 program_name);
131 exit (EXIT_FAILURE);
132 }
133
134 /* For long options that have no equivalent short option, use a
135 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
136 enum
137 {
138 FILES0_FROM_OPTION = CHAR_MAX +1,
139 };
140
141 static struct option const long_options[] =
142 {
143 { "file", required_argument, 0, 'f' },
144 { "output", required_argument, 0, 'o' },
145 { "include", required_argument, 0, 'i' },
146 { "exclude", required_argument, 0, 'x' },
147 { "lang-option", required_argument, 0, 'l' },
148 { "lang-map", required_argument, 0, 'm' },
149 { "default-lang", required_argument, 0, 'd' },
150 { "prune", required_argument, 0, 'p' },
151 { "verbose", no_argument, 0, 'v' },
152 { "statistics", no_argument, 0, 's' },
153 { "help", no_argument, &show_help, 1 },
154 { "version", no_argument, &show_version, 1 },
155 { "files0-from", required_argument, NULL, FILES0_FROM_OPTION },
156 {NULL, 0, NULL, 0}
157 };
158
159 static void __attribute__((__noreturn__))
help_me(void)160 help_me (void)
161 {
162 printf (_("\
163 Usage: %s [OPTION]... [FILE]...\n\
164 "), program_name);
165
166 printf (_("\
167 Build an identifier database.\n\
168 -o, --output=OUTFILE file name of ID database output\n\
169 -f, --file=OUTFILE synonym for --output\n\
170 -i, --include=LANGS include languages in LANGS (default: \"C C++ asm\")\n\
171 -x, --exclude=LANGS exclude languages in LANGS\n\
172 -l, --lang-option=L:OPT pass OPT as a default for language L (see below)\n\
173 -m, --lang-map=MAPFILE use MAPFILE to map file names onto source language\n\
174 -d, --default-lang=LANG make LANG the default source language\n\
175 -p, --prune=NAMES exclude the named files and/or directories\n\
176 -v, --verbose report per file statistics\n\
177 -s, --statistics report statistics at end of run\n\
178 \n\
179 --files0-from=F tokenize only the files specified by\n\
180 NUL-terminated names in file F\n\
181 \n\
182 --help display this help and exit\n\
183 --version output version information and exit\n\
184 \n\
185 FILE may be a file name, or a directory name to recursively search.\n\
186 If no FILE is given, the current directory is searched by default.\n\
187 Note that the `--include' and `--exclude' options are mutually-exclusive.\n\
188 \n\
189 The following arguments apply to the language-specific scanners:\n\
190 "));
191 language_help_me ();
192 printf (_("\nReport bugs to " PACKAGE_BUGREPORT "\n\n"));
193 exit (EXIT_SUCCESS);
194 }
195
196 static void *heap_initial;
197 static void *heap_after_walk;
198 static void *heap_after_scan;
199
get_process_heap(void)200 static void *get_process_heap(void)
201 {
202 #if HAVE_SBRK
203 return sbrk(0);
204 #else
205 return 0;
206 #endif /* HAVE_SBRK */
207 }
208
209 int
main(int argc,char ** argv)210 main (int argc, char **argv)
211 {
212 bool ok;
213 int nfiles;
214 char *files_from = NULL;
215
216 set_program_name (argv[0]);
217 heap_initial = get_process_heap();
218 idh.idh_file_name = DEFAULT_ID_FILE_NAME;
219
220 #if ENABLE_NLS
221 /* Set locale according to user's wishes. */
222 setlocale (LC_ALL, "");
223
224 /* Tell program which translations to use and where to find. */
225 bindtextdomain (PACKAGE, LOCALEDIR);
226 textdomain (PACKAGE);
227 #endif
228
229 atexit (close_stdout);
230
231 for (;;)
232 {
233 int optc = getopt_long (argc, argv, "o:f:i:x:l:m:d:p:vVs",
234 long_options, (int *) 0);
235 if (optc < 0)
236 break;
237 switch (optc)
238 {
239 case 0:
240 break;
241
242 case 'o':
243 case 'f':
244 idh.idh_file_name = optarg;
245 break;
246
247 case 'i':
248 include_languages (optarg);
249 break;
250
251 case 'x':
252 exclude_languages (optarg);
253 break;
254
255 case 'l':
256 language_save_arg (optarg);
257 break;
258
259 case 'm':
260 lang_map_file_name = optarg;
261 break;
262
263 case 'd':
264 set_default_language (optarg);
265 break;
266
267 case 'p':
268 if (cw_dlink == 0)
269 cw_dlink = init_walker (&idh);
270 prune_file_names (optarg, cw_dlink);
271 break;
272
273 case FILES0_FROM_OPTION:
274 files_from = optarg;
275 break;
276
277 case 'V':
278 walker_verbose_flag = 1;
279 case 'v':
280 verbose_flag = 1;
281 case 's':
282 statistics_flag = 1;
283 break;
284
285 default:
286 usage ();
287 }
288 }
289
290 if (show_version)
291 {
292 printf ("%s - %s\n", program_name, PACKAGE_VERSION);
293 exit (EXIT_SUCCESS);
294 }
295
296 if (show_help)
297 help_me ();
298
299 nfiles = argc - optind;
300
301 struct argv_iterator *ai;
302 if (files_from)
303 {
304 /* When using --files0-from=F, you may not specify any files
305 on the command-line. */
306 if (nfiles != 0)
307 {
308 error (0, 0, _("extra operand %s"), quote (argv[optind]));
309 fprintf (stderr, "%s\n",
310 _("file operands cannot be combined with --files0-from"));
311 usage();
312 }
313
314 if (! (strequ (files_from, "-") || freopen (files_from, "r", stdin)))
315 error (EXIT_FAILURE, errno, _("cannot open %s for reading"),
316 quote (files_from));
317
318 ai = argv_iter_init_stream (stdin);
319 }
320 else
321 {
322 static char *cwd_only[2];
323 cwd_only[0] = bad_cast (".");
324 cwd_only[1] = NULL;
325 char **files = (optind < argc ? argv + optind : cwd_only);
326 ai = argv_iter_init_argv (files);
327 }
328
329 language_getopt ();
330 assert_writeable (idh.idh_file_name);
331 if (cw_dlink == 0)
332 cw_dlink = init_walker (&idh);
333 parse_language_map (lang_map_file_name);
334
335 /* Walk the file and directory names given on the command line. */
336 ok = true;
337 while (true)
338 {
339 bool skip_file = false;
340 enum argv_iter_err ai_err;
341 char *file_name = argv_iter (ai, &ai_err);
342 if (ai_err == AI_ERR_EOF)
343 break;
344 if (!file_name)
345 {
346 switch (ai_err)
347 {
348 case AI_ERR_READ:
349 error (0, errno, _("%s: read error"), quote (files_from));
350 continue;
351
352 case AI_ERR_MEM:
353 xalloc_die ();
354
355 default:
356 assert (!"unexpected error code from argv_iter");
357 }
358 }
359 if (files_from && STREQ (files_from, "-") && STREQ (file_name, "-"))
360 {
361 /* Give a better diagnostic in an unusual case:
362 printf - | du --files0-from=- */
363 error (0, 0, _("when reading file names from stdin, "
364 "no file name of %s allowed"),
365 quote (file_name));
366 skip_file = true;
367 }
368
369 /* Report and skip any empty file names. */
370 if (!file_name[0])
371 {
372 /* Diagnose a zero-length file name. When it's one
373 among many, knowing the record number may help.
374 FIXME: currently print the record number only with
375 --files0-from=FILE. Maybe do it for argv, too? */
376 if (files_from == NULL)
377 error (0, 0, "%s", _("invalid zero-length file name"));
378 else
379 {
380 /* Using the standard `filename:line-number:' prefix here is
381 not totally appropriate, since NUL is the separator, not NL,
382 but it might be better than nothing. */
383 unsigned long int file_number = argv_iter_n_args (ai);
384 error (0, 0, "%s:%lu: %s", quotearg_colon (files_from),
385 file_number, _("invalid zero-length file name"));
386 }
387 skip_file = true;
388 }
389
390 if (skip_file)
391 ok = false;
392 else
393 {
394 struct file_link *flink = parse_file_name (file_name, cw_dlink);
395 if (flink)
396 walk_flink (flink, 0);
397 /* FIXME: walk_flink can fail, so should return status.
398 Then caller can continue with other arguments. */
399 }
400 }
401 argv_iter_free (ai);
402
403 heap_after_walk = get_process_heap();
404
405 mark_member_file_links (&idh);
406 log_8_member_files = ceil_log_8 (idh.idh_member_file_table.ht_fill);
407
408 /* hack start: when scanning a single file, log_8_member_files results 0.
409 Adjust to 1 in this case, or xmalloc will be called for 0 bytes,
410 and later the struct token will have no 'hits' field.
411 This would cause a crash
412 (see testsuite/consistency, testsuite/single_file_token_bug.c)
413
414 <claudio@gnu.org 2005>
415 */
416
417 if (log_8_member_files == 0)
418 log_8_member_files = 1;
419
420 /* hack end */
421
422 current_hits_signature = xmalloc (log_8_member_files);
423
424 /* If scannable files were given, then scan them. */
425 if (idh.idh_member_file_table.ht_fill)
426 {
427 scan_files (&idh);
428 heap_after_scan = get_process_heap();
429
430 free_summary_tokens ();
431 free (token_table.ht_vec);
432 chdir_to_link (cw_dlink);
433 write_id_file (&idh);
434
435 if (statistics_flag)
436 report_statistics ();
437 }
438 else
439 error (0, 0, _("nothing to do"));
440
441 exit (ok ? EXIT_SUCCESS : EXIT_FAILURE);
442 }
443
444 /* Return the integer ceiling of the base-8 logarithm of N. */
445
446 static int _GL_ATTRIBUTE_CONST
ceil_log_8(unsigned long n)447 ceil_log_8 (unsigned long n)
448 {
449 int log_8 = 0;
450
451 n--;
452 while (n)
453 {
454 log_8++;
455 n >>= 3;
456 }
457 return log_8;
458 }
459
460 /* Return the integer ceiling of the base-2 logarithm of N. */
461
462 static int _GL_ATTRIBUTE_CONST
ceil_log_2(unsigned long n)463 ceil_log_2 (unsigned long n)
464 {
465 int log_2 = 0;
466
467 n--;
468 while (n)
469 {
470 log_2++;
471 n >>= 1;
472 }
473 return log_2;
474 }
475
476 /* Ensure that FILE_NAME can be written. If it exists, make sure it's
477 writable. If it doesn't exist, make sure it can be created. Exit
478 with a diagnostic if this is not so. */
479
480 static void
assert_writeable(char const * filename)481 assert_writeable (char const *filename)
482 {
483 if (access (filename, 06) < 0)
484 {
485 if (errno == ENOENT)
486 {
487 char *dirname = dir_name (filename);
488 if (access (dirname, 06) < 0)
489 error (EXIT_FAILURE, errno, _("can't create `%s' in `%s'"),
490 base_name (filename), dirname);
491 free(dirname);
492 }
493 else
494 error (EXIT_FAILURE, errno, _("can't modify `%s'"), filename);
495 }
496 }
497
498 /* Iterate over all eligible files (the members of the set of scannable files).
499 Create a tree8 to store the set of files where a token occurs. */
500
501 static void
scan_files(struct idhead const * idhp)502 scan_files (struct idhead const *idhp)
503 {
504 struct member_file **members_0
505 = (struct member_file **) hash_dump (&idhp->idh_member_file_table,
506 0, member_file_qsort_compare);
507 struct member_file **end = &members_0[idhp->idh_member_file_table.ht_fill];
508 struct member_file **members = members_0;
509 int n = idhp->idh_member_file_table.ht_fill;
510
511 n = n * ceil_log_2 (n) * 16;
512 if (n == 0)
513 n = 1024;
514 else if (n > 1024*1024)
515 n = 1024*1024;
516
517 hash_init (&token_table, n, token_hash_1, token_hash_2, token_hash_cmp);
518 if (verbose_flag) {
519 char offstr[INT_BUFSIZE_BOUND(off_t)];
520
521 printf ("files=%ld, largest=%s, slots=%lu\n",
522 idhp->idh_member_file_table.ht_fill,
523 offtostr(largest_member_file, offstr),
524 token_table.ht_size);
525 }
526 init_hits_signature (0);
527 init_summary ();
528 obstack_init (&tokens_obstack);
529
530 if (largest_member_file > MAX_LARGEST_MEMBER_FILE)
531 largest_member_file = MAX_LARGEST_MEMBER_FILE;
532 scanner_buffer = xmalloc (largest_member_file + 1);
533
534 for (;;)
535 {
536 const struct member_file *member = *members++;
537 scan_member_file (member);
538 if (members == end)
539 break;
540 if (current_hits_signature[0] & 0x80)
541 summarize ();
542 bump_current_hits_signature ();
543 }
544
545 free (scanner_buffer);
546 free (members_0);
547 }
548
549 /* Open a file and scan it. */
550
551 static void
scan_member_file(struct member_file const * member)552 scan_member_file (struct member_file const *member)
553 {
554 struct lang_args const *lang_args = member->mf_lang_args;
555 struct language const *lang = lang_args->la_language;
556 get_token_func_t get_token = lang->lg_get_token;
557 struct file_link *flink = member->mf_link;
558 struct stat st;
559 FILE *source_FILE;
560
561 chdir_to_link (flink->fl_parent);
562 source_FILE = fopen (flink->fl_name, "r");
563 if (source_FILE)
564 {
565 char *file_name = alloca (PATH_MAX);
566 if (statistics_flag)
567 {
568 if (fstat (fileno (source_FILE), &st) < 0)
569 {
570 maybe_relative_file_name (file_name, flink, cw_dlink);
571 error (0, errno, _("can't stat `%s'"), file_name);
572 }
573 else
574 input_chars += st.st_size;
575 }
576 if (verbose_flag)
577 {
578 maybe_relative_file_name (file_name, flink, cw_dlink);
579 printf ("%ld: %s: %s", member->mf_index, lang->lg_name, file_name);
580 fflush (stdout);
581 }
582 scan_member_file_1 (get_token, lang_args->la_args_digested, source_FILE);
583 if (verbose_flag)
584 putchar ('\n');
585 fclose (source_FILE);
586 }
587 else
588 error (0, errno, _("can't open `%s'"), flink->fl_name);
589 }
590
591 /* Iterate over all tokens in the file, and merge the file's tree8
592 signature into the token table entry. */
593
594 static void
scan_member_file_1(get_token_func_t get_token,void const * args,FILE * source_FILE)595 scan_member_file_1 (get_token_func_t get_token, void const *args, FILE *source_FILE)
596 {
597 struct token **slot;
598 struct token *token;
599 int flags;
600 int new_tokens = 0;
601 int distinct_tokens = 0;
602
603 while ((token = (*get_token) (source_FILE, args, &flags)) != NULL)
604 {
605 if (*TOKEN_NAME (token) == '\0') {
606 obstack_free (&tokens_obstack, token);
607 continue;
608 }
609 slot = (struct token **) hash_find_slot (&token_table, token);
610 if (HASH_VACANT (*slot))
611 {
612 token->tok_flags = flags;
613 token->tok_count = 1;
614 memset (TOKEN_HITS (token), 0, log_8_member_files);
615 sign_token (token);
616 if (verbose_flag)
617 {
618 distinct_tokens++;
619 new_tokens++;
620 }
621 hash_insert_at (&token_table, token, slot);
622 }
623 else
624 {
625 obstack_free (&tokens_obstack, token);
626 token = *slot;
627 token->tok_flags |= flags;
628 if (token->tok_count < USHRT_MAX)
629 token->tok_count++;
630 if (!(TOKEN_HITS (token)[0] & current_hits_signature[0]))
631 {
632 sign_token (token);
633 if (verbose_flag)
634 distinct_tokens++;
635 }
636 }
637 }
638 if (verbose_flag)
639 {
640 printf (_(" new = %d/%d"), new_tokens, distinct_tokens);
641 if (distinct_tokens != 0)
642 printf (" = %.0f%%", 100.0 * (double) new_tokens / (double) distinct_tokens);
643 }
644 }
645
646 static void
report_statistics(void)647 report_statistics (void)
648 {
649 printf (_("Name=%ld, "), name_tokens);
650 printf (_("Number=%ld, "), number_tokens);
651 printf (_("String=%ld, "), string_tokens);
652 printf (_("Literal=%ld, "), literal_tokens);
653 printf (_("Comment=%ld\n"), comment_tokens);
654
655 printf (_("Files=%ld, "), idh.idh_files);
656 printf (_("Tokens=%ld, "), occurrences);
657 printf (_("Bytes=%ld Kb, "), input_chars / 1024);
658 printf (_("Heap=%llu+%llu Kb, "),
659 (unsigned long long) ((char *) heap_after_scan
660 - (char *) heap_after_walk) / 1024,
661 (unsigned long long) ((char *) heap_after_walk
662 - (char *) heap_initial) / 1024);
663 printf (_("Output=%ld (%ld tok, %ld hit)\n"),
664 output_length, tokens_length, hits_length);
665
666 hash_print_stats (&token_table, stdout);
667 printf (_(", Freq=%ld/%ld=%.2f\n"), occurrences, token_table.ht_fill,
668 (double) occurrences / (double) token_table.ht_fill);
669 }
670
671 /* As the database is written, may need to adjust the file names. If
672 we are generating the ID file in a remote directory, then adjust
673 the file names to be relative to the location of the ID database.
674
675 (This would be a common useage if you want to make a database for a
676 directory which you have no write access to, so you cannot create
677 the ID file.) */
678
679 static void
write_id_file(struct idhead * idhp)680 write_id_file (struct idhead *idhp)
681 {
682 struct token **tokens;
683 int i;
684 int buf_size;
685 int vec_size;
686 int tok_size;
687 int max_buf_size = 0;
688 int max_vec_size = 0;
689
690 if (verbose_flag)
691 printf (_("Sorting tokens...\n"));
692
693 assert (summary_root->sum_hits_count == token_table.ht_fill);
694 tokens = xnrealloc (summary_root->sum_tokens,
695 token_table.ht_fill, sizeof *tokens);
696 qsort (tokens, token_table.ht_fill, sizeof (struct token *), token_qsort_cmp);
697
698 if (verbose_flag)
699 printf (_("Writing `%s'...\n"), idhp->idh_file_name);
700 idhp->idh_FILE = fopen (idhp->idh_file_name, "w+b");
701 if (idhp->idh_FILE == NULL)
702 error (EXIT_FAILURE, errno, _("can't create `%s'"), idhp->idh_file_name);
703
704 idhp->idh_magic[0] = IDH_MAGIC_0;
705 idhp->idh_magic[1] = IDH_MAGIC_1;
706 idhp->idh_version = IDH_VERSION;
707 idhp->idh_flags = IDH_COUNTS;
708
709 /* write out the list of pathnames */
710
711 fseek (idhp->idh_FILE, sizeof_idhead (), 0);
712 off_t off = ftello (idhp->idh_FILE);
713 if (UINT32_MAX < off)
714 error (EXIT_FAILURE, 0, _("internal limitation: offset of 2^32 or larger"));
715 idhp->idh_flinks_offset = off;
716 serialize_file_links (idhp);
717
718 /* write out the list of identifiers */
719
720 putc ('\0', idhp->idh_FILE);
721 putc ('\0', idhp->idh_FILE);
722 off = ftello (idhp->idh_FILE);
723 if (UINT32_MAX < off)
724 error (EXIT_FAILURE, 0, _("internal limitation: offset of 2^32 or larger"));
725 idhp->idh_tokens_offset = off;
726
727 for (i = 0; i < token_table.ht_fill; i++, tokens++)
728 {
729 struct token *token = *tokens;
730
731 occurrences += token->tok_count;
732 if (token->tok_flags & TOK_NUMBER)
733 number_tokens++;
734 if (token->tok_flags & TOK_NAME)
735 name_tokens++;
736 if (token->tok_flags & TOK_STRING)
737 string_tokens++;
738 if (token->tok_flags & TOK_LITERAL)
739 literal_tokens++;
740 if (token->tok_flags & TOK_COMMENT)
741 comment_tokens++;
742
743 fputs (TOKEN_NAME (token), idhp->idh_FILE);
744 putc ('\0', idhp->idh_FILE);
745 if (token->tok_count > 0xff)
746 token->tok_flags |= TOK_SHORT_COUNT;
747 putc (token->tok_flags, idhp->idh_FILE);
748 putc (token->tok_count & 0xff, idhp->idh_FILE);
749 if (token->tok_flags & TOK_SHORT_COUNT)
750 putc (token->tok_count >> 8, idhp->idh_FILE);
751
752 vec_size = count_vec_size (summary_root, TOKEN_HITS (token) + levels);
753 buf_size = count_buf_size (summary_root, TOKEN_HITS (token) + levels);
754 hits_length += buf_size;
755 tok_size = strlen (TOKEN_NAME (token)) + 1;
756 tokens_length += tok_size;
757 buf_size += tok_size + sizeof (token->tok_flags) + sizeof (token->tok_count) + 2;
758 if (buf_size > max_buf_size)
759 max_buf_size = buf_size;
760 if (vec_size > max_vec_size)
761 max_vec_size = vec_size;
762
763 write_hits (idhp->idh_FILE, summary_root, TOKEN_HITS (token) + levels);
764 putc ('\0', idhp->idh_FILE);
765 putc ('\0', idhp->idh_FILE);
766 }
767 assert (check_hits (summary_root) == 0);
768 idhp->idh_tokens = token_table.ht_fill;
769 off = ftello (idhp->idh_FILE);
770 if (UINT32_MAX < off)
771 error (EXIT_FAILURE, 0, _("internal limitation: offset of 2^32 or larger"));
772 output_length = off;
773 idhp->idh_end_offset = output_length - 2;
774 idhp->idh_buf_size = max_buf_size;
775 idhp->idh_vec_size = max_vec_size;
776
777 write_idhead (&idh);
778 if (ferror (idhp->idh_FILE) || fclose (idhp->idh_FILE) != 0)
779 error (EXIT_FAILURE, errno, _("error closing `%s'"), idhp->idh_file_name);
780 }
781
782 /* Define primary and secondary hash and comparison functions for the
783 token table. */
784
785 static unsigned long _GL_ATTRIBUTE_PURE
token_hash_1(void const * key)786 token_hash_1 (void const *key)
787 {
788 return_STRING_HASH_1 (TOKEN_NAME ((struct token const *) key));
789 }
790
791 static unsigned long _GL_ATTRIBUTE_PURE
token_hash_2(void const * key)792 token_hash_2 (void const *key)
793 {
794 return_STRING_HASH_2 (TOKEN_NAME ((struct token const *) key));
795 }
796
797 static int _GL_ATTRIBUTE_PURE
token_hash_cmp(void const * x,void const * y)798 token_hash_cmp (void const *x, void const *y)
799 {
800 return_STRING_COMPARE (TOKEN_NAME ((struct token const *) x),
801 TOKEN_NAME ((struct token const *) y));
802 }
803
804 static int _GL_ATTRIBUTE_PURE
token_qsort_cmp(void const * x,void const * y)805 token_qsort_cmp (void const *x, void const *y)
806 {
807 return_STRING_COMPARE (TOKEN_NAME (*(struct token const *const *) x),
808 TOKEN_NAME (*(struct token const *const *) y));
809 }
810
811
812 /****************************************************************************/
813
814 /* Advance the tree8 hit signature that corresponds to the file
815 we are now scanning. */
816
817 static void
bump_current_hits_signature(void)818 bump_current_hits_signature (void)
819 {
820 unsigned char *hits = current_hits_signature;
821 while (*hits & 0x80)
822 *hits++ = 1;
823 *hits <<= 1;
824 }
825
826 /* Initialize the tree8 hit signature for the first file we scan. */
827
828 static void
init_hits_signature(int i)829 init_hits_signature (int i)
830 {
831 unsigned char *hits = current_hits_signature;
832 unsigned char const *end = current_hits_signature + log_8_member_files;
833 while (hits < end)
834 {
835 *hits = 1 << (i & 7);
836 i >>= 3;
837 hits++;
838 }
839 }
840
841 static void
free_summary_tokens(void)842 free_summary_tokens (void)
843 {
844 struct summary *summary = summary_leaf;
845 while (summary != summary_root)
846 {
847 free (summary->sum_tokens);
848 summary = summary->sum_parent;
849 }
850 }
851
852 static void
summarize(void)853 summarize (void)
854 {
855 unsigned char const *hits_sig = current_hits_signature;
856 struct summary *summary = summary_leaf;
857
858 do
859 {
860 unsigned long count = summary->sum_hits_count;
861 unsigned char *hits = xmalloc (count + 1);
862 unsigned int level = summary->sum_level;
863 struct token **tokens = summary->sum_tokens;
864 unsigned long init_size = INIT_TOKENS_SIZE (summary->sum_level);
865
866 if (verbose_flag)
867 printf (_("level %d: %ld/%ld = %.0f%%\n"),
868 summary->sum_level, count, init_size,
869 100.0 * (double) count / (double) init_size);
870
871 qsort (tokens, count, sizeof (struct token *), token_qsort_cmp);
872 summary->sum_hits = hits;
873 while (count--)
874 {
875 unsigned char *hit = TOKEN_HITS (*tokens++) + level;
876 *hits++ = *hit;
877 *hit = 0;
878 }
879 *hits++ = 0;
880 if (summary->sum_parent)
881 {
882 free (summary->sum_tokens);
883 summary->sum_tokens = 0;
884 }
885 summary = summary->sum_parent;
886 }
887 while (*++hits_sig & 0x80);
888 summary_leaf = make_sibling_summary (summary_leaf);
889 }
890
891 static void
init_summary(void)892 init_summary (void)
893 {
894 unsigned long size = INIT_TOKENS_SIZE (0);
895 summary_root = summary_leaf = xcalloc (1, sizeof(struct summary));
896 summary_root->sum_tokens_size = size;
897 summary_root->sum_tokens = xmalloc (sizeof(struct token *) * size);
898 }
899
900 static struct summary *
make_sibling_summary(struct summary * summary)901 make_sibling_summary (struct summary *summary)
902 {
903 struct summary *parent = summary->sum_parent;
904 size_t size;
905
906 if (parent == NULL)
907 {
908 levels++;
909 summary_root = summary->sum_parent = parent
910 = xcalloc (1, sizeof(struct summary));
911 parent->sum_level = levels;
912 parent->sum_kids[0] = summary;
913 parent->sum_hits_count = summary->sum_hits_count;
914 parent->sum_free_index = 1;
915 size = INIT_TOKENS_SIZE (levels);
916 if (summary->sum_tokens_size >= size)
917 {
918 parent->sum_tokens_size = summary->sum_tokens_size;
919 parent->sum_tokens = summary->sum_tokens;
920 }
921 else
922 {
923 parent->sum_tokens_size = size;
924 parent->sum_tokens = xnrealloc (summary->sum_tokens, size,
925 sizeof *summary->sum_tokens);
926 }
927 summary->sum_tokens = 0;
928 }
929 if (parent->sum_free_index == 8)
930 parent = make_sibling_summary (parent);
931 summary = xcalloc (1, sizeof(struct summary));
932 summary->sum_level = parent->sum_level - 1;
933 parent->sum_kids[parent->sum_free_index++] = summary;
934 summary->sum_parent = parent;
935 size = INIT_TOKENS_SIZE (summary->sum_level);
936 summary->sum_tokens_size = size;
937 summary->sum_tokens = xmalloc (sizeof(struct token *) * size);
938 return summary;
939 }
940
941 static int _GL_ATTRIBUTE_PURE
count_vec_size(struct summary * summary,unsigned char const * tail_hits)942 count_vec_size (struct summary *summary, unsigned char const *tail_hits)
943 {
944 struct summary **kids;
945 unsigned int hits = (summary->sum_hits ? *summary->sum_hits : *tail_hits);
946
947 kids = summary->sum_kids;
948 if (*kids == NULL)
949 {
950 static char bits_per_nybble[] = { 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4 };
951 return bits_per_nybble[hits & 0xf] + bits_per_nybble[hits >> 4];
952 }
953 else
954 {
955 int bit;
956 int count = 0;
957 --tail_hits;
958 for (bit = 1; bit & 0xff; bit <<= 1, ++kids)
959 if (bit & hits)
960 count += count_vec_size (*kids, tail_hits);
961 return count;
962 }
963 }
964
965 static int _GL_ATTRIBUTE_PURE
count_buf_size(struct summary * summary,unsigned char const * tail_hits)966 count_buf_size (struct summary *summary, unsigned char const *tail_hits)
967 {
968 struct summary **kids;
969 unsigned int hits = (summary->sum_hits ? *summary->sum_hits : *tail_hits);
970
971 kids = summary->sum_kids;
972 if (*kids == NULL)
973 return 1;
974 else
975 {
976 int bit;
977 int count = 1;
978 --tail_hits;
979 for (bit = 1; bit & 0xff; bit <<= 1, ++kids)
980 if (bit & hits)
981 count += count_buf_size (*kids, tail_hits);
982 return count;
983 }
984 }
985
986 /* Sanity-check hit counts. Return nonzero if there's a problem.
987 Otherwise, return 0. */
988 static int
check_hits(struct summary * summary)989 check_hits (struct summary* summary)
990 {
991 struct summary **kids = summary->sum_kids;
992 struct summary **end = &kids[8];
993
994 if ( ! (summary->sum_hits == NULL || *summary->sum_hits == 0))
995 return 1;
996
997 if (end[-1] == 0)
998 while (*--end == 0)
999 ;
1000 while (kids < end)
1001 if (check_hits (*kids++))
1002 return 1;
1003
1004 return 0;
1005 }
1006
1007 static void
write_hits(FILE * fp,struct summary * summary,unsigned char const * tail_hits)1008 write_hits (FILE *fp, struct summary *summary, unsigned char const *tail_hits)
1009 {
1010 struct summary **kids;
1011 unsigned int hits = (summary->sum_hits ? *summary->sum_hits++ : *tail_hits);
1012
1013 assert (hits);
1014 putc (hits, fp);
1015
1016 kids = summary->sum_kids;
1017 if (*kids)
1018 {
1019 int bit;
1020 --tail_hits;
1021 for (bit = 1; (bit & 0xff) && *kids; bit <<= 1, ++kids)
1022 if (bit & hits)
1023 write_hits (fp, *kids, tail_hits);
1024 }
1025 }
1026
1027 static void
sign_token(struct token * token)1028 sign_token (struct token *token)
1029 {
1030 unsigned char *tok_hits = TOKEN_HITS (token);
1031 unsigned char *hits_sig = current_hits_signature;
1032 unsigned char *end = current_hits_signature + log_8_member_files;
1033 struct summary *summary = summary_leaf;
1034
1035 while (summary)
1036 {
1037 if (*tok_hits == 0)
1038 add_token_to_summary (summary, token);
1039 if (*tok_hits & *hits_sig)
1040 break;
1041 *tok_hits |= *hits_sig;
1042 summary = summary->sum_parent;
1043 tok_hits++;
1044 hits_sig++;
1045 }
1046 while (hits_sig < end)
1047 {
1048 if (*tok_hits & *hits_sig)
1049 break;
1050 *tok_hits |= *hits_sig;
1051 tok_hits++;
1052 hits_sig++;
1053 }
1054 }
1055
1056 static void
add_token_to_summary(struct summary * summary,struct token * token)1057 add_token_to_summary (struct summary *summary, struct token *token)
1058 {
1059 size_t size = summary->sum_tokens_size;
1060
1061 if (summary->sum_hits_count >= size)
1062 {
1063 summary->sum_tokens = x2nrealloc (summary->sum_tokens, &size,
1064 sizeof *summary->sum_tokens);
1065 summary->sum_tokens_size = size;
1066 }
1067 summary->sum_tokens[summary->sum_hits_count++] = token;
1068 }
1069