1 // -*- C++ -*-
2 /* Copyright (C) 1989-2018 Free Software Foundation, Inc.
3 Written by James Clark (jjc@jclark.com)
4
5 This file is part of groff.
6
7 groff is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 groff is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19
20 #include "lib.h"
21
22 #include <stdlib.h>
23 #include <assert.h>
24 #include <errno.h>
25
26 #include "posix.h"
27 #include "errarg.h"
28 #include "error.h"
29 #include "stringclass.h"
30 #include "cset.h"
31 #include "cmap.h"
32
33 #include "defs.h"
34 #include "index.h"
35
36 #include "nonposix.h"
37
38 extern "C" const char *Version_string;
39
40 #define DEFAULT_HASH_TABLE_SIZE 997
41 #define TEMP_INDEX_TEMPLATE "indxbibXXXXXX"
42
43 // (2^n - MALLOC_OVERHEAD) should be a good argument for malloc().
44
45 #define MALLOC_OVERHEAD 16
46
47 #ifdef BLOCK_SIZE
48 #undef BLOCK_SIZE
49 #endif
50
51 const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *)
52 - sizeof(int)) / sizeof(int));
53 struct block {
54 block *next;
55 int used;
56 int v[BLOCK_SIZE];
57
blockblock58 block(block *p = 0) : next(p), used(0) { }
59 };
60
61 struct block;
62
63 union table_entry {
64 block *ptr;
65 int count;
66 };
67
68 struct word_list {
69 word_list *next;
70 char *str;
71 int len;
72 word_list(const char *, int, word_list *);
73 };
74
75 table_entry *hash_table;
76 int hash_table_size = DEFAULT_HASH_TABLE_SIZE;
77 // We make this the same size as hash_table so we only have to do one
78 // mod per key.
79 static word_list **common_words_table = 0;
80 char *key_buffer;
81
82 FILE *indxfp;
83 int ntags = 0;
84 string filenames;
85 char *temp_index_file = 0;
86
87 const char *ignore_fields = "XYZ";
88 const char *common_words_file = COMMON_WORDS_FILE;
89 int n_ignore_words = 100;
90 int truncate_len = 6;
91 int shortest_len = 3;
92 int max_keys_per_item = 100;
93
94 static void usage(FILE *stream);
95 static void write_hash_table();
96 static void init_hash_table();
97 static void read_common_words_file();
98 static int store_key(char *s, int len);
99 static void possibly_store_key(char *s, int len);
100 static int do_whole_file(const char *filename);
101 static int do_file(const char *filename);
102 static void store_reference(int filename_index, int pos, int len);
103 static void check_integer_arg(char opt, const char *arg, int min, int *res);
104 static void store_filename(const char *);
105 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp);
106 static char *get_cwd();
107
108 extern "C" {
109 void cleanup();
110 void catch_fatal_signals();
111 void ignore_fatal_signals();
112 }
113
main(int argc,char ** argv)114 int main(int argc, char **argv)
115 {
116 program_name = argv[0];
117 static char stderr_buf[BUFSIZ];
118 setbuf(stderr, stderr_buf);
119
120 const char *base_name = 0;
121 typedef int (*parser_t)(const char *);
122 parser_t parser = do_file;
123 const char *directory = 0;
124 const char *foption = 0;
125 int opt;
126 static const struct option long_options[] = {
127 { "help", no_argument, 0, CHAR_MAX + 1 },
128 { "version", no_argument, 0, 'v' },
129 { NULL, 0, 0, 0 }
130 };
131 while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw",
132 long_options, NULL))
133 != EOF)
134 switch (opt) {
135 case 'c':
136 common_words_file = optarg;
137 break;
138 case 'd':
139 directory = optarg;
140 break;
141 case 'f':
142 foption = optarg;
143 break;
144 case 'h':
145 check_integer_arg('h', optarg, 1, &hash_table_size);
146 if (!is_prime(hash_table_size)) {
147 while (!is_prime(++hash_table_size))
148 ;
149 warning("%1 not prime: using %2 instead", optarg, hash_table_size);
150 }
151 break;
152 case 'i':
153 ignore_fields = optarg;
154 break;
155 case 'k':
156 check_integer_arg('k', optarg, 1, &max_keys_per_item);
157 break;
158 case 'l':
159 check_integer_arg('l', optarg, 0, &shortest_len);
160 break;
161 case 'n':
162 check_integer_arg('n', optarg, 0, &n_ignore_words);
163 break;
164 case 'o':
165 base_name = optarg;
166 break;
167 case 't':
168 check_integer_arg('t', optarg, 1, &truncate_len);
169 break;
170 case 'w':
171 parser = do_whole_file;
172 break;
173 case 'v':
174 printf("GNU indxbib (groff) version %s\n", Version_string);
175 exit(0);
176 break;
177 case CHAR_MAX + 1: // --help
178 usage(stdout);
179 exit(0);
180 break;
181 case '?':
182 usage(stderr);
183 exit(1);
184 break;
185 default:
186 assert(0);
187 break;
188 }
189 if (optind >= argc && foption == 0)
190 fatal("no files and no -f option");
191 if (!directory) {
192 char *path = get_cwd();
193 store_filename(path);
194 a_delete path;
195 }
196 else
197 store_filename(directory);
198 init_hash_table();
199 store_filename(common_words_file);
200 store_filename(ignore_fields);
201 key_buffer = new char[truncate_len];
202 read_common_words_file();
203 if (!base_name)
204 base_name = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME;
205 const char *p = strrchr(base_name, DIR_SEPS[0]), *p1;
206 const char *sep = &DIR_SEPS[1];
207 while (*sep) {
208 p1 = strrchr(base_name, *sep);
209 if (p1 && (!p || p1 > p))
210 p = p1;
211 sep++;
212 }
213 size_t name_max;
214 if (p) {
215 char *dir = strsave(base_name);
216 dir[p - base_name] = '\0';
217 name_max = file_name_max(dir);
218 a_delete dir;
219 }
220 else
221 name_max = file_name_max(".");
222 const char *filename = p ? p + 1 : base_name;
223 if (strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max)
224 fatal("'%1.%2' is too long for a filename", filename, INDEX_SUFFIX);
225 if (p) {
226 p++;
227 temp_index_file = new char[p - base_name + sizeof(TEMP_INDEX_TEMPLATE)];
228 memcpy(temp_index_file, base_name, p - base_name);
229 strcpy(temp_index_file + (p - base_name), TEMP_INDEX_TEMPLATE);
230 }
231 else {
232 temp_index_file = strsave(TEMP_INDEX_TEMPLATE);
233 }
234 catch_fatal_signals();
235 int fd = mkstemp(temp_index_file);
236 if (fd < 0)
237 fatal("can't create temporary index file: %1", strerror(errno));
238 indxfp = fdopen(fd, FOPEN_WB);
239 if (indxfp == 0)
240 fatal("fdopen failed");
241 if (fseek(indxfp, sizeof(index_header), 0) < 0)
242 fatal("can't seek past index header: %1", strerror(errno));
243 int failed = 0;
244 if (foption) {
245 FILE *fp = stdin;
246 if (strcmp(foption, "-") != 0) {
247 errno = 0;
248 fp = fopen(foption, "r");
249 if (!fp)
250 fatal("can't open '%1': %2", foption, strerror(errno));
251 }
252 string path;
253 int lineno = 1;
254 for (;;) {
255 int c;
256 for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) {
257 if (c == '\0')
258 error_with_file_and_line(foption, lineno,
259 "nul character in pathname ignored");
260 else
261 path += c;
262 }
263 if (path.length() > 0) {
264 path += '\0';
265 if (!(*parser)(path.contents()))
266 failed = 1;
267 path.clear();
268 }
269 if (c == EOF)
270 break;
271 lineno++;
272 }
273 if (fp != stdin)
274 fclose(fp);
275 }
276 for (int i = optind; i < argc; i++)
277 if (!(*parser)(argv[i]))
278 failed = 1;
279 write_hash_table();
280 if (fclose(indxfp) < 0)
281 fatal("error closing temporary index file: %1", strerror(errno));
282 char *index_file = new char[strlen(base_name) + sizeof(INDEX_SUFFIX)];
283 strcpy(index_file, base_name);
284 strcat(index_file, INDEX_SUFFIX);
285 #ifdef HAVE_RENAME
286 #ifdef __EMX__
287 if (access(index_file, R_OK) == 0)
288 unlink(index_file);
289 #endif /* __EMX__ */
290 if (rename(temp_index_file, index_file) < 0) {
291 #ifdef __MSDOS__
292 // RENAME could fail on plain MSDOS filesystems because
293 // INDEX_FILE is an invalid filename, e.g. it has multiple dots.
294 char *fname = p ? index_file + (p - base_name) : 0;
295 char *dot = 0;
296
297 // Replace the dot with an underscore and try again.
298 if (fname
299 && (dot = strchr(fname, '.')) != 0
300 && strcmp(dot, INDEX_SUFFIX) != 0)
301 *dot = '_';
302 if (rename(temp_index_file, index_file) < 0)
303 #endif
304 fatal("can't rename temporary index file: %1", strerror(errno));
305 }
306 #else /* not HAVE_RENAME */
307 ignore_fatal_signals();
308 if (unlink(index_file) < 0) {
309 if (errno != ENOENT)
310 fatal("can't unlink '%1': %2", index_file, strerror(errno));
311 }
312 if (link(temp_index_file, index_file) < 0)
313 fatal("can't link temporary index file: %1", strerror(errno));
314 if (unlink(temp_index_file) < 0)
315 fatal("can't unlink temporary index file: %1", strerror(errno));
316 #endif /* not HAVE_RENAME */
317 temp_index_file = 0;
318 return failed;
319 }
320
usage(FILE * stream)321 static void usage(FILE *stream)
322 {
323 fprintf(stream,
324 "usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
325 " [-l n] [-n n] [-o base] [-t n] [files...]\n",
326 program_name);
327 }
328
check_integer_arg(char opt,const char * arg,int min,int * res)329 static void check_integer_arg(char opt, const char *arg, int min, int *res)
330 {
331 char *ptr;
332 long n = strtol(arg, &ptr, 10);
333 if (n == 0 && ptr == arg)
334 error("argument to -%1 not an integer", opt);
335 else if (n < min)
336 error("argument to -%1 must not be less than %2", opt, min);
337 else {
338 if (n > INT_MAX)
339 error("argument to -%1 greater than maximum integer", opt);
340 else if (*ptr != '\0')
341 error("junk after integer argument to -%1", opt);
342 *res = int(n);
343 }
344 }
345
get_cwd()346 static char *get_cwd()
347 {
348 char *buf;
349 int size = 12;
350
351 for (;;) {
352 buf = new char[size];
353 if (getcwd(buf, size))
354 break;
355 if (errno != ERANGE)
356 fatal("cannot get current working directory: %1", strerror(errno));
357 a_delete buf;
358 if (size == INT_MAX)
359 fatal("current working directory longer than INT_MAX");
360 if (size > INT_MAX/2)
361 size = INT_MAX;
362 else
363 size *= 2;
364 }
365 return buf;
366 }
367
word_list(const char * s,int n,word_list * p)368 word_list::word_list(const char *s, int n, word_list *p)
369 : next(p), len(n)
370 {
371 str = new char[n];
372 memcpy(str, s, n);
373 }
374
read_common_words_file()375 static void read_common_words_file()
376 {
377 if (n_ignore_words <= 0)
378 return;
379 errno = 0;
380 FILE *fp = fopen(common_words_file, "r");
381 if (!fp)
382 fatal("can't open '%1': %2", common_words_file, strerror(errno));
383 common_words_table = new word_list * [hash_table_size];
384 for (int i = 0; i < hash_table_size; i++)
385 common_words_table[i] = 0;
386 int count = 0;
387 int key_len = 0;
388 for (;;) {
389 int c = getc(fp);
390 while (c != EOF && !csalnum(c))
391 c = getc(fp);
392 if (c == EOF)
393 break;
394 do {
395 if (key_len < truncate_len)
396 key_buffer[key_len++] = cmlower(c);
397 c = getc(fp);
398 } while (c != EOF && csalnum(c));
399 if (key_len >= shortest_len) {
400 int h = hash(key_buffer, key_len) % hash_table_size;
401 common_words_table[h] = new word_list(key_buffer, key_len,
402 common_words_table[h]);
403 }
404 if (++count >= n_ignore_words)
405 break;
406 key_len = 0;
407 if (c == EOF)
408 break;
409 }
410 n_ignore_words = count;
411 fclose(fp);
412 }
413
do_whole_file(const char * filename)414 static int do_whole_file(const char *filename)
415 {
416 errno = 0;
417 FILE *fp = fopen(filename, "r");
418 if (!fp) {
419 error("can't open '%1': %2", filename, strerror(errno));
420 return 0;
421 }
422 int count = 0;
423 int key_len = 0;
424 int c;
425 while ((c = getc(fp)) != EOF) {
426 if (csalnum(c)) {
427 key_len = 1;
428 key_buffer[0] = c;
429 while ((c = getc(fp)) != EOF) {
430 if (!csalnum(c))
431 break;
432 if (key_len < truncate_len)
433 key_buffer[key_len++] = c;
434 }
435 if (store_key(key_buffer, key_len)) {
436 if (++count >= max_keys_per_item)
437 break;
438 }
439 if (c == EOF)
440 break;
441 }
442 }
443 store_reference(filenames.length(), 0, 0);
444 store_filename(filename);
445 fclose(fp);
446 return 1;
447 }
448
do_file(const char * filename)449 static int do_file(const char *filename)
450 {
451 errno = 0;
452 // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on
453 // byte counts to be consistent with fseek.
454 FILE *fp = fopen(filename, FOPEN_RB);
455 if (fp == 0) {
456 error("can't open '%1': %2", filename, strerror(errno));
457 return 0;
458 }
459 int filename_index = filenames.length();
460 store_filename(filename);
461
462 enum {
463 START, // at the start of the file; also in between references
464 BOL, // in the middle of a reference, at the beginning of the line
465 PERCENT, // seen a percent at the beginning of the line
466 IGNORE, // ignoring a field
467 IGNORE_BOL, // at the beginning of a line ignoring a field
468 KEY, // in the middle of a key
469 DISCARD, // after truncate_len bytes of a key
470 MIDDLE // in between keys
471 } state = START;
472
473 // In states START, BOL, IGNORE_BOL, space_count how many spaces at
474 // the beginning have been seen. In states PERCENT, IGNORE, KEY,
475 // MIDDLE space_count must be 0.
476 int space_count = 0;
477 int byte_count = 0; // bytes read
478 int key_len = 0;
479 int ref_start = -1; // position of start of current reference
480 for (;;) {
481 int c = getc(fp);
482 if (c == EOF)
483 break;
484 // We opened the file in binary mode, so we need to skip
485 // every CR character before a Newline.
486 if (c == '\r') {
487 int peek = getc(fp);
488 if (peek == '\n') {
489 byte_count++;
490 c = peek;
491 }
492 else
493 ungetc(peek, fp);
494 }
495 #if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__)
496 else if (c == 0x1a) // ^Z means EOF in text files
497 break;
498 #endif
499 byte_count++;
500 switch (state) {
501 case START:
502 if (c == ' ' || c == '\t') {
503 space_count++;
504 break;
505 }
506 if (c == '\n') {
507 space_count = 0;
508 break;
509 }
510 ref_start = byte_count - space_count - 1;
511 space_count = 0;
512 if (c == '%')
513 state = PERCENT;
514 else if (csalnum(c)) {
515 state = KEY;
516 key_buffer[0] = c;
517 key_len = 1;
518 }
519 else
520 state = MIDDLE;
521 break;
522 case BOL:
523 switch (c) {
524 case '%':
525 if (space_count > 0) {
526 space_count = 0;
527 state = MIDDLE;
528 }
529 else
530 state = PERCENT;
531 break;
532 case ' ':
533 case '\t':
534 space_count++;
535 break;
536 case '\n':
537 store_reference(filename_index, ref_start,
538 byte_count - 1 - space_count - ref_start);
539 state = START;
540 space_count = 0;
541 break;
542 default:
543 space_count = 0;
544 if (csalnum(c)) {
545 state = KEY;
546 key_buffer[0] = c;
547 key_len = 1;
548 }
549 else
550 state = MIDDLE;
551 }
552 break;
553 case PERCENT:
554 if (strchr(ignore_fields, c) != 0)
555 state = IGNORE;
556 else if (c == '\n')
557 state = BOL;
558 else
559 state = MIDDLE;
560 break;
561 case IGNORE:
562 if (c == '\n')
563 state = IGNORE_BOL;
564 break;
565 case IGNORE_BOL:
566 switch (c) {
567 case '%':
568 if (space_count > 0) {
569 state = IGNORE;
570 space_count = 0;
571 }
572 else
573 state = PERCENT;
574 break;
575 case ' ':
576 case '\t':
577 space_count++;
578 break;
579 case '\n':
580 store_reference(filename_index, ref_start,
581 byte_count - 1 - space_count - ref_start);
582 state = START;
583 space_count = 0;
584 break;
585 default:
586 space_count = 0;
587 state = IGNORE;
588 }
589 break;
590 case KEY:
591 if (csalnum(c)) {
592 if (key_len < truncate_len)
593 key_buffer[key_len++] = c;
594 else
595 state = DISCARD;
596 }
597 else {
598 possibly_store_key(key_buffer, key_len);
599 key_len = 0;
600 if (c == '\n')
601 state = BOL;
602 else
603 state = MIDDLE;
604 }
605 break;
606 case DISCARD:
607 if (!csalnum(c)) {
608 possibly_store_key(key_buffer, key_len);
609 key_len = 0;
610 if (c == '\n')
611 state = BOL;
612 else
613 state = MIDDLE;
614 }
615 break;
616 case MIDDLE:
617 if (csalnum(c)) {
618 state = KEY;
619 key_buffer[0] = c;
620 key_len = 1;
621 }
622 else if (c == '\n')
623 state = BOL;
624 break;
625 default:
626 assert(0);
627 }
628 }
629 switch (state) {
630 case START:
631 break;
632 case DISCARD:
633 case KEY:
634 possibly_store_key(key_buffer, key_len);
635 // fall through
636 case BOL:
637 case PERCENT:
638 case IGNORE_BOL:
639 case IGNORE:
640 case MIDDLE:
641 store_reference(filename_index, ref_start,
642 byte_count - ref_start - space_count);
643 break;
644 default:
645 assert(0);
646 }
647 fclose(fp);
648 return 1;
649 }
650
store_reference(int filename_index,int pos,int len)651 static void store_reference(int filename_index, int pos, int len)
652 {
653 tag t;
654 t.filename_index = filename_index;
655 t.start = pos;
656 t.length = len;
657 fwrite_or_die(&t, sizeof(t), 1, indxfp);
658 ntags++;
659 }
660
store_filename(const char * fn)661 static void store_filename(const char *fn)
662 {
663 filenames += fn;
664 filenames += '\0';
665 }
666
init_hash_table()667 static void init_hash_table()
668 {
669 hash_table = new table_entry[hash_table_size];
670 for (int i = 0; i < hash_table_size; i++)
671 hash_table[i].ptr = 0;
672 }
673
possibly_store_key(char * s,int len)674 static void possibly_store_key(char *s, int len)
675 {
676 static int last_tagno = -1;
677 static int key_count;
678 if (last_tagno != ntags) {
679 last_tagno = ntags;
680 key_count = 0;
681 }
682 if (key_count < max_keys_per_item) {
683 if (store_key(s, len))
684 key_count++;
685 }
686 }
687
store_key(char * s,int len)688 static int store_key(char *s, int len)
689 {
690 if (len < shortest_len)
691 return 0;
692 int is_number = 1;
693 for (int i = 0; i < len; i++)
694 if (!csdigit(s[i])) {
695 is_number = 0;
696 s[i] = cmlower(s[i]);
697 }
698 if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9'))
699 return 0;
700 int h = hash(s, len) % hash_table_size;
701 if (common_words_table) {
702 for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next)
703 if (len == ptr->len && memcmp(s, ptr->str, len) == 0)
704 return 0;
705 }
706 table_entry *pp = hash_table + h;
707 if (!pp->ptr)
708 pp->ptr = new block;
709 else if (pp->ptr->v[pp->ptr->used - 1] == ntags)
710 return 1;
711 else if (pp->ptr->used >= BLOCK_SIZE)
712 pp->ptr = new block(pp->ptr);
713 pp->ptr->v[(pp->ptr->used)++] = ntags;
714 return 1;
715 }
716
write_hash_table()717 static void write_hash_table()
718 {
719 const int minus_one = -1;
720 int li = 0;
721 for (int i = 0; i < hash_table_size; i++) {
722 block *ptr = hash_table[i].ptr;
723 if (!ptr)
724 hash_table[i].count = -1;
725 else {
726 hash_table[i].count = li;
727 block *rev = 0;
728 while (ptr) {
729 block *tem = ptr;
730 ptr = ptr->next;
731 tem->next = rev;
732 rev = tem;
733 }
734 while (rev) {
735 fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp);
736 li += rev->used;
737 block *tem = rev;
738 rev = rev->next;
739 delete tem;
740 }
741 fwrite_or_die(&minus_one, sizeof(int), 1, indxfp);
742 li += 1;
743 }
744 }
745 if (sizeof(table_entry) == sizeof(int))
746 fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp);
747 else {
748 // write it out word by word
749 for (int i = 0; i < hash_table_size; i++)
750 fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp);
751 }
752 fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp);
753 if (fseek(indxfp, 0, 0) < 0)
754 fatal("error seeking on index file: %1", strerror(errno));
755 index_header h;
756 h.magic = INDEX_MAGIC;
757 h.version = INDEX_VERSION;
758 h.tags_size = ntags;
759 h.lists_size = li;
760 h.table_size = hash_table_size;
761 h.strings_size = filenames.length();
762 h.truncate = truncate_len;
763 h.shortest = shortest_len;
764 h.common = n_ignore_words;
765 fwrite_or_die(&h, sizeof(h), 1, indxfp);
766 }
767
fwrite_or_die(const void * ptr,int size,int nitems,FILE * fp)768 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp)
769 {
770 if (fwrite(ptr, size, nitems, fp) != (size_t)nitems)
771 fatal("fwrite failed: %1", strerror(errno));
772 }
773
fatal_error_exit()774 void fatal_error_exit()
775 {
776 cleanup();
777 exit(3);
778 }
779
780 extern "C" {
781
cleanup()782 void cleanup()
783 {
784 if (temp_index_file)
785 unlink(temp_index_file);
786 }
787
788 }
789