1 // -*- C++ -*-
2 /* Copyright (C) 1989-2018 Free Software Foundation, Inc.
3      Written by James Clark (jjc@jclark.com)
4 
5 This file is part of groff.
6 
7 groff is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11 
12 groff is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with this program.  If not, see <http://www.gnu.org/licenses/>. */
19 
20 #include "lib.h"
21 
22 #include <stdlib.h>
23 #include <assert.h>
24 #include <errno.h>
25 
26 #include "posix.h"
27 #include "errarg.h"
28 #include "error.h"
29 #include "stringclass.h"
30 #include "cset.h"
31 #include "cmap.h"
32 
33 #include "defs.h"
34 #include "index.h"
35 
36 #include "nonposix.h"
37 
38 extern "C" const char *Version_string;
39 
40 #define DEFAULT_HASH_TABLE_SIZE 997
41 #define TEMP_INDEX_TEMPLATE "indxbibXXXXXX"
42 
43 // (2^n - MALLOC_OVERHEAD) should be a good argument for malloc().
44 
45 #define MALLOC_OVERHEAD 16
46 
47 #ifdef BLOCK_SIZE
48 #undef BLOCK_SIZE
49 #endif
50 
51 const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *)
52 			 - sizeof(int)) / sizeof(int));
53 struct block {
54   block *next;
55   int used;
56   int v[BLOCK_SIZE];
57 
blockblock58   block(block *p = 0) : next(p), used(0) { }
59 };
60 
61 struct block;
62 
63 union table_entry {
64   block *ptr;
65   int count;
66 };
67 
68 struct word_list {
69   word_list *next;
70   char *str;
71   int len;
72   word_list(const char *, int, word_list *);
73 };
74 
75 table_entry *hash_table;
76 int hash_table_size = DEFAULT_HASH_TABLE_SIZE;
77 // We make this the same size as hash_table so we only have to do one
78 // mod per key.
79 static word_list **common_words_table = 0;
80 char *key_buffer;
81 
82 FILE *indxfp;
83 int ntags = 0;
84 string filenames;
85 char *temp_index_file = 0;
86 
87 const char *ignore_fields = "XYZ";
88 const char *common_words_file = COMMON_WORDS_FILE;
89 int n_ignore_words = 100;
90 int truncate_len = 6;
91 int shortest_len = 3;
92 int max_keys_per_item = 100;
93 
94 static void usage(FILE *stream);
95 static void write_hash_table();
96 static void init_hash_table();
97 static void read_common_words_file();
98 static int store_key(char *s, int len);
99 static void possibly_store_key(char *s, int len);
100 static int do_whole_file(const char *filename);
101 static int do_file(const char *filename);
102 static void store_reference(int filename_index, int pos, int len);
103 static void check_integer_arg(char opt, const char *arg, int min, int *res);
104 static void store_filename(const char *);
105 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp);
106 static char *get_cwd();
107 
108 extern "C" {
109   void cleanup();
110   void catch_fatal_signals();
111   void ignore_fatal_signals();
112 }
113 
main(int argc,char ** argv)114 int main(int argc, char **argv)
115 {
116   program_name = argv[0];
117   static char stderr_buf[BUFSIZ];
118   setbuf(stderr, stderr_buf);
119 
120   const char *base_name = 0;
121   typedef int (*parser_t)(const char *);
122   parser_t parser = do_file;
123   const char *directory = 0;
124   const char *foption = 0;
125   int opt;
126   static const struct option long_options[] = {
127     { "help", no_argument, 0, CHAR_MAX + 1 },
128     { "version", no_argument, 0, 'v' },
129     { NULL, 0, 0, 0 }
130   };
131   while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw",
132 			    long_options, NULL))
133 	 != EOF)
134     switch (opt) {
135     case 'c':
136       common_words_file = optarg;
137       break;
138     case 'd':
139       directory = optarg;
140       break;
141     case 'f':
142       foption = optarg;
143       break;
144     case 'h':
145       check_integer_arg('h', optarg, 1, &hash_table_size);
146       if (!is_prime(hash_table_size)) {
147 	while (!is_prime(++hash_table_size))
148 	  ;
149 	warning("%1 not prime: using %2 instead", optarg, hash_table_size);
150       }
151       break;
152     case 'i':
153       ignore_fields = optarg;
154       break;
155     case 'k':
156       check_integer_arg('k', optarg, 1, &max_keys_per_item);
157       break;
158     case 'l':
159       check_integer_arg('l', optarg, 0, &shortest_len);
160       break;
161     case 'n':
162       check_integer_arg('n', optarg, 0, &n_ignore_words);
163       break;
164     case 'o':
165       base_name = optarg;
166       break;
167     case 't':
168       check_integer_arg('t', optarg, 1, &truncate_len);
169       break;
170     case 'w':
171       parser = do_whole_file;
172       break;
173     case 'v':
174       printf("GNU indxbib (groff) version %s\n", Version_string);
175       exit(0);
176       break;
177     case CHAR_MAX + 1: // --help
178       usage(stdout);
179       exit(0);
180       break;
181     case '?':
182       usage(stderr);
183       exit(1);
184       break;
185     default:
186       assert(0);
187       break;
188     }
189   if (optind >= argc && foption == 0)
190     fatal("no files and no -f option");
191   if (!directory) {
192     char *path = get_cwd();
193     store_filename(path);
194     a_delete path;
195   }
196   else
197     store_filename(directory);
198   init_hash_table();
199   store_filename(common_words_file);
200   store_filename(ignore_fields);
201   key_buffer = new char[truncate_len];
202   read_common_words_file();
203   if (!base_name)
204     base_name = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME;
205   const char *p = strrchr(base_name, DIR_SEPS[0]), *p1;
206   const char *sep = &DIR_SEPS[1];
207   while (*sep) {
208     p1 = strrchr(base_name, *sep);
209     if (p1 && (!p || p1 > p))
210       p = p1;
211     sep++;
212   }
213   size_t name_max;
214   if (p) {
215     char *dir = strsave(base_name);
216     dir[p - base_name] = '\0';
217     name_max = file_name_max(dir);
218     a_delete dir;
219   }
220   else
221     name_max = file_name_max(".");
222   const char *filename = p ? p + 1 : base_name;
223   if (strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max)
224     fatal("'%1.%2' is too long for a filename", filename, INDEX_SUFFIX);
225   if (p) {
226     p++;
227     temp_index_file = new char[p - base_name + sizeof(TEMP_INDEX_TEMPLATE)];
228     memcpy(temp_index_file, base_name, p - base_name);
229     strcpy(temp_index_file + (p - base_name), TEMP_INDEX_TEMPLATE);
230   }
231   else {
232     temp_index_file = strsave(TEMP_INDEX_TEMPLATE);
233   }
234   catch_fatal_signals();
235   int fd = mkstemp(temp_index_file);
236   if (fd < 0)
237     fatal("can't create temporary index file: %1", strerror(errno));
238   indxfp = fdopen(fd, FOPEN_WB);
239   if (indxfp == 0)
240     fatal("fdopen failed");
241   if (fseek(indxfp, sizeof(index_header), 0) < 0)
242     fatal("can't seek past index header: %1", strerror(errno));
243   int failed = 0;
244   if (foption) {
245     FILE *fp = stdin;
246     if (strcmp(foption, "-") != 0) {
247       errno = 0;
248       fp = fopen(foption, "r");
249       if (!fp)
250 	fatal("can't open '%1': %2", foption, strerror(errno));
251     }
252     string path;
253     int lineno = 1;
254     for (;;) {
255       int c;
256       for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) {
257 	if (c == '\0')
258 	  error_with_file_and_line(foption, lineno,
259 				   "nul character in pathname ignored");
260 	else
261 	  path += c;
262       }
263       if (path.length() > 0) {
264 	path += '\0';
265 	if (!(*parser)(path.contents()))
266 	  failed = 1;
267 	path.clear();
268       }
269       if (c == EOF)
270 	break;
271       lineno++;
272     }
273     if (fp != stdin)
274       fclose(fp);
275   }
276   for (int i = optind; i < argc; i++)
277     if (!(*parser)(argv[i]))
278       failed = 1;
279   write_hash_table();
280   if (fclose(indxfp) < 0)
281     fatal("error closing temporary index file: %1", strerror(errno));
282   char *index_file = new char[strlen(base_name) + sizeof(INDEX_SUFFIX)];
283   strcpy(index_file, base_name);
284   strcat(index_file, INDEX_SUFFIX);
285 #ifdef HAVE_RENAME
286 #ifdef __EMX__
287   if (access(index_file, R_OK) == 0)
288     unlink(index_file);
289 #endif /* __EMX__ */
290   if (rename(temp_index_file, index_file) < 0) {
291 #ifdef __MSDOS__
292     // RENAME could fail on plain MSDOS filesystems because
293     // INDEX_FILE is an invalid filename, e.g. it has multiple dots.
294     char *fname = p ? index_file + (p - base_name) : 0;
295     char *dot = 0;
296 
297     // Replace the dot with an underscore and try again.
298     if (fname
299         && (dot = strchr(fname, '.')) != 0
300         && strcmp(dot, INDEX_SUFFIX) != 0)
301       *dot = '_';
302     if (rename(temp_index_file, index_file) < 0)
303 #endif
304     fatal("can't rename temporary index file: %1", strerror(errno));
305   }
306 #else /* not HAVE_RENAME */
307   ignore_fatal_signals();
308   if (unlink(index_file) < 0) {
309     if (errno != ENOENT)
310       fatal("can't unlink '%1': %2", index_file, strerror(errno));
311   }
312   if (link(temp_index_file, index_file) < 0)
313     fatal("can't link temporary index file: %1", strerror(errno));
314   if (unlink(temp_index_file) < 0)
315     fatal("can't unlink temporary index file: %1", strerror(errno));
316 #endif /* not HAVE_RENAME */
317   temp_index_file = 0;
318   return failed;
319 }
320 
usage(FILE * stream)321 static void usage(FILE *stream)
322 {
323   fprintf(stream,
324 "usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
325 "       [-l n] [-n n] [-o base] [-t n] [files...]\n",
326 	  program_name);
327 }
328 
check_integer_arg(char opt,const char * arg,int min,int * res)329 static void check_integer_arg(char opt, const char *arg, int min, int *res)
330 {
331   char *ptr;
332   long n = strtol(arg, &ptr, 10);
333   if (n == 0 && ptr == arg)
334     error("argument to -%1 not an integer", opt);
335   else if (n < min)
336     error("argument to -%1 must not be less than %2", opt, min);
337   else {
338     if (n > INT_MAX)
339       error("argument to -%1 greater than maximum integer", opt);
340     else if (*ptr != '\0')
341       error("junk after integer argument to -%1", opt);
342     *res = int(n);
343   }
344 }
345 
get_cwd()346 static char *get_cwd()
347 {
348   char *buf;
349   int size = 12;
350 
351   for (;;) {
352     buf = new char[size];
353     if (getcwd(buf, size))
354       break;
355     if (errno != ERANGE)
356       fatal("cannot get current working directory: %1", strerror(errno));
357     a_delete buf;
358     if (size == INT_MAX)
359       fatal("current working directory longer than INT_MAX");
360     if (size > INT_MAX/2)
361       size = INT_MAX;
362     else
363       size *= 2;
364   }
365   return buf;
366 }
367 
word_list(const char * s,int n,word_list * p)368 word_list::word_list(const char *s, int n, word_list *p)
369 : next(p), len(n)
370 {
371   str = new char[n];
372   memcpy(str, s, n);
373 }
374 
read_common_words_file()375 static void read_common_words_file()
376 {
377   if (n_ignore_words <= 0)
378     return;
379   errno = 0;
380   FILE *fp = fopen(common_words_file, "r");
381   if (!fp)
382     fatal("can't open '%1': %2", common_words_file, strerror(errno));
383   common_words_table = new word_list * [hash_table_size];
384   for (int i = 0; i < hash_table_size; i++)
385     common_words_table[i] = 0;
386   int count = 0;
387   int key_len = 0;
388   for (;;) {
389     int c = getc(fp);
390     while (c != EOF && !csalnum(c))
391       c = getc(fp);
392     if (c == EOF)
393       break;
394     do {
395       if (key_len < truncate_len)
396 	key_buffer[key_len++] = cmlower(c);
397       c = getc(fp);
398     } while (c != EOF && csalnum(c));
399     if (key_len >= shortest_len) {
400       int h = hash(key_buffer, key_len) % hash_table_size;
401       common_words_table[h] = new word_list(key_buffer, key_len,
402 					    common_words_table[h]);
403     }
404     if (++count >= n_ignore_words)
405       break;
406     key_len = 0;
407     if (c == EOF)
408       break;
409   }
410   n_ignore_words = count;
411   fclose(fp);
412 }
413 
do_whole_file(const char * filename)414 static int do_whole_file(const char *filename)
415 {
416   errno = 0;
417   FILE *fp = fopen(filename, "r");
418   if (!fp) {
419     error("can't open '%1': %2", filename, strerror(errno));
420     return 0;
421   }
422   int count = 0;
423   int key_len = 0;
424   int c;
425   while ((c = getc(fp)) != EOF) {
426     if (csalnum(c)) {
427       key_len = 1;
428       key_buffer[0] = c;
429       while ((c = getc(fp)) != EOF) {
430 	if (!csalnum(c))
431 	  break;
432 	if (key_len < truncate_len)
433 	  key_buffer[key_len++] = c;
434       }
435       if (store_key(key_buffer, key_len)) {
436 	if (++count >= max_keys_per_item)
437 	  break;
438       }
439       if (c == EOF)
440 	break;
441     }
442   }
443   store_reference(filenames.length(), 0, 0);
444   store_filename(filename);
445   fclose(fp);
446   return 1;
447 }
448 
do_file(const char * filename)449 static int do_file(const char *filename)
450 {
451   errno = 0;
452   // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on
453   // byte counts to be consistent with fseek.
454   FILE *fp = fopen(filename, FOPEN_RB);
455   if (fp == 0) {
456     error("can't open '%1': %2", filename, strerror(errno));
457     return 0;
458   }
459   int filename_index = filenames.length();
460   store_filename(filename);
461 
462   enum {
463     START,	// at the start of the file; also in between references
464     BOL,	// in the middle of a reference, at the beginning of the line
465     PERCENT,	// seen a percent at the beginning of the line
466     IGNORE,	// ignoring a field
467     IGNORE_BOL,	// at the beginning of a line ignoring a field
468     KEY,	// in the middle of a key
469     DISCARD,	// after truncate_len bytes of a key
470     MIDDLE	// in between keys
471   } state = START;
472 
473   // In states START, BOL, IGNORE_BOL, space_count how many spaces at
474   // the beginning have been seen.  In states PERCENT, IGNORE, KEY,
475   // MIDDLE space_count must be 0.
476   int space_count = 0;
477   int byte_count = 0;		// bytes read
478   int key_len = 0;
479   int ref_start = -1;		// position of start of current reference
480   for (;;) {
481     int c = getc(fp);
482     if (c == EOF)
483       break;
484     // We opened the file in binary mode, so we need to skip
485     // every CR character before a Newline.
486     if (c == '\r') {
487       int peek = getc(fp);
488       if (peek == '\n') {
489 	byte_count++;
490 	c = peek;
491       }
492       else
493 	ungetc(peek, fp);
494     }
495 #if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__)
496     else if (c == 0x1a)	// ^Z means EOF in text files
497       break;
498 #endif
499     byte_count++;
500     switch (state) {
501     case START:
502       if (c == ' ' || c == '\t') {
503 	space_count++;
504 	break;
505       }
506       if (c == '\n') {
507 	space_count = 0;
508 	break;
509       }
510       ref_start = byte_count - space_count - 1;
511       space_count = 0;
512       if (c == '%')
513 	state = PERCENT;
514       else if (csalnum(c)) {
515 	state = KEY;
516 	key_buffer[0] = c;
517 	key_len = 1;
518       }
519       else
520 	state = MIDDLE;
521       break;
522     case BOL:
523       switch (c) {
524       case '%':
525 	if (space_count > 0) {
526 	  space_count = 0;
527 	  state = MIDDLE;
528 	}
529 	else
530 	  state = PERCENT;
531 	break;
532       case ' ':
533       case '\t':
534 	space_count++;
535 	break;
536       case '\n':
537 	store_reference(filename_index, ref_start,
538 			byte_count - 1 - space_count - ref_start);
539 	state = START;
540 	space_count = 0;
541 	break;
542       default:
543 	space_count = 0;
544 	if (csalnum(c)) {
545 	  state = KEY;
546 	  key_buffer[0] = c;
547 	  key_len = 1;
548 	}
549 	else
550 	  state = MIDDLE;
551       }
552       break;
553     case PERCENT:
554       if (strchr(ignore_fields, c) != 0)
555 	state = IGNORE;
556       else if (c == '\n')
557 	state = BOL;
558       else
559 	state = MIDDLE;
560       break;
561     case IGNORE:
562       if (c == '\n')
563 	state = IGNORE_BOL;
564       break;
565     case IGNORE_BOL:
566       switch (c) {
567       case '%':
568 	if (space_count > 0) {
569 	  state = IGNORE;
570 	  space_count = 0;
571 	}
572 	else
573 	  state = PERCENT;
574 	break;
575       case ' ':
576       case '\t':
577 	space_count++;
578 	break;
579       case '\n':
580 	store_reference(filename_index, ref_start,
581 			byte_count - 1 - space_count - ref_start);
582 	state = START;
583 	space_count = 0;
584 	break;
585       default:
586 	space_count = 0;
587 	state = IGNORE;
588       }
589       break;
590     case KEY:
591       if (csalnum(c)) {
592 	if (key_len < truncate_len)
593 	  key_buffer[key_len++] = c;
594 	else
595 	  state = DISCARD;
596       }
597       else {
598 	possibly_store_key(key_buffer, key_len);
599 	key_len = 0;
600 	if (c == '\n')
601 	  state = BOL;
602 	else
603 	  state = MIDDLE;
604       }
605       break;
606     case DISCARD:
607       if (!csalnum(c)) {
608 	possibly_store_key(key_buffer, key_len);
609 	key_len = 0;
610 	if (c == '\n')
611 	  state = BOL;
612 	else
613 	  state = MIDDLE;
614       }
615       break;
616     case MIDDLE:
617       if (csalnum(c)) {
618 	state = KEY;
619 	key_buffer[0] = c;
620 	key_len = 1;
621       }
622       else if (c == '\n')
623 	state = BOL;
624       break;
625     default:
626       assert(0);
627     }
628   }
629   switch (state) {
630   case START:
631     break;
632   case DISCARD:
633   case KEY:
634     possibly_store_key(key_buffer, key_len);
635     // fall through
636   case BOL:
637   case PERCENT:
638   case IGNORE_BOL:
639   case IGNORE:
640   case MIDDLE:
641     store_reference(filename_index, ref_start,
642 		    byte_count - ref_start - space_count);
643     break;
644   default:
645     assert(0);
646   }
647   fclose(fp);
648   return 1;
649 }
650 
store_reference(int filename_index,int pos,int len)651 static void store_reference(int filename_index, int pos, int len)
652 {
653   tag t;
654   t.filename_index = filename_index;
655   t.start = pos;
656   t.length = len;
657   fwrite_or_die(&t, sizeof(t), 1, indxfp);
658   ntags++;
659 }
660 
store_filename(const char * fn)661 static void store_filename(const char *fn)
662 {
663   filenames += fn;
664   filenames += '\0';
665 }
666 
init_hash_table()667 static void init_hash_table()
668 {
669   hash_table = new table_entry[hash_table_size];
670   for (int i = 0; i < hash_table_size; i++)
671     hash_table[i].ptr = 0;
672 }
673 
possibly_store_key(char * s,int len)674 static void possibly_store_key(char *s, int len)
675 {
676   static int last_tagno = -1;
677   static int key_count;
678   if (last_tagno != ntags) {
679     last_tagno = ntags;
680     key_count = 0;
681   }
682   if (key_count < max_keys_per_item) {
683     if (store_key(s, len))
684       key_count++;
685   }
686 }
687 
store_key(char * s,int len)688 static int store_key(char *s, int len)
689 {
690   if (len < shortest_len)
691     return 0;
692   int is_number = 1;
693   for (int i = 0; i < len; i++)
694     if (!csdigit(s[i])) {
695       is_number = 0;
696       s[i] = cmlower(s[i]);
697     }
698   if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9'))
699     return 0;
700   int h = hash(s, len) % hash_table_size;
701   if (common_words_table) {
702     for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next)
703       if (len == ptr->len && memcmp(s, ptr->str, len) == 0)
704 	return 0;
705   }
706   table_entry *pp =  hash_table + h;
707   if (!pp->ptr)
708     pp->ptr = new block;
709   else if (pp->ptr->v[pp->ptr->used - 1] == ntags)
710     return 1;
711   else if (pp->ptr->used >= BLOCK_SIZE)
712     pp->ptr = new block(pp->ptr);
713   pp->ptr->v[(pp->ptr->used)++] = ntags;
714   return 1;
715 }
716 
write_hash_table()717 static void write_hash_table()
718 {
719   const int minus_one = -1;
720   int li = 0;
721   for (int i = 0; i < hash_table_size; i++) {
722     block *ptr = hash_table[i].ptr;
723     if (!ptr)
724       hash_table[i].count = -1;
725     else {
726       hash_table[i].count = li;
727       block *rev = 0;
728       while (ptr) {
729 	block *tem = ptr;
730 	ptr = ptr->next;
731 	tem->next = rev;
732 	rev = tem;
733       }
734       while (rev) {
735 	fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp);
736 	li += rev->used;
737 	block *tem = rev;
738 	rev = rev->next;
739 	delete tem;
740       }
741       fwrite_or_die(&minus_one, sizeof(int), 1, indxfp);
742       li += 1;
743     }
744   }
745   if (sizeof(table_entry) == sizeof(int))
746     fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp);
747   else {
748     // write it out word by word
749     for (int i = 0; i < hash_table_size; i++)
750       fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp);
751   }
752   fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp);
753   if (fseek(indxfp, 0, 0) < 0)
754     fatal("error seeking on index file: %1", strerror(errno));
755   index_header h;
756   h.magic = INDEX_MAGIC;
757   h.version = INDEX_VERSION;
758   h.tags_size = ntags;
759   h.lists_size = li;
760   h.table_size = hash_table_size;
761   h.strings_size = filenames.length();
762   h.truncate = truncate_len;
763   h.shortest = shortest_len;
764   h.common = n_ignore_words;
765   fwrite_or_die(&h, sizeof(h), 1, indxfp);
766 }
767 
fwrite_or_die(const void * ptr,int size,int nitems,FILE * fp)768 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp)
769 {
770   if (fwrite(ptr, size, nitems, fp) != (size_t)nitems)
771     fatal("fwrite failed: %1", strerror(errno));
772 }
773 
fatal_error_exit()774 void fatal_error_exit()
775 {
776   cleanup();
777   exit(3);
778 }
779 
780 extern "C" {
781 
cleanup()782 void cleanup()
783 {
784   if (temp_index_file)
785     unlink(temp_index_file);
786 }
787 
788 }
789