1 /*
2  * Insert an index between "<!--begin-index-->" and "<!--end-index-->",
3  * or replacing the comment "<!--index-->"
4  *
5  * The index links to elements with ID attributes as well as with
6  * empty <A NAME> elements.
7  *
8  * Any <A> tags with a class of "bctarget" are not copied, but
9  * regenerated. They are assumed to be backwards-compatible versions
10  * of ID attributes on their parent elements. But if the option -t or
11  * -x are given, those <A> elements are removed.
12  *
13  * There's a limit of 100000 index terms (10^(MAXIDLEN-1)).
14  *
15  * Index terms are elements with a class of "index", "index-inst" or
16  * "index-def", as well as all <dfn> elements. The contents of the
17  * element is the index term, unless the element has a title
18  * attribute. The title attribute can contain "|" and "!!":
19  *
20  * "term"
21  * "term1|term2|term3|..."
22  * "term!!subterm!!subsubterm!!..."
23  * "term1!!subterm1|term2!!subterm2|..."
24  * etc.
25  *
26  * For backward compatibility with an earlier Perl program, "::" is
27  * accepted as an alternative for "!!", but it is better not to use
28  * both separators in the same project, since the sorting maybe
29  * adversely affected.
30  *
31  * Class "index-def" results in a bold entry in the index, "index" in
32  * a normal one. "index-inst" is an alias for "index", provided for
33  * backward compatibility.
34  *
35  * To do: an option to split the index at each new first letter.
36  *
37  * Copyright © 1994-2005 World Wide Web Consortium
38  * See http://www.w3.org/Consortium/Legal/copyright-software
39  *
40  * Author: Bert Bos <bert@w3.org>
41  * Created: 11 Apr 2000
42  * Version: $Id: hxindex.c,v 1.24 2018/02/23 19:05:04 bbos Exp $
43  *
44  **/
45 #include "config.h"
46 #include <assert.h>
47 #include <locale.h>
48 #include <wchar.h>
49 #include <ctype.h>
50 #include <stdlib.h>
51 #include <stdio.h>
52 #include <iconv.h>
53 #include <unistd.h>
54 #include <err.h>
55 #include <stdbool.h>
56 #if STDC_HEADERS
57 # include <string.h>
58 #else
59 # ifndef HAVE_STRCHR
60 #  define strchr index
61 #  define strrchr rindex
62 # endif
63 # ifndef HAVE_STRSTR
64 #  include "strstr.e"
65 # endif
66 #endif
67 #ifdef HAVE_ERRNO_H
68 #  include <errno.h>
69 #else
70 extern int errno;
71 char *strerror(int errnum);
72 int strerror_r(int errnum, char *buf, size_t n);
73 #endif
74 
75 #ifdef HAVE_SEARCH_H
76 #  include <search.h>
77 #else
78 #  include "search-freebsd.h"
79 #endif
80 #include "export.h"
81 #include "types.e"
82 #include "heap.e"
83 #include "tree.e"
84 #include "html.e"
85 #include "scan.e"
86 #include "dict.e"
87 #include "openurl.e"
88 #include "genid.e"
89 #include "class.e"
90 
91 #undef USE_DATA_ATTRIBUTE	/* Data attributes are a proposal in HTML5 */
92 
93 #define BEGIN_INDEX "begin-index" /* <!--begin-index--> */
94 #define END_INDEX "end-index"	/* <!--end-index--> */
95 #define INDEX "index"		/* <!--index--> */
96 #define INDEX_INST "index-inst"	/* class="index-inst" */
97 #define INDEX_DEF "index-def"	/* class="index-def" */
98 #define TARGET "bctarget"	/* CLASS="...bctarget..." */
99 
100 #define MAXSUBS 20		/* Max. depth of subterms */
101 #define SECNO "secno"		/* Class of elements that define section # */
102 #define NO_NUM "no-num"		/* Class of elements without a section # */
103 
104 typedef struct _indexterm {
105   string url;
106   int importance;		/* 1 (low) or 2 (high) */
107   string secno;			/* For option -n */
108   string sectitle;		/* For option -N */
109   string doctitle;
110   string *terms;		/* Array of subterms */
111   wchar_t **sortkeys;		/* Array of normalized subterms */
112   int nrkeys;			/* Length of term and sortkeys arrays */
113 } *Indexterm;
114 
115 static Tree tree;
116 static bool xml = false;	/* Use <empty /> convention */
117 static string base = NULL;	/* (Rel.) URL of output file */
118 static string indexdb = NULL;	/* Persistent store of terms */
119 static string* userclassnames = NULL;	/* Persistent store of class names */
120 static FILE *globalfile;	/* Must be global for twalk */
121 static Indexterm globalprevious; /* Must be global for twalk */
122 static string globalurlprevious;/* Must be global for twalk */
123 static bool bctarget = true;	/* Add <A name=> after IDs */
124 static bool use_secno = false;	/* Anchor text is "#" instead of section # */
125 static bool use_sectitle = false; /* Anchor text is section title, not # */
126 static bool final = false;	/* Leave used attributes in document */
127 static bool trim_punct = true;	/* Remove trailing punctuation from terms */
128 static string section_name = NULL; /* Term meaning "section %s" */
129 static string unknown_name = NULL; /* Term meaning "without number" */
130 static string* exclude_elts = NULL; /* Don't index these elements */
131 static string* only_elts = NULL; /* Only index these elements */
132 
133 
134 /* handle_error -- called when a parse error occurred */
handle_error(void * clientdata,const string s,int lineno)135 static void handle_error(void *clientdata, const string s, int lineno)
136 {
137   (void) fprintf(stderr, "%d: %s\n", lineno, s);
138 }
139 
140 /* start -- called before the first event is reported */
start(void)141 static void* start(void)
142 {
143   tree = create();
144   return NULL;
145 }
146 
147 /* end -- called after the last event is reported */
end(void * clientdata)148 static void end(void *clientdata)
149 {
150   /* skip */
151 }
152 
153 /* handle_comment -- called after a comment is parsed */
handle_comment(void * clientdata,string commenttext)154 static void handle_comment(void *clientdata, string commenttext)
155 {
156   tree = append_comment(tree, commenttext);
157 }
158 
159 /* handle_text -- called after a tex chunk is parsed */
handle_text(void * clientdata,string text)160 static void handle_text(void *clientdata, string text)
161 {
162   tree = append_text(tree, text);
163 }
164 
165 /* handle_declaration -- called after a declaration is parsed */
handle_decl(void * clientdata,string gi,string fpi,string url)166 static void handle_decl(void *clientdata, string gi,
167 			string fpi, string url)
168 {
169   tree = append_declaration(tree, gi, fpi, url);
170 }
171 
172 /* handle_proc_instr -- called after a PI is parsed */
handle_pi(void * clientdata,string pi_text)173 static void handle_pi(void *clientdata, string pi_text)
174 {
175   tree = append_procins(tree, pi_text);
176 }
177 
178 /* handle_starttag -- called after a start tag is parsed */
handle_starttag(void * clientdata,string name,pairlist attribs)179 static void handle_starttag(void *clientdata, string name, pairlist attribs)
180 {
181   conststring id;
182 
183   tree = html_push(tree, name, attribs);
184 
185   /* If it has an ID, store it (so we don't accidentally generate it) */
186   if ((id = pairlist_get(attribs, "id"))) storeID(id);
187 }
188 
189 /* handle_emptytag -- called after an empty tag is parsed */
handle_emptytag(void * clientdata,string name,pairlist attribs)190 static void handle_emptytag(void *clientdata, string name, pairlist attribs)
191 {
192   conststring id;
193 
194   tree = html_push(tree, name, attribs);
195 
196   /* If it has an ID, store it (so we don't accidentally generate it) */
197   if ((id = pairlist_get(attribs, "id"))) storeID(id);
198 }
199 
200 /* handle_endtag -- called after an endtag is parsed (name may be "") */
handle_endtag(void * clientdata,string name)201 static void handle_endtag(void *clientdata, string name)
202 {
203   tree = html_pop(tree, name);
204 }
205 
206 /* trim -- remove leading and trailing white space, collapse white space */
trim(string s)207 static void trim(string s)
208 {
209   string t;
210   int i, j;
211 
212   if (!s) return;
213   t = newstring(s);
214   for (i = 0; isspace(t[i]); i++); /* Skip leading white space */
215   for (j = 0; t[i]; i++)
216     if (!isspace(t[i])) s[j++] = t[i];
217     else if (!isspace(t[i-1])) s[j++] = ' ';
218   if (j == 0) s[j] = '\0';
219   else if (isspace(s[j-1])) s[j-1] = '\0';
220   else s[j] = '\0';
221   dispose(t);
222 }
223 
224 /* parse_subterms -- parse s to create terms & sortkeys array in an Indexterm */
parse_subterms(const Indexterm term,const conststring s)225 static void parse_subterms(const Indexterm term, const conststring s)
226 {
227   enum {TEXT, TAG, DQUOTE, SQUOTE} state;
228   string h, k, p, q;
229   iconv_t cd;
230   size_t len, len2;
231   int i;
232 
233   /* Create the terms array and count the number of subterms */
234   h = newstring(s);
235   trim(h);
236   term->nrkeys = 1;
237   newarray(term->terms, 1);
238   term->terms[0] = h;
239   while ((k = strstr(h, "!!")) || (k = strstr(h, "::"))) {
240     h = k + 2;
241     *k = '\0';
242     renewarray(term->terms, term->nrkeys + 1);
243     trim(h);				/* Remove leading & trailing space */
244     term->terms[term->nrkeys] = h;	/* All terms point into h */
245     term->nrkeys++;
246   }
247 
248   /* Create the sortkeys array by normalizing each term */
249   newarray(term->sortkeys, term->nrkeys);
250   for (i = 0; i < term->nrkeys; i++) {
251 
252     /* First remove mark-up and expand the standard XML entities */
253     h = newstring(term->terms[i]);
254     for (state = TEXT, p = q = h; *p; p++) {
255       switch (state) {
256       case TEXT:
257 	if (*p == '<') state = TAG;
258 	else if (hasprefix(p, "&lt;"))   {*(q++) = '<'; p += 3;}
259 	else if (hasprefix(p, "&#60;"))  {*(q++) = '<'; p += 4;}
260 	else if (hasprefix(p, "&#x3c;")) {*(q++) = '<'; p += 5;}
261 	else if (hasprefix(p, "&#x3C;")) {*(q++) = '<'; p += 5;}
262 	else if (hasprefix(p, "&gt;"))   {*(q++) = '>'; p += 3;}
263 	else if (hasprefix(p, "&#62;"))  {*(q++) = '>'; p += 4;}
264 	else if (hasprefix(p, "&#x3e;")) {*(q++) = '>'; p += 5;}
265 	else if (hasprefix(p, "&#x3E;")) {*(q++) = '>'; p += 5;}
266 	else if (hasprefix(p, "&quot;")) {*(q++) = '"'; p += 5;}
267 	else if (hasprefix(p, "&#34;"))  {*(q++) = '"'; p += 4;}
268 	else if (hasprefix(p, "&#x22;")) {*(q++) = '"'; p += 5;}
269 	else if (hasprefix(p, "&amp;"))  {*(q++) = '&'; p += 4;}
270 	else if (hasprefix(p, "&#38;"))  {*(q++) = '&'; p += 4;}
271 	else if (hasprefix(p, "&#x26;")) {*(q++) = '&'; p += 5;}
272 	else *(q++) = tolower(*p);
273 	break;
274       case TAG:
275 	if (*p == '>') state = TEXT;
276 	else if (*p == '"') state = DQUOTE;
277 	else if (*p == '\'') state = SQUOTE;
278 	break;
279       case DQUOTE:
280 	if (*p == '"') state = TAG;
281 	break;
282       case SQUOTE:
283 	if (*p == '\'') state = TAG;
284 	break;
285       default:
286 	assert(!"Cannot happen!");
287       }
288     }
289     *q = '\0';
290 
291     if (trim_punct) {
292       /* Remove some trailing white space and punctuation */
293       for (q--; q != h && strspn(q, " \r\n\t\f\v,:;!?"); q--) *q = '\0';
294 
295       /* Remove final '.' only if it is the only '.' in the term */
296       if ((q = strrchr(h, '.')) && !*(q+1) && q == strchr(h, '.')) *q = '\0';
297     }
298 
299     /* Then convert from UTF-8 to wchar_t */
300     cd = iconv_open("wchar_t", "UTF-8");
301     if (cd == (iconv_t)(-1)) {perror("hxindex"); exit(1);}
302     len = strlen(h) + 1;
303     newarray(term->sortkeys[i], len); /* Large enough */
304     p = (string) term->sortkeys[i];
305     len2 = len * sizeof(term->sortkeys[i][0]);
306     if (iconv(cd, &h, &len, &p, &len2) == (size_t)(-1)) {
307       perror("hxindex"); exit(1);
308     }
309     /* *p = L'\0'; */
310     if (iconv_close(cd) == -1) {perror("hxindex"); exit(1);}
311   }
312 }
313 
314 /* folding_cmp -- compare two arrays of sort keys */
folding_cmp(wchar_t ** a,const int alen,wchar_t ** b,const int blen)315 static int folding_cmp(wchar_t **a, const int alen, wchar_t **b, const int blen)
316 {
317   int i, j;
318 
319   assert(a && alen >= 0);
320   assert(b && blen >= 0);
321   for (i = 0;; i++) {
322     if (i == alen) return i == blen ? 0 : -1;
323     if (i == blen) return 1;
324     assert(a[i] && b[i]);
325     if ((j = wcscoll(a[i], b[i])) != 0) return j;
326   }
327   assert(! "Cannot happen!");
328 }
329 
330 /* indent -- print newline and n times 2 spaces */
indent(int n)331 static void indent(int n)
332 {
333   putchar('\n');
334   for (; n > 0; n--) printf("  ");
335 }
336 
337 /* print_escaped -- print s escaped for use in an attribute */
print_escaped(const conststring s)338 static void print_escaped(const conststring s)
339 {
340   conststring h;
341 
342   for (h = s; *h; h++) if (*h == '"') printf("&quot;"); else putchar(*h);
343 }
344 
345 /* print_title -- print a TITLE attribute */
print_title(const Indexterm term)346 static void print_title(const Indexterm term)
347 {
348   enum {TEXT, TAG, DQUOTE, SQUOTE} state;
349   string h;
350 
351   assert(use_secno);
352   fputs(" title=\"", stdout);
353   if (base[0]) {		/* Only add document titles if needed */
354     for (state = TEXT, h = term->doctitle; *h; h++) {
355       switch (state) {
356       case TEXT:
357 	if (*h == '<') state = TAG;
358 	else if (*h == '"') fputs("&quot;", stdout);
359 	else putchar(*h);
360 	break;
361       case TAG:
362 	if (*h == '>') state = TEXT;
363 	else if (*h == '"') state = DQUOTE;
364 	else if (*h == '\'') state = SQUOTE;
365 	break;
366       case DQUOTE:
367 	if (*h == '"') state = TAG;
368 	break;
369       case SQUOTE:
370 	if (*h == '\'') state = TAG;
371 	break;
372       default:
373 	assert(!"Cannot happen!");
374       }
375     }
376     fputs(", ", stdout);
377   }
378   for (h = section_name; *h; h++)
379     switch (*h) {
380     case '"': fputs("&quot;", stdout); break;
381     case '>': fputs("&gt;", stdout); break;
382     case '<': fputs("&lt;", stdout); break;
383     case '%':
384       if (*(h+1) == '%') {putchar('%'); h++;}
385       else if (*(h+1) != 's') putchar('%');
386       else if (term->secno) {print_escaped(term->secno); h++;}
387       else {print_escaped(unknown_name); h++;}
388       break;
389     default: putchar(*h);
390     }
391   putchar('"');
392 }
393 
394 /* write_index_item -- write one item in the list of index terms */
write_index_item(const void * term1,const VISIT which,const int depth)395 static void write_index_item(const void *term1, const VISIT which,
396 			     const int depth)
397 {
398   Indexterm term = *(Indexterm*)term1;
399   int i, j;
400 
401   if (which != postorder && which != leaf) return;
402 
403   /* Count how many subterms are equal to the previous entry */
404   i = 0;
405   while (i < min(term->nrkeys, globalprevious->nrkeys) &&
406 	 !folding_cmp(term->sortkeys + i, 1, globalprevious->sortkeys + i, 1))
407     i++;
408 
409   /* Close lists as needed */
410   for (j = globalprevious->nrkeys - 1; j > i; j--) {
411     indent(j);
412     printf("</ul>");
413   }
414 
415   /* Open a list if needed */
416   if (term->nrkeys > globalprevious->nrkeys && globalprevious->nrkeys == i) {
417     indent(i);
418     printf("<ul>");
419   }
420 
421   /* Print new subterms, if any */
422   for (j = i; j < term->nrkeys; j++) {
423     indent(j);
424     printf("<li>%s", term->terms[j]);
425     if (j != term->nrkeys - 1) {
426       indent(j + 1);
427       printf("<ul>");
428     }
429   }
430 
431   /* Print a link */
432   printf(", ");
433   printf("<a href=\"");
434   print_escaped(term->url);
435   printf("\"");
436   if (use_secno) print_title(term);
437   if (term->importance == 2) printf("><strong>"); else printf(">");
438   if (use_sectitle)
439     printf("%s", term->sectitle ? term->sectitle : term->doctitle);
440   else if (!use_secno) putchar('#');
441   else if (term->secno) print_escaped(term->secno);
442   else print_escaped(unknown_name);
443   if (term->importance == 2) printf("</strong></a>"); else printf("</a>");
444 
445   /* Remember this term */
446   globalprevious = term;
447   globalurlprevious = term->url;
448 
449 }
450 
451 /* mkindex -- write out an index */
mkindex(Indexterm terms)452 static void mkindex(Indexterm terms)
453 {
454   int i;
455 
456   printf("<ul class=\"indexlist\">");
457 
458   /* Initialize globalprevious to a term with an unlikely sortkey */
459   new(globalprevious);
460   globalprevious->nrkeys = 1;
461   newarray(globalprevious->sortkeys, globalprevious->nrkeys);
462   newarray(globalprevious->sortkeys[0], 15);
463   wcscpy(globalprevious->sortkeys[0], L"zzzzzzzzzzzzzz");
464 
465   twalk(terms, write_index_item);
466 
467   /* Close all open lists */
468   for (i = 0; i < globalprevious->nrkeys; i++) printf("\n</ul>");
469 }
470 
471 /* expand -- write the tree, add <A NAME> if needed and replace <!--index--> */
expand(Tree t,bool * write,Indexterm terms)472 static void expand(Tree t, bool *write, Indexterm terms)
473 {
474   conststring val;
475   Tree h;
476   pairlist a;
477   string s;
478   bool do_tag;
479 
480   for (h = t->children; h != NULL; h = h->sister) {
481     switch (h->tp) {
482       case Text:
483 	if (*write) printf("%s", h->text);
484 	break;
485       case Comment:
486 	s = newstring(h->text);
487 	trim(s);
488 	if (eq(s, INDEX) || eq(s, BEGIN_INDEX)) {
489 	  if (!final) printf("<!--%s-->\n", BEGIN_INDEX);
490 	  mkindex(terms);
491 	  if (!final) printf("<!--%s-->", END_INDEX);
492 	  if (eq(s, BEGIN_INDEX)) *write = false;	/* Skip old index */
493 	} else if (eq(s, END_INDEX)) {
494 	  *write = true;
495 	} else {
496 	  printf("<!--%s-->", h->text);
497 	}
498 	dispose(s);
499 	break;
500       case Declaration:
501 	printf("<!DOCTYPE %s", h->name);
502 	if (h->text) printf(" PUBLIC \"%s\"", h->text);
503 	if (h->url) printf(" %s\"%s\"", h->text ? "" : "SYSTEM ", h->url);
504 	printf(">");
505 	break;
506       case Procins:
507 	if (*write) printf("<?%s>", h->text);
508 	break;
509       case Element:
510 	if (*write) {
511 	  /* If an <a> was inserted by index itself, remove it */
512 	  do_tag = !eq(h->name, "a") || !has_class(h->attribs, TARGET);
513 	  if (do_tag) {
514 	    printf("<%s", h->name);
515 	    for (a = h->attribs; a != NULL; a = a->next) {
516 	      printf(" %s", a->name);
517 	      if (a->value != NULL) printf("=\"%s\"", a->value);
518 	    }
519 	    assert(! is_empty(h->name) || h->children == NULL);
520 	    printf(xml && is_empty(h->name) ? " />" : ">");
521 	    /* Insert an <A NAME> if element has an ID and is not <A> */
522 	    if (bctarget && is_mixed(h->name) && (val = get_attrib(h, "id"))
523 		&& !eq(h->name, "a") && ! xml)
524 	      printf("<a class=\"%s\" name=\"%s\"></a>", TARGET, val);
525 	  }
526 	  expand(h, write, terms);
527 	  if (do_tag && ! is_empty(h->name)) printf("</%s>", h->name);
528 	}
529 	break;
530       case Root:
531 	assert(! "Cannot happen");
532 	break;
533       default:
534 	assert(! "Cannot happen");
535     }
536   }
537 }
538 
539 /* termcmp -- comparison routine for Indexterms */
termcmp(const void * a1,const void * b1)540 static int termcmp(const void *a1, const void *b1)
541 {
542   Indexterm a = (Indexterm)a1, b = (Indexterm)b1;
543   int r;
544 
545   assert(a);
546   assert(b);
547   assert(a->sortkeys);
548   assert(b->sortkeys);
549   assert(a->nrkeys > 0);
550   assert(b->nrkeys > 0);
551 
552   r = folding_cmp(a->sortkeys, a->nrkeys, b->sortkeys, b->nrkeys);
553   if (r != 0) return r;
554   return strcmp(a->url, b->url); /* Terms are equal, compare URL instead */
555 }
556 
557 /* copy_contents -- recursively expand contents of element t into a string */
copy_contents(Tree t,string * s)558 static void copy_contents(Tree t, string *s)
559 {
560   Tree h;
561   int i;
562   pairlist a;
563   string p;
564 
565   for (h = t->children; h != NULL; h = h->sister) {
566     switch (h->tp) {
567       case Text:
568 	i = *s ? strlen(*s) : 0;
569 	renewarray(*s, i + strlen(h->text) + 1);
570 	/* Copy, but transform all whitespace to spaces */
571 	for (p = h->text; *p; p++, i++) (*s)[i] = isspace(*p) ? ' ' : *p;
572 	(*s)[i] = '\0';
573 	break;
574       case Comment: break;
575       case Declaration: break;
576       case Procins: break;
577       case Element:
578 	/* Only certain tags are retained */
579 	if (eq(h->name, "span") || eq(h->name, "code") || eq(h->name, "tt")
580 	    || eq(h->name, "acronym") || eq(h->name, "abbr")
581 	    || eq(h->name, "bdo") || eq(h->name, "kbd") || eq(h->name, "samp")
582 	    || eq(h->name, "sub") || eq(h->name, "sup")
583 	    || eq(h->name, "var")) {
584 	  strapp(s, "<", h->name, NULL);
585 	  for (a = h->attribs; a != NULL; a = a->next) {
586 	    if (! a->value) strapp(s, " ", a->name, NULL);
587 	    else strapp(s, " ", a->name, "=\"", a->value, "\"", NULL);
588 	  }
589 	  assert(! is_empty(h->name) || h->children == NULL);
590 	  if (is_empty(h->name)) {
591 	    strapp(s, xml ? " />" : ">", NULL);
592 	  } else {
593 	    strapp(s, ">", NULL);
594 	    copy_contents(h, s);
595 	    strapp(s, "</", h->name, ">", NULL);
596 	  }
597 	} else {				/* Ignore tag, copy contents */
598 	  copy_contents(h, s);
599 	}
600 	break;
601       case Root: assert(! "Cannot happen"); break;
602       default: assert(! "Cannot happen");
603     }
604   }
605 }
606 
607 /* copy_to_index -- copy the contents of element h to the index db */
copy_to_index(Tree t,Indexterm * terms,int importance,conststring secno,conststring sectitle,conststring doctitle)608 static void copy_to_index(Tree t, Indexterm *terms, int importance,
609 			  conststring secno, conststring sectitle,
610 			  conststring doctitle)
611 {
612   conststring id, title;
613   string h;
614   Indexterm term;
615   int i, n;
616 
617   id = get_attrib(t, "id");
618 #ifdef USE_DATA_ATTRIBUTE
619   if (! (title = get_attrib(t, "data-index")))
620 #endif
621     title = get_attrib(t, "title");
622 
623   /* Get term either from title attribute or contents */
624   if (title) {
625 
626     i = 0;
627     while (title[i]) {
628       n = strcspn(title + i, "|");		/* Find | or \0 */
629       new(term);
630       term->importance = importance;
631       term->secno = secno ? newstring(secno) : NULL;
632       term->sectitle = sectitle ? newstring(sectitle) : NULL;
633       term->doctitle = newstring(doctitle);
634       term->url = NULL;
635       strapp(&term->url, base, "#", id, NULL);
636       h = newnstring(title + i, n);
637       parse_subterms(term, h);
638       if (! tsearch(term, (void**)terms, termcmp))
639 	errx(1, "Out of memory while parsing term %s\n", h);
640       i += n;
641       if (title[i]) i++;			/* Skip '|' */
642     }
643     if (final)					/* Remove used attribute */
644 #ifdef USE_DATA_ATTRIBUTE
645       if (!delete_attrib(t, "data-index"))
646 #endif
647 	delete_attrib(t, "title");
648 
649   } else {					/* Recursively copy contents */
650 
651     h = NULL;
652     copy_contents(t, &h);
653     if (h) {					/* Non-empty contents */
654       new(term);
655       term->importance = importance;
656       term->secno = secno ? newstring(secno) : NULL;
657       term->sectitle = sectitle ? newstring(sectitle) : NULL;
658       term->doctitle = newstring(doctitle);
659       term->url = NULL;
660       strapp(&term->url, base, "#", id, NULL);
661       parse_subterms(term, h);
662       if (! tsearch(term, (void**)terms, termcmp))
663 	errx(1, "Out of memory while parsing term %s", h);
664     }
665 
666   }
667 }
668 
669 /* in_list -- check if word occurs in array list */
in_list(const string word,const string * list)670 static bool in_list(const string word, const string *list)
671 {
672   int i;
673 
674   for (i = 0; list[i]; i++) if (eq(word, list[i])) return true;
675   return false;
676 }
677 
678 /* collect -- collect index terms, add IDs where needed */
collect(Tree t,Indexterm * terms,string * secno,string * sectitle,string * doctitle)679 static void collect(Tree t, Indexterm *terms, string *secno,
680 		    string *sectitle, string *doctitle)
681 {
682   int importance;
683   Tree h;
684 
685   for (h = t->children; h != NULL; h = h->sister) {
686     switch (h->tp) {
687       case Text: case Comment: case Declaration: case Procins: break;
688       case Element:
689 	if (eq(h->name, "title")) {
690 	  dispose(*doctitle);
691 	  copy_contents(h, doctitle);
692 	}
693 	if (has_class(h->attribs, SECNO)) {
694 	  dispose(*secno);
695 	  copy_contents(h, secno);
696 	  trim(*secno);
697 	} else if (has_class(h->attribs, NO_NUM)) {
698 	  dispose(*secno);
699 	  *secno = newstring(unknown_name);
700 	}
701 	if (eq(h->name, "h1") || eq(h->name, "h2") || eq(h->name, "h3") ||
702 	    eq(h->name, "h4") || eq(h->name, "h5") || eq(h->name, "h5")) {
703 	  dispose(*sectitle);
704 	  copy_contents(h, sectitle);
705 	  trim(*sectitle);
706 	}
707 	if (eq(h->name, "dfn")) importance = 2;
708 	else if (exclude_elts && in_list(h->name, exclude_elts)) importance = 0;
709 	else if (only_elts && !in_list(h->name, only_elts)) importance = 0;
710 	else if (has_class(h->attribs,INDEX)||has_class(h->attribs,INDEX_INST))
711 	  importance = 1;
712 	else if (userclassnames && has_class_in_list(h->attribs, userclassnames))
713 	  importance = 1;
714 	else if (has_class(h->attribs, INDEX_DEF)) importance = 2;
715 	else importance = 0;
716 	if (importance != 0) {
717 	  /* Give it an ID, if it doesn't have one */
718 	  if (! get_attrib(h, "id")) set_attrib(h, "id", gen_id(h));
719 	  copy_to_index(h, terms, importance, *secno, *sectitle, *doctitle);
720 	} else {
721 	  collect(h, terms, secno, sectitle, doctitle);
722 	}
723 	break;
724       case Root: assert(! "Cannot happen"); break;
725       default: assert(! "Cannot happen");
726     }
727   }
728 }
729 
730 /* load_index -- read persistent term db from file */
load_index(const string indexdb,Indexterm * terms)731 static void load_index(const string indexdb, Indexterm *terms)
732 {
733   FILE *f;
734   int n1, n2, n3, n4, n5, n6;
735   char *line = NULL;
736   size_t linesize = 0;
737   Indexterm term;
738   string h;
739 
740   if (! (f = fopen(indexdb, "r"))) return;	/* Assume file not found... */
741 
742   while (getline(&line, &linesize, f) != -1) {
743     n1 = strcspn(line, "\t");
744     if (line[n1] != '\t') errx(1, "Illegal syntax in %s", indexdb);
745     n2 = n1 + 1 + strcspn(line + n1 + 1, "\t");
746     if (line[n2] != '\t') errx(1, "Illegal syntax in %s", indexdb);
747     n3 = n2 + 1 + strcspn(line + n2 + 1, "\t");
748     if (line[n3] != '\t') errx(1, "Illegal syntax in %s", indexdb);
749     n4 = n3 + 1 + strcspn(line + n3 + 1, "\t");
750     if (line[n4] != '\t') errx(1, "Illegal syntax in %s", indexdb);
751     n5 = n4 + 1 + strcspn(line + n4 + 1, "\t");
752     if (line[n5] != '\t') errx(1, "Illegal syntax in %s", indexdb);
753     n6 = n5 + 1 + strcspn(line + n5 + 1, "\t\n");
754     if (line[n6] != '\n') errx(1, "Illegal syntax in %s", indexdb);
755     new(term);
756     h = newnstring(line, n1);
757     switch (line[n1 + 1]) {
758       case '1': term->importance = 1; break;
759       case '2': term->importance = 2; break;
760     default: errx(1, "Error in %s (column 2 must be '1' or '2')", indexdb);
761     }
762     term->url = newnstring(line + n2 + 1, n3 - n2 - 1);
763     term->secno = newnstring(line + n3 + 1, n4 - n3 - 1);
764     term->sectitle = newnstring(line + n4 + 1, n5 - n4 - 1);
765     term->doctitle = newnstring(line + n5 + 1, n6 - n5 - 1);
766     parse_subterms(term, h);
767     if (! tsearch(term, (void**)terms, termcmp))
768       errx(1, "Out of memory while loading %s", indexdb);
769   }
770 
771   fclose(f);
772   free(line);
773 }
774 
775 /* save_a_term -- write one term to globalfile */
save_a_term(const void * term1,const VISIT which,const int dp)776 static void save_a_term(const void *term1, const VISIT which, const int dp)
777 {
778   Indexterm term = *(Indexterm*)term1;
779   int i;
780 
781   if (which == endorder || which == leaf) {
782     for (i = 0; i < term->nrkeys; i++) {
783       if (i > 0) fprintf(globalfile, "!!");
784       fprintf(globalfile, "%s", term->terms[i]);
785     }
786     fprintf(globalfile, "\t%d\t%s\t%s\t%s\t%s\n", term->importance, term->url,
787 	    term->secno ? term->secno : (use_secno ? unknown_name : "#"),
788 	    term->sectitle ? term->sectitle : term->doctitle,
789 	    term->doctitle);
790   }
791 }
792 
793 /* save_index -- write terms to file */
save_index(const string indexdb,Indexterm terms)794 static void save_index(const string indexdb, Indexterm terms)
795 {
796   if (! (globalfile = fopen(indexdb, "w")))
797     errx(1, "%s: %s", indexdb, strerror(errno));
798   twalk(terms, save_a_term);
799   fclose(globalfile);
800 }
801 
802 /* usage -- print usage message and exit */
usage(string name)803 static void usage(string name)
804 {
805   errx(1, "Version %s\nUsage: %s [-i indexdb] [-b base] [-x] [-t] [-n] [-c userclass] [-s template] [-u phrase] [html-file]",
806 	  VERSION, name);
807 }
808 
809 /* tokenize -- split string s into tokens at each comma, return an array */
tokenize(string s)810 static string * tokenize(string s)
811 {
812   string * t;
813   int i, n;
814 
815   assert(s && s[0]);
816   for (t = NULL, n = 0; *s; s += i + 1, n++) {
817     i = strcspn(s, ",");
818     renewarray(t, n + 1);
819     t[n] = newnstring(s, i);
820   }
821   renewarray(t, n + 1);		/* Make final item NULL */
822   t[n] = NULL;
823   return t;
824 }
825 
826 /* main */
main(int argc,char * argv[])827 int main(int argc, char *argv[])
828 {
829   bool write = true;
830   Indexterm termtree = NULL;	/* Sorted tree of terms */
831   string secno, doctitle, sectitle;
832   int c, status = 200;
833 
834   /* Bind the parser callback routines to our handlers */
835   set_error_handler(handle_error);
836   set_start_handler(start);
837   set_end_handler(end);
838   set_comment_handler(handle_comment);
839   set_text_handler(handle_text);
840   set_decl_handler(handle_decl);
841   set_pi_handler(handle_pi);
842   set_starttag_handler(handle_starttag);
843   set_emptytag_handler(handle_emptytag);
844   set_endtag_handler(handle_endtag);
845 
846   yyin = NULL;
847 
848   while ((c = getopt(argc, argv, "txb:i:cnNfrs:u:O:X:")) != -1)
849   switch (c) {
850   case 't': bctarget = false; break; /* Don't write <a name> after each ID */
851   case 'x': xml = true; break;	/* Output as XML */
852   case 'b': base = newstring(optarg); break; /* Set base of URL */
853   case 'i': indexdb = newstring(optarg); break;	/* Set name of index db */
854   case 'c': userclassnames = tokenize(optarg); break; /* Set class names */
855   case 'n': use_secno = true; use_sectitle = false; break;
856   case 'N': use_sectitle = true; use_secno = false; break;
857   case 'f': final = true; break; /* "Final": remove used attributes */
858   case 'r': trim_punct = false; break; /* Do not remove trailing punctuation */
859   case 's': section_name = newstring(optarg); break;
860   case 'u': unknown_name = newstring(optarg); break;
861   case 'O': only_elts = tokenize(optarg); break; /* Index only these elements */
862   case 'X': exclude_elts = tokenize(optarg); break; /* Don't index these */
863   default: usage(argv[0]);
864   }
865   if (optind == argc) yyin = stdin;
866   else if (argc > optind + 1) usage(argv[0]);
867   else if (eq(argv[optind], "-")) yyin = stdin;
868   else yyin = fopenurl(argv[optind], "r", &status);
869 
870   if (yyin == NULL) {perror(argv[optind]); exit(1);}
871   if (status != 200) errx(1, "%s : %s", argv[optind], http_strerror(status));
872 
873   if (!base) base = newstring("");
874   if (!section_name) section_name = newstring("section %s");
875   if (!unknown_name) unknown_name = newstring("??");
876 
877   /* Apply user's locale */
878   setlocale(LC_ALL, "");
879 
880   /* Read the index DB into memory */
881   if (indexdb) load_index(indexdb, &termtree);
882 
883   /* Parse, build tree, collect existing IDs */
884   if (yyparse() != 0) exit(3);
885 
886   /* Scan for index terms, add them to the tree, add IDs where needed */
887   secno = NULL;
888   sectitle = NULL;
889   doctitle = newstring("");
890   collect(get_root(tree), &termtree, &secno, &sectitle, &doctitle);
891 
892   /* Write out the document, adding <A NAME> and replacing <!--index--> */
893   expand(get_root(tree), &write, termtree);
894 
895   /* Store terms to file */
896   if (indexdb) save_index(indexdb, termtree);
897 
898   fclose(yyin);
899   return 0;
900 }
901