1 /*
2 * Insert an index between "<!--begin-index-->" and "<!--end-index-->",
3 * or replacing the comment "<!--index-->"
4 *
5 * The index links to elements with ID attributes as well as with
6 * empty <A NAME> elements.
7 *
8 * Any <A> tags with a class of "bctarget" are not copied, but
9 * regenerated. They are assumed to be backwards-compatible versions
10 * of ID attributes on their parent elements. But if the option -t or
11 * -x are given, those <A> elements are removed.
12 *
13 * There's a limit of 100000 index terms (10^(MAXIDLEN-1)).
14 *
15 * Index terms are elements with a class of "index", "index-inst" or
16 * "index-def", as well as all <dfn> elements. The contents of the
17 * element is the index term, unless the element has a title
18 * attribute. The title attribute can contain "|" and "!!":
19 *
20 * "term"
21 * "term1|term2|term3|..."
22 * "term!!subterm!!subsubterm!!..."
23 * "term1!!subterm1|term2!!subterm2|..."
24 * etc.
25 *
26 * For backward compatibility with an earlier Perl program, "::" is
27 * accepted as an alternative for "!!", but it is better not to use
28 * both separators in the same project, since the sorting maybe
29 * adversely affected.
30 *
31 * Class "index-def" results in a bold entry in the index, "index" in
32 * a normal one. "index-inst" is an alias for "index", provided for
33 * backward compatibility.
34 *
35 * To do: an option to split the index at each new first letter.
36 *
37 * Copyright © 1994-2005 World Wide Web Consortium
38 * See http://www.w3.org/Consortium/Legal/copyright-software
39 *
40 * Author: Bert Bos <bert@w3.org>
41 * Created: 11 Apr 2000
42 * Version: $Id: hxindex.c,v 1.24 2018/02/23 19:05:04 bbos Exp $
43 *
44 **/
45 #include "config.h"
46 #include <assert.h>
47 #include <locale.h>
48 #include <wchar.h>
49 #include <ctype.h>
50 #include <stdlib.h>
51 #include <stdio.h>
52 #include <iconv.h>
53 #include <unistd.h>
54 #include <err.h>
55 #include <stdbool.h>
56 #if STDC_HEADERS
57 # include <string.h>
58 #else
59 # ifndef HAVE_STRCHR
60 # define strchr index
61 # define strrchr rindex
62 # endif
63 # ifndef HAVE_STRSTR
64 # include "strstr.e"
65 # endif
66 #endif
67 #ifdef HAVE_ERRNO_H
68 # include <errno.h>
69 #else
70 extern int errno;
71 char *strerror(int errnum);
72 int strerror_r(int errnum, char *buf, size_t n);
73 #endif
74
75 #ifdef HAVE_SEARCH_H
76 # include <search.h>
77 #else
78 # include "search-freebsd.h"
79 #endif
80 #include "export.h"
81 #include "types.e"
82 #include "heap.e"
83 #include "tree.e"
84 #include "html.e"
85 #include "scan.e"
86 #include "dict.e"
87 #include "openurl.e"
88 #include "genid.e"
89 #include "class.e"
90
91 #undef USE_DATA_ATTRIBUTE /* Data attributes are a proposal in HTML5 */
92
93 #define BEGIN_INDEX "begin-index" /* <!--begin-index--> */
94 #define END_INDEX "end-index" /* <!--end-index--> */
95 #define INDEX "index" /* <!--index--> */
96 #define INDEX_INST "index-inst" /* class="index-inst" */
97 #define INDEX_DEF "index-def" /* class="index-def" */
98 #define TARGET "bctarget" /* CLASS="...bctarget..." */
99
100 #define MAXSUBS 20 /* Max. depth of subterms */
101 #define SECNO "secno" /* Class of elements that define section # */
102 #define NO_NUM "no-num" /* Class of elements without a section # */
103
104 typedef struct _indexterm {
105 string url;
106 int importance; /* 1 (low) or 2 (high) */
107 string secno; /* For option -n */
108 string sectitle; /* For option -N */
109 string doctitle;
110 string *terms; /* Array of subterms */
111 wchar_t **sortkeys; /* Array of normalized subterms */
112 int nrkeys; /* Length of term and sortkeys arrays */
113 } *Indexterm;
114
115 static Tree tree;
116 static bool xml = false; /* Use <empty /> convention */
117 static string base = NULL; /* (Rel.) URL of output file */
118 static string indexdb = NULL; /* Persistent store of terms */
119 static string* userclassnames = NULL; /* Persistent store of class names */
120 static FILE *globalfile; /* Must be global for twalk */
121 static Indexterm globalprevious; /* Must be global for twalk */
122 static string globalurlprevious;/* Must be global for twalk */
123 static bool bctarget = true; /* Add <A name=> after IDs */
124 static bool use_secno = false; /* Anchor text is "#" instead of section # */
125 static bool use_sectitle = false; /* Anchor text is section title, not # */
126 static bool final = false; /* Leave used attributes in document */
127 static bool trim_punct = true; /* Remove trailing punctuation from terms */
128 static string section_name = NULL; /* Term meaning "section %s" */
129 static string unknown_name = NULL; /* Term meaning "without number" */
130 static string* exclude_elts = NULL; /* Don't index these elements */
131 static string* only_elts = NULL; /* Only index these elements */
132
133
134 /* handle_error -- called when a parse error occurred */
handle_error(void * clientdata,const string s,int lineno)135 static void handle_error(void *clientdata, const string s, int lineno)
136 {
137 (void) fprintf(stderr, "%d: %s\n", lineno, s);
138 }
139
140 /* start -- called before the first event is reported */
start(void)141 static void* start(void)
142 {
143 tree = create();
144 return NULL;
145 }
146
147 /* end -- called after the last event is reported */
end(void * clientdata)148 static void end(void *clientdata)
149 {
150 /* skip */
151 }
152
153 /* handle_comment -- called after a comment is parsed */
handle_comment(void * clientdata,string commenttext)154 static void handle_comment(void *clientdata, string commenttext)
155 {
156 tree = append_comment(tree, commenttext);
157 }
158
159 /* handle_text -- called after a tex chunk is parsed */
handle_text(void * clientdata,string text)160 static void handle_text(void *clientdata, string text)
161 {
162 tree = append_text(tree, text);
163 }
164
165 /* handle_declaration -- called after a declaration is parsed */
handle_decl(void * clientdata,string gi,string fpi,string url)166 static void handle_decl(void *clientdata, string gi,
167 string fpi, string url)
168 {
169 tree = append_declaration(tree, gi, fpi, url);
170 }
171
172 /* handle_proc_instr -- called after a PI is parsed */
handle_pi(void * clientdata,string pi_text)173 static void handle_pi(void *clientdata, string pi_text)
174 {
175 tree = append_procins(tree, pi_text);
176 }
177
178 /* handle_starttag -- called after a start tag is parsed */
handle_starttag(void * clientdata,string name,pairlist attribs)179 static void handle_starttag(void *clientdata, string name, pairlist attribs)
180 {
181 conststring id;
182
183 tree = html_push(tree, name, attribs);
184
185 /* If it has an ID, store it (so we don't accidentally generate it) */
186 if ((id = pairlist_get(attribs, "id"))) storeID(id);
187 }
188
189 /* handle_emptytag -- called after an empty tag is parsed */
handle_emptytag(void * clientdata,string name,pairlist attribs)190 static void handle_emptytag(void *clientdata, string name, pairlist attribs)
191 {
192 conststring id;
193
194 tree = html_push(tree, name, attribs);
195
196 /* If it has an ID, store it (so we don't accidentally generate it) */
197 if ((id = pairlist_get(attribs, "id"))) storeID(id);
198 }
199
200 /* handle_endtag -- called after an endtag is parsed (name may be "") */
handle_endtag(void * clientdata,string name)201 static void handle_endtag(void *clientdata, string name)
202 {
203 tree = html_pop(tree, name);
204 }
205
206 /* trim -- remove leading and trailing white space, collapse white space */
trim(string s)207 static void trim(string s)
208 {
209 string t;
210 int i, j;
211
212 if (!s) return;
213 t = newstring(s);
214 for (i = 0; isspace(t[i]); i++); /* Skip leading white space */
215 for (j = 0; t[i]; i++)
216 if (!isspace(t[i])) s[j++] = t[i];
217 else if (!isspace(t[i-1])) s[j++] = ' ';
218 if (j == 0) s[j] = '\0';
219 else if (isspace(s[j-1])) s[j-1] = '\0';
220 else s[j] = '\0';
221 dispose(t);
222 }
223
224 /* parse_subterms -- parse s to create terms & sortkeys array in an Indexterm */
parse_subterms(const Indexterm term,const conststring s)225 static void parse_subterms(const Indexterm term, const conststring s)
226 {
227 enum {TEXT, TAG, DQUOTE, SQUOTE} state;
228 string h, k, p, q;
229 iconv_t cd;
230 size_t len, len2;
231 int i;
232
233 /* Create the terms array and count the number of subterms */
234 h = newstring(s);
235 trim(h);
236 term->nrkeys = 1;
237 newarray(term->terms, 1);
238 term->terms[0] = h;
239 while ((k = strstr(h, "!!")) || (k = strstr(h, "::"))) {
240 h = k + 2;
241 *k = '\0';
242 renewarray(term->terms, term->nrkeys + 1);
243 trim(h); /* Remove leading & trailing space */
244 term->terms[term->nrkeys] = h; /* All terms point into h */
245 term->nrkeys++;
246 }
247
248 /* Create the sortkeys array by normalizing each term */
249 newarray(term->sortkeys, term->nrkeys);
250 for (i = 0; i < term->nrkeys; i++) {
251
252 /* First remove mark-up and expand the standard XML entities */
253 h = newstring(term->terms[i]);
254 for (state = TEXT, p = q = h; *p; p++) {
255 switch (state) {
256 case TEXT:
257 if (*p == '<') state = TAG;
258 else if (hasprefix(p, "<")) {*(q++) = '<'; p += 3;}
259 else if (hasprefix(p, "<")) {*(q++) = '<'; p += 4;}
260 else if (hasprefix(p, "<")) {*(q++) = '<'; p += 5;}
261 else if (hasprefix(p, "<")) {*(q++) = '<'; p += 5;}
262 else if (hasprefix(p, ">")) {*(q++) = '>'; p += 3;}
263 else if (hasprefix(p, ">")) {*(q++) = '>'; p += 4;}
264 else if (hasprefix(p, ">")) {*(q++) = '>'; p += 5;}
265 else if (hasprefix(p, ">")) {*(q++) = '>'; p += 5;}
266 else if (hasprefix(p, """)) {*(q++) = '"'; p += 5;}
267 else if (hasprefix(p, """)) {*(q++) = '"'; p += 4;}
268 else if (hasprefix(p, """)) {*(q++) = '"'; p += 5;}
269 else if (hasprefix(p, "&")) {*(q++) = '&'; p += 4;}
270 else if (hasprefix(p, "&")) {*(q++) = '&'; p += 4;}
271 else if (hasprefix(p, "&")) {*(q++) = '&'; p += 5;}
272 else *(q++) = tolower(*p);
273 break;
274 case TAG:
275 if (*p == '>') state = TEXT;
276 else if (*p == '"') state = DQUOTE;
277 else if (*p == '\'') state = SQUOTE;
278 break;
279 case DQUOTE:
280 if (*p == '"') state = TAG;
281 break;
282 case SQUOTE:
283 if (*p == '\'') state = TAG;
284 break;
285 default:
286 assert(!"Cannot happen!");
287 }
288 }
289 *q = '\0';
290
291 if (trim_punct) {
292 /* Remove some trailing white space and punctuation */
293 for (q--; q != h && strspn(q, " \r\n\t\f\v,:;!?"); q--) *q = '\0';
294
295 /* Remove final '.' only if it is the only '.' in the term */
296 if ((q = strrchr(h, '.')) && !*(q+1) && q == strchr(h, '.')) *q = '\0';
297 }
298
299 /* Then convert from UTF-8 to wchar_t */
300 cd = iconv_open("wchar_t", "UTF-8");
301 if (cd == (iconv_t)(-1)) {perror("hxindex"); exit(1);}
302 len = strlen(h) + 1;
303 newarray(term->sortkeys[i], len); /* Large enough */
304 p = (string) term->sortkeys[i];
305 len2 = len * sizeof(term->sortkeys[i][0]);
306 if (iconv(cd, &h, &len, &p, &len2) == (size_t)(-1)) {
307 perror("hxindex"); exit(1);
308 }
309 /* *p = L'\0'; */
310 if (iconv_close(cd) == -1) {perror("hxindex"); exit(1);}
311 }
312 }
313
314 /* folding_cmp -- compare two arrays of sort keys */
folding_cmp(wchar_t ** a,const int alen,wchar_t ** b,const int blen)315 static int folding_cmp(wchar_t **a, const int alen, wchar_t **b, const int blen)
316 {
317 int i, j;
318
319 assert(a && alen >= 0);
320 assert(b && blen >= 0);
321 for (i = 0;; i++) {
322 if (i == alen) return i == blen ? 0 : -1;
323 if (i == blen) return 1;
324 assert(a[i] && b[i]);
325 if ((j = wcscoll(a[i], b[i])) != 0) return j;
326 }
327 assert(! "Cannot happen!");
328 }
329
330 /* indent -- print newline and n times 2 spaces */
indent(int n)331 static void indent(int n)
332 {
333 putchar('\n');
334 for (; n > 0; n--) printf(" ");
335 }
336
337 /* print_escaped -- print s escaped for use in an attribute */
print_escaped(const conststring s)338 static void print_escaped(const conststring s)
339 {
340 conststring h;
341
342 for (h = s; *h; h++) if (*h == '"') printf("""); else putchar(*h);
343 }
344
345 /* print_title -- print a TITLE attribute */
print_title(const Indexterm term)346 static void print_title(const Indexterm term)
347 {
348 enum {TEXT, TAG, DQUOTE, SQUOTE} state;
349 string h;
350
351 assert(use_secno);
352 fputs(" title=\"", stdout);
353 if (base[0]) { /* Only add document titles if needed */
354 for (state = TEXT, h = term->doctitle; *h; h++) {
355 switch (state) {
356 case TEXT:
357 if (*h == '<') state = TAG;
358 else if (*h == '"') fputs(""", stdout);
359 else putchar(*h);
360 break;
361 case TAG:
362 if (*h == '>') state = TEXT;
363 else if (*h == '"') state = DQUOTE;
364 else if (*h == '\'') state = SQUOTE;
365 break;
366 case DQUOTE:
367 if (*h == '"') state = TAG;
368 break;
369 case SQUOTE:
370 if (*h == '\'') state = TAG;
371 break;
372 default:
373 assert(!"Cannot happen!");
374 }
375 }
376 fputs(", ", stdout);
377 }
378 for (h = section_name; *h; h++)
379 switch (*h) {
380 case '"': fputs(""", stdout); break;
381 case '>': fputs(">", stdout); break;
382 case '<': fputs("<", stdout); break;
383 case '%':
384 if (*(h+1) == '%') {putchar('%'); h++;}
385 else if (*(h+1) != 's') putchar('%');
386 else if (term->secno) {print_escaped(term->secno); h++;}
387 else {print_escaped(unknown_name); h++;}
388 break;
389 default: putchar(*h);
390 }
391 putchar('"');
392 }
393
394 /* write_index_item -- write one item in the list of index terms */
write_index_item(const void * term1,const VISIT which,const int depth)395 static void write_index_item(const void *term1, const VISIT which,
396 const int depth)
397 {
398 Indexterm term = *(Indexterm*)term1;
399 int i, j;
400
401 if (which != postorder && which != leaf) return;
402
403 /* Count how many subterms are equal to the previous entry */
404 i = 0;
405 while (i < min(term->nrkeys, globalprevious->nrkeys) &&
406 !folding_cmp(term->sortkeys + i, 1, globalprevious->sortkeys + i, 1))
407 i++;
408
409 /* Close lists as needed */
410 for (j = globalprevious->nrkeys - 1; j > i; j--) {
411 indent(j);
412 printf("</ul>");
413 }
414
415 /* Open a list if needed */
416 if (term->nrkeys > globalprevious->nrkeys && globalprevious->nrkeys == i) {
417 indent(i);
418 printf("<ul>");
419 }
420
421 /* Print new subterms, if any */
422 for (j = i; j < term->nrkeys; j++) {
423 indent(j);
424 printf("<li>%s", term->terms[j]);
425 if (j != term->nrkeys - 1) {
426 indent(j + 1);
427 printf("<ul>");
428 }
429 }
430
431 /* Print a link */
432 printf(", ");
433 printf("<a href=\"");
434 print_escaped(term->url);
435 printf("\"");
436 if (use_secno) print_title(term);
437 if (term->importance == 2) printf("><strong>"); else printf(">");
438 if (use_sectitle)
439 printf("%s", term->sectitle ? term->sectitle : term->doctitle);
440 else if (!use_secno) putchar('#');
441 else if (term->secno) print_escaped(term->secno);
442 else print_escaped(unknown_name);
443 if (term->importance == 2) printf("</strong></a>"); else printf("</a>");
444
445 /* Remember this term */
446 globalprevious = term;
447 globalurlprevious = term->url;
448
449 }
450
451 /* mkindex -- write out an index */
mkindex(Indexterm terms)452 static void mkindex(Indexterm terms)
453 {
454 int i;
455
456 printf("<ul class=\"indexlist\">");
457
458 /* Initialize globalprevious to a term with an unlikely sortkey */
459 new(globalprevious);
460 globalprevious->nrkeys = 1;
461 newarray(globalprevious->sortkeys, globalprevious->nrkeys);
462 newarray(globalprevious->sortkeys[0], 15);
463 wcscpy(globalprevious->sortkeys[0], L"zzzzzzzzzzzzzz");
464
465 twalk(terms, write_index_item);
466
467 /* Close all open lists */
468 for (i = 0; i < globalprevious->nrkeys; i++) printf("\n</ul>");
469 }
470
471 /* expand -- write the tree, add <A NAME> if needed and replace <!--index--> */
expand(Tree t,bool * write,Indexterm terms)472 static void expand(Tree t, bool *write, Indexterm terms)
473 {
474 conststring val;
475 Tree h;
476 pairlist a;
477 string s;
478 bool do_tag;
479
480 for (h = t->children; h != NULL; h = h->sister) {
481 switch (h->tp) {
482 case Text:
483 if (*write) printf("%s", h->text);
484 break;
485 case Comment:
486 s = newstring(h->text);
487 trim(s);
488 if (eq(s, INDEX) || eq(s, BEGIN_INDEX)) {
489 if (!final) printf("<!--%s-->\n", BEGIN_INDEX);
490 mkindex(terms);
491 if (!final) printf("<!--%s-->", END_INDEX);
492 if (eq(s, BEGIN_INDEX)) *write = false; /* Skip old index */
493 } else if (eq(s, END_INDEX)) {
494 *write = true;
495 } else {
496 printf("<!--%s-->", h->text);
497 }
498 dispose(s);
499 break;
500 case Declaration:
501 printf("<!DOCTYPE %s", h->name);
502 if (h->text) printf(" PUBLIC \"%s\"", h->text);
503 if (h->url) printf(" %s\"%s\"", h->text ? "" : "SYSTEM ", h->url);
504 printf(">");
505 break;
506 case Procins:
507 if (*write) printf("<?%s>", h->text);
508 break;
509 case Element:
510 if (*write) {
511 /* If an <a> was inserted by index itself, remove it */
512 do_tag = !eq(h->name, "a") || !has_class(h->attribs, TARGET);
513 if (do_tag) {
514 printf("<%s", h->name);
515 for (a = h->attribs; a != NULL; a = a->next) {
516 printf(" %s", a->name);
517 if (a->value != NULL) printf("=\"%s\"", a->value);
518 }
519 assert(! is_empty(h->name) || h->children == NULL);
520 printf(xml && is_empty(h->name) ? " />" : ">");
521 /* Insert an <A NAME> if element has an ID and is not <A> */
522 if (bctarget && is_mixed(h->name) && (val = get_attrib(h, "id"))
523 && !eq(h->name, "a") && ! xml)
524 printf("<a class=\"%s\" name=\"%s\"></a>", TARGET, val);
525 }
526 expand(h, write, terms);
527 if (do_tag && ! is_empty(h->name)) printf("</%s>", h->name);
528 }
529 break;
530 case Root:
531 assert(! "Cannot happen");
532 break;
533 default:
534 assert(! "Cannot happen");
535 }
536 }
537 }
538
539 /* termcmp -- comparison routine for Indexterms */
termcmp(const void * a1,const void * b1)540 static int termcmp(const void *a1, const void *b1)
541 {
542 Indexterm a = (Indexterm)a1, b = (Indexterm)b1;
543 int r;
544
545 assert(a);
546 assert(b);
547 assert(a->sortkeys);
548 assert(b->sortkeys);
549 assert(a->nrkeys > 0);
550 assert(b->nrkeys > 0);
551
552 r = folding_cmp(a->sortkeys, a->nrkeys, b->sortkeys, b->nrkeys);
553 if (r != 0) return r;
554 return strcmp(a->url, b->url); /* Terms are equal, compare URL instead */
555 }
556
557 /* copy_contents -- recursively expand contents of element t into a string */
copy_contents(Tree t,string * s)558 static void copy_contents(Tree t, string *s)
559 {
560 Tree h;
561 int i;
562 pairlist a;
563 string p;
564
565 for (h = t->children; h != NULL; h = h->sister) {
566 switch (h->tp) {
567 case Text:
568 i = *s ? strlen(*s) : 0;
569 renewarray(*s, i + strlen(h->text) + 1);
570 /* Copy, but transform all whitespace to spaces */
571 for (p = h->text; *p; p++, i++) (*s)[i] = isspace(*p) ? ' ' : *p;
572 (*s)[i] = '\0';
573 break;
574 case Comment: break;
575 case Declaration: break;
576 case Procins: break;
577 case Element:
578 /* Only certain tags are retained */
579 if (eq(h->name, "span") || eq(h->name, "code") || eq(h->name, "tt")
580 || eq(h->name, "acronym") || eq(h->name, "abbr")
581 || eq(h->name, "bdo") || eq(h->name, "kbd") || eq(h->name, "samp")
582 || eq(h->name, "sub") || eq(h->name, "sup")
583 || eq(h->name, "var")) {
584 strapp(s, "<", h->name, NULL);
585 for (a = h->attribs; a != NULL; a = a->next) {
586 if (! a->value) strapp(s, " ", a->name, NULL);
587 else strapp(s, " ", a->name, "=\"", a->value, "\"", NULL);
588 }
589 assert(! is_empty(h->name) || h->children == NULL);
590 if (is_empty(h->name)) {
591 strapp(s, xml ? " />" : ">", NULL);
592 } else {
593 strapp(s, ">", NULL);
594 copy_contents(h, s);
595 strapp(s, "</", h->name, ">", NULL);
596 }
597 } else { /* Ignore tag, copy contents */
598 copy_contents(h, s);
599 }
600 break;
601 case Root: assert(! "Cannot happen"); break;
602 default: assert(! "Cannot happen");
603 }
604 }
605 }
606
607 /* copy_to_index -- copy the contents of element h to the index db */
copy_to_index(Tree t,Indexterm * terms,int importance,conststring secno,conststring sectitle,conststring doctitle)608 static void copy_to_index(Tree t, Indexterm *terms, int importance,
609 conststring secno, conststring sectitle,
610 conststring doctitle)
611 {
612 conststring id, title;
613 string h;
614 Indexterm term;
615 int i, n;
616
617 id = get_attrib(t, "id");
618 #ifdef USE_DATA_ATTRIBUTE
619 if (! (title = get_attrib(t, "data-index")))
620 #endif
621 title = get_attrib(t, "title");
622
623 /* Get term either from title attribute or contents */
624 if (title) {
625
626 i = 0;
627 while (title[i]) {
628 n = strcspn(title + i, "|"); /* Find | or \0 */
629 new(term);
630 term->importance = importance;
631 term->secno = secno ? newstring(secno) : NULL;
632 term->sectitle = sectitle ? newstring(sectitle) : NULL;
633 term->doctitle = newstring(doctitle);
634 term->url = NULL;
635 strapp(&term->url, base, "#", id, NULL);
636 h = newnstring(title + i, n);
637 parse_subterms(term, h);
638 if (! tsearch(term, (void**)terms, termcmp))
639 errx(1, "Out of memory while parsing term %s\n", h);
640 i += n;
641 if (title[i]) i++; /* Skip '|' */
642 }
643 if (final) /* Remove used attribute */
644 #ifdef USE_DATA_ATTRIBUTE
645 if (!delete_attrib(t, "data-index"))
646 #endif
647 delete_attrib(t, "title");
648
649 } else { /* Recursively copy contents */
650
651 h = NULL;
652 copy_contents(t, &h);
653 if (h) { /* Non-empty contents */
654 new(term);
655 term->importance = importance;
656 term->secno = secno ? newstring(secno) : NULL;
657 term->sectitle = sectitle ? newstring(sectitle) : NULL;
658 term->doctitle = newstring(doctitle);
659 term->url = NULL;
660 strapp(&term->url, base, "#", id, NULL);
661 parse_subterms(term, h);
662 if (! tsearch(term, (void**)terms, termcmp))
663 errx(1, "Out of memory while parsing term %s", h);
664 }
665
666 }
667 }
668
669 /* in_list -- check if word occurs in array list */
in_list(const string word,const string * list)670 static bool in_list(const string word, const string *list)
671 {
672 int i;
673
674 for (i = 0; list[i]; i++) if (eq(word, list[i])) return true;
675 return false;
676 }
677
678 /* collect -- collect index terms, add IDs where needed */
collect(Tree t,Indexterm * terms,string * secno,string * sectitle,string * doctitle)679 static void collect(Tree t, Indexterm *terms, string *secno,
680 string *sectitle, string *doctitle)
681 {
682 int importance;
683 Tree h;
684
685 for (h = t->children; h != NULL; h = h->sister) {
686 switch (h->tp) {
687 case Text: case Comment: case Declaration: case Procins: break;
688 case Element:
689 if (eq(h->name, "title")) {
690 dispose(*doctitle);
691 copy_contents(h, doctitle);
692 }
693 if (has_class(h->attribs, SECNO)) {
694 dispose(*secno);
695 copy_contents(h, secno);
696 trim(*secno);
697 } else if (has_class(h->attribs, NO_NUM)) {
698 dispose(*secno);
699 *secno = newstring(unknown_name);
700 }
701 if (eq(h->name, "h1") || eq(h->name, "h2") || eq(h->name, "h3") ||
702 eq(h->name, "h4") || eq(h->name, "h5") || eq(h->name, "h5")) {
703 dispose(*sectitle);
704 copy_contents(h, sectitle);
705 trim(*sectitle);
706 }
707 if (eq(h->name, "dfn")) importance = 2;
708 else if (exclude_elts && in_list(h->name, exclude_elts)) importance = 0;
709 else if (only_elts && !in_list(h->name, only_elts)) importance = 0;
710 else if (has_class(h->attribs,INDEX)||has_class(h->attribs,INDEX_INST))
711 importance = 1;
712 else if (userclassnames && has_class_in_list(h->attribs, userclassnames))
713 importance = 1;
714 else if (has_class(h->attribs, INDEX_DEF)) importance = 2;
715 else importance = 0;
716 if (importance != 0) {
717 /* Give it an ID, if it doesn't have one */
718 if (! get_attrib(h, "id")) set_attrib(h, "id", gen_id(h));
719 copy_to_index(h, terms, importance, *secno, *sectitle, *doctitle);
720 } else {
721 collect(h, terms, secno, sectitle, doctitle);
722 }
723 break;
724 case Root: assert(! "Cannot happen"); break;
725 default: assert(! "Cannot happen");
726 }
727 }
728 }
729
730 /* load_index -- read persistent term db from file */
load_index(const string indexdb,Indexterm * terms)731 static void load_index(const string indexdb, Indexterm *terms)
732 {
733 FILE *f;
734 int n1, n2, n3, n4, n5, n6;
735 char *line = NULL;
736 size_t linesize = 0;
737 Indexterm term;
738 string h;
739
740 if (! (f = fopen(indexdb, "r"))) return; /* Assume file not found... */
741
742 while (getline(&line, &linesize, f) != -1) {
743 n1 = strcspn(line, "\t");
744 if (line[n1] != '\t') errx(1, "Illegal syntax in %s", indexdb);
745 n2 = n1 + 1 + strcspn(line + n1 + 1, "\t");
746 if (line[n2] != '\t') errx(1, "Illegal syntax in %s", indexdb);
747 n3 = n2 + 1 + strcspn(line + n2 + 1, "\t");
748 if (line[n3] != '\t') errx(1, "Illegal syntax in %s", indexdb);
749 n4 = n3 + 1 + strcspn(line + n3 + 1, "\t");
750 if (line[n4] != '\t') errx(1, "Illegal syntax in %s", indexdb);
751 n5 = n4 + 1 + strcspn(line + n4 + 1, "\t");
752 if (line[n5] != '\t') errx(1, "Illegal syntax in %s", indexdb);
753 n6 = n5 + 1 + strcspn(line + n5 + 1, "\t\n");
754 if (line[n6] != '\n') errx(1, "Illegal syntax in %s", indexdb);
755 new(term);
756 h = newnstring(line, n1);
757 switch (line[n1 + 1]) {
758 case '1': term->importance = 1; break;
759 case '2': term->importance = 2; break;
760 default: errx(1, "Error in %s (column 2 must be '1' or '2')", indexdb);
761 }
762 term->url = newnstring(line + n2 + 1, n3 - n2 - 1);
763 term->secno = newnstring(line + n3 + 1, n4 - n3 - 1);
764 term->sectitle = newnstring(line + n4 + 1, n5 - n4 - 1);
765 term->doctitle = newnstring(line + n5 + 1, n6 - n5 - 1);
766 parse_subterms(term, h);
767 if (! tsearch(term, (void**)terms, termcmp))
768 errx(1, "Out of memory while loading %s", indexdb);
769 }
770
771 fclose(f);
772 free(line);
773 }
774
775 /* save_a_term -- write one term to globalfile */
save_a_term(const void * term1,const VISIT which,const int dp)776 static void save_a_term(const void *term1, const VISIT which, const int dp)
777 {
778 Indexterm term = *(Indexterm*)term1;
779 int i;
780
781 if (which == endorder || which == leaf) {
782 for (i = 0; i < term->nrkeys; i++) {
783 if (i > 0) fprintf(globalfile, "!!");
784 fprintf(globalfile, "%s", term->terms[i]);
785 }
786 fprintf(globalfile, "\t%d\t%s\t%s\t%s\t%s\n", term->importance, term->url,
787 term->secno ? term->secno : (use_secno ? unknown_name : "#"),
788 term->sectitle ? term->sectitle : term->doctitle,
789 term->doctitle);
790 }
791 }
792
793 /* save_index -- write terms to file */
save_index(const string indexdb,Indexterm terms)794 static void save_index(const string indexdb, Indexterm terms)
795 {
796 if (! (globalfile = fopen(indexdb, "w")))
797 errx(1, "%s: %s", indexdb, strerror(errno));
798 twalk(terms, save_a_term);
799 fclose(globalfile);
800 }
801
802 /* usage -- print usage message and exit */
usage(string name)803 static void usage(string name)
804 {
805 errx(1, "Version %s\nUsage: %s [-i indexdb] [-b base] [-x] [-t] [-n] [-c userclass] [-s template] [-u phrase] [html-file]",
806 VERSION, name);
807 }
808
809 /* tokenize -- split string s into tokens at each comma, return an array */
tokenize(string s)810 static string * tokenize(string s)
811 {
812 string * t;
813 int i, n;
814
815 assert(s && s[0]);
816 for (t = NULL, n = 0; *s; s += i + 1, n++) {
817 i = strcspn(s, ",");
818 renewarray(t, n + 1);
819 t[n] = newnstring(s, i);
820 }
821 renewarray(t, n + 1); /* Make final item NULL */
822 t[n] = NULL;
823 return t;
824 }
825
826 /* main */
main(int argc,char * argv[])827 int main(int argc, char *argv[])
828 {
829 bool write = true;
830 Indexterm termtree = NULL; /* Sorted tree of terms */
831 string secno, doctitle, sectitle;
832 int c, status = 200;
833
834 /* Bind the parser callback routines to our handlers */
835 set_error_handler(handle_error);
836 set_start_handler(start);
837 set_end_handler(end);
838 set_comment_handler(handle_comment);
839 set_text_handler(handle_text);
840 set_decl_handler(handle_decl);
841 set_pi_handler(handle_pi);
842 set_starttag_handler(handle_starttag);
843 set_emptytag_handler(handle_emptytag);
844 set_endtag_handler(handle_endtag);
845
846 yyin = NULL;
847
848 while ((c = getopt(argc, argv, "txb:i:cnNfrs:u:O:X:")) != -1)
849 switch (c) {
850 case 't': bctarget = false; break; /* Don't write <a name> after each ID */
851 case 'x': xml = true; break; /* Output as XML */
852 case 'b': base = newstring(optarg); break; /* Set base of URL */
853 case 'i': indexdb = newstring(optarg); break; /* Set name of index db */
854 case 'c': userclassnames = tokenize(optarg); break; /* Set class names */
855 case 'n': use_secno = true; use_sectitle = false; break;
856 case 'N': use_sectitle = true; use_secno = false; break;
857 case 'f': final = true; break; /* "Final": remove used attributes */
858 case 'r': trim_punct = false; break; /* Do not remove trailing punctuation */
859 case 's': section_name = newstring(optarg); break;
860 case 'u': unknown_name = newstring(optarg); break;
861 case 'O': only_elts = tokenize(optarg); break; /* Index only these elements */
862 case 'X': exclude_elts = tokenize(optarg); break; /* Don't index these */
863 default: usage(argv[0]);
864 }
865 if (optind == argc) yyin = stdin;
866 else if (argc > optind + 1) usage(argv[0]);
867 else if (eq(argv[optind], "-")) yyin = stdin;
868 else yyin = fopenurl(argv[optind], "r", &status);
869
870 if (yyin == NULL) {perror(argv[optind]); exit(1);}
871 if (status != 200) errx(1, "%s : %s", argv[optind], http_strerror(status));
872
873 if (!base) base = newstring("");
874 if (!section_name) section_name = newstring("section %s");
875 if (!unknown_name) unknown_name = newstring("??");
876
877 /* Apply user's locale */
878 setlocale(LC_ALL, "");
879
880 /* Read the index DB into memory */
881 if (indexdb) load_index(indexdb, &termtree);
882
883 /* Parse, build tree, collect existing IDs */
884 if (yyparse() != 0) exit(3);
885
886 /* Scan for index terms, add them to the tree, add IDs where needed */
887 secno = NULL;
888 sectitle = NULL;
889 doctitle = newstring("");
890 collect(get_root(tree), &termtree, &secno, §itle, &doctitle);
891
892 /* Write out the document, adding <A NAME> and replacing <!--index--> */
893 expand(get_root(tree), &write, termtree);
894
895 /* Store terms to file */
896 if (indexdb) save_index(indexdb, termtree);
897
898 fclose(yyin);
899 return 0;
900 }
901