1 /*
2  * Program to (semi-)automatically link instances of terms and phrases
3  * in an HTML file to their definitions.
4  *
5  * The program collects all <dfn> elements, and stores either their
6  * title attribute, or if there is none, their content (without
7  * mark-up). Then it looks for occurrences of the same text and makes
8  * a link from the occurrence to the corresponding <dfn> element. The
9  * occurrences that are checked are the contents of all inline
10  * elements, such as <em> and <span>. HTML unfortunately forbids
11  * nested links, so the program doesn't look for occurrences inside an
12  * <a>.
13  *
14  * The program can store the <dfn> elements (the terms they define,
15  * the file they occur in and their ID) in a file, so that
16  * cross-references among several files are possible, by running the
17  * program on each of the files. It may be necessary to run the
18  * program twice on a series of files, to create all the references.
19  *
20  * Copyright © 2000-2012 World Wide Web Consortium
21  * See http://www.w3.org/Consortium/Legal/copyright-software
22  *
23  * Author: Bert Bos <bert@w3.org>
24  * Created: 4 August 2000
25  * Version: $Id: hxref.c,v 1.14 2017/11/24 09:50:25 bbos Exp $
26  **/
27 
28 #include "config.h"
29 #include <assert.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <ctype.h>
33 #include <stdbool.h>
34 
35 #ifdef HAVE_ERRNO_H
36 #  include <errno.h>
37 #endif
38 #ifdef HAVE_SEARCH_H
39 #  include <search.h>
40 #else
41 #  include "hash.e"
42 #endif
43 
44 #if STDC_HEADERS
45 # include <string.h>
46 #else
47 # ifndef HAVE_STRCHR
48 #  define strchr index
49 #  define strrchr rindex
50 # endif
51 # ifndef HAVE_STRSTR
52 #  include "strstr.e"
53 # endif
54 #endif
55 #include "heap.e"
56 #include "types.e"
57 #include "html.e"
58 #include "scan.e"
59 #include "tree.e"
60 #include "dict.e"
61 #include "openurl.e"
62 #include "genid.e"
63 #include "errexit.e"
64 
65 
66 /* Warning: arbitrary limit! */
67 #define MAXLINE 4096				/* Max. len. of url + term */
68 #define HASHSIZE 4096				/* Size of hash table */
69 
70 
71 static Tree tree;
72 static string base = NULL, progname;
73 static bool do_xml = false;
74 static bool use_language = false;
75 static char *extras = "-_@()";			/* Significant characters */
76 
77 
78 /* handle_error -- called when a parse error occurred */
handle_error(void * clientdata,const string s,int lineno)79 static void handle_error(void *clientdata, const string s, int lineno)
80 {
81   fprintf(stderr, "%d: %s\n", lineno, s);
82 }
83 
84 
85 /* start -- called before the first event is reported */
start(void)86 static void* start(void)
87 {
88   tree = create();
89   return NULL;
90 }
91 
92 
93 /* end -- called after the last event is reported */
end(void * clientdata)94 static void end(void *clientdata)
95 {
96   /* skip */
97 }
98 
99 
100 /* handle_comment -- called after a comment is parsed */
handle_comment(void * clientdata,string commenttext)101 static void handle_comment(void *clientdata, string commenttext)
102 {
103   tree = append_comment(tree, commenttext);
104 }
105 
106 
107 /* handle_text -- called after a tex chunk is parsed */
handle_text(void * clientdata,string text)108 static void handle_text(void *clientdata, string text)
109 {
110   tree = append_text(tree, text);
111 }
112 
113 
114 /* handle_declaration -- called after a declaration is parsed */
handle_decl(void * clientdata,string gi,string fpi,string url)115 static void handle_decl(void *clientdata, string gi, string fpi, string url)
116 {
117   tree = append_declaration(tree, gi, fpi, url);
118 }
119 
120 
121 /* handle_proc_instr -- called after a PI is parsed */
handle_pi(void * clientdata,string pi_text)122 static void handle_pi(void *clientdata, string pi_text)
123 {
124   tree = append_procins(tree, pi_text);
125 }
126 
127 
128 /* handle_starttag -- called after a start tag is parsed */
handle_starttag(void * clientdata,string name,pairlist attribs)129 static void handle_starttag(void *clientdata, string name, pairlist attribs)
130 {
131   conststring id;
132 
133   tree = html_push(tree, name, attribs);
134 
135   /* If it has an ID, store it (so we don't accidentally generate it) */
136   if ((id = pairlist_get(attribs, "id"))) storeID(id);
137 }
138 
139 
140 /* handle_emptytag -- called after an empty tag is parsed */
handle_emptytag(void * clientdata,string name,pairlist attribs)141 static void handle_emptytag(void *clientdata, string name, pairlist attribs)
142 {
143   handle_starttag(clientdata, name, attribs);
144 }
145 
146 
147 /* handle_endtag -- called after an endtag is parsed (name may be "") */
handle_endtag(void * clientdata,string name)148 static void handle_endtag(void *clientdata, string name)
149 {
150   tree = html_pop(tree, name);
151 }
152 
153 
154 /* load_definitions -- read already defined terms from file */
load_definitions(FILE * f)155 static void load_definitions(FILE *f)
156 {
157   char buf[MAXLINE];
158   ENTRY entry;
159   string h;
160 
161   while (fgets(buf, sizeof(buf), f)) {		/* Format is PHRASE\tURL\n */
162     h = strchr(buf, '\t');
163     if (! h) errexit("%s: index file not in correct format\n", progname);
164     chomp(h);
165     entry.key = newnstring(buf, h - buf);
166     entry.data = newstring(h + 1);
167     hsearch(entry, ENTER);
168   }
169 }
170 
171 
172 /* get_contents -- collect all text content of an elt into a single string */
get_contents(Tree t)173 static string get_contents(Tree t)
174 {
175   Node *h;
176   string contents = NULL, k;
177 
178   assert(t->tp == Element);
179   for (h = t->children; h; h = h->sister) {
180     if (h->tp == Text) {
181       strapp(&contents, h->text, NULL);
182     } else if (h->tp == Element && !eq(h->name, "a") && !eq(h->name, "dfn")
183 	       && (k = get_contents(h))) {
184       strapp(&contents, k, NULL);
185       dispose(k);
186     }
187   }
188   return contents;
189 }
190 
191 
192 /* normalize -- collapse whitespace, trim, lowercase (modifies s) */
normalize(string s)193 static string normalize(string s)
194 {
195   int i = 0, j;
196 
197   if (!s) return newstring("");
198 
199   for (j = 0; isspace(s[j]); j++) ;		/* Skip initial whitespace */
200 
201   for (; s[j]; j++)
202     if (isupper(s[j])) s[i++] = tolower(s[j]);	/* Upper -> lowercase */
203     else if (isalnum(s[j])) s[i++] = s[j];	/* Keep these */
204     else if (strchr(extras, s[j])) s[i++] = s[j]; /* Keep these, too */
205     else if (! isspace(s[j])) ;			/* Skip rest, except spaces */
206     else if (s[i-1] != ' ') s[i++] = ' ';	/* Collapse whitespace */
207 
208   for (; i > 0 && s[i-1] == ' '; i--) ;		/* Remove trailing spaces */
209 
210   s[i] = '\0';
211   return s;
212 }
213 
214 
215 /* search -- search a matching string in the hash table */
search(string key,const conststring language)216 static ENTRY* search(string key, const conststring language)
217 {
218   ENTRY entry, *e;
219   int n;
220   string t;
221 
222   /* Assumes key has already passed normalize() */
223 
224   /* First try the key as it is */
225   entry.key = key;
226   if ((e = hsearch(entry, FIND))) return e;
227 
228   /* Should we try language-specific modifications to the key? */
229   if (!language || !use_language) return NULL;
230 
231   if (eq(language, "en") || hasprefix(language, "en-")) { /* English */
232 
233     /* Remove plural s */
234     if ((n = strlen(key)) > 1 && key[n-1] == 's' && islower(key[n-2])) {
235       t = newnstring(key, n - 1);
236       entry.key = t;
237       e = hsearch(entry, FIND);
238       dispose(t);
239       if (e) return e;
240     }
241     /* Remove plural es */
242     if (n > 2 && key[n-1] == 's' && key[n-2] == 'e' && islower(key[n-3])) {
243       t = newnstring(key, n - 2);
244       entry.key = t;
245       e = hsearch(entry, FIND);
246       dispose(t);
247       if (e) return e;
248     }
249     /* Replace plural ies by singular y */
250     if (n > 3 && hasaffix(key, "ies") && islower(key[n-4])) {
251       t = newnstring(key, n - 3);
252       strapp(&t, "y", NULL);
253       entry.key = t;
254       e = hsearch(entry, FIND);
255       dispose(t);
256       if (e) return e;
257     }
258   }
259 
260   return NULL;
261 }
262 
263 
264 /* collect_terms -- walk the document tree looking for <dfn> elements */
collect_terms(Tree tree,FILE * db)265 static void collect_terms(Tree tree, FILE *db)
266 {
267   conststring id, title;
268   string url = NULL, s;
269   ENTRY entry, *e;
270   int i, n;
271   Node *h;
272 
273   switch (tree->tp) {
274     case Text:
275     case Comment:
276     case Declaration:
277     case Procins:
278       break;
279     case Root:
280       for (h = tree->children; h; h = h->sister) collect_terms(h, db);
281       break;
282     case Element:
283       if (! eq(tree->name, "dfn")) {
284 	for (h = tree->children; h; h = h->sister) collect_terms(h, db);
285       } else {
286 	if (! (id = get_attrib(tree, "id"))) {	/* Make sure there's an ID */
287 	  id = gen_id(tree);
288 	  set_attrib(tree, "id", id);
289 	}
290 	if ((title = get_attrib(tree, "title")))  /* Use title if it exists */
291 	  s = newstring(title);			/* Don't normalize yet */
292 	else					/* otherwise grab contents */
293 	  s = normalize(get_contents(tree));	/* Normalize, also removes "|" */
294 
295 	entry.data = strapp(&url, base ? base : (string)"", "#", id, NULL);
296 	for (i = 0; s[i];) {			/* Loop over |-separated terms */
297 	  n = strcspn(s + i, "|");
298 	  entry.key = normalize(newnstring(s + i, n));
299 	  /* Add to hash table and to db file, if not already there */
300 	  if (! (e = hsearch(entry, FIND))
301 	      || ! eq((string)e->data, (string)entry.data)) {
302 	    hsearch(entry, ENTER);
303 	    if (db) fprintf(db, "%s\t%s\n", entry.key, (char*)entry.data);
304 	  }
305 	  i += n;
306 	  if (s[i]) i++;			/* Skip "|" */
307 	}
308       }
309       break;
310     default:
311       assert(!"Cannot happen");
312   }
313 }
314 
315 
316 /* find_instances -- walk tree, make instances of defined terms into links */
find_instances(Tree tree,const conststring language)317 static void find_instances(Tree tree, const conststring language)
318 {
319   ENTRY *e;
320   conststring title, lang;
321   string key;
322 
323   if (!tree) return;
324 
325   switch (tree->tp) {
326     case Text: case Comment: case Declaration: case Procins:
327       find_instances(tree->sister, language);
328       break;
329     case Root:
330       find_instances(tree->children, language);	/* Recurse over children */
331       find_instances(tree->sister, language);	/* Recurse over siblings */
332       break;
333     case Element:
334       if (!(lang = get_attrib(tree, "lang")) &&
335 	  !(lang = get_attrib(tree, "xml:lang")))
336 	lang = language;
337       if (eq(tree->name, "a") || eq(tree->name, "dfn"))
338 	;					/* Don't descend into these */
339       else if (eq(tree->name, "abbr") || eq(tree->name, "acronym")
340 	       || eq(tree->name, "b") || eq(tree->name, "bdo")
341 	       || eq(tree->name, "big") /*|| eq(tree->name, "cite")*/
342 	       || eq(tree->name, "code") || eq(tree->name, "del")
343 	       /*|| eq(tree->name, "dt")*/ || eq(tree->name, "em")
344 	       || eq(tree->name, "i") || eq(tree->name, "ins")
345 	       || eq(tree->name, "kbd") || eq(tree->name, "label")
346 	       || eq(tree->name, "legend") || eq(tree->name, "q")
347 	       || eq(tree->name, "samp") || eq(tree->name, "small")
348 	       || eq(tree->name, "span") || eq(tree->name, "strong")
349 	       || eq(tree->name, "sub") || eq(tree->name, "sup")
350 	       || eq(tree->name, "tt") || eq(tree->name, "var")) {
351 	if ((title = get_attrib(tree, "title"))) /* Use title if it exists */
352 	  key = newstring(title);
353 	else					/* Get flattened contents */
354 	  key = get_contents(tree);
355 	if (!(e = search(normalize(key), lang))) { /* If not an instance */
356 	  find_instances(tree->children, lang); /* Recurse over children */
357 	} else if (eq(tree->name, "span")) {	/* Found an instance */
358 	  rename_elt(tree, "a");		/* Turn the span into an a */
359 	  set_attrib(tree, "href", e->data);
360 	} else {
361 	  tree = wrap_elt(tree, "a", NULL);	/* Wrap element in an <a> */
362 	  set_attrib(tree, "href", e->data);
363 	}
364 	dispose(key);
365       } else {					/* Not an inline element */
366 	find_instances(tree->children, lang);	/* Recurse over children */
367       }
368       find_instances(tree->sister, language);	/* Recurse over siblings */
369       break;
370     default:
371       assert(!"Cannot happen");
372   }
373 }
374 
375 
376 /* write_doc -- write the tree to a file */
write_doc(Tree n,bool do_xml,FILE * f)377 static void write_doc(Tree n, bool do_xml, FILE *f)
378 {
379   pairlist h;
380   Tree l;
381 
382   switch (n->tp) {
383     case Root:
384       for (l = n->children; l; l = l->sister) write_doc(l, do_xml, f);
385       break;
386     case Text:
387       fprintf(f, "%s", n->text);
388       break;
389     case Comment:
390       fprintf(f, "<!--%s-->", n->text);
391       break;
392     case Declaration:
393       fprintf(f, "<!DOCTYPE %s", n->name);
394       if (n->text) fprintf(f, " PUBLIC \"%s\"", n->text);
395       if (n->url) fprintf(f, " %s\"%s\"", n->text ? "" : "SYSTEM ", n->url);
396       fprintf(f, ">");
397       break;
398     case Procins:
399       fprintf(f, "<?%s>", n->text);
400       break;
401     case Element:
402       fprintf(f, "<%s", n->name);
403       for (h = n->attribs; h != NULL; h = h->next) {
404 	fprintf(f, " %s", h->name);
405 	if (h->value != NULL) fprintf(f, "=\"%s\"", h->value);
406 	else if (do_xml) fprintf(f, "=\"%s\"", h->name);
407       }
408       if (is_empty(n->name)) {
409 	assert(n->children == NULL);
410 	fprintf(f, do_xml ? " />" : ">");
411       } else {
412 	fprintf(f, ">");
413 	for (l = n->children; l; l = l->sister) write_doc(l, do_xml, f);
414 	fprintf(f, "</%s>", n->name);
415       }
416       break;
417     default:
418       assert(!"Cannot happen");
419   }
420 }
421 
422 
423 /* usage -- print usage message and exit */
usage(void)424 static void usage(void)
425 {
426   fprintf(stderr,
427 	  "Usage: %s [-v] [-b base] [-i index] [-x] [-l] [--] [input [output]]\n",
428 	  progname);
429   exit(1);
430 }
431 
432 
433 /* main -- main body of xref */
main(int argc,char * argv[])434 int main(int argc, char *argv[])
435 {
436   int i, status = 200;
437   FILE *outfile = NULL, *db = NULL;
438 
439   /* Bind the parser callback routines to our handlers */
440   set_error_handler(handle_error);
441   set_start_handler(start);
442   set_end_handler(end);
443   set_comment_handler(handle_comment);
444   set_text_handler(handle_text);
445   set_decl_handler(handle_decl);
446   set_pi_handler(handle_pi);
447   set_starttag_handler(handle_starttag);
448   set_emptytag_handler(handle_emptytag);
449   set_endtag_handler(handle_endtag);
450 
451   /* Parse command line */
452   progname = argv[0];
453   yyin = NULL;
454   for (i = 1; i < argc && argv[i][0] == '-' && !eq(argv[i], "--"); i++) {
455     switch (argv[i][1]) {
456       case 'b':
457 	if (!argv[i][2] && i + 1 == argc) usage(); /* Missing argument */
458 	if (base) usage();			/* Option was already set */
459 	base = argv[i][2] ? argv[i] + 2 : argv[++i];
460 	break;
461       case 'x':
462 	if (do_xml) usage();			/* Option was already set */
463 	do_xml = true;
464 	break;
465       case 'i':
466 	if (!argv[i][2] && i + 1 == argc) usage(); /* Missing argument */
467 	if (db) usage();			/* Index was already set */
468 	db = fopen(argv[i][2] ? argv[i] + 2 : argv[++i], "a+");
469 	if (! db) errexit("%s: %s\n", argv[i], strerror(errno));
470 	break;
471       case 'l':
472 	if (use_language) usage(); 		/* Option was already set */
473 	use_language = true;
474 	break;
475       case 'v':
476 	printf("Version: %s %s\n", PACKAGE, VERSION);
477 	return 0;
478       case '\0':
479 	if (!yyin) yyin = stdin;
480 	else if (!outfile) outfile = stdout;
481 	else usage();				/* Was already set */
482 	break;
483       default:
484       usage();					/* Unknown option */
485     }
486   }
487   if (i < argc && eq(argv[i], "--")) i++;
488 
489   if (i < argc) {
490     if (yyin) usage();				/* Input was already set */
491     if (eq(argv[i], "-")) yyin = stdin;
492     else yyin = fopenurl(argv[i], "r", &status);
493     if (! yyin) errexit("%s: %s\n", argv[i], strerror(errno));
494     if (status != 200) errexit("%s : %s\n", argv[i], http_strerror(status));
495   }
496   if (++i < argc) {
497     if (outfile) usage();			/* Output was already set */
498     if (eq(argv[i], "-")) outfile = stdout;
499     else outfile = fopen(argv[i], "w");
500     if (! outfile) perror(argv[i]);
501   }
502   if (++i < argc) usage();			/* Too many args */
503 
504   if (! yyin) yyin = stdin;
505   if (! outfile) outfile = stdout;
506 
507   if (! hcreate(HASHSIZE))
508     errexit("%s: cannot create hash table (out of memory?)\n", argv[0]);
509 
510   if (db) {
511     if (fseek(db, 0L, SEEK_SET) == -1)
512       errexit("%s: %s\n", progname, strerror(errno));
513     load_definitions(db);
514   }
515 
516   if (yyparse() != 0) exit(3);
517 
518   tree = get_root(tree);
519   collect_terms(tree, db);
520   find_instances(tree, NULL);
521 
522   if (db) fclose(db);
523 
524   write_doc(tree, do_xml, outfile);
525 
526   return 0;
527 }
528