1 /*
2 * Program to (semi-)automatically link instances of terms and phrases
3 * in an HTML file to their definitions.
4 *
5 * The program collects all <dfn> elements, and stores either their
6 * title attribute, or if there is none, their content (without
7 * mark-up). Then it looks for occurrences of the same text and makes
8 * a link from the occurrence to the corresponding <dfn> element. The
9 * occurrences that are checked are the contents of all inline
10 * elements, such as <em> and <span>. HTML unfortunately forbids
11 * nested links, so the program doesn't look for occurrences inside an
12 * <a>.
13 *
14 * The program can store the <dfn> elements (the terms they define,
15 * the file they occur in and their ID) in a file, so that
16 * cross-references among several files are possible, by running the
17 * program on each of the files. It may be necessary to run the
18 * program twice on a series of files, to create all the references.
19 *
20 * Copyright © 2000-2012 World Wide Web Consortium
21 * See http://www.w3.org/Consortium/Legal/copyright-software
22 *
23 * Author: Bert Bos <bert@w3.org>
24 * Created: 4 August 2000
25 * Version: $Id: hxref.c,v 1.14 2017/11/24 09:50:25 bbos Exp $
26 **/
27
28 #include "config.h"
29 #include <assert.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <ctype.h>
33 #include <stdbool.h>
34
35 #ifdef HAVE_ERRNO_H
36 # include <errno.h>
37 #endif
38 #ifdef HAVE_SEARCH_H
39 # include <search.h>
40 #else
41 # include "hash.e"
42 #endif
43
44 #if STDC_HEADERS
45 # include <string.h>
46 #else
47 # ifndef HAVE_STRCHR
48 # define strchr index
49 # define strrchr rindex
50 # endif
51 # ifndef HAVE_STRSTR
52 # include "strstr.e"
53 # endif
54 #endif
55 #include "heap.e"
56 #include "types.e"
57 #include "html.e"
58 #include "scan.e"
59 #include "tree.e"
60 #include "dict.e"
61 #include "openurl.e"
62 #include "genid.e"
63 #include "errexit.e"
64
65
66 /* Warning: arbitrary limit! */
67 #define MAXLINE 4096 /* Max. len. of url + term */
68 #define HASHSIZE 4096 /* Size of hash table */
69
70
71 static Tree tree;
72 static string base = NULL, progname;
73 static bool do_xml = false;
74 static bool use_language = false;
75 static char *extras = "-_@()"; /* Significant characters */
76
77
78 /* handle_error -- called when a parse error occurred */
handle_error(void * clientdata,const string s,int lineno)79 static void handle_error(void *clientdata, const string s, int lineno)
80 {
81 fprintf(stderr, "%d: %s\n", lineno, s);
82 }
83
84
85 /* start -- called before the first event is reported */
start(void)86 static void* start(void)
87 {
88 tree = create();
89 return NULL;
90 }
91
92
93 /* end -- called after the last event is reported */
end(void * clientdata)94 static void end(void *clientdata)
95 {
96 /* skip */
97 }
98
99
100 /* handle_comment -- called after a comment is parsed */
handle_comment(void * clientdata,string commenttext)101 static void handle_comment(void *clientdata, string commenttext)
102 {
103 tree = append_comment(tree, commenttext);
104 }
105
106
107 /* handle_text -- called after a tex chunk is parsed */
handle_text(void * clientdata,string text)108 static void handle_text(void *clientdata, string text)
109 {
110 tree = append_text(tree, text);
111 }
112
113
114 /* handle_declaration -- called after a declaration is parsed */
handle_decl(void * clientdata,string gi,string fpi,string url)115 static void handle_decl(void *clientdata, string gi, string fpi, string url)
116 {
117 tree = append_declaration(tree, gi, fpi, url);
118 }
119
120
121 /* handle_proc_instr -- called after a PI is parsed */
handle_pi(void * clientdata,string pi_text)122 static void handle_pi(void *clientdata, string pi_text)
123 {
124 tree = append_procins(tree, pi_text);
125 }
126
127
128 /* handle_starttag -- called after a start tag is parsed */
handle_starttag(void * clientdata,string name,pairlist attribs)129 static void handle_starttag(void *clientdata, string name, pairlist attribs)
130 {
131 conststring id;
132
133 tree = html_push(tree, name, attribs);
134
135 /* If it has an ID, store it (so we don't accidentally generate it) */
136 if ((id = pairlist_get(attribs, "id"))) storeID(id);
137 }
138
139
140 /* handle_emptytag -- called after an empty tag is parsed */
handle_emptytag(void * clientdata,string name,pairlist attribs)141 static void handle_emptytag(void *clientdata, string name, pairlist attribs)
142 {
143 handle_starttag(clientdata, name, attribs);
144 }
145
146
147 /* handle_endtag -- called after an endtag is parsed (name may be "") */
handle_endtag(void * clientdata,string name)148 static void handle_endtag(void *clientdata, string name)
149 {
150 tree = html_pop(tree, name);
151 }
152
153
154 /* load_definitions -- read already defined terms from file */
load_definitions(FILE * f)155 static void load_definitions(FILE *f)
156 {
157 char buf[MAXLINE];
158 ENTRY entry;
159 string h;
160
161 while (fgets(buf, sizeof(buf), f)) { /* Format is PHRASE\tURL\n */
162 h = strchr(buf, '\t');
163 if (! h) errexit("%s: index file not in correct format\n", progname);
164 chomp(h);
165 entry.key = newnstring(buf, h - buf);
166 entry.data = newstring(h + 1);
167 hsearch(entry, ENTER);
168 }
169 }
170
171
172 /* get_contents -- collect all text content of an elt into a single string */
get_contents(Tree t)173 static string get_contents(Tree t)
174 {
175 Node *h;
176 string contents = NULL, k;
177
178 assert(t->tp == Element);
179 for (h = t->children; h; h = h->sister) {
180 if (h->tp == Text) {
181 strapp(&contents, h->text, NULL);
182 } else if (h->tp == Element && !eq(h->name, "a") && !eq(h->name, "dfn")
183 && (k = get_contents(h))) {
184 strapp(&contents, k, NULL);
185 dispose(k);
186 }
187 }
188 return contents;
189 }
190
191
192 /* normalize -- collapse whitespace, trim, lowercase (modifies s) */
normalize(string s)193 static string normalize(string s)
194 {
195 int i = 0, j;
196
197 if (!s) return newstring("");
198
199 for (j = 0; isspace(s[j]); j++) ; /* Skip initial whitespace */
200
201 for (; s[j]; j++)
202 if (isupper(s[j])) s[i++] = tolower(s[j]); /* Upper -> lowercase */
203 else if (isalnum(s[j])) s[i++] = s[j]; /* Keep these */
204 else if (strchr(extras, s[j])) s[i++] = s[j]; /* Keep these, too */
205 else if (! isspace(s[j])) ; /* Skip rest, except spaces */
206 else if (s[i-1] != ' ') s[i++] = ' '; /* Collapse whitespace */
207
208 for (; i > 0 && s[i-1] == ' '; i--) ; /* Remove trailing spaces */
209
210 s[i] = '\0';
211 return s;
212 }
213
214
215 /* search -- search a matching string in the hash table */
search(string key,const conststring language)216 static ENTRY* search(string key, const conststring language)
217 {
218 ENTRY entry, *e;
219 int n;
220 string t;
221
222 /* Assumes key has already passed normalize() */
223
224 /* First try the key as it is */
225 entry.key = key;
226 if ((e = hsearch(entry, FIND))) return e;
227
228 /* Should we try language-specific modifications to the key? */
229 if (!language || !use_language) return NULL;
230
231 if (eq(language, "en") || hasprefix(language, "en-")) { /* English */
232
233 /* Remove plural s */
234 if ((n = strlen(key)) > 1 && key[n-1] == 's' && islower(key[n-2])) {
235 t = newnstring(key, n - 1);
236 entry.key = t;
237 e = hsearch(entry, FIND);
238 dispose(t);
239 if (e) return e;
240 }
241 /* Remove plural es */
242 if (n > 2 && key[n-1] == 's' && key[n-2] == 'e' && islower(key[n-3])) {
243 t = newnstring(key, n - 2);
244 entry.key = t;
245 e = hsearch(entry, FIND);
246 dispose(t);
247 if (e) return e;
248 }
249 /* Replace plural ies by singular y */
250 if (n > 3 && hasaffix(key, "ies") && islower(key[n-4])) {
251 t = newnstring(key, n - 3);
252 strapp(&t, "y", NULL);
253 entry.key = t;
254 e = hsearch(entry, FIND);
255 dispose(t);
256 if (e) return e;
257 }
258 }
259
260 return NULL;
261 }
262
263
264 /* collect_terms -- walk the document tree looking for <dfn> elements */
collect_terms(Tree tree,FILE * db)265 static void collect_terms(Tree tree, FILE *db)
266 {
267 conststring id, title;
268 string url = NULL, s;
269 ENTRY entry, *e;
270 int i, n;
271 Node *h;
272
273 switch (tree->tp) {
274 case Text:
275 case Comment:
276 case Declaration:
277 case Procins:
278 break;
279 case Root:
280 for (h = tree->children; h; h = h->sister) collect_terms(h, db);
281 break;
282 case Element:
283 if (! eq(tree->name, "dfn")) {
284 for (h = tree->children; h; h = h->sister) collect_terms(h, db);
285 } else {
286 if (! (id = get_attrib(tree, "id"))) { /* Make sure there's an ID */
287 id = gen_id(tree);
288 set_attrib(tree, "id", id);
289 }
290 if ((title = get_attrib(tree, "title"))) /* Use title if it exists */
291 s = newstring(title); /* Don't normalize yet */
292 else /* otherwise grab contents */
293 s = normalize(get_contents(tree)); /* Normalize, also removes "|" */
294
295 entry.data = strapp(&url, base ? base : (string)"", "#", id, NULL);
296 for (i = 0; s[i];) { /* Loop over |-separated terms */
297 n = strcspn(s + i, "|");
298 entry.key = normalize(newnstring(s + i, n));
299 /* Add to hash table and to db file, if not already there */
300 if (! (e = hsearch(entry, FIND))
301 || ! eq((string)e->data, (string)entry.data)) {
302 hsearch(entry, ENTER);
303 if (db) fprintf(db, "%s\t%s\n", entry.key, (char*)entry.data);
304 }
305 i += n;
306 if (s[i]) i++; /* Skip "|" */
307 }
308 }
309 break;
310 default:
311 assert(!"Cannot happen");
312 }
313 }
314
315
316 /* find_instances -- walk tree, make instances of defined terms into links */
find_instances(Tree tree,const conststring language)317 static void find_instances(Tree tree, const conststring language)
318 {
319 ENTRY *e;
320 conststring title, lang;
321 string key;
322
323 if (!tree) return;
324
325 switch (tree->tp) {
326 case Text: case Comment: case Declaration: case Procins:
327 find_instances(tree->sister, language);
328 break;
329 case Root:
330 find_instances(tree->children, language); /* Recurse over children */
331 find_instances(tree->sister, language); /* Recurse over siblings */
332 break;
333 case Element:
334 if (!(lang = get_attrib(tree, "lang")) &&
335 !(lang = get_attrib(tree, "xml:lang")))
336 lang = language;
337 if (eq(tree->name, "a") || eq(tree->name, "dfn"))
338 ; /* Don't descend into these */
339 else if (eq(tree->name, "abbr") || eq(tree->name, "acronym")
340 || eq(tree->name, "b") || eq(tree->name, "bdo")
341 || eq(tree->name, "big") /*|| eq(tree->name, "cite")*/
342 || eq(tree->name, "code") || eq(tree->name, "del")
343 /*|| eq(tree->name, "dt")*/ || eq(tree->name, "em")
344 || eq(tree->name, "i") || eq(tree->name, "ins")
345 || eq(tree->name, "kbd") || eq(tree->name, "label")
346 || eq(tree->name, "legend") || eq(tree->name, "q")
347 || eq(tree->name, "samp") || eq(tree->name, "small")
348 || eq(tree->name, "span") || eq(tree->name, "strong")
349 || eq(tree->name, "sub") || eq(tree->name, "sup")
350 || eq(tree->name, "tt") || eq(tree->name, "var")) {
351 if ((title = get_attrib(tree, "title"))) /* Use title if it exists */
352 key = newstring(title);
353 else /* Get flattened contents */
354 key = get_contents(tree);
355 if (!(e = search(normalize(key), lang))) { /* If not an instance */
356 find_instances(tree->children, lang); /* Recurse over children */
357 } else if (eq(tree->name, "span")) { /* Found an instance */
358 rename_elt(tree, "a"); /* Turn the span into an a */
359 set_attrib(tree, "href", e->data);
360 } else {
361 tree = wrap_elt(tree, "a", NULL); /* Wrap element in an <a> */
362 set_attrib(tree, "href", e->data);
363 }
364 dispose(key);
365 } else { /* Not an inline element */
366 find_instances(tree->children, lang); /* Recurse over children */
367 }
368 find_instances(tree->sister, language); /* Recurse over siblings */
369 break;
370 default:
371 assert(!"Cannot happen");
372 }
373 }
374
375
376 /* write_doc -- write the tree to a file */
write_doc(Tree n,bool do_xml,FILE * f)377 static void write_doc(Tree n, bool do_xml, FILE *f)
378 {
379 pairlist h;
380 Tree l;
381
382 switch (n->tp) {
383 case Root:
384 for (l = n->children; l; l = l->sister) write_doc(l, do_xml, f);
385 break;
386 case Text:
387 fprintf(f, "%s", n->text);
388 break;
389 case Comment:
390 fprintf(f, "<!--%s-->", n->text);
391 break;
392 case Declaration:
393 fprintf(f, "<!DOCTYPE %s", n->name);
394 if (n->text) fprintf(f, " PUBLIC \"%s\"", n->text);
395 if (n->url) fprintf(f, " %s\"%s\"", n->text ? "" : "SYSTEM ", n->url);
396 fprintf(f, ">");
397 break;
398 case Procins:
399 fprintf(f, "<?%s>", n->text);
400 break;
401 case Element:
402 fprintf(f, "<%s", n->name);
403 for (h = n->attribs; h != NULL; h = h->next) {
404 fprintf(f, " %s", h->name);
405 if (h->value != NULL) fprintf(f, "=\"%s\"", h->value);
406 else if (do_xml) fprintf(f, "=\"%s\"", h->name);
407 }
408 if (is_empty(n->name)) {
409 assert(n->children == NULL);
410 fprintf(f, do_xml ? " />" : ">");
411 } else {
412 fprintf(f, ">");
413 for (l = n->children; l; l = l->sister) write_doc(l, do_xml, f);
414 fprintf(f, "</%s>", n->name);
415 }
416 break;
417 default:
418 assert(!"Cannot happen");
419 }
420 }
421
422
423 /* usage -- print usage message and exit */
usage(void)424 static void usage(void)
425 {
426 fprintf(stderr,
427 "Usage: %s [-v] [-b base] [-i index] [-x] [-l] [--] [input [output]]\n",
428 progname);
429 exit(1);
430 }
431
432
433 /* main -- main body of xref */
main(int argc,char * argv[])434 int main(int argc, char *argv[])
435 {
436 int i, status = 200;
437 FILE *outfile = NULL, *db = NULL;
438
439 /* Bind the parser callback routines to our handlers */
440 set_error_handler(handle_error);
441 set_start_handler(start);
442 set_end_handler(end);
443 set_comment_handler(handle_comment);
444 set_text_handler(handle_text);
445 set_decl_handler(handle_decl);
446 set_pi_handler(handle_pi);
447 set_starttag_handler(handle_starttag);
448 set_emptytag_handler(handle_emptytag);
449 set_endtag_handler(handle_endtag);
450
451 /* Parse command line */
452 progname = argv[0];
453 yyin = NULL;
454 for (i = 1; i < argc && argv[i][0] == '-' && !eq(argv[i], "--"); i++) {
455 switch (argv[i][1]) {
456 case 'b':
457 if (!argv[i][2] && i + 1 == argc) usage(); /* Missing argument */
458 if (base) usage(); /* Option was already set */
459 base = argv[i][2] ? argv[i] + 2 : argv[++i];
460 break;
461 case 'x':
462 if (do_xml) usage(); /* Option was already set */
463 do_xml = true;
464 break;
465 case 'i':
466 if (!argv[i][2] && i + 1 == argc) usage(); /* Missing argument */
467 if (db) usage(); /* Index was already set */
468 db = fopen(argv[i][2] ? argv[i] + 2 : argv[++i], "a+");
469 if (! db) errexit("%s: %s\n", argv[i], strerror(errno));
470 break;
471 case 'l':
472 if (use_language) usage(); /* Option was already set */
473 use_language = true;
474 break;
475 case 'v':
476 printf("Version: %s %s\n", PACKAGE, VERSION);
477 return 0;
478 case '\0':
479 if (!yyin) yyin = stdin;
480 else if (!outfile) outfile = stdout;
481 else usage(); /* Was already set */
482 break;
483 default:
484 usage(); /* Unknown option */
485 }
486 }
487 if (i < argc && eq(argv[i], "--")) i++;
488
489 if (i < argc) {
490 if (yyin) usage(); /* Input was already set */
491 if (eq(argv[i], "-")) yyin = stdin;
492 else yyin = fopenurl(argv[i], "r", &status);
493 if (! yyin) errexit("%s: %s\n", argv[i], strerror(errno));
494 if (status != 200) errexit("%s : %s\n", argv[i], http_strerror(status));
495 }
496 if (++i < argc) {
497 if (outfile) usage(); /* Output was already set */
498 if (eq(argv[i], "-")) outfile = stdout;
499 else outfile = fopen(argv[i], "w");
500 if (! outfile) perror(argv[i]);
501 }
502 if (++i < argc) usage(); /* Too many args */
503
504 if (! yyin) yyin = stdin;
505 if (! outfile) outfile = stdout;
506
507 if (! hcreate(HASHSIZE))
508 errexit("%s: cannot create hash table (out of memory?)\n", argv[0]);
509
510 if (db) {
511 if (fseek(db, 0L, SEEK_SET) == -1)
512 errexit("%s: %s\n", progname, strerror(errno));
513 load_definitions(db);
514 }
515
516 if (yyparse() != 0) exit(3);
517
518 tree = get_root(tree);
519 collect_terms(tree, db);
520 find_instances(tree, NULL);
521
522 if (db) fclose(db);
523
524 write_doc(tree, do_xml, outfile);
525
526 return 0;
527 }
528