1 /*
2  * cite - adds hyperlinks to bibliographic references in HTML
3  *
4  * The programs looks for strings of the form [[name]] (i.e., a
5  * bibliographic label inside a double pair of square brackets), e.g.,
6  * [[Knuth84]] or [[LieBos97]]. The label will be looked up in a
7  * bibliography database and if it is found, the string will be
8  * replaced by a pattern which is typically of the form <a
9  * href="...">[name]</a>, but the pattern can be changed
10  * with a command line option.
11  *
12  * If the string is of the form {{name}}, the name will be looked up,
13  * but the string will be copied unchanged.
14  *
15  * If the label is not found, a warning is printed and the string is
16  * left unchanged.
17  *
18  * All labels that are found are also stored, one label per line, in a
19  * separate file with extension .aux. This file can be used by mkbib
20  * to create the bibliography by extracting the corresponding
21  * bibliographic entries from the database.
22  *
23  * The bibliography database must be a refer-style database. Though
24  * for the purposes of this program all lines that don't start with
25  * "%L" or %K are ignored. Lines with "%L" are assumed to contain a
26  * label. Lines with %K are assumed to contain whitespace separated
27  * keywords, which are effectively aliases for the label. Entries must
28  * have one %L line and one or zero %K lines.
29  *
30  * Options:
31  *
32  * -b base
33  *     Give the value for %b in the pattern.
34  *
35  * -p pattern
36  *     The replacement for the string [[label]]. The default is
37  *
38  *     <a href=\"%b#%L\" rel=\"biblioentry\">[%L]<!--{{%m%L}}--></a>
39  *
40  *     %L will be replaced by the label, %b by the value of the -b
41  *     option and %m by the marker (-m option).
42  *
43  * -a auxfile
44  *     The name of the file in which the list of labels will be stored.
45  *     Default is the name of the file given as argument, minus its
46  *     extension, plus ".aux". If no file is give (input comes from
47  *     stdin), the default name is "aux.aux".
48  *
49  * -m marker
50  *     By default, the program looks for "[[name]]", but it can be
51  *     made to look for "[[Xname]]" where X is some string, usually a
52  *     symbol such as '!' or ='. This allows references to be
53  *     classified, e.g., "[[!name]]" for normative references and
54  *     "[[name]]" for non-normative references.
55  *
56  * -c
57  *     Assume that every pair "<!--" and "-->" delimit a comment and
58  *     do not process any [[label]] that occurs between them. Any
59  *     "{{label}}" is processed as normal. This does not actually
60  *     parse the input as HTML or XML and thus the program will
61  *     mistake occurrences of these two strings inside CDATA sections
62  *     or attribute values for comment delimiters.
63  *
64  * Copyright © 1994-2012 World Wide Web Consortium
65  * See http://www.w3.org/Consortium/Legal/copyright-software
66  *
67  * Author: Bert Bos <bert@w3.org>
68  * Created: 18 March 2000
69  * Version: $Id: hxcite.c,v 1.11 2018/02/15 19:02:36 bbos Exp $
70  **/
71 
72 #include "config.h"
73 #ifdef HAVE_UNISTD_H
74 #  include <unistd.h>
75 #endif
76 #include <assert.h>
77 #include <stdlib.h>
78 #include <errno.h>
79 #include <stdio.h>
80 #if STDC_HEADERS
81 # include <string.h>
82 #else
83 # ifndef HAVE_STRCHR
84 #  define strchr index
85 #  define strrchr rindex
86 # endif
87 # ifndef HAVE_STRSTR
88 #  include "strstr.e"
89 # endif
90 #endif
91 
92 #ifdef HAVE_SEARCH_H
93 #  include <search.h>
94 #else
95 #  include "hash.e"
96 #endif
97 
98 #include <ctype.h>
99 #include <stdbool.h>
100 #include "export.h"
101 #include "heap.e"
102 #include "types.e"
103 #include "errexit.e"
104 
105 
106 /* Warning: arbitrary limits! */
107 #define LINESIZE 32768
108 #define HASHSIZE 4096				/* Size of hash table */
109 
110 #define WS " \t\r\n\f"				/* Separates %K keywords */
111 
112 static string base = "";			/* URL of bibilography */
113 static string mark = "";			/* Flag after "'[[" */
114 static size_t marklen = 0;			/* Length of mark */
115 static string prog;				/* = argv[0] */
116 static string pattern =
117   "<a href=\"%b#%L\" rel=\"biblioentry\">[%L]<!--{{%m%L}}--></a>";
118 static FILE *aux;
119 static bool skip_comments = false; /* Whether to skip [[ inside <!----> */
120 
121 
122 /* get_label -- get the label for the keyword, or NULL */
get_label(const string keyword)123 static string get_label(const string keyword)
124 {
125   ENTRY *result, e = {keyword, NULL};
126 
127   result = hsearch(e, FIND);
128   return result ? (string) result->data : NULL;
129 }
130 
131 
132 /* valid_label -- check if the label is well-formed */
valid_label(const string label)133 static bool valid_label(const string label)
134 {
135   int i;
136 
137   for (i = 0; label[i]; i++)
138     if (! isalnum(label[i])
139 	&& label[i] != '-'
140 	&& label[i] != '_'
141 	&& label[i] != '.') return false;
142   return true;
143 }
144 
145 
146 /* expand_ref -- print the reformatted reference */
expand_ref(const string label)147 static void expand_ref(const string label)
148 {
149   int i;
150 
151   /* ToDo: somehow allow sequence numbers for references [1], [2], etc. */
152   for (i = 0; pattern[i]; i++) {
153     if (pattern[i] != '%') {
154       putchar(pattern[i]);
155     } else {
156       switch (pattern[++i]) {
157 	case '%': putchar('%'); break;		/* Literal '%' */
158 	case 'b': printf("%s", base); break;	/* Base URL */
159 	case 'L': printf("%s", label); break;	/* Label */
160 	case 'm': printf("%s", mark); break;	/* Mark (-m option) */
161 	default: break;				/* Error in pattern */
162       }
163     }
164   }
165 }
166 
167 
168 /* process_line -- look for citations in a line */
process_line(const string text,const string fname,int lineno,bool * in_comment)169 EXPORT void process_line(const string text, const string fname, int lineno,
170 			 bool *in_comment)
171 {
172   string h = text, p, q, label = NULL, key;
173   char c;
174 
175   /* Loop over occurrences of "[[" + mark + label + "]]"
176    and "{{" + mark + label + "}}" */
177 
178   while (*in_comment ? (p = strpbrk(h, "-{")) : (p = strpbrk(h, "[{<"))) {
179 
180     while (h != p) putchar(*(h++));		/* Print text up to here */
181 
182     if (strncmp(p, "-->", 3) == 0) {		/* End of comment */
183       putchar(*(h++));
184       *in_comment = false;
185       continue;
186     }
187     if (strncmp(p, "<!--", 4) == 0) {		/* Begin of comment */
188       putchar(*(h++));
189       *in_comment = skip_comments;
190       continue;
191     }
192     if (strncmp(p, "{{", 2) && strncmp(p, "[[", 2)) { /* Not {{ or [[ */
193       putchar(*(h++));
194       continue;
195     }
196 
197     /* Is there a corresponding closing bracket? */
198     if (! (q = strstr(p + 2, *p == '[' ? "]]" : "}}"))) break;
199 
200     c = *p;					/* Remember [ or { */
201 
202     if (marklen == 0 || strncmp(p + 2, mark, marklen) == 0) {
203 
204       p += 2 + marklen;				/* Skip "[["/"{{" + mark */
205       key = newnstring(p, q - p);		/* Extract the key */
206 
207       if (! valid_label(key)) {			/* Cannot be a key */
208 	while (h != q) putchar(*(h++));		/* Copy unchanged */
209 	putchar(*q); putchar(*(q+1));
210       } else if (!(label = get_label(key))) {	/* No citation found: warn */
211 	while (h != q) putchar(*(h++));		/* Copy unchanged */
212 	putchar(*q); putchar(*(q+1));
213 	fprintf(stderr, "%s:%d: warning: no bib entry found for %s\n",
214 		fname ? fname : (string)"<stdin>", lineno, key);
215       } else if (c == '[') {			/* Key found: expand */
216 	expand_ref(label);			/* Insert full reference */
217 	fprintf(aux, "%s\n", label);		/* Store label */
218       } else {					/* "{{" so don't expand */
219 	while (h != q) putchar(*(h++));		/* Copy unchanged */
220 	putchar(*q); putchar(*(q+1));
221 	fprintf(aux, "%s\n", label);		/* Store label */
222       }
223       dispose(key);
224 
225     } else {					/* No valid mark */
226 
227       while (h != q) putchar(*(h++));		/* Copy unchanged */
228       putchar(*q); putchar(*(q+1));
229     }
230     h = q + 2;
231   }
232 
233   printf("%s", h);				/* Print rest of text */
234 }
235 
236 
237 /* store_labels_and_keywords -- store label in hash table */
store_labels_and_keywords(const string label,const string keys)238 static void store_labels_and_keywords(const string label, const string keys)
239 {
240   string label1, h, b;
241   ENTRY entry;
242 
243   assert(label);
244   label1 = strtok_r(label, WS, &b);		/* Remove white space */
245   if (!label1) return;				/* Empty label */
246   entry.key = newstring(label1);
247   entry.data = newstring(label1);
248   if (!hsearch(entry, ENTER)) errexit("%s: %s\n", prog, strerror(errno));
249   if (keys) {
250     for (h = strtok_r(keys, WS, &b); h; h = strtok_r(NULL, WS, &b)) {
251       entry.key = newstring(h);
252       entry.data = newstring(label1);
253       if (!hsearch(entry, ENTER)) errexit("%s: %s\n", prog, strerror(errno));
254     }
255   }
256 }
257 
258 
259 /* parse_db -- extract all labels from the refer-style database */
parse_db(const string db)260 static void parse_db(const string db)
261 {
262   char line[LINESIZE];
263   FILE *f;
264   int e;
265   string label = NULL, keywords = NULL;
266 
267   if (!(f = fopen(db,"r"))) errexit("%s: %s: %s\n", prog, db, strerror(errno));
268 
269   /* Initialize the hash table */
270   if (! hcreate(HASHSIZE)) errexit("%s: %s\n", prog, strerror(errno));
271 
272   /* Search for %L lines */
273   clearerr(f);
274   while (fgets(line, sizeof(line), f)) {
275     if (line[0] != '%') {	/* We're outside an entry */
276       if (label) store_labels_and_keywords(label, keywords);
277       dispose(label);
278       dispose(keywords);
279     } else if (strncmp(line, "%L ", 3) == 0) {
280       label = newstring(line + 3);
281     } else if (strncmp(line, "%K ", 3) == 0) {
282       keywords = newstring(line + 3);
283     }
284   }
285   if (label) store_labels_and_keywords(label, keywords);
286 
287   if ((e = ferror(f))) errexit("%s: %s: %s\n", prog, db, strerror(e));
288 
289   if (fclose(f) != 0) errexit("%s: %s: %s\n", prog, db, strerror(errno));
290 }
291 
292 
293 /* usage -- print usage message and exit */
usage(void)294 static void usage(void)
295 {
296   errexit("Usage: %s [-b base] [-p pattern] [-a auxfile] [-c] [-v] bib-file [HTML-file]\n",
297 	  prog);
298 }
299 
300 
main(int argc,char * argv[])301 int main(int argc, char *argv[])
302 {
303   char line[LINESIZE];
304   string h, auxfile = NULL, dbfile = NULL, infile = NULL;
305   bool in_comment = false;
306   int e, lineno, c;
307   FILE *f;
308 
309   /* Parse command line arguments */
310   prog = argv[0];
311   while ((c = getopt(argc, argv, "b:p:a:m:cv")) != -1) {
312     switch (c) {
313     case 'b': base = optarg; break;		/* Set base of URL */
314     case 'p': pattern = optarg; break;		/* Form of expanded ref */
315     case 'a': auxfile = optarg; break;		/* Name of auxfile */
316     case 'm': mark = optarg; marklen = strlen(mark); break; /* After "[[" */
317     case 'c': skip_comments = true; break;	/* Skip [[ in comments */
318     case 'v': printf("Version: %s %s\n", PACKAGE, VERSION); return 0;
319     default: usage();
320     }
321   }
322   if (optind == argc || argc > optind + 2) usage();
323 
324   dbfile = argv[optind++];
325   if (optind != argc) infile = argv[optind++];
326 
327   /* Read the labels from the bibliography database */
328   parse_db(dbfile);
329 
330   /* Construct auxfile */
331   if (! auxfile) {
332     if (infile) {
333       newarray(auxfile, strlen(infile) + 5);
334       strcpy(auxfile, infile);
335       if ((h = strrchr(auxfile, '.'))) *h = '\0';
336       strcat(auxfile, ".aux");
337     } else {
338       auxfile = "aux.aux";
339     }
340   }
341   if (! (aux = fopen(auxfile, "w")))
342     errexit("%s: %s: %s\n", prog, auxfile, strerror(errno));
343 
344   /* Open input file or use stdin */
345   f = infile ? fopen(infile, "r") : stdin;
346   if (!f) errexit("%s: %s: %s\n", prog, infile, strerror(errno));
347 
348   /* Read input line by line */
349   clearerr(f);
350   lineno = 1;
351   while (fgets(line, sizeof(line), f))
352     process_line(line, infile, lineno++, &in_comment);
353   if ((e = ferror(f))) errexit("%s: %s\n", prog, strerror(e));
354 
355   fclose(aux);
356   fclose(f);
357   return 0;
358 }
359