xref: /dragonfly/contrib/mdocml/html.c (revision cecb9aae)
1 /*	$Id: html.c,v 1.147 2011/05/24 21:40:14 kristaps Exp $ */
2 /*
3  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21 
22 #include <sys/types.h>
23 
24 #include <assert.h>
25 #include <ctype.h>
26 #include <stdarg.h>
27 #include <stdio.h>
28 #include <stdint.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <unistd.h>
32 
33 #include "mandoc.h"
34 #include "libmandoc.h"
35 #include "out.h"
36 #include "html.h"
37 #include "main.h"
38 
39 struct	htmldata {
40 	const char	 *name;
41 	int		  flags;
42 #define	HTML_CLRLINE	 (1 << 0)
43 #define	HTML_NOSTACK	 (1 << 1)
44 #define	HTML_AUTOCLOSE	 (1 << 2) /* Tag has auto-closure. */
45 };
46 
47 static	const struct htmldata htmltags[TAG_MAX] = {
48 	{"html",	HTML_CLRLINE}, /* TAG_HTML */
49 	{"head",	HTML_CLRLINE}, /* TAG_HEAD */
50 	{"body",	HTML_CLRLINE}, /* TAG_BODY */
51 	{"meta",	HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */
52 	{"title",	HTML_CLRLINE}, /* TAG_TITLE */
53 	{"div",		HTML_CLRLINE}, /* TAG_DIV */
54 	{"h1",		0}, /* TAG_H1 */
55 	{"h2",		0}, /* TAG_H2 */
56 	{"span",	0}, /* TAG_SPAN */
57 	{"link",	HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */
58 	{"br",		HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */
59 	{"a",		0}, /* TAG_A */
60 	{"table",	HTML_CLRLINE}, /* TAG_TABLE */
61 	{"tbody",	HTML_CLRLINE}, /* TAG_TBODY */
62 	{"col",		HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */
63 	{"tr",		HTML_CLRLINE}, /* TAG_TR */
64 	{"td",		HTML_CLRLINE}, /* TAG_TD */
65 	{"li",		HTML_CLRLINE}, /* TAG_LI */
66 	{"ul",		HTML_CLRLINE}, /* TAG_UL */
67 	{"ol",		HTML_CLRLINE}, /* TAG_OL */
68 	{"dl",		HTML_CLRLINE}, /* TAG_DL */
69 	{"dt",		HTML_CLRLINE}, /* TAG_DT */
70 	{"dd",		HTML_CLRLINE}, /* TAG_DD */
71 	{"blockquote",	HTML_CLRLINE}, /* TAG_BLOCKQUOTE */
72 	{"p",		HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_P */
73 	{"pre",		HTML_CLRLINE }, /* TAG_PRE */
74 	{"b",		0 }, /* TAG_B */
75 	{"i",		0 }, /* TAG_I */
76 	{"code",	0 }, /* TAG_CODE */
77 	{"small",	0 }, /* TAG_SMALL */
78 };
79 
80 static	const char	*const htmlattrs[ATTR_MAX] = {
81 	"http-equiv", /* ATTR_HTTPEQUIV */
82 	"content", /* ATTR_CONTENT */
83 	"name", /* ATTR_NAME */
84 	"rel", /* ATTR_REL */
85 	"href", /* ATTR_HREF */
86 	"type", /* ATTR_TYPE */
87 	"media", /* ATTR_MEDIA */
88 	"class", /* ATTR_CLASS */
89 	"style", /* ATTR_STYLE */
90 	"width", /* ATTR_WIDTH */
91 	"id", /* ATTR_ID */
92 	"summary", /* ATTR_SUMMARY */
93 	"align", /* ATTR_ALIGN */
94 	"colspan", /* ATTR_COLSPAN */
95 };
96 
97 static	const char	*const roffscales[SCALE_MAX] = {
98 	"cm", /* SCALE_CM */
99 	"in", /* SCALE_IN */
100 	"pc", /* SCALE_PC */
101 	"pt", /* SCALE_PT */
102 	"em", /* SCALE_EM */
103 	"em", /* SCALE_MM */
104 	"ex", /* SCALE_EN */
105 	"ex", /* SCALE_BU */
106 	"em", /* SCALE_VS */
107 	"ex", /* SCALE_FS */
108 };
109 
110 static	void	 bufncat(struct html *, const char *, size_t);
111 static	void	 print_ctag(struct html *, enum htmltag);
112 static	int	 print_encode(struct html *, const char *, int);
113 static	void	 print_metaf(struct html *, enum mandoc_esc);
114 static	void	 print_attr(struct html *, const char *, const char *);
115 static	void	 *ml_alloc(char *, enum htmltype);
116 
117 static void *
118 ml_alloc(char *outopts, enum htmltype type)
119 {
120 	struct html	*h;
121 	const char	*toks[4];
122 	char		*v;
123 
124 	toks[0] = "style";
125 	toks[1] = "man";
126 	toks[2] = "includes";
127 	toks[3] = NULL;
128 
129 	h = mandoc_calloc(1, sizeof(struct html));
130 
131 	h->type = type;
132 	h->tags.head = NULL;
133 	h->symtab = mchars_alloc();
134 
135 	while (outopts && *outopts)
136 		switch (getsubopt(&outopts, UNCONST(toks), &v)) {
137 		case (0):
138 			h->style = v;
139 			break;
140 		case (1):
141 			h->base_man = v;
142 			break;
143 		case (2):
144 			h->base_includes = v;
145 			break;
146 		default:
147 			break;
148 		}
149 
150 	return(h);
151 }
152 
153 void *
154 html_alloc(char *outopts)
155 {
156 
157 	return(ml_alloc(outopts, HTML_HTML_4_01_STRICT));
158 }
159 
160 
161 void *
162 xhtml_alloc(char *outopts)
163 {
164 
165 	return(ml_alloc(outopts, HTML_XHTML_1_0_STRICT));
166 }
167 
168 
169 void
170 html_free(void *p)
171 {
172 	struct tag	*tag;
173 	struct html	*h;
174 
175 	h = (struct html *)p;
176 
177 	while ((tag = h->tags.head) != NULL) {
178 		h->tags.head = tag->next;
179 		free(tag);
180 	}
181 
182 	if (h->symtab)
183 		mchars_free(h->symtab);
184 
185 	free(h);
186 }
187 
188 
189 void
190 print_gen_head(struct html *h)
191 {
192 	struct htmlpair	 tag[4];
193 
194 	tag[0].key = ATTR_HTTPEQUIV;
195 	tag[0].val = "Content-Type";
196 	tag[1].key = ATTR_CONTENT;
197 	tag[1].val = "text/html; charset=utf-8";
198 	print_otag(h, TAG_META, 2, tag);
199 
200 	tag[0].key = ATTR_NAME;
201 	tag[0].val = "resource-type";
202 	tag[1].key = ATTR_CONTENT;
203 	tag[1].val = "document";
204 	print_otag(h, TAG_META, 2, tag);
205 
206 	if (h->style) {
207 		tag[0].key = ATTR_REL;
208 		tag[0].val = "stylesheet";
209 		tag[1].key = ATTR_HREF;
210 		tag[1].val = h->style;
211 		tag[2].key = ATTR_TYPE;
212 		tag[2].val = "text/css";
213 		tag[3].key = ATTR_MEDIA;
214 		tag[3].val = "all";
215 		print_otag(h, TAG_LINK, 4, tag);
216 	}
217 }
218 
219 static void
220 print_metaf(struct html *h, enum mandoc_esc deco)
221 {
222 	enum htmlfont	 font;
223 
224 	switch (deco) {
225 	case (ESCAPE_FONTPREV):
226 		font = h->metal;
227 		break;
228 	case (ESCAPE_FONTITALIC):
229 		font = HTMLFONT_ITALIC;
230 		break;
231 	case (ESCAPE_FONTBOLD):
232 		font = HTMLFONT_BOLD;
233 		break;
234 	case (ESCAPE_FONT):
235 		/* FALLTHROUGH */
236 	case (ESCAPE_FONTROMAN):
237 		font = HTMLFONT_NONE;
238 		break;
239 	default:
240 		abort();
241 		/* NOTREACHED */
242 	}
243 
244 	if (h->metaf) {
245 		print_tagq(h, h->metaf);
246 		h->metaf = NULL;
247 	}
248 
249 	h->metal = h->metac;
250 	h->metac = font;
251 
252 	if (HTMLFONT_NONE != font)
253 		h->metaf = HTMLFONT_BOLD == font ?
254 			print_otag(h, TAG_B, 0, NULL) :
255 			print_otag(h, TAG_I, 0, NULL);
256 }
257 
258 int
259 html_strlen(const char *cp)
260 {
261 	int		 ssz, sz;
262 	const char	*seq, *p;
263 
264 	/*
265 	 * Account for escaped sequences within string length
266 	 * calculations.  This follows the logic in term_strlen() as we
267 	 * must calculate the width of produced strings.
268 	 * Assume that characters are always width of "1".  This is
269 	 * hacky, but it gets the job done for approximation of widths.
270 	 */
271 
272 	sz = 0;
273 	while (NULL != (p = strchr(cp, '\\'))) {
274 		sz += (int)(p - cp);
275 		++cp;
276 		switch (mandoc_escape(&cp, &seq, &ssz)) {
277 		case (ESCAPE_ERROR):
278 			return(sz);
279 		case (ESCAPE_UNICODE):
280 			/* FALLTHROUGH */
281 		case (ESCAPE_NUMBERED):
282 			/* FALLTHROUGH */
283 		case (ESCAPE_SPECIAL):
284 			sz++;
285 			break;
286 		default:
287 			break;
288 		}
289 	}
290 
291 	assert(sz >= 0);
292 	return(sz + strlen(cp));
293 }
294 
295 static int
296 print_encode(struct html *h, const char *p, int norecurse)
297 {
298 	size_t		 sz;
299 	int		 c, len, nospace;
300 	const char	*seq;
301 	enum mandoc_esc	 esc;
302 	static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' };
303 
304 	nospace = 0;
305 
306 	while ('\0' != *p) {
307 		sz = strcspn(p, rejs);
308 
309 		fwrite(p, 1, sz, stdout);
310 		p += (int)sz;
311 
312 		if ('\0' == *p)
313 			break;
314 
315 		switch (*p++) {
316 		case ('<'):
317 			printf("&lt;");
318 			continue;
319 		case ('>'):
320 			printf("&gt;");
321 			continue;
322 		case ('&'):
323 			printf("&amp;");
324 			continue;
325 		case (ASCII_HYPH):
326 			putchar('-');
327 			continue;
328 		default:
329 			break;
330 		}
331 
332 		esc = mandoc_escape(&p, &seq, &len);
333 		if (ESCAPE_ERROR == esc)
334 			break;
335 
336 		switch (esc) {
337 		case (ESCAPE_UNICODE):
338 			/* Skip passed "u" header. */
339 			c = mchars_num2uc(seq + 1, len - 1);
340 			if ('\0' != c)
341 				printf("&#x%x;", c);
342 			break;
343 		case (ESCAPE_NUMBERED):
344 			c = mchars_num2char(seq, len);
345 			if ('\0' != c)
346 				putchar(c);
347 			break;
348 		case (ESCAPE_SPECIAL):
349 			c = mchars_spec2cp(h->symtab, seq, len);
350 			if (c > 0)
351 				printf("&#%d;", c);
352 			else if (-1 == c && 1 == len)
353 				putchar((int)*seq);
354 			break;
355 		case (ESCAPE_FONT):
356 			/* FALLTHROUGH */
357 		case (ESCAPE_FONTPREV):
358 			/* FALLTHROUGH */
359 		case (ESCAPE_FONTBOLD):
360 			/* FALLTHROUGH */
361 		case (ESCAPE_FONTITALIC):
362 			/* FALLTHROUGH */
363 		case (ESCAPE_FONTROMAN):
364 			if (norecurse)
365 				break;
366 			print_metaf(h, esc);
367 			break;
368 		case (ESCAPE_NOSPACE):
369 			if ('\0' == *p)
370 				nospace = 1;
371 			break;
372 		default:
373 			break;
374 		}
375 	}
376 
377 	return(nospace);
378 }
379 
380 
381 static void
382 print_attr(struct html *h, const char *key, const char *val)
383 {
384 	printf(" %s=\"", key);
385 	(void)print_encode(h, val, 1);
386 	putchar('\"');
387 }
388 
389 
390 struct tag *
391 print_otag(struct html *h, enum htmltag tag,
392 		int sz, const struct htmlpair *p)
393 {
394 	int		 i;
395 	struct tag	*t;
396 
397 	/* Push this tags onto the stack of open scopes. */
398 
399 	if ( ! (HTML_NOSTACK & htmltags[tag].flags)) {
400 		t = mandoc_malloc(sizeof(struct tag));
401 		t->tag = tag;
402 		t->next = h->tags.head;
403 		h->tags.head = t;
404 	} else
405 		t = NULL;
406 
407 	if ( ! (HTML_NOSPACE & h->flags))
408 		if ( ! (HTML_CLRLINE & htmltags[tag].flags)) {
409 			/* Manage keeps! */
410 			if ( ! (HTML_KEEP & h->flags)) {
411 				if (HTML_PREKEEP & h->flags)
412 					h->flags |= HTML_KEEP;
413 				putchar(' ');
414 			} else
415 				printf("&#160;");
416 		}
417 
418 	if ( ! (h->flags & HTML_NONOSPACE))
419 		h->flags &= ~HTML_NOSPACE;
420 	else
421 		h->flags |= HTML_NOSPACE;
422 
423 	/* Print out the tag name and attributes. */
424 
425 	printf("<%s", htmltags[tag].name);
426 	for (i = 0; i < sz; i++)
427 		print_attr(h, htmlattrs[p[i].key], p[i].val);
428 
429 	/* Add non-overridable attributes. */
430 
431 	if (TAG_HTML == tag && HTML_XHTML_1_0_STRICT == h->type) {
432 		print_attr(h, "xmlns", "http://www.w3.org/1999/xhtml");
433 		print_attr(h, "xml:lang", "en");
434 		print_attr(h, "lang", "en");
435 	}
436 
437 	/* Accommodate for XML "well-formed" singleton escaping. */
438 
439 	if (HTML_AUTOCLOSE & htmltags[tag].flags)
440 		switch (h->type) {
441 		case (HTML_XHTML_1_0_STRICT):
442 			putchar('/');
443 			break;
444 		default:
445 			break;
446 		}
447 
448 	putchar('>');
449 
450 	h->flags |= HTML_NOSPACE;
451 
452 	if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags)
453 		putchar('\n');
454 
455 	return(t);
456 }
457 
458 
459 static void
460 print_ctag(struct html *h, enum htmltag tag)
461 {
462 
463 	printf("</%s>", htmltags[tag].name);
464 	if (HTML_CLRLINE & htmltags[tag].flags) {
465 		h->flags |= HTML_NOSPACE;
466 		putchar('\n');
467 	}
468 }
469 
470 void
471 print_gen_decls(struct html *h)
472 {
473 	const char	*doctype;
474 	const char	*dtd;
475 	const char	*name;
476 
477 	switch (h->type) {
478 	case (HTML_HTML_4_01_STRICT):
479 		name = "HTML";
480 		doctype = "-//W3C//DTD HTML 4.01//EN";
481 		dtd = "http://www.w3.org/TR/html4/strict.dtd";
482 		break;
483 	default:
484 		puts("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
485 		name = "html";
486 		doctype = "-//W3C//DTD XHTML 1.0 Strict//EN";
487 		dtd = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
488 		break;
489 	}
490 
491 	printf("<!DOCTYPE %s PUBLIC \"%s\" \"%s\">\n",
492 			name, doctype, dtd);
493 }
494 
495 void
496 print_text(struct html *h, const char *word)
497 {
498 
499 	if ( ! (HTML_NOSPACE & h->flags)) {
500 		/* Manage keeps! */
501 		if ( ! (HTML_KEEP & h->flags)) {
502 			if (HTML_PREKEEP & h->flags)
503 				h->flags |= HTML_KEEP;
504 			putchar(' ');
505 		} else
506 			printf("&#160;");
507 	}
508 
509 	assert(NULL == h->metaf);
510 	if (HTMLFONT_NONE != h->metac)
511 		h->metaf = HTMLFONT_BOLD == h->metac ?
512 			print_otag(h, TAG_B, 0, NULL) :
513 			print_otag(h, TAG_I, 0, NULL);
514 
515 	assert(word);
516 	if ( ! print_encode(h, word, 0))
517 		if ( ! (h->flags & HTML_NONOSPACE))
518 			h->flags &= ~HTML_NOSPACE;
519 
520 	if (h->metaf) {
521 		print_tagq(h, h->metaf);
522 		h->metaf = NULL;
523 	}
524 
525 	h->flags &= ~HTML_IGNDELIM;
526 }
527 
528 
529 void
530 print_tagq(struct html *h, const struct tag *until)
531 {
532 	struct tag	*tag;
533 
534 	while ((tag = h->tags.head) != NULL) {
535 		/*
536 		 * Remember to close out and nullify the current
537 		 * meta-font and table, if applicable.
538 		 */
539 		if (tag == h->metaf)
540 			h->metaf = NULL;
541 		if (tag == h->tblt)
542 			h->tblt = NULL;
543 		print_ctag(h, tag->tag);
544 		h->tags.head = tag->next;
545 		free(tag);
546 		if (until && tag == until)
547 			return;
548 	}
549 }
550 
551 
552 void
553 print_stagq(struct html *h, const struct tag *suntil)
554 {
555 	struct tag	*tag;
556 
557 	while ((tag = h->tags.head) != NULL) {
558 		if (suntil && tag == suntil)
559 			return;
560 		/*
561 		 * Remember to close out and nullify the current
562 		 * meta-font and table, if applicable.
563 		 */
564 		if (tag == h->metaf)
565 			h->metaf = NULL;
566 		if (tag == h->tblt)
567 			h->tblt = NULL;
568 		print_ctag(h, tag->tag);
569 		h->tags.head = tag->next;
570 		free(tag);
571 	}
572 }
573 
574 void
575 bufinit(struct html *h)
576 {
577 
578 	h->buf[0] = '\0';
579 	h->buflen = 0;
580 }
581 
582 void
583 bufcat_style(struct html *h, const char *key, const char *val)
584 {
585 
586 	bufcat(h, key);
587 	bufcat(h, ":");
588 	bufcat(h, val);
589 	bufcat(h, ";");
590 }
591 
592 void
593 bufcat(struct html *h, const char *p)
594 {
595 
596 	h->buflen = strlcat(h->buf, p, BUFSIZ);
597 	assert(h->buflen < BUFSIZ);
598 	h->buflen--;
599 }
600 
601 void
602 bufcat_fmt(struct html *h, const char *fmt, ...)
603 {
604 	va_list		 ap;
605 
606 	va_start(ap, fmt);
607 	(void)vsnprintf(h->buf + (int)h->buflen,
608 			BUFSIZ - h->buflen - 1, fmt, ap);
609 	va_end(ap);
610 	h->buflen = strlen(h->buf);
611 }
612 
613 static void
614 bufncat(struct html *h, const char *p, size_t sz)
615 {
616 
617 	assert(h->buflen + sz + 1 < BUFSIZ);
618 	strncat(h->buf, p, sz);
619 	h->buflen += sz;
620 }
621 
622 void
623 buffmt_includes(struct html *h, const char *name)
624 {
625 	const char	*p, *pp;
626 
627 	pp = h->base_includes;
628 
629 	bufinit(h);
630 	while (NULL != (p = strchr(pp, '%'))) {
631 		bufncat(h, pp, (size_t)(p - pp));
632 		switch (*(p + 1)) {
633 		case('I'):
634 			bufcat(h, name);
635 			break;
636 		default:
637 			bufncat(h, p, 2);
638 			break;
639 		}
640 		pp = p + 2;
641 	}
642 	if (pp)
643 		bufcat(h, pp);
644 }
645 
646 void
647 buffmt_man(struct html *h,
648 		const char *name, const char *sec)
649 {
650 	const char	*p, *pp;
651 
652 	pp = h->base_man;
653 
654 	bufinit(h);
655 	while (NULL != (p = strchr(pp, '%'))) {
656 		bufncat(h, pp, (size_t)(p - pp));
657 		switch (*(p + 1)) {
658 		case('S'):
659 			bufcat(h, sec ? sec : "1");
660 			break;
661 		case('N'):
662 			bufcat_fmt(h, name);
663 			break;
664 		default:
665 			bufncat(h, p, 2);
666 			break;
667 		}
668 		pp = p + 2;
669 	}
670 	if (pp)
671 		bufcat(h, pp);
672 }
673 
674 void
675 bufcat_su(struct html *h, const char *p, const struct roffsu *su)
676 {
677 	double		 v;
678 
679 	v = su->scale;
680 	if (SCALE_MM == su->unit && 0.0 == (v /= 100.0))
681 		v = 1.0;
682 
683 	bufcat_fmt(h, "%s: %.2f%s;", p, v, roffscales[su->unit]);
684 }
685 
686 void
687 bufcat_id(struct html *h, const char *src)
688 {
689 
690 	/* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */
691 
692 	while ('\0' != *src)
693 		bufcat_fmt(h, "%.2x", *src++);
694 }
695