xref: /minix/external/bsd/mdocml/dist/html.c (revision 6c8f7fc3)
1 /*	$Vendor-Id: html.c,v 1.150 2011/10/05 21:35:17 kristaps Exp $ */
2 /*
3  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21 
22 #include <sys/types.h>
23 
24 #include <assert.h>
25 #include <ctype.h>
26 #include <stdarg.h>
27 #include <stdio.h>
28 #include <stdint.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <unistd.h>
32 
33 #include "mandoc.h"
34 #include "libmandoc.h"
35 #include "out.h"
36 #include "html.h"
37 #include "main.h"
38 
39 struct	htmldata {
40 	const char	 *name;
41 	int		  flags;
42 #define	HTML_CLRLINE	 (1 << 0)
43 #define	HTML_NOSTACK	 (1 << 1)
44 #define	HTML_AUTOCLOSE	 (1 << 2) /* Tag has auto-closure. */
45 };
46 
47 static	const struct htmldata htmltags[TAG_MAX] = {
48 	{"html",	HTML_CLRLINE}, /* TAG_HTML */
49 	{"head",	HTML_CLRLINE}, /* TAG_HEAD */
50 	{"body",	HTML_CLRLINE}, /* TAG_BODY */
51 	{"meta",	HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */
52 	{"title",	HTML_CLRLINE}, /* TAG_TITLE */
53 	{"div",		HTML_CLRLINE}, /* TAG_DIV */
54 	{"h1",		0}, /* TAG_H1 */
55 	{"h2",		0}, /* TAG_H2 */
56 	{"span",	0}, /* TAG_SPAN */
57 	{"link",	HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */
58 	{"br",		HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */
59 	{"a",		0}, /* TAG_A */
60 	{"table",	HTML_CLRLINE}, /* TAG_TABLE */
61 	{"tbody",	HTML_CLRLINE}, /* TAG_TBODY */
62 	{"col",		HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */
63 	{"tr",		HTML_CLRLINE}, /* TAG_TR */
64 	{"td",		HTML_CLRLINE}, /* TAG_TD */
65 	{"li",		HTML_CLRLINE}, /* TAG_LI */
66 	{"ul",		HTML_CLRLINE}, /* TAG_UL */
67 	{"ol",		HTML_CLRLINE}, /* TAG_OL */
68 	{"dl",		HTML_CLRLINE}, /* TAG_DL */
69 	{"dt",		HTML_CLRLINE}, /* TAG_DT */
70 	{"dd",		HTML_CLRLINE}, /* TAG_DD */
71 	{"blockquote",	HTML_CLRLINE}, /* TAG_BLOCKQUOTE */
72 	{"p",		HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_P */
73 	{"pre",		HTML_CLRLINE }, /* TAG_PRE */
74 	{"b",		0 }, /* TAG_B */
75 	{"i",		0 }, /* TAG_I */
76 	{"code",	0 }, /* TAG_CODE */
77 	{"small",	0 }, /* TAG_SMALL */
78 };
79 
80 static	const char	*const htmlattrs[ATTR_MAX] = {
81 	"http-equiv", /* ATTR_HTTPEQUIV */
82 	"content", /* ATTR_CONTENT */
83 	"name", /* ATTR_NAME */
84 	"rel", /* ATTR_REL */
85 	"href", /* ATTR_HREF */
86 	"type", /* ATTR_TYPE */
87 	"media", /* ATTR_MEDIA */
88 	"class", /* ATTR_CLASS */
89 	"style", /* ATTR_STYLE */
90 	"width", /* ATTR_WIDTH */
91 	"id", /* ATTR_ID */
92 	"summary", /* ATTR_SUMMARY */
93 	"align", /* ATTR_ALIGN */
94 	"colspan", /* ATTR_COLSPAN */
95 };
96 
97 static	const char	*const roffscales[SCALE_MAX] = {
98 	"cm", /* SCALE_CM */
99 	"in", /* SCALE_IN */
100 	"pc", /* SCALE_PC */
101 	"pt", /* SCALE_PT */
102 	"em", /* SCALE_EM */
103 	"em", /* SCALE_MM */
104 	"ex", /* SCALE_EN */
105 	"ex", /* SCALE_BU */
106 	"em", /* SCALE_VS */
107 	"ex", /* SCALE_FS */
108 };
109 
110 static	void	 bufncat(struct html *, const char *, size_t);
111 static	void	 print_ctag(struct html *, enum htmltag);
112 static	int	 print_encode(struct html *, const char *, int);
113 static	void	 print_metaf(struct html *, enum mandoc_esc);
114 static	void	 print_attr(struct html *, const char *, const char *);
115 static	void	 *ml_alloc(char *, enum htmltype);
116 
117 static void *
118 ml_alloc(char *outopts, enum htmltype type)
119 {
120 	struct html	*h;
121 	const char	*toks[5];
122 	char		*v;
123 
124 	toks[0] = "style";
125 	toks[1] = "man";
126 	toks[2] = "includes";
127 	toks[3] = "fragment";
128 	toks[4] = NULL;
129 
130 	h = mandoc_calloc(1, sizeof(struct html));
131 
132 	h->type = type;
133 	h->tags.head = NULL;
134 	h->symtab = mchars_alloc();
135 
136 	while (outopts && *outopts)
137 		switch (getsubopt(&outopts, UNCONST(toks), &v)) {
138 		case (0):
139 			h->style = v;
140 			break;
141 		case (1):
142 			h->base_man = v;
143 			break;
144 		case (2):
145 			h->base_includes = v;
146 			break;
147 		case (3):
148 			h->oflags |= HTML_FRAGMENT;
149 			break;
150 		default:
151 			break;
152 		}
153 
154 	return(h);
155 }
156 
157 void *
158 html_alloc(char *outopts)
159 {
160 
161 	return(ml_alloc(outopts, HTML_HTML_4_01_STRICT));
162 }
163 
164 
165 void *
166 xhtml_alloc(char *outopts)
167 {
168 
169 	return(ml_alloc(outopts, HTML_XHTML_1_0_STRICT));
170 }
171 
172 
173 void
174 html_free(void *p)
175 {
176 	struct tag	*tag;
177 	struct html	*h;
178 
179 	h = (struct html *)p;
180 
181 	while ((tag = h->tags.head) != NULL) {
182 		h->tags.head = tag->next;
183 		free(tag);
184 	}
185 
186 	if (h->symtab)
187 		mchars_free(h->symtab);
188 
189 	free(h);
190 }
191 
192 
193 void
194 print_gen_head(struct html *h)
195 {
196 	struct htmlpair	 tag[4];
197 
198 	tag[0].key = ATTR_HTTPEQUIV;
199 	tag[0].val = "Content-Type";
200 	tag[1].key = ATTR_CONTENT;
201 	tag[1].val = "text/html; charset=utf-8";
202 	print_otag(h, TAG_META, 2, tag);
203 
204 	tag[0].key = ATTR_NAME;
205 	tag[0].val = "resource-type";
206 	tag[1].key = ATTR_CONTENT;
207 	tag[1].val = "document";
208 	print_otag(h, TAG_META, 2, tag);
209 
210 	if (h->style) {
211 		tag[0].key = ATTR_REL;
212 		tag[0].val = "stylesheet";
213 		tag[1].key = ATTR_HREF;
214 		tag[1].val = h->style;
215 		tag[2].key = ATTR_TYPE;
216 		tag[2].val = "text/css";
217 		tag[3].key = ATTR_MEDIA;
218 		tag[3].val = "all";
219 		print_otag(h, TAG_LINK, 4, tag);
220 	}
221 }
222 
223 static void
224 print_metaf(struct html *h, enum mandoc_esc deco)
225 {
226 	enum htmlfont	 font;
227 
228 	switch (deco) {
229 	case (ESCAPE_FONTPREV):
230 		font = h->metal;
231 		break;
232 	case (ESCAPE_FONTITALIC):
233 		font = HTMLFONT_ITALIC;
234 		break;
235 	case (ESCAPE_FONTBOLD):
236 		font = HTMLFONT_BOLD;
237 		break;
238 	case (ESCAPE_FONT):
239 		/* FALLTHROUGH */
240 	case (ESCAPE_FONTROMAN):
241 		font = HTMLFONT_NONE;
242 		break;
243 	default:
244 		abort();
245 		/* NOTREACHED */
246 	}
247 
248 	if (h->metaf) {
249 		print_tagq(h, h->metaf);
250 		h->metaf = NULL;
251 	}
252 
253 	h->metal = h->metac;
254 	h->metac = font;
255 
256 	if (HTMLFONT_NONE != font)
257 		h->metaf = HTMLFONT_BOLD == font ?
258 			print_otag(h, TAG_B, 0, NULL) :
259 			print_otag(h, TAG_I, 0, NULL);
260 }
261 
262 int
263 html_strlen(const char *cp)
264 {
265 	int		 ssz, sz;
266 	const char	*seq, *p;
267 
268 	/*
269 	 * Account for escaped sequences within string length
270 	 * calculations.  This follows the logic in term_strlen() as we
271 	 * must calculate the width of produced strings.
272 	 * Assume that characters are always width of "1".  This is
273 	 * hacky, but it gets the job done for approximation of widths.
274 	 */
275 
276 	sz = 0;
277 	while (NULL != (p = strchr(cp, '\\'))) {
278 		sz += (int)(p - cp);
279 		++cp;
280 		switch (mandoc_escape(&cp, &seq, &ssz)) {
281 		case (ESCAPE_ERROR):
282 			return(sz);
283 		case (ESCAPE_UNICODE):
284 			/* FALLTHROUGH */
285 		case (ESCAPE_NUMBERED):
286 			/* FALLTHROUGH */
287 		case (ESCAPE_SPECIAL):
288 			sz++;
289 			break;
290 		default:
291 			break;
292 		}
293 	}
294 
295 	assert(sz >= 0);
296 	return(sz + strlen(cp));
297 }
298 
299 static int
300 print_encode(struct html *h, const char *p, int norecurse)
301 {
302 	size_t		 sz;
303 	int		 c, len, nospace;
304 	const char	*seq;
305 	enum mandoc_esc	 esc;
306 	static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' };
307 
308 	nospace = 0;
309 
310 	while ('\0' != *p) {
311 		sz = strcspn(p, rejs);
312 
313 		fwrite(p, 1, sz, stdout);
314 		p += (int)sz;
315 
316 		if ('\0' == *p)
317 			break;
318 
319 		switch (*p++) {
320 		case ('<'):
321 			printf("&lt;");
322 			continue;
323 		case ('>'):
324 			printf("&gt;");
325 			continue;
326 		case ('&'):
327 			printf("&amp;");
328 			continue;
329 		case (ASCII_HYPH):
330 			putchar('-');
331 			continue;
332 		default:
333 			break;
334 		}
335 
336 		esc = mandoc_escape(&p, &seq, &len);
337 		if (ESCAPE_ERROR == esc)
338 			break;
339 
340 		switch (esc) {
341 		case (ESCAPE_UNICODE):
342 			/* Skip passed "u" header. */
343 			c = mchars_num2uc(seq + 1, len - 1);
344 			if ('\0' != c)
345 				printf("&#x%x;", c);
346 			break;
347 		case (ESCAPE_NUMBERED):
348 			c = mchars_num2char(seq, len);
349 			if ('\0' != c)
350 				putchar(c);
351 			break;
352 		case (ESCAPE_SPECIAL):
353 			c = mchars_spec2cp(h->symtab, seq, len);
354 			if (c > 0)
355 				printf("&#%d;", c);
356 			else if (-1 == c && 1 == len)
357 				putchar((int)*seq);
358 			break;
359 		case (ESCAPE_FONT):
360 			/* FALLTHROUGH */
361 		case (ESCAPE_FONTPREV):
362 			/* FALLTHROUGH */
363 		case (ESCAPE_FONTBOLD):
364 			/* FALLTHROUGH */
365 		case (ESCAPE_FONTITALIC):
366 			/* FALLTHROUGH */
367 		case (ESCAPE_FONTROMAN):
368 			if (norecurse)
369 				break;
370 			print_metaf(h, esc);
371 			break;
372 		case (ESCAPE_NOSPACE):
373 			if ('\0' == *p)
374 				nospace = 1;
375 			break;
376 		default:
377 			break;
378 		}
379 	}
380 
381 	return(nospace);
382 }
383 
384 
385 static void
386 print_attr(struct html *h, const char *key, const char *val)
387 {
388 	printf(" %s=\"", key);
389 	(void)print_encode(h, val, 1);
390 	putchar('\"');
391 }
392 
393 
394 struct tag *
395 print_otag(struct html *h, enum htmltag tag,
396 		int sz, const struct htmlpair *p)
397 {
398 	int		 i;
399 	struct tag	*t;
400 
401 	/* Push this tags onto the stack of open scopes. */
402 
403 	if ( ! (HTML_NOSTACK & htmltags[tag].flags)) {
404 		t = mandoc_malloc(sizeof(struct tag));
405 		t->tag = tag;
406 		t->next = h->tags.head;
407 		h->tags.head = t;
408 	} else
409 		t = NULL;
410 
411 	if ( ! (HTML_NOSPACE & h->flags))
412 		if ( ! (HTML_CLRLINE & htmltags[tag].flags)) {
413 			/* Manage keeps! */
414 			if ( ! (HTML_KEEP & h->flags)) {
415 				if (HTML_PREKEEP & h->flags)
416 					h->flags |= HTML_KEEP;
417 				putchar(' ');
418 			} else
419 				printf("&#160;");
420 		}
421 
422 	if ( ! (h->flags & HTML_NONOSPACE))
423 		h->flags &= ~HTML_NOSPACE;
424 	else
425 		h->flags |= HTML_NOSPACE;
426 
427 	/* Print out the tag name and attributes. */
428 
429 	printf("<%s", htmltags[tag].name);
430 	for (i = 0; i < sz; i++)
431 		print_attr(h, htmlattrs[p[i].key], p[i].val);
432 
433 	/* Add non-overridable attributes. */
434 
435 	if (TAG_HTML == tag && HTML_XHTML_1_0_STRICT == h->type) {
436 		print_attr(h, "xmlns", "http://www.w3.org/1999/xhtml");
437 		print_attr(h, "xml:lang", "en");
438 		print_attr(h, "lang", "en");
439 	}
440 
441 	/* Accommodate for XML "well-formed" singleton escaping. */
442 
443 	if (HTML_AUTOCLOSE & htmltags[tag].flags)
444 		switch (h->type) {
445 		case (HTML_XHTML_1_0_STRICT):
446 			putchar('/');
447 			break;
448 		default:
449 			break;
450 		}
451 
452 	putchar('>');
453 
454 	h->flags |= HTML_NOSPACE;
455 
456 	if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags)
457 		putchar('\n');
458 
459 	return(t);
460 }
461 
462 
463 static void
464 print_ctag(struct html *h, enum htmltag tag)
465 {
466 
467 	printf("</%s>", htmltags[tag].name);
468 	if (HTML_CLRLINE & htmltags[tag].flags) {
469 		h->flags |= HTML_NOSPACE;
470 		putchar('\n');
471 	}
472 }
473 
474 void
475 print_gen_decls(struct html *h)
476 {
477 	const char	*doctype;
478 	const char	*dtd;
479 	const char	*name;
480 
481 	switch (h->type) {
482 	case (HTML_HTML_4_01_STRICT):
483 		name = "HTML";
484 		doctype = "-//W3C//DTD HTML 4.01//EN";
485 		dtd = "http://www.w3.org/TR/html4/strict.dtd";
486 		break;
487 	default:
488 		puts("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
489 		name = "html";
490 		doctype = "-//W3C//DTD XHTML 1.0 Strict//EN";
491 		dtd = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
492 		break;
493 	}
494 
495 	printf("<!DOCTYPE %s PUBLIC \"%s\" \"%s\">\n",
496 			name, doctype, dtd);
497 }
498 
499 void
500 print_text(struct html *h, const char *word)
501 {
502 
503 	if ( ! (HTML_NOSPACE & h->flags)) {
504 		/* Manage keeps! */
505 		if ( ! (HTML_KEEP & h->flags)) {
506 			if (HTML_PREKEEP & h->flags)
507 				h->flags |= HTML_KEEP;
508 			putchar(' ');
509 		} else
510 			printf("&#160;");
511 	}
512 
513 	assert(NULL == h->metaf);
514 	if (HTMLFONT_NONE != h->metac)
515 		h->metaf = HTMLFONT_BOLD == h->metac ?
516 			print_otag(h, TAG_B, 0, NULL) :
517 			print_otag(h, TAG_I, 0, NULL);
518 
519 	assert(word);
520 	if ( ! print_encode(h, word, 0)) {
521 		if ( ! (h->flags & HTML_NONOSPACE))
522 			h->flags &= ~HTML_NOSPACE;
523 	} else
524 		h->flags |= HTML_NOSPACE;
525 
526 	if (h->metaf) {
527 		print_tagq(h, h->metaf);
528 		h->metaf = NULL;
529 	}
530 
531 	h->flags &= ~HTML_IGNDELIM;
532 }
533 
534 
535 void
536 print_tagq(struct html *h, const struct tag *until)
537 {
538 	struct tag	*tag;
539 
540 	while ((tag = h->tags.head) != NULL) {
541 		/*
542 		 * Remember to close out and nullify the current
543 		 * meta-font and table, if applicable.
544 		 */
545 		if (tag == h->metaf)
546 			h->metaf = NULL;
547 		if (tag == h->tblt)
548 			h->tblt = NULL;
549 		print_ctag(h, tag->tag);
550 		h->tags.head = tag->next;
551 		free(tag);
552 		if (until && tag == until)
553 			return;
554 	}
555 }
556 
557 
558 void
559 print_stagq(struct html *h, const struct tag *suntil)
560 {
561 	struct tag	*tag;
562 
563 	while ((tag = h->tags.head) != NULL) {
564 		if (suntil && tag == suntil)
565 			return;
566 		/*
567 		 * Remember to close out and nullify the current
568 		 * meta-font and table, if applicable.
569 		 */
570 		if (tag == h->metaf)
571 			h->metaf = NULL;
572 		if (tag == h->tblt)
573 			h->tblt = NULL;
574 		print_ctag(h, tag->tag);
575 		h->tags.head = tag->next;
576 		free(tag);
577 	}
578 }
579 
580 void
581 bufinit(struct html *h)
582 {
583 
584 	h->buf[0] = '\0';
585 	h->buflen = 0;
586 }
587 
588 void
589 bufcat_style(struct html *h, const char *key, const char *val)
590 {
591 
592 	bufcat(h, key);
593 	bufcat(h, ":");
594 	bufcat(h, val);
595 	bufcat(h, ";");
596 }
597 
598 void
599 bufcat(struct html *h, const char *p)
600 {
601 
602 	h->buflen = strlcat(h->buf, p, BUFSIZ);
603 	assert(h->buflen < BUFSIZ);
604 }
605 
606 void
607 bufcat_fmt(struct html *h, const char *fmt, ...)
608 {
609 	va_list		 ap;
610 
611 	va_start(ap, fmt);
612 	(void)vsnprintf(h->buf + (int)h->buflen,
613 			BUFSIZ - h->buflen - 1, fmt, ap);
614 	va_end(ap);
615 	h->buflen = strlen(h->buf);
616 }
617 
618 static void
619 bufncat(struct html *h, const char *p, size_t sz)
620 {
621 
622 	assert(h->buflen + sz + 1 < BUFSIZ);
623 	strncat(h->buf, p, sz);
624 	h->buflen += sz;
625 }
626 
627 void
628 buffmt_includes(struct html *h, const char *name)
629 {
630 	const char	*p, *pp;
631 
632 	pp = h->base_includes;
633 
634 	bufinit(h);
635 	while (NULL != (p = strchr(pp, '%'))) {
636 		bufncat(h, pp, (size_t)(p - pp));
637 		switch (*(p + 1)) {
638 		case('I'):
639 			bufcat(h, name);
640 			break;
641 		default:
642 			bufncat(h, p, 2);
643 			break;
644 		}
645 		pp = p + 2;
646 	}
647 	if (pp)
648 		bufcat(h, pp);
649 }
650 
651 void
652 buffmt_man(struct html *h,
653 		const char *name, const char *sec)
654 {
655 	const char	*p, *pp;
656 
657 	pp = h->base_man;
658 
659 	bufinit(h);
660 	while (NULL != (p = strchr(pp, '%'))) {
661 		bufncat(h, pp, (size_t)(p - pp));
662 		switch (*(p + 1)) {
663 		case('S'):
664 			bufcat(h, sec ? sec : "1");
665 			break;
666 		case('N'):
667 			bufcat_fmt(h, name);
668 			break;
669 		default:
670 			bufncat(h, p, 2);
671 			break;
672 		}
673 		pp = p + 2;
674 	}
675 	if (pp)
676 		bufcat(h, pp);
677 }
678 
679 void
680 bufcat_su(struct html *h, const char *p, const struct roffsu *su)
681 {
682 	double		 v;
683 
684 	v = su->scale;
685 	if (SCALE_MM == su->unit && 0.0 == (v /= 100.0))
686 		v = 1.0;
687 
688 	bufcat_fmt(h, "%s: %.2f%s;", p, v, roffscales[su->unit]);
689 }
690 
691 void
692 bufcat_id(struct html *h, const char *src)
693 {
694 
695 	/* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */
696 
697 	while ('\0' != *src)
698 		bufcat_fmt(h, "%.2x", *src++);
699 }
700