xref: /openbsd/usr.bin/mandoc/html.c (revision 73471bf0)
1 /* $OpenBSD: html.c,v 1.146 2021/09/09 14:45:18 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2011-2015, 2017-2021 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  *
18  * Common functions for mandoc(1) HTML formatters.
19  * For use by individual formatters and by the main program.
20  */
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 
24 #include <assert.h>
25 #include <ctype.h>
26 #include <stdarg.h>
27 #include <stddef.h>
28 #include <stdio.h>
29 #include <stdint.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <unistd.h>
33 
34 #include "mandoc_aux.h"
35 #include "mandoc_ohash.h"
36 #include "mandoc.h"
37 #include "roff.h"
38 #include "out.h"
39 #include "html.h"
40 #include "manconf.h"
41 #include "main.h"
42 
43 struct	htmldata {
44 	const char	 *name;
45 	int		  flags;
46 #define	HTML_INPHRASE	 (1 << 0)  /* Can appear in phrasing context. */
47 #define	HTML_TOPHRASE	 (1 << 1)  /* Establishes phrasing context. */
48 #define	HTML_NOSTACK	 (1 << 2)  /* Does not have an end tag. */
49 #define	HTML_NLBEFORE	 (1 << 3)  /* Output line break before opening. */
50 #define	HTML_NLBEGIN	 (1 << 4)  /* Output line break after opening. */
51 #define	HTML_NLEND	 (1 << 5)  /* Output line break before closing. */
52 #define	HTML_NLAFTER	 (1 << 6)  /* Output line break after closing. */
53 #define	HTML_NLAROUND	 (HTML_NLBEFORE | HTML_NLAFTER)
54 #define	HTML_NLINSIDE	 (HTML_NLBEGIN | HTML_NLEND)
55 #define	HTML_NLALL	 (HTML_NLAROUND | HTML_NLINSIDE)
56 #define	HTML_INDENT	 (1 << 7)  /* Indent content by two spaces. */
57 #define	HTML_NOINDENT	 (1 << 8)  /* Exception: never indent content. */
58 };
59 
60 static	const struct htmldata htmltags[TAG_MAX] = {
61 	{"html",	HTML_NLALL},
62 	{"head",	HTML_NLALL | HTML_INDENT},
63 	{"meta",	HTML_NOSTACK | HTML_NLALL},
64 	{"link",	HTML_NOSTACK | HTML_NLALL},
65 	{"style",	HTML_NLALL | HTML_INDENT},
66 	{"title",	HTML_NLAROUND},
67 	{"body",	HTML_NLALL},
68 	{"div",		HTML_NLAROUND},
69 	{"section",	HTML_NLALL},
70 	{"table",	HTML_NLALL | HTML_INDENT},
71 	{"tr",		HTML_NLALL | HTML_INDENT},
72 	{"td",		HTML_NLAROUND},
73 	{"li",		HTML_NLAROUND | HTML_INDENT},
74 	{"ul",		HTML_NLALL | HTML_INDENT},
75 	{"ol",		HTML_NLALL | HTML_INDENT},
76 	{"dl",		HTML_NLALL | HTML_INDENT},
77 	{"dt",		HTML_NLAROUND},
78 	{"dd",		HTML_NLAROUND | HTML_INDENT},
79 	{"h1",		HTML_TOPHRASE | HTML_NLAROUND},
80 	{"h2",		HTML_TOPHRASE | HTML_NLAROUND},
81 	{"p",		HTML_TOPHRASE | HTML_NLAROUND | HTML_INDENT},
82 	{"pre",		HTML_TOPHRASE | HTML_NLAROUND | HTML_NOINDENT},
83 	{"a",		HTML_INPHRASE | HTML_TOPHRASE},
84 	{"b",		HTML_INPHRASE | HTML_TOPHRASE},
85 	{"cite",	HTML_INPHRASE | HTML_TOPHRASE},
86 	{"code",	HTML_INPHRASE | HTML_TOPHRASE},
87 	{"i",		HTML_INPHRASE | HTML_TOPHRASE},
88 	{"small",	HTML_INPHRASE | HTML_TOPHRASE},
89 	{"span",	HTML_INPHRASE | HTML_TOPHRASE},
90 	{"var",		HTML_INPHRASE | HTML_TOPHRASE},
91 	{"br",		HTML_INPHRASE | HTML_NOSTACK | HTML_NLALL},
92 	{"hr",		HTML_INPHRASE | HTML_NOSTACK},
93 	{"mark",	HTML_INPHRASE },
94 	{"math",	HTML_INPHRASE | HTML_NLALL | HTML_INDENT},
95 	{"mrow",	0},
96 	{"mi",		0},
97 	{"mn",		0},
98 	{"mo",		0},
99 	{"msup",	0},
100 	{"msub",	0},
101 	{"msubsup",	0},
102 	{"mfrac",	0},
103 	{"msqrt",	0},
104 	{"mfenced",	0},
105 	{"mtable",	0},
106 	{"mtr",		0},
107 	{"mtd",		0},
108 	{"munderover",	0},
109 	{"munder",	0},
110 	{"mover",	0},
111 };
112 
113 /* Avoid duplicate HTML id= attributes. */
114 
115 struct	id_entry {
116 	int	 ord;	/* Ordinal number of the latest occurrence. */
117 	char	 id[];	/* The id= attribute without any ordinal suffix. */
118 };
119 static	struct ohash	 id_unique;
120 
121 static	void	 html_reset_internal(struct html *);
122 static	void	 print_byte(struct html *, char);
123 static	void	 print_endword(struct html *);
124 static	void	 print_indent(struct html *);
125 static	void	 print_word(struct html *, const char *);
126 
127 static	void	 print_ctag(struct html *, struct tag *);
128 static	int	 print_escape(struct html *, char);
129 static	int	 print_encode(struct html *, const char *, const char *, int);
130 static	void	 print_href(struct html *, const char *, const char *, int);
131 static	void	 print_metaf(struct html *);
132 
133 
134 void *
135 html_alloc(const struct manoutput *outopts)
136 {
137 	struct html	*h;
138 
139 	h = mandoc_calloc(1, sizeof(struct html));
140 
141 	h->tag = NULL;
142 	h->metac = h->metal = ESCAPE_FONTROMAN;
143 	h->style = outopts->style;
144 	if ((h->base_man1 = outopts->man) == NULL)
145 		h->base_man2 = NULL;
146 	else if ((h->base_man2 = strchr(h->base_man1, ';')) != NULL)
147 		*h->base_man2++ = '\0';
148 	h->base_includes = outopts->includes;
149 	if (outopts->fragment)
150 		h->oflags |= HTML_FRAGMENT;
151 	if (outopts->toc)
152 		h->oflags |= HTML_TOC;
153 
154 	mandoc_ohash_init(&id_unique, 4, offsetof(struct id_entry, id));
155 
156 	return h;
157 }
158 
159 static void
160 html_reset_internal(struct html *h)
161 {
162 	struct tag	*tag;
163 	struct id_entry	*entry;
164 	unsigned int	 slot;
165 
166 	while ((tag = h->tag) != NULL) {
167 		h->tag = tag->next;
168 		free(tag);
169 	}
170 	entry = ohash_first(&id_unique, &slot);
171 	while (entry != NULL) {
172 		free(entry);
173 		entry = ohash_next(&id_unique, &slot);
174 	}
175 	ohash_delete(&id_unique);
176 }
177 
178 void
179 html_reset(void *p)
180 {
181 	html_reset_internal(p);
182 	mandoc_ohash_init(&id_unique, 4, offsetof(struct id_entry, id));
183 }
184 
185 void
186 html_free(void *p)
187 {
188 	html_reset_internal(p);
189 	free(p);
190 }
191 
192 void
193 print_gen_head(struct html *h)
194 {
195 	struct tag	*t;
196 
197 	print_otag(h, TAG_META, "?", "charset", "utf-8");
198 	print_otag(h, TAG_META, "??", "name", "viewport",
199 	    "content", "width=device-width, initial-scale=1.0");
200 	if (h->style != NULL) {
201 		print_otag(h, TAG_LINK, "?h??", "rel", "stylesheet",
202 		    h->style, "type", "text/css", "media", "all");
203 		return;
204 	}
205 
206 	/*
207 	 * Print a minimal embedded style sheet.
208 	 */
209 
210 	t = print_otag(h, TAG_STYLE, "");
211 	print_text(h, "table.head, table.foot { width: 100%; }");
212 	print_endline(h);
213 	print_text(h, "td.head-rtitle, td.foot-os { text-align: right; }");
214 	print_endline(h);
215 	print_text(h, "td.head-vol { text-align: center; }");
216 	print_endline(h);
217 	print_text(h, ".Nd, .Bf, .Op { display: inline; }");
218 	print_endline(h);
219 	print_text(h, ".Pa, .Ad { font-style: italic; }");
220 	print_endline(h);
221 	print_text(h, ".Ms { font-weight: bold; }");
222 	print_endline(h);
223 	print_text(h, ".Bl-diag ");
224 	print_byte(h, '>');
225 	print_text(h, " dt { font-weight: bold; }");
226 	print_endline(h);
227 	print_text(h, "code.Nm, .Fl, .Cm, .Ic, code.In, .Fd, .Fn, .Cd "
228 	    "{ font-weight: bold; font-family: inherit; }");
229 	print_tagq(h, t);
230 }
231 
232 int
233 html_setfont(struct html *h, enum mandoc_esc font)
234 {
235 	switch (font) {
236 	case ESCAPE_FONTPREV:
237 		font = h->metal;
238 		break;
239 	case ESCAPE_FONTITALIC:
240 	case ESCAPE_FONTBOLD:
241 	case ESCAPE_FONTBI:
242 	case ESCAPE_FONTROMAN:
243 	case ESCAPE_FONTCR:
244 	case ESCAPE_FONTCB:
245 	case ESCAPE_FONTCI:
246 		break;
247 	case ESCAPE_FONT:
248 		font = ESCAPE_FONTROMAN;
249 		break;
250 	default:
251 		return 0;
252 	}
253 	h->metal = h->metac;
254 	h->metac = font;
255 	return 1;
256 }
257 
258 static void
259 print_metaf(struct html *h)
260 {
261 	if (h->metaf) {
262 		print_tagq(h, h->metaf);
263 		h->metaf = NULL;
264 	}
265 	switch (h->metac) {
266 	case ESCAPE_FONTITALIC:
267 		h->metaf = print_otag(h, TAG_I, "");
268 		break;
269 	case ESCAPE_FONTBOLD:
270 		h->metaf = print_otag(h, TAG_B, "");
271 		break;
272 	case ESCAPE_FONTBI:
273 		h->metaf = print_otag(h, TAG_B, "");
274 		print_otag(h, TAG_I, "");
275 		break;
276 	case ESCAPE_FONTCR:
277 		h->metaf = print_otag(h, TAG_SPAN, "c", "Li");
278 		break;
279 	case ESCAPE_FONTCB:
280 		h->metaf = print_otag(h, TAG_SPAN, "c", "Li");
281 		print_otag(h, TAG_B, "");
282 		break;
283 	case ESCAPE_FONTCI:
284 		h->metaf = print_otag(h, TAG_SPAN, "c", "Li");
285 		print_otag(h, TAG_I, "");
286 		break;
287 	default:
288 		break;
289 	}
290 }
291 
292 void
293 html_close_paragraph(struct html *h)
294 {
295 	struct tag	*this, *next;
296 	int		 flags;
297 
298 	this = h->tag;
299 	for (;;) {
300 		next = this->next;
301 		flags = htmltags[this->tag].flags;
302 		if (flags & (HTML_INPHRASE | HTML_TOPHRASE))
303 			print_ctag(h, this);
304 		if ((flags & HTML_INPHRASE) == 0)
305 			break;
306 		this = next;
307 	}
308 }
309 
310 /*
311  * ROFF_nf switches to no-fill mode, ROFF_fi to fill mode.
312  * TOKEN_NONE does not switch.  The old mode is returned.
313  */
314 enum roff_tok
315 html_fillmode(struct html *h, enum roff_tok want)
316 {
317 	struct tag	*t;
318 	enum roff_tok	 had;
319 
320 	for (t = h->tag; t != NULL; t = t->next)
321 		if (t->tag == TAG_PRE)
322 			break;
323 
324 	had = t == NULL ? ROFF_fi : ROFF_nf;
325 
326 	if (want != had) {
327 		switch (want) {
328 		case ROFF_fi:
329 			print_tagq(h, t);
330 			break;
331 		case ROFF_nf:
332 			html_close_paragraph(h);
333 			print_otag(h, TAG_PRE, "");
334 			break;
335 		case TOKEN_NONE:
336 			break;
337 		default:
338 			abort();
339 		}
340 	}
341 	return had;
342 }
343 
344 /*
345  * Allocate a string to be used for the "id=" attribute of an HTML
346  * element and/or as a segment identifier for a URI in an <a> element.
347  * The function may fail and return NULL if the node lacks text data
348  * to create the attribute from.
349  * The caller is responsible for free(3)ing the returned string.
350  *
351  * If the "unique" argument is non-zero, the "id_unique" ohash table
352  * is used for de-duplication.  If the "unique" argument is 1,
353  * it is the first time the function is called for this tag and
354  * location, so if an ordinal suffix is needed, it is incremented.
355  * If the "unique" argument is 2, it is the second time the function
356  * is called for this tag and location, so the ordinal suffix
357  * remains unchanged.
358  */
359 char *
360 html_make_id(const struct roff_node *n, int unique)
361 {
362 	const struct roff_node	*nch;
363 	struct id_entry		*entry;
364 	char			*buf, *cp;
365 	size_t			 len;
366 	unsigned int		 slot;
367 
368 	if (n->tag != NULL)
369 		buf = mandoc_strdup(n->tag);
370 	else {
371 		switch (n->tok) {
372 		case MDOC_Sh:
373 		case MDOC_Ss:
374 		case MDOC_Sx:
375 		case MAN_SH:
376 		case MAN_SS:
377 			for (nch = n->child; nch != NULL; nch = nch->next)
378 				if (nch->type != ROFFT_TEXT)
379 					return NULL;
380 			buf = NULL;
381 			deroff(&buf, n);
382 			if (buf == NULL)
383 				return NULL;
384 			break;
385 		default:
386 			if (n->child == NULL || n->child->type != ROFFT_TEXT)
387 				return NULL;
388 			buf = mandoc_strdup(n->child->string);
389 			break;
390 		}
391 	}
392 
393 	/*
394 	 * In ID attributes, only use ASCII characters that are
395 	 * permitted in URL-fragment strings according to the
396 	 * explicit list at:
397 	 * https://url.spec.whatwg.org/#url-fragment-string
398 	 * In addition, reserve '~' for ordinal suffixes.
399 	 */
400 
401 	for (cp = buf; *cp != '\0'; cp++)
402 		if (isalnum((unsigned char)*cp) == 0 &&
403 		    strchr("!$&'()*+,-./:;=?@_", *cp) == NULL)
404 			*cp = '_';
405 
406 	if (unique == 0)
407 		return buf;
408 
409 	/* Avoid duplicate HTML id= attributes. */
410 
411 	slot = ohash_qlookup(&id_unique, buf);
412 	if ((entry = ohash_find(&id_unique, slot)) == NULL) {
413 		len = strlen(buf) + 1;
414 		entry = mandoc_malloc(sizeof(*entry) + len);
415 		entry->ord = 1;
416 		memcpy(entry->id, buf, len);
417 		ohash_insert(&id_unique, slot, entry);
418 	} else if (unique == 1)
419 		entry->ord++;
420 
421 	if (entry->ord > 1) {
422 		cp = buf;
423 		mandoc_asprintf(&buf, "%s~%d", cp, entry->ord);
424 		free(cp);
425 	}
426 	return buf;
427 }
428 
429 static int
430 print_escape(struct html *h, char c)
431 {
432 
433 	switch (c) {
434 	case '<':
435 		print_word(h, "&lt;");
436 		break;
437 	case '>':
438 		print_word(h, "&gt;");
439 		break;
440 	case '&':
441 		print_word(h, "&amp;");
442 		break;
443 	case '"':
444 		print_word(h, "&quot;");
445 		break;
446 	case ASCII_NBRSP:
447 		print_word(h, "&nbsp;");
448 		break;
449 	case ASCII_HYPH:
450 		print_byte(h, '-');
451 		break;
452 	case ASCII_BREAK:
453 		break;
454 	default:
455 		return 0;
456 	}
457 	return 1;
458 }
459 
460 static int
461 print_encode(struct html *h, const char *p, const char *pend, int norecurse)
462 {
463 	char		 numbuf[16];
464 	const char	*seq;
465 	size_t		 sz;
466 	int		 c, len, breakline, nospace;
467 	enum mandoc_esc	 esc;
468 	static const char rejs[10] = { ' ', '\\', '<', '>', '&', '"',
469 		ASCII_NBRSP, ASCII_HYPH, ASCII_BREAK, '\0' };
470 
471 	if (pend == NULL)
472 		pend = strchr(p, '\0');
473 
474 	breakline = 0;
475 	nospace = 0;
476 
477 	while (p < pend) {
478 		if (HTML_SKIPCHAR & h->flags && '\\' != *p) {
479 			h->flags &= ~HTML_SKIPCHAR;
480 			p++;
481 			continue;
482 		}
483 
484 		for (sz = strcspn(p, rejs); sz-- && p < pend; p++)
485 			print_byte(h, *p);
486 
487 		if (breakline &&
488 		    (p >= pend || *p == ' ' || *p == ASCII_NBRSP)) {
489 			print_otag(h, TAG_BR, "");
490 			breakline = 0;
491 			while (p < pend && (*p == ' ' || *p == ASCII_NBRSP))
492 				p++;
493 			continue;
494 		}
495 
496 		if (p >= pend)
497 			break;
498 
499 		if (*p == ' ') {
500 			print_endword(h);
501 			p++;
502 			continue;
503 		}
504 
505 		if (print_escape(h, *p++))
506 			continue;
507 
508 		esc = mandoc_escape(&p, &seq, &len);
509 		switch (esc) {
510 		case ESCAPE_FONT:
511 		case ESCAPE_FONTPREV:
512 		case ESCAPE_FONTBOLD:
513 		case ESCAPE_FONTITALIC:
514 		case ESCAPE_FONTBI:
515 		case ESCAPE_FONTROMAN:
516 		case ESCAPE_FONTCR:
517 		case ESCAPE_FONTCB:
518 		case ESCAPE_FONTCI:
519 			if (0 == norecurse) {
520 				h->flags |= HTML_NOSPACE;
521 				if (html_setfont(h, esc))
522 					print_metaf(h);
523 				h->flags &= ~HTML_NOSPACE;
524 			}
525 			continue;
526 		case ESCAPE_SKIPCHAR:
527 			h->flags |= HTML_SKIPCHAR;
528 			continue;
529 		case ESCAPE_ERROR:
530 			continue;
531 		default:
532 			break;
533 		}
534 
535 		if (h->flags & HTML_SKIPCHAR) {
536 			h->flags &= ~HTML_SKIPCHAR;
537 			continue;
538 		}
539 
540 		switch (esc) {
541 		case ESCAPE_UNICODE:
542 			/* Skip past "u" header. */
543 			c = mchars_num2uc(seq + 1, len - 1);
544 			break;
545 		case ESCAPE_NUMBERED:
546 			c = mchars_num2char(seq, len);
547 			if (c < 0)
548 				continue;
549 			break;
550 		case ESCAPE_SPECIAL:
551 			c = mchars_spec2cp(seq, len);
552 			if (c <= 0)
553 				continue;
554 			break;
555 		case ESCAPE_UNDEF:
556 			c = *seq;
557 			break;
558 		case ESCAPE_DEVICE:
559 			print_word(h, "html");
560 			continue;
561 		case ESCAPE_BREAK:
562 			breakline = 1;
563 			continue;
564 		case ESCAPE_NOSPACE:
565 			if ('\0' == *p)
566 				nospace = 1;
567 			continue;
568 		case ESCAPE_OVERSTRIKE:
569 			if (len == 0)
570 				continue;
571 			c = seq[len - 1];
572 			break;
573 		default:
574 			continue;
575 		}
576 		if ((c < 0x20 && c != 0x09) ||
577 		    (c > 0x7E && c < 0xA0))
578 			c = 0xFFFD;
579 		if (c > 0x7E) {
580 			(void)snprintf(numbuf, sizeof(numbuf), "&#x%.4X;", c);
581 			print_word(h, numbuf);
582 		} else if (print_escape(h, c) == 0)
583 			print_byte(h, c);
584 	}
585 
586 	return nospace;
587 }
588 
589 static void
590 print_href(struct html *h, const char *name, const char *sec, int man)
591 {
592 	struct stat	 sb;
593 	const char	*p, *pp;
594 	char		*filename;
595 
596 	if (man) {
597 		pp = h->base_man1;
598 		if (h->base_man2 != NULL) {
599 			mandoc_asprintf(&filename, "%s.%s", name, sec);
600 			if (stat(filename, &sb) == -1)
601 				pp = h->base_man2;
602 			free(filename);
603 		}
604 	} else
605 		pp = h->base_includes;
606 
607 	while ((p = strchr(pp, '%')) != NULL) {
608 		print_encode(h, pp, p, 1);
609 		if (man && p[1] == 'S') {
610 			if (sec == NULL)
611 				print_byte(h, '1');
612 			else
613 				print_encode(h, sec, NULL, 1);
614 		} else if ((man && p[1] == 'N') ||
615 		    (man == 0 && p[1] == 'I'))
616 			print_encode(h, name, NULL, 1);
617 		else
618 			print_encode(h, p, p + 2, 1);
619 		pp = p + 2;
620 	}
621 	if (*pp != '\0')
622 		print_encode(h, pp, NULL, 1);
623 }
624 
625 struct tag *
626 print_otag(struct html *h, enum htmltag tag, const char *fmt, ...)
627 {
628 	va_list		 ap;
629 	struct tag	*t;
630 	const char	*attr;
631 	char		*arg1, *arg2;
632 	int		 style_written, tflags;
633 
634 	tflags = htmltags[tag].flags;
635 
636 	/* Flow content is not allowed in phrasing context. */
637 
638 	if ((tflags & HTML_INPHRASE) == 0) {
639 		for (t = h->tag; t != NULL; t = t->next) {
640 			if (t->closed)
641 				continue;
642 			assert((htmltags[t->tag].flags & HTML_TOPHRASE) == 0);
643 			break;
644 		}
645 
646 	/*
647 	 * Always wrap phrasing elements in a paragraph
648 	 * unless already contained in some flow container;
649 	 * never put them directly into a section.
650 	 */
651 
652 	} else if (tflags & HTML_TOPHRASE && h->tag->tag == TAG_SECTION)
653 		print_otag(h, TAG_P, "c", "Pp");
654 
655 	/* Push this tag onto the stack of open scopes. */
656 
657 	if ((tflags & HTML_NOSTACK) == 0) {
658 		t = mandoc_malloc(sizeof(struct tag));
659 		t->tag = tag;
660 		t->next = h->tag;
661 		t->refcnt = 0;
662 		t->closed = 0;
663 		h->tag = t;
664 	} else
665 		t = NULL;
666 
667 	if (tflags & HTML_NLBEFORE)
668 		print_endline(h);
669 	if (h->col == 0)
670 		print_indent(h);
671 	else if ((h->flags & HTML_NOSPACE) == 0) {
672 		if (h->flags & HTML_KEEP)
673 			print_word(h, "&#x00A0;");
674 		else {
675 			if (h->flags & HTML_PREKEEP)
676 				h->flags |= HTML_KEEP;
677 			print_endword(h);
678 		}
679 	}
680 
681 	if ( ! (h->flags & HTML_NONOSPACE))
682 		h->flags &= ~HTML_NOSPACE;
683 	else
684 		h->flags |= HTML_NOSPACE;
685 
686 	/* Print out the tag name and attributes. */
687 
688 	print_byte(h, '<');
689 	print_word(h, htmltags[tag].name);
690 
691 	va_start(ap, fmt);
692 
693 	while (*fmt != '\0' && *fmt != 's') {
694 
695 		/* Parse attributes and arguments. */
696 
697 		arg1 = va_arg(ap, char *);
698 		arg2 = NULL;
699 		switch (*fmt++) {
700 		case 'c':
701 			attr = "class";
702 			break;
703 		case 'h':
704 			attr = "href";
705 			break;
706 		case 'i':
707 			attr = "id";
708 			break;
709 		case '?':
710 			attr = arg1;
711 			arg1 = va_arg(ap, char *);
712 			break;
713 		default:
714 			abort();
715 		}
716 		if (*fmt == 'M')
717 			arg2 = va_arg(ap, char *);
718 		if (arg1 == NULL)
719 			continue;
720 
721 		/* Print the attributes. */
722 
723 		print_byte(h, ' ');
724 		print_word(h, attr);
725 		print_byte(h, '=');
726 		print_byte(h, '"');
727 		switch (*fmt) {
728 		case 'I':
729 			print_href(h, arg1, NULL, 0);
730 			fmt++;
731 			break;
732 		case 'M':
733 			print_href(h, arg1, arg2, 1);
734 			fmt++;
735 			break;
736 		case 'R':
737 			print_byte(h, '#');
738 			print_encode(h, arg1, NULL, 1);
739 			fmt++;
740 			break;
741 		default:
742 			print_encode(h, arg1, NULL, 1);
743 			break;
744 		}
745 		print_byte(h, '"');
746 	}
747 
748 	style_written = 0;
749 	while (*fmt++ == 's') {
750 		arg1 = va_arg(ap, char *);
751 		arg2 = va_arg(ap, char *);
752 		if (arg2 == NULL)
753 			continue;
754 		print_byte(h, ' ');
755 		if (style_written == 0) {
756 			print_word(h, "style=\"");
757 			style_written = 1;
758 		}
759 		print_word(h, arg1);
760 		print_byte(h, ':');
761 		print_byte(h, ' ');
762 		print_word(h, arg2);
763 		print_byte(h, ';');
764 	}
765 	if (style_written)
766 		print_byte(h, '"');
767 
768 	va_end(ap);
769 
770 	/* Accommodate for "well-formed" singleton escaping. */
771 
772 	if (htmltags[tag].flags & HTML_NOSTACK)
773 		print_byte(h, '/');
774 
775 	print_byte(h, '>');
776 
777 	if (tflags & HTML_NLBEGIN)
778 		print_endline(h);
779 	else
780 		h->flags |= HTML_NOSPACE;
781 
782 	if (tflags & HTML_INDENT)
783 		h->indent++;
784 	if (tflags & HTML_NOINDENT)
785 		h->noindent++;
786 
787 	return t;
788 }
789 
790 /*
791  * Print an element with an optional "id=" attribute.
792  * If the element has phrasing content and an "id=" attribute,
793  * also add a permalink: outside if it can be in phrasing context,
794  * inside otherwise.
795  */
796 struct tag *
797 print_otag_id(struct html *h, enum htmltag elemtype, const char *cattr,
798     struct roff_node *n)
799 {
800 	struct roff_node *nch;
801 	struct tag	*ret, *t;
802 	char		*id, *href;
803 
804 	ret = NULL;
805 	id = href = NULL;
806 	if (n->flags & NODE_ID)
807 		id = html_make_id(n, 1);
808 	if (n->flags & NODE_HREF)
809 		href = id == NULL ? html_make_id(n, 2) : id;
810 	if (href != NULL && htmltags[elemtype].flags & HTML_INPHRASE)
811 		ret = print_otag(h, TAG_A, "chR", "permalink", href);
812 	t = print_otag(h, elemtype, "ci", cattr, id);
813 	if (ret == NULL) {
814 		ret = t;
815 		if (href != NULL && (nch = n->child) != NULL) {
816 			/* man(7) is safe, it tags phrasing content only. */
817 			if (n->tok > MDOC_MAX ||
818 			    htmltags[elemtype].flags & HTML_TOPHRASE)
819 				nch = NULL;
820 			else  /* For mdoc(7), beware of nested blocks. */
821 				while (nch != NULL && nch->type == ROFFT_TEXT)
822 					nch = nch->next;
823 			if (nch == NULL)
824 				print_otag(h, TAG_A, "chR", "permalink", href);
825 		}
826 	}
827 	free(id);
828 	if (id == NULL)
829 		free(href);
830 	return ret;
831 }
832 
833 static void
834 print_ctag(struct html *h, struct tag *tag)
835 {
836 	int	 tflags;
837 
838 	if (tag->closed == 0) {
839 		tag->closed = 1;
840 		if (tag == h->metaf)
841 			h->metaf = NULL;
842 		if (tag == h->tblt)
843 			h->tblt = NULL;
844 
845 		tflags = htmltags[tag->tag].flags;
846 		if (tflags & HTML_INDENT)
847 			h->indent--;
848 		if (tflags & HTML_NOINDENT)
849 			h->noindent--;
850 		if (tflags & HTML_NLEND)
851 			print_endline(h);
852 		print_indent(h);
853 		print_byte(h, '<');
854 		print_byte(h, '/');
855 		print_word(h, htmltags[tag->tag].name);
856 		print_byte(h, '>');
857 		if (tflags & HTML_NLAFTER)
858 			print_endline(h);
859 	}
860 	if (tag->refcnt == 0) {
861 		h->tag = tag->next;
862 		free(tag);
863 	}
864 }
865 
866 void
867 print_gen_decls(struct html *h)
868 {
869 	print_word(h, "<!DOCTYPE html>");
870 	print_endline(h);
871 }
872 
873 void
874 print_gen_comment(struct html *h, struct roff_node *n)
875 {
876 	int	 wantblank;
877 
878 	print_word(h, "<!-- This is an automatically generated file."
879 	    "  Do not edit.");
880 	h->indent = 1;
881 	wantblank = 0;
882 	while (n != NULL && n->type == ROFFT_COMMENT) {
883 		if (strstr(n->string, "-->") == NULL &&
884 		    (wantblank || *n->string != '\0')) {
885 			print_endline(h);
886 			print_indent(h);
887 			print_word(h, n->string);
888 			wantblank = *n->string != '\0';
889 		}
890 		n = n->next;
891 	}
892 	if (wantblank)
893 		print_endline(h);
894 	print_word(h, " -->");
895 	print_endline(h);
896 	h->indent = 0;
897 }
898 
899 void
900 print_text(struct html *h, const char *word)
901 {
902 	print_tagged_text(h, word, NULL);
903 }
904 
905 void
906 print_tagged_text(struct html *h, const char *word, struct roff_node *n)
907 {
908 	struct tag	*t;
909 	char		*href;
910 
911 	/*
912 	 * Always wrap text in a paragraph unless already contained in
913 	 * some flow container; never put it directly into a section.
914 	 */
915 
916 	if (h->tag->tag == TAG_SECTION)
917 		print_otag(h, TAG_P, "c", "Pp");
918 
919 	/* Output whitespace before this text? */
920 
921 	if (h->col && (h->flags & HTML_NOSPACE) == 0) {
922 		if ( ! (HTML_KEEP & h->flags)) {
923 			if (HTML_PREKEEP & h->flags)
924 				h->flags |= HTML_KEEP;
925 			print_endword(h);
926 		} else
927 			print_word(h, "&#x00A0;");
928 	}
929 
930 	/*
931 	 * Optionally switch fonts, optionally write a permalink, then
932 	 * print the text, optionally surrounded by HTML whitespace.
933 	 */
934 
935 	assert(h->metaf == NULL);
936 	print_metaf(h);
937 	print_indent(h);
938 
939 	if (n != NULL && (href = html_make_id(n, 2)) != NULL) {
940 		t = print_otag(h, TAG_A, "chR", "permalink", href);
941 		free(href);
942 	} else
943 		t = NULL;
944 
945 	if ( ! print_encode(h, word, NULL, 0)) {
946 		if ( ! (h->flags & HTML_NONOSPACE))
947 			h->flags &= ~HTML_NOSPACE;
948 		h->flags &= ~HTML_NONEWLINE;
949 	} else
950 		h->flags |= HTML_NOSPACE | HTML_NONEWLINE;
951 
952 	if (h->metaf != NULL) {
953 		print_tagq(h, h->metaf);
954 		h->metaf = NULL;
955 	} else if (t != NULL)
956 		print_tagq(h, t);
957 
958 	h->flags &= ~HTML_IGNDELIM;
959 }
960 
961 void
962 print_tagq(struct html *h, const struct tag *until)
963 {
964 	struct tag	*this, *next;
965 
966 	for (this = h->tag; this != NULL; this = next) {
967 		next = this == until ? NULL : this->next;
968 		print_ctag(h, this);
969 	}
970 }
971 
972 /*
973  * Close out all open elements up to but excluding suntil.
974  * Note that a paragraph just inside stays open together with it
975  * because paragraphs include subsequent phrasing content.
976  */
977 void
978 print_stagq(struct html *h, const struct tag *suntil)
979 {
980 	struct tag	*this, *next;
981 
982 	for (this = h->tag; this != NULL; this = next) {
983 		next = this->next;
984 		if (this == suntil || (next == suntil &&
985 		    (this->tag == TAG_P || this->tag == TAG_PRE)))
986 			break;
987 		print_ctag(h, this);
988 	}
989 }
990 
991 
992 /***********************************************************************
993  * Low level output functions.
994  * They implement line breaking using a short static buffer.
995  ***********************************************************************/
996 
997 /*
998  * Buffer one HTML output byte.
999  * If the buffer is full, flush and deactivate it and start a new line.
1000  * If the buffer is inactive, print directly.
1001  */
1002 static void
1003 print_byte(struct html *h, char c)
1004 {
1005 	if ((h->flags & HTML_BUFFER) == 0) {
1006 		putchar(c);
1007 		h->col++;
1008 		return;
1009 	}
1010 
1011 	if (h->col + h->bufcol < sizeof(h->buf)) {
1012 		h->buf[h->bufcol++] = c;
1013 		return;
1014 	}
1015 
1016 	putchar('\n');
1017 	h->col = 0;
1018 	print_indent(h);
1019 	putchar(' ');
1020 	putchar(' ');
1021 	fwrite(h->buf, h->bufcol, 1, stdout);
1022 	putchar(c);
1023 	h->col = (h->indent + 1) * 2 + h->bufcol + 1;
1024 	h->bufcol = 0;
1025 	h->flags &= ~HTML_BUFFER;
1026 }
1027 
1028 /*
1029  * If something was printed on the current output line, end it.
1030  * Not to be called right after print_indent().
1031  */
1032 void
1033 print_endline(struct html *h)
1034 {
1035 	if (h->col == 0)
1036 		return;
1037 
1038 	if (h->bufcol) {
1039 		putchar(' ');
1040 		fwrite(h->buf, h->bufcol, 1, stdout);
1041 		h->bufcol = 0;
1042 	}
1043 	putchar('\n');
1044 	h->col = 0;
1045 	h->flags |= HTML_NOSPACE;
1046 	h->flags &= ~HTML_BUFFER;
1047 }
1048 
1049 /*
1050  * Flush the HTML output buffer.
1051  * If it is inactive, activate it.
1052  */
1053 static void
1054 print_endword(struct html *h)
1055 {
1056 	if (h->noindent) {
1057 		print_byte(h, ' ');
1058 		return;
1059 	}
1060 
1061 	if ((h->flags & HTML_BUFFER) == 0) {
1062 		h->col++;
1063 		h->flags |= HTML_BUFFER;
1064 	} else if (h->bufcol) {
1065 		putchar(' ');
1066 		fwrite(h->buf, h->bufcol, 1, stdout);
1067 		h->col += h->bufcol + 1;
1068 	}
1069 	h->bufcol = 0;
1070 }
1071 
1072 /*
1073  * If at the beginning of a new output line,
1074  * perform indentation and mark the line as containing output.
1075  * Make sure to really produce some output right afterwards,
1076  * but do not use print_otag() for producing it.
1077  */
1078 static void
1079 print_indent(struct html *h)
1080 {
1081 	size_t	 i;
1082 
1083 	if (h->col || h->noindent)
1084 		return;
1085 
1086 	h->col = h->indent * 2;
1087 	for (i = 0; i < h->col; i++)
1088 		putchar(' ');
1089 }
1090 
1091 /*
1092  * Print or buffer some characters
1093  * depending on the current HTML output buffer state.
1094  */
1095 static void
1096 print_word(struct html *h, const char *cp)
1097 {
1098 	while (*cp != '\0')
1099 		print_byte(h, *cp++);
1100 }
1101