1 /* $Id: html.c,v 1.159 2014/07/23 15:00:08 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #ifdef HAVE_CONFIG_H 19 #include "config.h" 20 #endif 21 22 #include <sys/types.h> 23 24 #include <assert.h> 25 #include <ctype.h> 26 #include <stdarg.h> 27 #include <stdio.h> 28 #include <stdint.h> 29 #include <stdlib.h> 30 #include <string.h> 31 #include <unistd.h> 32 33 #include "mandoc.h" 34 #include "mandoc_aux.h" 35 #include "libmandoc.h" 36 #include "out.h" 37 #include "html.h" 38 #include "main.h" 39 40 struct htmldata { 41 const char *name; 42 int flags; 43 #define HTML_CLRLINE (1 << 0) 44 #define HTML_NOSTACK (1 << 1) 45 #define HTML_AUTOCLOSE (1 << 2) /* Tag has auto-closure. */ 46 }; 47 48 static const struct htmldata htmltags[TAG_MAX] = { 49 {"html", HTML_CLRLINE}, /* TAG_HTML */ 50 {"head", HTML_CLRLINE}, /* TAG_HEAD */ 51 {"body", HTML_CLRLINE}, /* TAG_BODY */ 52 {"meta", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */ 53 {"title", HTML_CLRLINE}, /* TAG_TITLE */ 54 {"div", HTML_CLRLINE}, /* TAG_DIV */ 55 {"h1", 0}, /* TAG_H1 */ 56 {"h2", 0}, /* TAG_H2 */ 57 {"span", 0}, /* TAG_SPAN */ 58 {"link", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */ 59 {"br", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */ 60 {"a", 0}, /* TAG_A */ 61 {"table", HTML_CLRLINE}, /* TAG_TABLE */ 62 {"tbody", HTML_CLRLINE}, /* TAG_TBODY */ 63 {"col", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */ 64 {"tr", HTML_CLRLINE}, /* TAG_TR */ 65 {"td", HTML_CLRLINE}, /* TAG_TD */ 66 {"li", HTML_CLRLINE}, /* TAG_LI */ 67 {"ul", HTML_CLRLINE}, /* TAG_UL */ 68 {"ol", HTML_CLRLINE}, /* TAG_OL */ 69 {"dl", HTML_CLRLINE}, /* TAG_DL */ 70 {"dt", HTML_CLRLINE}, /* TAG_DT */ 71 {"dd", HTML_CLRLINE}, /* TAG_DD */ 72 {"blockquote", HTML_CLRLINE}, /* TAG_BLOCKQUOTE */ 73 {"p", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_P */ 74 {"pre", HTML_CLRLINE }, /* TAG_PRE */ 75 {"b", 0 }, /* TAG_B */ 76 {"i", 0 }, /* TAG_I */ 77 {"code", 0 }, /* TAG_CODE */ 78 {"small", 0 }, /* TAG_SMALL */ 79 }; 80 81 static const char *const htmlattrs[ATTR_MAX] = { 82 "http-equiv", /* ATTR_HTTPEQUIV */ 83 "content", /* ATTR_CONTENT */ 84 "name", /* ATTR_NAME */ 85 "rel", /* ATTR_REL */ 86 "href", /* ATTR_HREF */ 87 "type", /* ATTR_TYPE */ 88 "media", /* ATTR_MEDIA */ 89 "class", /* ATTR_CLASS */ 90 "style", /* ATTR_STYLE */ 91 "width", /* ATTR_WIDTH */ 92 "id", /* ATTR_ID */ 93 "summary", /* ATTR_SUMMARY */ 94 "align", /* ATTR_ALIGN */ 95 "colspan", /* ATTR_COLSPAN */ 96 }; 97 98 static const char *const roffscales[SCALE_MAX] = { 99 "cm", /* SCALE_CM */ 100 "in", /* SCALE_IN */ 101 "pc", /* SCALE_PC */ 102 "pt", /* SCALE_PT */ 103 "em", /* SCALE_EM */ 104 "em", /* SCALE_MM */ 105 "ex", /* SCALE_EN */ 106 "ex", /* SCALE_BU */ 107 "em", /* SCALE_VS */ 108 "ex", /* SCALE_FS */ 109 }; 110 111 static void bufncat(struct html *, const char *, size_t); 112 static void print_ctag(struct html *, enum htmltag); 113 static int print_escape(char); 114 static int print_encode(struct html *, const char *, int); 115 static void print_metaf(struct html *, enum mandoc_esc); 116 static void print_attr(struct html *, const char *, const char *); 117 static void *ml_alloc(char *, enum htmltype); 118 119 120 static void * 121 ml_alloc(char *outopts, enum htmltype type) 122 { 123 struct html *h; 124 const char *toks[5]; 125 char *v; 126 127 toks[0] = "style"; 128 toks[1] = "man"; 129 toks[2] = "includes"; 130 toks[3] = "fragment"; 131 toks[4] = NULL; 132 133 h = mandoc_calloc(1, sizeof(struct html)); 134 135 h->type = type; 136 h->tags.head = NULL; 137 h->symtab = mchars_alloc(); 138 139 while (outopts && *outopts) 140 switch (getsubopt(&outopts, UNCONST(toks), &v)) { 141 case 0: 142 h->style = v; 143 break; 144 case 1: 145 h->base_man = v; 146 break; 147 case 2: 148 h->base_includes = v; 149 break; 150 case 3: 151 h->oflags |= HTML_FRAGMENT; 152 break; 153 default: 154 break; 155 } 156 157 return(h); 158 } 159 160 void * 161 html_alloc(char *outopts) 162 { 163 164 return(ml_alloc(outopts, HTML_HTML_4_01_STRICT)); 165 } 166 167 void * 168 xhtml_alloc(char *outopts) 169 { 170 171 return(ml_alloc(outopts, HTML_XHTML_1_0_STRICT)); 172 } 173 174 void 175 html_free(void *p) 176 { 177 struct tag *tag; 178 struct html *h; 179 180 h = (struct html *)p; 181 182 while ((tag = h->tags.head) != NULL) { 183 h->tags.head = tag->next; 184 free(tag); 185 } 186 187 if (h->symtab) 188 mchars_free(h->symtab); 189 190 free(h); 191 } 192 193 void 194 print_gen_head(struct html *h) 195 { 196 struct htmlpair tag[4]; 197 198 tag[0].key = ATTR_HTTPEQUIV; 199 tag[0].val = "Content-Type"; 200 tag[1].key = ATTR_CONTENT; 201 tag[1].val = "text/html; charset=utf-8"; 202 print_otag(h, TAG_META, 2, tag); 203 204 tag[0].key = ATTR_NAME; 205 tag[0].val = "resource-type"; 206 tag[1].key = ATTR_CONTENT; 207 tag[1].val = "document"; 208 print_otag(h, TAG_META, 2, tag); 209 210 if (h->style) { 211 tag[0].key = ATTR_REL; 212 tag[0].val = "stylesheet"; 213 tag[1].key = ATTR_HREF; 214 tag[1].val = h->style; 215 tag[2].key = ATTR_TYPE; 216 tag[2].val = "text/css"; 217 tag[3].key = ATTR_MEDIA; 218 tag[3].val = "all"; 219 print_otag(h, TAG_LINK, 4, tag); 220 } 221 } 222 223 static void 224 print_metaf(struct html *h, enum mandoc_esc deco) 225 { 226 enum htmlfont font; 227 228 switch (deco) { 229 case ESCAPE_FONTPREV: 230 font = h->metal; 231 break; 232 case ESCAPE_FONTITALIC: 233 font = HTMLFONT_ITALIC; 234 break; 235 case ESCAPE_FONTBOLD: 236 font = HTMLFONT_BOLD; 237 break; 238 case ESCAPE_FONTBI: 239 font = HTMLFONT_BI; 240 break; 241 case ESCAPE_FONT: 242 /* FALLTHROUGH */ 243 case ESCAPE_FONTROMAN: 244 font = HTMLFONT_NONE; 245 break; 246 default: 247 abort(); 248 /* NOTREACHED */ 249 } 250 251 if (h->metaf) { 252 print_tagq(h, h->metaf); 253 h->metaf = NULL; 254 } 255 256 h->metal = h->metac; 257 h->metac = font; 258 259 switch (font) { 260 case HTMLFONT_ITALIC: 261 h->metaf = print_otag(h, TAG_I, 0, NULL); 262 break; 263 case HTMLFONT_BOLD: 264 h->metaf = print_otag(h, TAG_B, 0, NULL); 265 break; 266 case HTMLFONT_BI: 267 h->metaf = print_otag(h, TAG_B, 0, NULL); 268 print_otag(h, TAG_I, 0, NULL); 269 break; 270 default: 271 break; 272 } 273 } 274 275 int 276 html_strlen(const char *cp) 277 { 278 size_t rsz; 279 int skip, sz; 280 281 /* 282 * Account for escaped sequences within string length 283 * calculations. This follows the logic in term_strlen() as we 284 * must calculate the width of produced strings. 285 * Assume that characters are always width of "1". This is 286 * hacky, but it gets the job done for approximation of widths. 287 */ 288 289 sz = 0; 290 skip = 0; 291 while (1) { 292 rsz = strcspn(cp, "\\"); 293 if (rsz) { 294 cp += rsz; 295 if (skip) { 296 skip = 0; 297 rsz--; 298 } 299 sz += rsz; 300 } 301 if ('\0' == *cp) 302 break; 303 cp++; 304 switch (mandoc_escape(&cp, NULL, NULL)) { 305 case ESCAPE_ERROR: 306 return(sz); 307 case ESCAPE_UNICODE: 308 /* FALLTHROUGH */ 309 case ESCAPE_NUMBERED: 310 /* FALLTHROUGH */ 311 case ESCAPE_SPECIAL: 312 if (skip) 313 skip = 0; 314 else 315 sz++; 316 break; 317 case ESCAPE_SKIPCHAR: 318 skip = 1; 319 break; 320 default: 321 break; 322 } 323 } 324 return(sz); 325 } 326 327 static int 328 print_escape(char c) 329 { 330 331 switch (c) { 332 case '<': 333 printf("<"); 334 break; 335 case '>': 336 printf(">"); 337 break; 338 case '&': 339 printf("&"); 340 break; 341 case '"': 342 printf("""); 343 break; 344 case ASCII_NBRSP: 345 putchar('-'); 346 break; 347 case ASCII_HYPH: 348 putchar('-'); 349 /* FALLTHROUGH */ 350 case ASCII_BREAK: 351 break; 352 default: 353 return(0); 354 } 355 return(1); 356 } 357 358 static int 359 print_encode(struct html *h, const char *p, int norecurse) 360 { 361 size_t sz; 362 int c, len, nospace; 363 const char *seq; 364 enum mandoc_esc esc; 365 static const char rejs[9] = { '\\', '<', '>', '&', '"', 366 ASCII_NBRSP, ASCII_HYPH, ASCII_BREAK, '\0' }; 367 368 nospace = 0; 369 370 while ('\0' != *p) { 371 if (HTML_SKIPCHAR & h->flags && '\\' != *p) { 372 h->flags &= ~HTML_SKIPCHAR; 373 p++; 374 continue; 375 } 376 377 sz = strcspn(p, rejs); 378 379 fwrite(p, 1, sz, stdout); 380 p += (int)sz; 381 382 if ('\0' == *p) 383 break; 384 385 if (print_escape(*p++)) 386 continue; 387 388 esc = mandoc_escape(&p, &seq, &len); 389 if (ESCAPE_ERROR == esc) 390 break; 391 392 switch (esc) { 393 case ESCAPE_FONT: 394 /* FALLTHROUGH */ 395 case ESCAPE_FONTPREV: 396 /* FALLTHROUGH */ 397 case ESCAPE_FONTBOLD: 398 /* FALLTHROUGH */ 399 case ESCAPE_FONTITALIC: 400 /* FALLTHROUGH */ 401 case ESCAPE_FONTBI: 402 /* FALLTHROUGH */ 403 case ESCAPE_FONTROMAN: 404 if (0 == norecurse) 405 print_metaf(h, esc); 406 continue; 407 case ESCAPE_SKIPCHAR: 408 h->flags |= HTML_SKIPCHAR; 409 continue; 410 default: 411 break; 412 } 413 414 if (h->flags & HTML_SKIPCHAR) { 415 h->flags &= ~HTML_SKIPCHAR; 416 continue; 417 } 418 419 switch (esc) { 420 case ESCAPE_UNICODE: 421 /* Skip past "u" header. */ 422 c = mchars_num2uc(seq + 1, len - 1); 423 if ('\0' != c) 424 printf("&#x%x;", c); 425 break; 426 case ESCAPE_NUMBERED: 427 c = mchars_num2char(seq, len); 428 if ( ! ('\0' == c || print_escape(c))) 429 putchar(c); 430 break; 431 case ESCAPE_SPECIAL: 432 c = mchars_spec2cp(h->symtab, seq, len); 433 if (c > 0) 434 printf("&#%d;", c); 435 else if (-1 == c && 1 == len && 436 !print_escape(*seq)) 437 putchar((int)*seq); 438 break; 439 case ESCAPE_NOSPACE: 440 if ('\0' == *p) 441 nospace = 1; 442 break; 443 default: 444 break; 445 } 446 } 447 448 return(nospace); 449 } 450 451 static void 452 print_attr(struct html *h, const char *key, const char *val) 453 { 454 printf(" %s=\"", key); 455 (void)print_encode(h, val, 1); 456 putchar('\"'); 457 } 458 459 struct tag * 460 print_otag(struct html *h, enum htmltag tag, 461 int sz, const struct htmlpair *p) 462 { 463 int i; 464 struct tag *t; 465 466 /* Push this tags onto the stack of open scopes. */ 467 468 if ( ! (HTML_NOSTACK & htmltags[tag].flags)) { 469 t = mandoc_malloc(sizeof(struct tag)); 470 t->tag = tag; 471 t->next = h->tags.head; 472 h->tags.head = t; 473 } else 474 t = NULL; 475 476 if ( ! (HTML_NOSPACE & h->flags)) 477 if ( ! (HTML_CLRLINE & htmltags[tag].flags)) { 478 /* Manage keeps! */ 479 if ( ! (HTML_KEEP & h->flags)) { 480 if (HTML_PREKEEP & h->flags) 481 h->flags |= HTML_KEEP; 482 putchar(' '); 483 } else 484 printf(" "); 485 } 486 487 if ( ! (h->flags & HTML_NONOSPACE)) 488 h->flags &= ~HTML_NOSPACE; 489 else 490 h->flags |= HTML_NOSPACE; 491 492 /* Print out the tag name and attributes. */ 493 494 printf("<%s", htmltags[tag].name); 495 for (i = 0; i < sz; i++) 496 print_attr(h, htmlattrs[p[i].key], p[i].val); 497 498 /* Add non-overridable attributes. */ 499 500 if (TAG_HTML == tag && HTML_XHTML_1_0_STRICT == h->type) { 501 print_attr(h, "xmlns", "http://www.w3.org/1999/xhtml"); 502 print_attr(h, "xml:lang", "en"); 503 print_attr(h, "lang", "en"); 504 } 505 506 /* Accommodate for XML "well-formed" singleton escaping. */ 507 508 if (HTML_AUTOCLOSE & htmltags[tag].flags) 509 switch (h->type) { 510 case HTML_XHTML_1_0_STRICT: 511 putchar('/'); 512 break; 513 default: 514 break; 515 } 516 517 putchar('>'); 518 519 h->flags |= HTML_NOSPACE; 520 521 if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags) 522 putchar('\n'); 523 524 return(t); 525 } 526 527 static void 528 print_ctag(struct html *h, enum htmltag tag) 529 { 530 531 printf("</%s>", htmltags[tag].name); 532 if (HTML_CLRLINE & htmltags[tag].flags) { 533 h->flags |= HTML_NOSPACE; 534 putchar('\n'); 535 } 536 } 537 538 void 539 print_gen_decls(struct html *h) 540 { 541 const char *doctype; 542 const char *dtd; 543 const char *name; 544 545 switch (h->type) { 546 case HTML_HTML_4_01_STRICT: 547 name = "HTML"; 548 doctype = "-//W3C//DTD HTML 4.01//EN"; 549 dtd = "http://www.w3.org/TR/html4/strict.dtd"; 550 break; 551 default: 552 puts("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); 553 name = "html"; 554 doctype = "-//W3C//DTD XHTML 1.0 Strict//EN"; 555 dtd = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; 556 break; 557 } 558 559 printf("<!DOCTYPE %s PUBLIC \"%s\" \"%s\">\n", 560 name, doctype, dtd); 561 } 562 563 void 564 print_text(struct html *h, const char *word) 565 { 566 567 if ( ! (HTML_NOSPACE & h->flags)) { 568 /* Manage keeps! */ 569 if ( ! (HTML_KEEP & h->flags)) { 570 if (HTML_PREKEEP & h->flags) 571 h->flags |= HTML_KEEP; 572 putchar(' '); 573 } else 574 printf(" "); 575 } 576 577 assert(NULL == h->metaf); 578 switch (h->metac) { 579 case HTMLFONT_ITALIC: 580 h->metaf = print_otag(h, TAG_I, 0, NULL); 581 break; 582 case HTMLFONT_BOLD: 583 h->metaf = print_otag(h, TAG_B, 0, NULL); 584 break; 585 case HTMLFONT_BI: 586 h->metaf = print_otag(h, TAG_B, 0, NULL); 587 print_otag(h, TAG_I, 0, NULL); 588 break; 589 default: 590 break; 591 } 592 593 assert(word); 594 if ( ! print_encode(h, word, 0)) { 595 if ( ! (h->flags & HTML_NONOSPACE)) 596 h->flags &= ~HTML_NOSPACE; 597 } else 598 h->flags |= HTML_NOSPACE; 599 600 if (h->metaf) { 601 print_tagq(h, h->metaf); 602 h->metaf = NULL; 603 } 604 605 h->flags &= ~HTML_IGNDELIM; 606 } 607 608 void 609 print_tagq(struct html *h, const struct tag *until) 610 { 611 struct tag *tag; 612 613 while ((tag = h->tags.head) != NULL) { 614 /* 615 * Remember to close out and nullify the current 616 * meta-font and table, if applicable. 617 */ 618 if (tag == h->metaf) 619 h->metaf = NULL; 620 if (tag == h->tblt) 621 h->tblt = NULL; 622 print_ctag(h, tag->tag); 623 h->tags.head = tag->next; 624 free(tag); 625 if (until && tag == until) 626 return; 627 } 628 } 629 630 void 631 print_stagq(struct html *h, const struct tag *suntil) 632 { 633 struct tag *tag; 634 635 while ((tag = h->tags.head) != NULL) { 636 if (suntil && tag == suntil) 637 return; 638 /* 639 * Remember to close out and nullify the current 640 * meta-font and table, if applicable. 641 */ 642 if (tag == h->metaf) 643 h->metaf = NULL; 644 if (tag == h->tblt) 645 h->tblt = NULL; 646 print_ctag(h, tag->tag); 647 h->tags.head = tag->next; 648 free(tag); 649 } 650 } 651 652 void 653 bufinit(struct html *h) 654 { 655 656 h->buf[0] = '\0'; 657 h->buflen = 0; 658 } 659 660 void 661 bufcat_style(struct html *h, const char *key, const char *val) 662 { 663 664 bufcat(h, key); 665 bufcat(h, ":"); 666 bufcat(h, val); 667 bufcat(h, ";"); 668 } 669 670 void 671 bufcat(struct html *h, const char *p) 672 { 673 674 /* 675 * XXX This is broken and not easy to fix. 676 * When using the -Oincludes option, buffmt_includes() 677 * may pass in strings overrunning BUFSIZ, causing a crash. 678 */ 679 680 h->buflen = strlcat(h->buf, p, BUFSIZ); 681 assert(h->buflen < BUFSIZ); 682 } 683 684 void 685 bufcat_fmt(struct html *h, const char *fmt, ...) 686 { 687 va_list ap; 688 689 va_start(ap, fmt); 690 (void)vsnprintf(h->buf + (int)h->buflen, 691 BUFSIZ - h->buflen - 1, fmt, ap); 692 va_end(ap); 693 h->buflen = strlen(h->buf); 694 } 695 696 static void 697 bufncat(struct html *h, const char *p, size_t sz) 698 { 699 700 assert(h->buflen + sz + 1 < BUFSIZ); 701 strncat(h->buf, p, sz); 702 h->buflen += sz; 703 } 704 705 void 706 buffmt_includes(struct html *h, const char *name) 707 { 708 const char *p, *pp; 709 710 pp = h->base_includes; 711 712 bufinit(h); 713 while (NULL != (p = strchr(pp, '%'))) { 714 bufncat(h, pp, (size_t)(p - pp)); 715 switch (*(p + 1)) { 716 case'I': 717 bufcat(h, name); 718 break; 719 default: 720 bufncat(h, p, 2); 721 break; 722 } 723 pp = p + 2; 724 } 725 if (pp) 726 bufcat(h, pp); 727 } 728 729 void 730 buffmt_man(struct html *h, const char *name, const char *sec) 731 { 732 const char *p, *pp; 733 734 pp = h->base_man; 735 736 bufinit(h); 737 while (NULL != (p = strchr(pp, '%'))) { 738 bufncat(h, pp, (size_t)(p - pp)); 739 switch (*(p + 1)) { 740 case 'S': 741 bufcat(h, sec ? sec : "1"); 742 break; 743 case 'N': 744 bufcat_fmt(h, "%s", name); 745 break; 746 default: 747 bufncat(h, p, 2); 748 break; 749 } 750 pp = p + 2; 751 } 752 if (pp) 753 bufcat(h, pp); 754 } 755 756 void 757 bufcat_su(struct html *h, const char *p, const struct roffsu *su) 758 { 759 double v; 760 761 v = su->scale; 762 if (SCALE_MM == su->unit && 0.0 == (v /= 100.0)) 763 v = 1.0; 764 765 bufcat_fmt(h, "%s: %.2f%s;", p, v, roffscales[su->unit]); 766 } 767 768 void 769 bufcat_id(struct html *h, const char *src) 770 { 771 772 /* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */ 773 774 while ('\0' != *src) 775 bufcat_fmt(h, "%.2x", *src++); 776 } 777