1 /* $Vendor-Id: html.c,v 1.124 2010/12/27 21:41:05 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 #ifdef HAVE_CONFIG_H 18 #include "config.h" 19 #endif 20 21 #include <sys/types.h> 22 23 #include <assert.h> 24 #include <ctype.h> 25 #include <stdarg.h> 26 #include <stdio.h> 27 #include <stdint.h> 28 #include <stdlib.h> 29 #include <string.h> 30 #include <unistd.h> 31 32 #include "mandoc.h" 33 #include "out.h" 34 #include "chars.h" 35 #include "html.h" 36 #include "main.h" 37 38 struct htmldata { 39 const char *name; 40 int flags; 41 #define HTML_CLRLINE (1 << 0) 42 #define HTML_NOSTACK (1 << 1) 43 #define HTML_AUTOCLOSE (1 << 2) /* Tag has auto-closure. */ 44 }; 45 46 static const struct htmldata htmltags[TAG_MAX] = { 47 {"html", HTML_CLRLINE}, /* TAG_HTML */ 48 {"head", HTML_CLRLINE}, /* TAG_HEAD */ 49 {"body", HTML_CLRLINE}, /* TAG_BODY */ 50 {"meta", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */ 51 {"title", HTML_CLRLINE}, /* TAG_TITLE */ 52 {"div", HTML_CLRLINE}, /* TAG_DIV */ 53 {"h1", 0}, /* TAG_H1 */ 54 {"h2", 0}, /* TAG_H2 */ 55 {"span", 0}, /* TAG_SPAN */ 56 {"link", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */ 57 {"br", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */ 58 {"a", 0}, /* TAG_A */ 59 {"table", HTML_CLRLINE}, /* TAG_TABLE */ 60 {"tbody", HTML_CLRLINE}, /* TAG_TBODY */ 61 {"col", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */ 62 {"tr", HTML_CLRLINE}, /* TAG_TR */ 63 {"td", HTML_CLRLINE}, /* TAG_TD */ 64 {"li", HTML_CLRLINE}, /* TAG_LI */ 65 {"ul", HTML_CLRLINE}, /* TAG_UL */ 66 {"ol", HTML_CLRLINE}, /* TAG_OL */ 67 {"dl", HTML_CLRLINE}, /* TAG_DL */ 68 {"dt", HTML_CLRLINE}, /* TAG_DT */ 69 {"dd", HTML_CLRLINE}, /* TAG_DD */ 70 {"blockquote", HTML_CLRLINE}, /* TAG_BLOCKQUOTE */ 71 {"p", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_P */ 72 {"pre", HTML_CLRLINE }, /* TAG_PRE */ 73 {"b", 0 }, /* TAG_B */ 74 {"i", 0 }, /* TAG_I */ 75 {"code", 0 }, /* TAG_CODE */ 76 {"small", 0 }, /* TAG_SMALL */ 77 }; 78 79 static const char *const htmlattrs[ATTR_MAX] = { 80 "http-equiv", /* ATTR_HTTPEQUIV */ 81 "content", /* ATTR_CONTENT */ 82 "name", /* ATTR_NAME */ 83 "rel", /* ATTR_REL */ 84 "href", /* ATTR_HREF */ 85 "type", /* ATTR_TYPE */ 86 "media", /* ATTR_MEDIA */ 87 "class", /* ATTR_CLASS */ 88 "style", /* ATTR_STYLE */ 89 "width", /* ATTR_WIDTH */ 90 "id", /* ATTR_ID */ 91 "summary", /* ATTR_SUMMARY */ 92 "align", /* ATTR_ALIGN */ 93 }; 94 95 static void print_spec(struct html *, enum roffdeco, 96 const char *, size_t); 97 static void print_res(struct html *, const char *, size_t); 98 static void print_ctag(struct html *, enum htmltag); 99 static void print_doctype(struct html *); 100 static void print_xmltype(struct html *); 101 static int print_encode(struct html *, const char *, int); 102 static void print_metaf(struct html *, enum roffdeco); 103 static void print_attr(struct html *, 104 const char *, const char *); 105 static void *ml_alloc(char *, enum htmltype); 106 107 108 static void * 109 ml_alloc(char *outopts, enum htmltype type) 110 { 111 struct html *h; 112 const char *toks[4]; 113 char *v; 114 115 toks[0] = "style"; 116 toks[1] = "man"; 117 toks[2] = "includes"; 118 toks[3] = NULL; 119 120 h = calloc(1, sizeof(struct html)); 121 if (NULL == h) { 122 perror(NULL); 123 exit((int)MANDOCLEVEL_SYSERR); 124 } 125 126 h->type = type; 127 h->tags.head = NULL; 128 h->symtab = chars_init(CHARS_HTML); 129 130 while (outopts && *outopts) 131 switch (getsubopt(&outopts, UNCONST(toks), &v)) { 132 case (0): 133 h->style = v; 134 break; 135 case (1): 136 h->base_man = v; 137 break; 138 case (2): 139 h->base_includes = v; 140 break; 141 default: 142 break; 143 } 144 145 return(h); 146 } 147 148 void * 149 html_alloc(char *outopts) 150 { 151 152 return(ml_alloc(outopts, HTML_HTML_4_01_STRICT)); 153 } 154 155 156 void * 157 xhtml_alloc(char *outopts) 158 { 159 160 return(ml_alloc(outopts, HTML_XHTML_1_0_STRICT)); 161 } 162 163 164 void 165 html_free(void *p) 166 { 167 struct tag *tag; 168 struct html *h; 169 170 h = (struct html *)p; 171 172 while ((tag = h->tags.head) != NULL) { 173 h->tags.head = tag->next; 174 free(tag); 175 } 176 177 if (h->symtab) 178 chars_free(h->symtab); 179 180 free(h); 181 } 182 183 184 void 185 print_gen_head(struct html *h) 186 { 187 struct htmlpair tag[4]; 188 189 tag[0].key = ATTR_HTTPEQUIV; 190 tag[0].val = "Content-Type"; 191 tag[1].key = ATTR_CONTENT; 192 tag[1].val = "text/html; charset=utf-8"; 193 print_otag(h, TAG_META, 2, tag); 194 195 tag[0].key = ATTR_NAME; 196 tag[0].val = "resource-type"; 197 tag[1].key = ATTR_CONTENT; 198 tag[1].val = "document"; 199 print_otag(h, TAG_META, 2, tag); 200 201 if (h->style) { 202 tag[0].key = ATTR_REL; 203 tag[0].val = "stylesheet"; 204 tag[1].key = ATTR_HREF; 205 tag[1].val = h->style; 206 tag[2].key = ATTR_TYPE; 207 tag[2].val = "text/css"; 208 tag[3].key = ATTR_MEDIA; 209 tag[3].val = "all"; 210 print_otag(h, TAG_LINK, 4, tag); 211 } 212 } 213 214 215 static void 216 print_spec(struct html *h, enum roffdeco d, const char *p, size_t len) 217 { 218 int cp; 219 const char *rhs; 220 size_t sz; 221 222 if ((cp = chars_spec2cp(h->symtab, p, len)) > 0) { 223 printf("&#%d;", cp); 224 return; 225 } else if (-1 == cp && DECO_SSPECIAL == d) { 226 fwrite(p, 1, len, stdout); 227 return; 228 } else if (-1 == cp) 229 return; 230 231 if (NULL != (rhs = chars_spec2str(h->symtab, p, len, &sz))) 232 fwrite(rhs, 1, sz, stdout); 233 } 234 235 236 static void 237 print_res(struct html *h, const char *p, size_t len) 238 { 239 int cp; 240 const char *rhs; 241 size_t sz; 242 243 if ((cp = chars_res2cp(h->symtab, p, len)) > 0) { 244 printf("&#%d;", cp); 245 return; 246 } else if (-1 == cp) 247 return; 248 249 if (NULL != (rhs = chars_res2str(h->symtab, p, len, &sz))) 250 fwrite(rhs, 1, sz, stdout); 251 } 252 253 254 static void 255 print_metaf(struct html *h, enum roffdeco deco) 256 { 257 enum htmlfont font; 258 259 switch (deco) { 260 case (DECO_PREVIOUS): 261 font = h->metal; 262 break; 263 case (DECO_ITALIC): 264 font = HTMLFONT_ITALIC; 265 break; 266 case (DECO_BOLD): 267 font = HTMLFONT_BOLD; 268 break; 269 case (DECO_ROMAN): 270 font = HTMLFONT_NONE; 271 break; 272 default: 273 abort(); 274 /* NOTREACHED */ 275 } 276 277 if (h->metaf) { 278 print_tagq(h, h->metaf); 279 h->metaf = NULL; 280 } 281 282 h->metal = h->metac; 283 h->metac = font; 284 285 if (HTMLFONT_NONE != font) 286 h->metaf = HTMLFONT_BOLD == font ? 287 print_otag(h, TAG_B, 0, NULL) : 288 print_otag(h, TAG_I, 0, NULL); 289 } 290 291 292 static int 293 print_encode(struct html *h, const char *p, int norecurse) 294 { 295 size_t sz; 296 int len, nospace; 297 const char *seq; 298 enum roffdeco deco; 299 static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' }; 300 301 nospace = 0; 302 303 for (; *p; p++) { 304 sz = strcspn(p, rejs); 305 306 fwrite(p, 1, sz, stdout); 307 p += /* LINTED */ 308 sz; 309 310 if ('<' == *p) { 311 printf("<"); 312 continue; 313 } else if ('>' == *p) { 314 printf(">"); 315 continue; 316 } else if ('&' == *p) { 317 printf("&"); 318 continue; 319 } else if (ASCII_HYPH == *p) { 320 /* 321 * Note: "soft hyphens" aren't graphically 322 * displayed when not breaking the text; we want 323 * them to be displayed. 324 */ 325 /*printf("­");*/ 326 putchar('-'); 327 continue; 328 } else if ('\0' == *p) 329 break; 330 331 seq = ++p; 332 len = a2roffdeco(&deco, &seq, &sz); 333 334 switch (deco) { 335 case (DECO_RESERVED): 336 print_res(h, seq, sz); 337 break; 338 case (DECO_SSPECIAL): 339 /* FALLTHROUGH */ 340 case (DECO_SPECIAL): 341 print_spec(h, deco, seq, sz); 342 break; 343 case (DECO_PREVIOUS): 344 /* FALLTHROUGH */ 345 case (DECO_BOLD): 346 /* FALLTHROUGH */ 347 case (DECO_ITALIC): 348 /* FALLTHROUGH */ 349 case (DECO_ROMAN): 350 if (norecurse) 351 break; 352 print_metaf(h, deco); 353 break; 354 default: 355 break; 356 } 357 358 p += len - 1; 359 360 if (DECO_NOSPACE == deco && '\0' == *(p + 1)) 361 nospace = 1; 362 } 363 364 return(nospace); 365 } 366 367 368 static void 369 print_attr(struct html *h, const char *key, const char *val) 370 { 371 printf(" %s=\"", key); 372 (void)print_encode(h, val, 1); 373 putchar('\"'); 374 } 375 376 377 struct tag * 378 print_otag(struct html *h, enum htmltag tag, 379 int sz, const struct htmlpair *p) 380 { 381 int i; 382 struct tag *t; 383 384 /* Push this tags onto the stack of open scopes. */ 385 386 if ( ! (HTML_NOSTACK & htmltags[tag].flags)) { 387 t = malloc(sizeof(struct tag)); 388 if (NULL == t) { 389 perror(NULL); 390 exit((int)MANDOCLEVEL_SYSERR); 391 } 392 t->tag = tag; 393 t->next = h->tags.head; 394 h->tags.head = t; 395 } else 396 t = NULL; 397 398 if ( ! (HTML_NOSPACE & h->flags)) 399 if ( ! (HTML_CLRLINE & htmltags[tag].flags)) { 400 /* Manage keeps! */ 401 if ( ! (HTML_KEEP & h->flags)) { 402 if (HTML_PREKEEP & h->flags) 403 h->flags |= HTML_KEEP; 404 putchar(' '); 405 } else 406 printf(" "); 407 } 408 409 if ( ! (h->flags & HTML_NONOSPACE)) 410 h->flags &= ~HTML_NOSPACE; 411 else 412 h->flags |= HTML_NOSPACE; 413 414 /* Print out the tag name and attributes. */ 415 416 printf("<%s", htmltags[tag].name); 417 for (i = 0; i < sz; i++) 418 print_attr(h, htmlattrs[p[i].key], p[i].val); 419 420 /* Add non-overridable attributes. */ 421 422 if (TAG_HTML == tag && HTML_XHTML_1_0_STRICT == h->type) { 423 print_attr(h, "xmlns", "http://www.w3.org/1999/xhtml"); 424 print_attr(h, "xml:lang", "en"); 425 print_attr(h, "lang", "en"); 426 } 427 428 /* Accomodate for XML "well-formed" singleton escaping. */ 429 430 if (HTML_AUTOCLOSE & htmltags[tag].flags) 431 switch (h->type) { 432 case (HTML_XHTML_1_0_STRICT): 433 putchar('/'); 434 break; 435 default: 436 break; 437 } 438 439 putchar('>'); 440 441 h->flags |= HTML_NOSPACE; 442 443 if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags) 444 putchar('\n'); 445 446 return(t); 447 } 448 449 450 static void 451 print_ctag(struct html *h, enum htmltag tag) 452 { 453 454 printf("</%s>", htmltags[tag].name); 455 if (HTML_CLRLINE & htmltags[tag].flags) { 456 h->flags |= HTML_NOSPACE; 457 putchar('\n'); 458 } 459 } 460 461 462 void 463 print_gen_decls(struct html *h) 464 { 465 466 print_xmltype(h); 467 print_doctype(h); 468 } 469 470 471 static void 472 print_xmltype(struct html *h) 473 { 474 475 if (HTML_XHTML_1_0_STRICT == h->type) 476 puts("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); 477 } 478 479 480 static void 481 print_doctype(struct html *h) 482 { 483 const char *doctype; 484 const char *dtd; 485 const char *name; 486 487 switch (h->type) { 488 case (HTML_HTML_4_01_STRICT): 489 name = "HTML"; 490 doctype = "-//W3C//DTD HTML 4.01//EN"; 491 dtd = "http://www.w3.org/TR/html4/strict.dtd"; 492 break; 493 default: 494 name = "html"; 495 doctype = "-//W3C//DTD XHTML 1.0 Strict//EN"; 496 dtd = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; 497 break; 498 } 499 500 printf("<!DOCTYPE %s PUBLIC \"%s\" \"%s\">\n", 501 name, doctype, dtd); 502 } 503 504 505 void 506 print_text(struct html *h, const char *word) 507 { 508 509 if (word[0] && '\0' == word[1]) 510 switch (word[0]) { 511 case('.'): 512 /* FALLTHROUGH */ 513 case(','): 514 /* FALLTHROUGH */ 515 case(';'): 516 /* FALLTHROUGH */ 517 case(':'): 518 /* FALLTHROUGH */ 519 case('?'): 520 /* FALLTHROUGH */ 521 case('!'): 522 /* FALLTHROUGH */ 523 case(')'): 524 /* FALLTHROUGH */ 525 case(']'): 526 if ( ! (HTML_IGNDELIM & h->flags)) 527 h->flags |= HTML_NOSPACE; 528 break; 529 default: 530 break; 531 } 532 533 if ( ! (HTML_NOSPACE & h->flags)) { 534 /* Manage keeps! */ 535 if ( ! (HTML_KEEP & h->flags)) { 536 if (HTML_PREKEEP & h->flags) 537 h->flags |= HTML_KEEP; 538 putchar(' '); 539 } else 540 printf(" "); 541 } 542 543 assert(NULL == h->metaf); 544 if (HTMLFONT_NONE != h->metac) 545 h->metaf = HTMLFONT_BOLD == h->metac ? 546 print_otag(h, TAG_B, 0, NULL) : 547 print_otag(h, TAG_I, 0, NULL); 548 549 assert(word); 550 if ( ! print_encode(h, word, 0)) 551 if ( ! (h->flags & HTML_NONOSPACE)) 552 h->flags &= ~HTML_NOSPACE; 553 554 if (h->metaf) { 555 print_tagq(h, h->metaf); 556 h->metaf = NULL; 557 } 558 559 h->flags &= ~HTML_IGNDELIM; 560 561 /* 562 * Note that we don't process the pipe: the parser sees it as 563 * punctuation, but we don't in terms of typography. 564 */ 565 if (word[0] && '\0' == word[1]) 566 switch (word[0]) { 567 case('('): 568 /* FALLTHROUGH */ 569 case('['): 570 h->flags |= HTML_NOSPACE; 571 break; 572 default: 573 break; 574 } 575 } 576 577 578 void 579 print_tagq(struct html *h, const struct tag *until) 580 { 581 struct tag *tag; 582 583 while ((tag = h->tags.head) != NULL) { 584 if (tag == h->metaf) 585 h->metaf = NULL; 586 print_ctag(h, tag->tag); 587 h->tags.head = tag->next; 588 free(tag); 589 if (until && tag == until) 590 return; 591 } 592 } 593 594 595 void 596 print_stagq(struct html *h, const struct tag *suntil) 597 { 598 struct tag *tag; 599 600 while ((tag = h->tags.head) != NULL) { 601 if (suntil && tag == suntil) 602 return; 603 if (tag == h->metaf) 604 h->metaf = NULL; 605 print_ctag(h, tag->tag); 606 h->tags.head = tag->next; 607 free(tag); 608 } 609 } 610 611 612 void 613 bufinit(struct html *h) 614 { 615 616 h->buf[0] = '\0'; 617 h->buflen = 0; 618 } 619 620 621 void 622 bufcat_style(struct html *h, const char *key, const char *val) 623 { 624 625 bufcat(h, key); 626 bufncat(h, ":", 1); 627 bufcat(h, val); 628 bufncat(h, ";", 1); 629 } 630 631 632 void 633 bufcat(struct html *h, const char *p) 634 { 635 636 bufncat(h, p, strlen(p)); 637 } 638 639 640 void 641 buffmt(struct html *h, const char *fmt, ...) 642 { 643 va_list ap; 644 645 va_start(ap, fmt); 646 (void)vsnprintf(h->buf + (int)h->buflen, 647 BUFSIZ - h->buflen - 1, fmt, ap); 648 va_end(ap); 649 h->buflen = strlen(h->buf); 650 } 651 652 653 void 654 bufncat(struct html *h, const char *p, size_t sz) 655 { 656 657 if (h->buflen + sz > BUFSIZ - 1) 658 sz = BUFSIZ - 1 - h->buflen; 659 660 (void)strncat(h->buf, p, sz); 661 h->buflen += sz; 662 } 663 664 665 void 666 buffmt_includes(struct html *h, const char *name) 667 { 668 const char *p, *pp; 669 670 pp = h->base_includes; 671 672 while (NULL != (p = strchr(pp, '%'))) { 673 bufncat(h, pp, (size_t)(p - pp)); 674 switch (*(p + 1)) { 675 case('I'): 676 bufcat(h, name); 677 break; 678 default: 679 bufncat(h, p, 2); 680 break; 681 } 682 pp = p + 2; 683 } 684 if (pp) 685 bufcat(h, pp); 686 } 687 688 689 void 690 buffmt_man(struct html *h, 691 const char *name, const char *sec) 692 { 693 const char *p, *pp; 694 695 pp = h->base_man; 696 697 /* LINTED */ 698 while (NULL != (p = strchr(pp, '%'))) { 699 bufncat(h, pp, (size_t)(p - pp)); 700 switch (*(p + 1)) { 701 case('S'): 702 bufcat(h, sec ? sec : "1"); 703 break; 704 case('N'): 705 buffmt(h, name); 706 break; 707 default: 708 bufncat(h, p, 2); 709 break; 710 } 711 pp = p + 2; 712 } 713 if (pp) 714 bufcat(h, pp); 715 } 716 717 718 void 719 bufcat_su(struct html *h, const char *p, const struct roffsu *su) 720 { 721 double v; 722 const char *u; 723 724 v = su->scale; 725 726 switch (su->unit) { 727 case (SCALE_CM): 728 u = "cm"; 729 break; 730 case (SCALE_IN): 731 u = "in"; 732 break; 733 case (SCALE_PC): 734 u = "pc"; 735 break; 736 case (SCALE_PT): 737 u = "pt"; 738 break; 739 case (SCALE_EM): 740 u = "em"; 741 break; 742 case (SCALE_MM): 743 if (0 == (v /= 100)) 744 v = 1; 745 u = "em"; 746 break; 747 case (SCALE_EN): 748 u = "ex"; 749 break; 750 case (SCALE_BU): 751 u = "ex"; 752 break; 753 case (SCALE_VS): 754 u = "em"; 755 break; 756 default: 757 u = "ex"; 758 break; 759 } 760 761 /* 762 * XXX: the CSS spec isn't clear as to which types accept 763 * integer or real numbers, so we just make them all decimals. 764 */ 765 buffmt(h, "%s: %.2f%s;", p, v, u); 766 } 767 768 769 void 770 html_idcat(char *dst, const char *src, int sz) 771 { 772 int ssz; 773 774 assert(sz > 2); 775 776 /* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */ 777 778 /* We can't start with a number (bah). */ 779 780 if ('#' == *dst) { 781 dst++; 782 sz--; 783 } 784 if ('\0' == *dst) { 785 *dst++ = 'x'; 786 *dst = '\0'; 787 sz--; 788 } 789 790 for ( ; *dst != '\0' && sz; dst++, sz--) 791 /* Jump to end. */ ; 792 793 for ( ; *src != '\0' && sz > 1; src++) { 794 ssz = snprintf(dst, (size_t)sz, "%.2x", *src); 795 sz -= ssz; 796 dst += ssz; 797 } 798 } 799