1 /* $Id: html.c,v 1.147 2011/05/24 21:40:14 kristaps Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #ifdef HAVE_CONFIG_H 19 #include "config.h" 20 #endif 21 22 #include <sys/types.h> 23 24 #include <assert.h> 25 #include <ctype.h> 26 #include <stdarg.h> 27 #include <stdio.h> 28 #include <stdint.h> 29 #include <stdlib.h> 30 #include <string.h> 31 #include <unistd.h> 32 33 #include "mandoc.h" 34 #include "libmandoc.h" 35 #include "out.h" 36 #include "html.h" 37 #include "main.h" 38 39 struct htmldata { 40 const char *name; 41 int flags; 42 #define HTML_CLRLINE (1 << 0) 43 #define HTML_NOSTACK (1 << 1) 44 #define HTML_AUTOCLOSE (1 << 2) /* Tag has auto-closure. */ 45 }; 46 47 static const struct htmldata htmltags[TAG_MAX] = { 48 {"html", HTML_CLRLINE}, /* TAG_HTML */ 49 {"head", HTML_CLRLINE}, /* TAG_HEAD */ 50 {"body", HTML_CLRLINE}, /* TAG_BODY */ 51 {"meta", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */ 52 {"title", HTML_CLRLINE}, /* TAG_TITLE */ 53 {"div", HTML_CLRLINE}, /* TAG_DIV */ 54 {"h1", 0}, /* TAG_H1 */ 55 {"h2", 0}, /* TAG_H2 */ 56 {"span", 0}, /* TAG_SPAN */ 57 {"link", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */ 58 {"br", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */ 59 {"a", 0}, /* TAG_A */ 60 {"table", HTML_CLRLINE}, /* TAG_TABLE */ 61 {"tbody", HTML_CLRLINE}, /* TAG_TBODY */ 62 {"col", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */ 63 {"tr", HTML_CLRLINE}, /* TAG_TR */ 64 {"td", HTML_CLRLINE}, /* TAG_TD */ 65 {"li", HTML_CLRLINE}, /* TAG_LI */ 66 {"ul", HTML_CLRLINE}, /* TAG_UL */ 67 {"ol", HTML_CLRLINE}, /* TAG_OL */ 68 {"dl", HTML_CLRLINE}, /* TAG_DL */ 69 {"dt", HTML_CLRLINE}, /* TAG_DT */ 70 {"dd", HTML_CLRLINE}, /* TAG_DD */ 71 {"blockquote", HTML_CLRLINE}, /* TAG_BLOCKQUOTE */ 72 {"p", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_P */ 73 {"pre", HTML_CLRLINE }, /* TAG_PRE */ 74 {"b", 0 }, /* TAG_B */ 75 {"i", 0 }, /* TAG_I */ 76 {"code", 0 }, /* TAG_CODE */ 77 {"small", 0 }, /* TAG_SMALL */ 78 }; 79 80 static const char *const htmlattrs[ATTR_MAX] = { 81 "http-equiv", /* ATTR_HTTPEQUIV */ 82 "content", /* ATTR_CONTENT */ 83 "name", /* ATTR_NAME */ 84 "rel", /* ATTR_REL */ 85 "href", /* ATTR_HREF */ 86 "type", /* ATTR_TYPE */ 87 "media", /* ATTR_MEDIA */ 88 "class", /* ATTR_CLASS */ 89 "style", /* ATTR_STYLE */ 90 "width", /* ATTR_WIDTH */ 91 "id", /* ATTR_ID */ 92 "summary", /* ATTR_SUMMARY */ 93 "align", /* ATTR_ALIGN */ 94 "colspan", /* ATTR_COLSPAN */ 95 }; 96 97 static const char *const roffscales[SCALE_MAX] = { 98 "cm", /* SCALE_CM */ 99 "in", /* SCALE_IN */ 100 "pc", /* SCALE_PC */ 101 "pt", /* SCALE_PT */ 102 "em", /* SCALE_EM */ 103 "em", /* SCALE_MM */ 104 "ex", /* SCALE_EN */ 105 "ex", /* SCALE_BU */ 106 "em", /* SCALE_VS */ 107 "ex", /* SCALE_FS */ 108 }; 109 110 static void bufncat(struct html *, const char *, size_t); 111 static void print_ctag(struct html *, enum htmltag); 112 static int print_encode(struct html *, const char *, int); 113 static void print_metaf(struct html *, enum mandoc_esc); 114 static void print_attr(struct html *, const char *, const char *); 115 static void *ml_alloc(char *, enum htmltype); 116 117 static void * 118 ml_alloc(char *outopts, enum htmltype type) 119 { 120 struct html *h; 121 const char *toks[4]; 122 char *v; 123 124 toks[0] = "style"; 125 toks[1] = "man"; 126 toks[2] = "includes"; 127 toks[3] = NULL; 128 129 h = mandoc_calloc(1, sizeof(struct html)); 130 131 h->type = type; 132 h->tags.head = NULL; 133 h->symtab = mchars_alloc(); 134 135 while (outopts && *outopts) 136 switch (getsubopt(&outopts, UNCONST(toks), &v)) { 137 case (0): 138 h->style = v; 139 break; 140 case (1): 141 h->base_man = v; 142 break; 143 case (2): 144 h->base_includes = v; 145 break; 146 default: 147 break; 148 } 149 150 return(h); 151 } 152 153 void * 154 html_alloc(char *outopts) 155 { 156 157 return(ml_alloc(outopts, HTML_HTML_4_01_STRICT)); 158 } 159 160 161 void * 162 xhtml_alloc(char *outopts) 163 { 164 165 return(ml_alloc(outopts, HTML_XHTML_1_0_STRICT)); 166 } 167 168 169 void 170 html_free(void *p) 171 { 172 struct tag *tag; 173 struct html *h; 174 175 h = (struct html *)p; 176 177 while ((tag = h->tags.head) != NULL) { 178 h->tags.head = tag->next; 179 free(tag); 180 } 181 182 if (h->symtab) 183 mchars_free(h->symtab); 184 185 free(h); 186 } 187 188 189 void 190 print_gen_head(struct html *h) 191 { 192 struct htmlpair tag[4]; 193 194 tag[0].key = ATTR_HTTPEQUIV; 195 tag[0].val = "Content-Type"; 196 tag[1].key = ATTR_CONTENT; 197 tag[1].val = "text/html; charset=utf-8"; 198 print_otag(h, TAG_META, 2, tag); 199 200 tag[0].key = ATTR_NAME; 201 tag[0].val = "resource-type"; 202 tag[1].key = ATTR_CONTENT; 203 tag[1].val = "document"; 204 print_otag(h, TAG_META, 2, tag); 205 206 if (h->style) { 207 tag[0].key = ATTR_REL; 208 tag[0].val = "stylesheet"; 209 tag[1].key = ATTR_HREF; 210 tag[1].val = h->style; 211 tag[2].key = ATTR_TYPE; 212 tag[2].val = "text/css"; 213 tag[3].key = ATTR_MEDIA; 214 tag[3].val = "all"; 215 print_otag(h, TAG_LINK, 4, tag); 216 } 217 } 218 219 static void 220 print_metaf(struct html *h, enum mandoc_esc deco) 221 { 222 enum htmlfont font; 223 224 switch (deco) { 225 case (ESCAPE_FONTPREV): 226 font = h->metal; 227 break; 228 case (ESCAPE_FONTITALIC): 229 font = HTMLFONT_ITALIC; 230 break; 231 case (ESCAPE_FONTBOLD): 232 font = HTMLFONT_BOLD; 233 break; 234 case (ESCAPE_FONT): 235 /* FALLTHROUGH */ 236 case (ESCAPE_FONTROMAN): 237 font = HTMLFONT_NONE; 238 break; 239 default: 240 abort(); 241 /* NOTREACHED */ 242 } 243 244 if (h->metaf) { 245 print_tagq(h, h->metaf); 246 h->metaf = NULL; 247 } 248 249 h->metal = h->metac; 250 h->metac = font; 251 252 if (HTMLFONT_NONE != font) 253 h->metaf = HTMLFONT_BOLD == font ? 254 print_otag(h, TAG_B, 0, NULL) : 255 print_otag(h, TAG_I, 0, NULL); 256 } 257 258 int 259 html_strlen(const char *cp) 260 { 261 int ssz, sz; 262 const char *seq, *p; 263 264 /* 265 * Account for escaped sequences within string length 266 * calculations. This follows the logic in term_strlen() as we 267 * must calculate the width of produced strings. 268 * Assume that characters are always width of "1". This is 269 * hacky, but it gets the job done for approximation of widths. 270 */ 271 272 sz = 0; 273 while (NULL != (p = strchr(cp, '\\'))) { 274 sz += (int)(p - cp); 275 ++cp; 276 switch (mandoc_escape(&cp, &seq, &ssz)) { 277 case (ESCAPE_ERROR): 278 return(sz); 279 case (ESCAPE_UNICODE): 280 /* FALLTHROUGH */ 281 case (ESCAPE_NUMBERED): 282 /* FALLTHROUGH */ 283 case (ESCAPE_SPECIAL): 284 sz++; 285 break; 286 default: 287 break; 288 } 289 } 290 291 assert(sz >= 0); 292 return(sz + strlen(cp)); 293 } 294 295 static int 296 print_encode(struct html *h, const char *p, int norecurse) 297 { 298 size_t sz; 299 int c, len, nospace; 300 const char *seq; 301 enum mandoc_esc esc; 302 static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' }; 303 304 nospace = 0; 305 306 while ('\0' != *p) { 307 sz = strcspn(p, rejs); 308 309 fwrite(p, 1, sz, stdout); 310 p += (int)sz; 311 312 if ('\0' == *p) 313 break; 314 315 switch (*p++) { 316 case ('<'): 317 printf("<"); 318 continue; 319 case ('>'): 320 printf(">"); 321 continue; 322 case ('&'): 323 printf("&"); 324 continue; 325 case (ASCII_HYPH): 326 putchar('-'); 327 continue; 328 default: 329 break; 330 } 331 332 esc = mandoc_escape(&p, &seq, &len); 333 if (ESCAPE_ERROR == esc) 334 break; 335 336 switch (esc) { 337 case (ESCAPE_UNICODE): 338 /* Skip passed "u" header. */ 339 c = mchars_num2uc(seq + 1, len - 1); 340 if ('\0' != c) 341 printf("&#x%x;", c); 342 break; 343 case (ESCAPE_NUMBERED): 344 c = mchars_num2char(seq, len); 345 if ('\0' != c) 346 putchar(c); 347 break; 348 case (ESCAPE_SPECIAL): 349 c = mchars_spec2cp(h->symtab, seq, len); 350 if (c > 0) 351 printf("&#%d;", c); 352 else if (-1 == c && 1 == len) 353 putchar((int)*seq); 354 break; 355 case (ESCAPE_FONT): 356 /* FALLTHROUGH */ 357 case (ESCAPE_FONTPREV): 358 /* FALLTHROUGH */ 359 case (ESCAPE_FONTBOLD): 360 /* FALLTHROUGH */ 361 case (ESCAPE_FONTITALIC): 362 /* FALLTHROUGH */ 363 case (ESCAPE_FONTROMAN): 364 if (norecurse) 365 break; 366 print_metaf(h, esc); 367 break; 368 case (ESCAPE_NOSPACE): 369 if ('\0' == *p) 370 nospace = 1; 371 break; 372 default: 373 break; 374 } 375 } 376 377 return(nospace); 378 } 379 380 381 static void 382 print_attr(struct html *h, const char *key, const char *val) 383 { 384 printf(" %s=\"", key); 385 (void)print_encode(h, val, 1); 386 putchar('\"'); 387 } 388 389 390 struct tag * 391 print_otag(struct html *h, enum htmltag tag, 392 int sz, const struct htmlpair *p) 393 { 394 int i; 395 struct tag *t; 396 397 /* Push this tags onto the stack of open scopes. */ 398 399 if ( ! (HTML_NOSTACK & htmltags[tag].flags)) { 400 t = mandoc_malloc(sizeof(struct tag)); 401 t->tag = tag; 402 t->next = h->tags.head; 403 h->tags.head = t; 404 } else 405 t = NULL; 406 407 if ( ! (HTML_NOSPACE & h->flags)) 408 if ( ! (HTML_CLRLINE & htmltags[tag].flags)) { 409 /* Manage keeps! */ 410 if ( ! (HTML_KEEP & h->flags)) { 411 if (HTML_PREKEEP & h->flags) 412 h->flags |= HTML_KEEP; 413 putchar(' '); 414 } else 415 printf(" "); 416 } 417 418 if ( ! (h->flags & HTML_NONOSPACE)) 419 h->flags &= ~HTML_NOSPACE; 420 else 421 h->flags |= HTML_NOSPACE; 422 423 /* Print out the tag name and attributes. */ 424 425 printf("<%s", htmltags[tag].name); 426 for (i = 0; i < sz; i++) 427 print_attr(h, htmlattrs[p[i].key], p[i].val); 428 429 /* Add non-overridable attributes. */ 430 431 if (TAG_HTML == tag && HTML_XHTML_1_0_STRICT == h->type) { 432 print_attr(h, "xmlns", "http://www.w3.org/1999/xhtml"); 433 print_attr(h, "xml:lang", "en"); 434 print_attr(h, "lang", "en"); 435 } 436 437 /* Accommodate for XML "well-formed" singleton escaping. */ 438 439 if (HTML_AUTOCLOSE & htmltags[tag].flags) 440 switch (h->type) { 441 case (HTML_XHTML_1_0_STRICT): 442 putchar('/'); 443 break; 444 default: 445 break; 446 } 447 448 putchar('>'); 449 450 h->flags |= HTML_NOSPACE; 451 452 if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags) 453 putchar('\n'); 454 455 return(t); 456 } 457 458 459 static void 460 print_ctag(struct html *h, enum htmltag tag) 461 { 462 463 printf("</%s>", htmltags[tag].name); 464 if (HTML_CLRLINE & htmltags[tag].flags) { 465 h->flags |= HTML_NOSPACE; 466 putchar('\n'); 467 } 468 } 469 470 void 471 print_gen_decls(struct html *h) 472 { 473 const char *doctype; 474 const char *dtd; 475 const char *name; 476 477 switch (h->type) { 478 case (HTML_HTML_4_01_STRICT): 479 name = "HTML"; 480 doctype = "-//W3C//DTD HTML 4.01//EN"; 481 dtd = "http://www.w3.org/TR/html4/strict.dtd"; 482 break; 483 default: 484 puts("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); 485 name = "html"; 486 doctype = "-//W3C//DTD XHTML 1.0 Strict//EN"; 487 dtd = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; 488 break; 489 } 490 491 printf("<!DOCTYPE %s PUBLIC \"%s\" \"%s\">\n", 492 name, doctype, dtd); 493 } 494 495 void 496 print_text(struct html *h, const char *word) 497 { 498 499 if ( ! (HTML_NOSPACE & h->flags)) { 500 /* Manage keeps! */ 501 if ( ! (HTML_KEEP & h->flags)) { 502 if (HTML_PREKEEP & h->flags) 503 h->flags |= HTML_KEEP; 504 putchar(' '); 505 } else 506 printf(" "); 507 } 508 509 assert(NULL == h->metaf); 510 if (HTMLFONT_NONE != h->metac) 511 h->metaf = HTMLFONT_BOLD == h->metac ? 512 print_otag(h, TAG_B, 0, NULL) : 513 print_otag(h, TAG_I, 0, NULL); 514 515 assert(word); 516 if ( ! print_encode(h, word, 0)) 517 if ( ! (h->flags & HTML_NONOSPACE)) 518 h->flags &= ~HTML_NOSPACE; 519 520 if (h->metaf) { 521 print_tagq(h, h->metaf); 522 h->metaf = NULL; 523 } 524 525 h->flags &= ~HTML_IGNDELIM; 526 } 527 528 529 void 530 print_tagq(struct html *h, const struct tag *until) 531 { 532 struct tag *tag; 533 534 while ((tag = h->tags.head) != NULL) { 535 /* 536 * Remember to close out and nullify the current 537 * meta-font and table, if applicable. 538 */ 539 if (tag == h->metaf) 540 h->metaf = NULL; 541 if (tag == h->tblt) 542 h->tblt = NULL; 543 print_ctag(h, tag->tag); 544 h->tags.head = tag->next; 545 free(tag); 546 if (until && tag == until) 547 return; 548 } 549 } 550 551 552 void 553 print_stagq(struct html *h, const struct tag *suntil) 554 { 555 struct tag *tag; 556 557 while ((tag = h->tags.head) != NULL) { 558 if (suntil && tag == suntil) 559 return; 560 /* 561 * Remember to close out and nullify the current 562 * meta-font and table, if applicable. 563 */ 564 if (tag == h->metaf) 565 h->metaf = NULL; 566 if (tag == h->tblt) 567 h->tblt = NULL; 568 print_ctag(h, tag->tag); 569 h->tags.head = tag->next; 570 free(tag); 571 } 572 } 573 574 void 575 bufinit(struct html *h) 576 { 577 578 h->buf[0] = '\0'; 579 h->buflen = 0; 580 } 581 582 void 583 bufcat_style(struct html *h, const char *key, const char *val) 584 { 585 586 bufcat(h, key); 587 bufcat(h, ":"); 588 bufcat(h, val); 589 bufcat(h, ";"); 590 } 591 592 void 593 bufcat(struct html *h, const char *p) 594 { 595 596 h->buflen = strlcat(h->buf, p, BUFSIZ); 597 assert(h->buflen < BUFSIZ); 598 h->buflen--; 599 } 600 601 void 602 bufcat_fmt(struct html *h, const char *fmt, ...) 603 { 604 va_list ap; 605 606 va_start(ap, fmt); 607 (void)vsnprintf(h->buf + (int)h->buflen, 608 BUFSIZ - h->buflen - 1, fmt, ap); 609 va_end(ap); 610 h->buflen = strlen(h->buf); 611 } 612 613 static void 614 bufncat(struct html *h, const char *p, size_t sz) 615 { 616 617 assert(h->buflen + sz + 1 < BUFSIZ); 618 strncat(h->buf, p, sz); 619 h->buflen += sz; 620 } 621 622 void 623 buffmt_includes(struct html *h, const char *name) 624 { 625 const char *p, *pp; 626 627 pp = h->base_includes; 628 629 bufinit(h); 630 while (NULL != (p = strchr(pp, '%'))) { 631 bufncat(h, pp, (size_t)(p - pp)); 632 switch (*(p + 1)) { 633 case('I'): 634 bufcat(h, name); 635 break; 636 default: 637 bufncat(h, p, 2); 638 break; 639 } 640 pp = p + 2; 641 } 642 if (pp) 643 bufcat(h, pp); 644 } 645 646 void 647 buffmt_man(struct html *h, 648 const char *name, const char *sec) 649 { 650 const char *p, *pp; 651 652 pp = h->base_man; 653 654 bufinit(h); 655 while (NULL != (p = strchr(pp, '%'))) { 656 bufncat(h, pp, (size_t)(p - pp)); 657 switch (*(p + 1)) { 658 case('S'): 659 bufcat(h, sec ? sec : "1"); 660 break; 661 case('N'): 662 bufcat_fmt(h, name); 663 break; 664 default: 665 bufncat(h, p, 2); 666 break; 667 } 668 pp = p + 2; 669 } 670 if (pp) 671 bufcat(h, pp); 672 } 673 674 void 675 bufcat_su(struct html *h, const char *p, const struct roffsu *su) 676 { 677 double v; 678 679 v = su->scale; 680 if (SCALE_MM == su->unit && 0.0 == (v /= 100.0)) 681 v = 1.0; 682 683 bufcat_fmt(h, "%s: %.2f%s;", p, v, roffscales[su->unit]); 684 } 685 686 void 687 bufcat_id(struct html *h, const char *src) 688 { 689 690 /* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */ 691 692 while ('\0' != *src) 693 bufcat_fmt(h, "%.2x", *src++); 694 } 695