1#!/usr/bin/perl 2##---------------------------------------------------------------------------## 3## File: 4## @(#) man2html 1.2 97/08/12 12:57:30 @(#) 5## Author: 6## Earl Hood, ehood@medusa.acs.uci.edu 7## Description: 8## man2html is a Perl program to convert formatted nroff output 9## to HTML. 10## 11## Recommend command-line options based on platform: 12## 13## Platform Options 14## --------------------------------------------------------------------- 15## c2mp <None, the defaults should be okay> 16## hp9000s700/800 -leftm 1 -topm 8 17## sun4 -sun 18## --------------------------------------------------------------------- 19## 20##---------------------------------------------------------------------------## 21## Copyright (C) 1995-1997 Earl Hood, ehood@medusa.acs.uci.edu 22## 23## This program is free software; you can redistribute it and/or modify 24## it under the terms of the GNU General Public License as published by 25## the Free Software Foundation; either version 2 of the License, or 26## (at your option) any later version. 27## 28## This program is distributed in the hope that it will be useful, 29## but WITHOUT ANY WARRANTY; without even the implied warranty of 30## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 31## GNU General Public License for more details. 32## 33## You should have received a copy of the GNU General Public License 34## along with this program; if not, write to the Free Software 35## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 36## 02111-1307, USA 37##---------------------------------------------------------------------------## 38 39package Man2Html; 40 41use Getopt::Long; 42 43($PROG = $0) =~ s/.*\///; 44$VERSION = "3.0.1"; 45 46## Input and outputs filehandles 47$InFH = \*STDIN unless $InFH; 48$OutFH = \*STDOUT unless $OutFH; 49 50## Backspace character: Used in overstriking detection 51*bs = \"\b"; 52 53## Hash of section titles and their HTML tag wrapper. 54## This list allows customization of what HTML tag is used for 55## a given section head. 56## 57## The section title can be a regular expression. Therefore, one must 58## be careful about quoting special characters. 59## 60%SectionHead = ( 61 62 '\S.*OPTIONS.*' => '<H2>', 63 'AUTHORS?' => '<H2>', 64 'BUGS' => '<H2>', 65 'COMPATIBILITY' => '<H2>', 66 'DEPENDENCIES' => '<H2>', 67 'DESCRIPTION' => '<H2>', 68 'DIAGNOSTICS' => '<H2>', 69 'ENVIRONMENT' => '<H2>', 70 'ERRORS' => '<H2>', 71 'EXAMPLES' => '<H2>', 72 'EXTERNAL INFLUENCES' => '<H2>', 73 'FILES' => '<H2>', 74 'LIMITATIONS' => '<H2>', 75 'NAME' => '<H2>', 76 'NOTES?' => '<H2>', 77 'OPTIONS' => '<H2>', 78 'REFERENCES' => '<H2>', 79 'RETURN VALUE' => '<H2>', 80 'SECTION.*:' => '<H2>', 81 'SEE ALSO' => '<H2>', 82 'STANDARDS CONFORMANCE' => '<H2>', 83 'STYLE CONVENTION' => '<H2>', 84 'SYNOPSIS' => '<H2>', 85 'SYNTAX' => '<H2>', 86 'WARNINGS' => '<H2>', 87 '\s+Section.*:' => '<H3>', 88 89); 90 91## Fallback tag if above is not found 92$HeadFallback = '<H2>'; 93 94## Other gobals 95 96$Bare = 0; # Skip printing HTML head/foot flag 97$BTag = 'B'; # Overstrike tag 98$CgiUrl = ''; # CGI URL expression 99$Compress = 0; # Do blank line compression flag 100$K = 0; # Do keyword search processing flag 101$NoDepage = 0; # Do not strip page information 102$NoHeads = 0; # Do no header detection flag 103$SeeAlso = 0; # Do only SEE ALSO xrefs flag 104$Solaris = 0; # Solaris keyword search processing flag 105$Sun = 0; # Headers not overstriken flag 106$Title = ''; # Title 107$UTag = 'I'; # Underline tag 108$ftsz = 7; # Bottome margin size 109$hdsz = 7; # Top margin size 110$leftm = ''; # Left margin pad 111$leftmsz = 0; # Left margin size 112$pgsz = 66; # Size of page size 113$txsz = 52; # Text body length size 114 115############################################################################# 116## Main Block 117############################################################################# 118{ 119 if (get_cli_opts()) { 120 if ($K) { 121 man_k(); 122 } else { 123 do_it(); 124 } 125 } else { 126 usage(); 127 } 128} 129 130############################################################################# 131## Subroutines 132############################################################################# 133 134sub do_it { 135 136 ## Define while loop and then eval it when used. The reason 137 ## is to avoid the regular expression reevaulation in the 138 ## section head detection code. 139 140 $doitcode =<<'EndOfDoItCode'; 141 142 my($line, $tmp, $i, $head, $preindent, $see_also, $do); 143 144 $see_also = !$SeeAlso; 145 print $OutFH "<!-- Manpage converted by man2html $VERSION -->\n"; 146 LOOP: while(!eof($InFH)) { 147 $blank = 0; 148 for ($i=0; $i < $hdsz; $i++) { 149 last LOOP unless defined($_ = <$InFH>); 150 } 151 for ($i=0; $i < $txsz; $i++) { 152 last LOOP unless defined($_ = <$InFH>); 153 154 ## Check if compress consecutive blank lines 155 if ($Compress and !/\S/) { 156 if ($blank) { next; } else { $blank = 1; } 157 } else { 158 $blank = 0; 159 } 160 161 ## Try to check if line space is needed at page boundaries ## 162 if (!$NoDepage && ($i==0 || $i==($txsz-1)) && !/^\s*$/) { 163 /^(\s*)/; $tmp = length($1); 164 if ($do) { 165 if ($tmp < $preindent) { print $OutFH "\n"; } 166 } else { 167 $do = 1; 168 } 169 $preindent = $tmp; 170 } else { 171 $do = 0; $preindent = 0; 172 } 173 174 ## Interpret line 175 $line = $_; 176 entitize(\$_); # Convert [$<>] to entity references 177 178 ## Check for 'SEE ALSO' link only 179 if (!$see_also && $CgiUrl && $SeeAlso) { 180 ($tmp = $line) =~ s/.\010//go; 181 if ($tmp =~ /^\s*SEE\s+ALSO\s*$/o) { $see_also = 1; } 182 else { $see_also = 0; } 183 } 184 185 ## Create anchor links for manpage references 186 s/((((.\010)+)?[\+_\.\w-])+\(((.\010)+)? 187 \d((.\010)+)?\w*\)) 188 /make_xref($1) 189 /geox if $see_also; 190 191 ## Emphasize underlined words 192 # s/((_\010[^_])+[\.\(\)_]?(_\010[^_])+\)?)/emphasize($1)/oge; 193 # s/((_\010[^_])+([\.\(\)_]?(_\010[^_])+)?)/emphasize($1)/oge; 194 # 195 # The previous expressions were trying to be clever about 196 # detecting underlined text which contain non-alphanumeric 197 # characters. nroff will not underline non-alphanumeric 198 # characters in an underlined phrase, and the above was trying 199 # to detect that. It does not work all the time, and it 200 # screws up other text, so a simplified expression is used. 201 202 s/((_\010[^_])+)/emphasize($1)/oge; 203 204 $secth = 0; 205 ## Check for strong text and headings 206 if ($Sun || /.\010./o) { 207 if (!$NoHeads) { 208 $line =~ s/.\010//go; 209 $tmp = $HeadFallback; 210EndOfDoItCode 211 212 ## Create switch statement for detecting a heading 213 ## 214 $doitcode .= "HEADSW: {\n"; 215 foreach $head (keys %SectionHead) { 216 $doitcode .= join("", "\$tmp = '$SectionHead{$head}', ", 217 "\$secth = 1, last HEADSW ", 218 "if \$line =~ /^$leftm$head/o;\n"); 219 } 220 $doitcode .= "}\n"; 221 222 ## Rest of routine 223 ## 224 $doitcode .=<<'EndOfDoItCode'; 225 if ($secth || $line =~ /^$leftm\S/o) { 226 chop $line; 227 $_ = $tmp . $line . $tmp; 228 s%<([^>]*)>$%</$1>%; 229 $_ = "\n</PRE>\n" . $_ . "<PRE>\n"; 230 } else { 231 s/(((.\010)+.)+)/strongize($1)/oge; 232 } 233 } else { 234 s/(((.\010)+.)+)/strongize($1)/oge; 235 } 236 } 237 print $OutFH $_; 238 } 239 240 for ($i=0; $i < $ftsz; $i++) { 241 last LOOP unless defined($_ = <$InFH>); 242 } 243 } 244EndOfDoItCode 245 246 247 ## Perform processing. 248 249 printhead() unless $Bare; 250 print $OutFH "<PRE>\n"; 251 eval $doitcode; # $doitcode defined above 252 print $OutFH "</PRE>\n"; 253 printtail() unless $Bare; 254} 255 256##--------------------------------------------------------------------------- 257## 258sub get_cli_opts { 259 return 0 unless 260 GetOptions( 261 "bare", # Leave out HTML, HEAD, BODY tags. 262 "belem=s", # HTML Element for overstriked text (def: "B") 263 "botm=i", # Number of lines for bottom margin (def: 7) 264 "cgiurl=s", # CGI URL for linking to other manpages 265 "cgiurlexp=s", # CGI URL Perl expr for linking to other manpages 266 "compress", # Compress consecutive blank lines 267 "headmap=s", # Filename of user section head map file 268 "k", # Process input from 'man -k' output. 269 "leftm=i", # Character width of left margin (def: 0) 270 "nodepage", # Do not remove pagination lines 271 "noheads", # Do not detect for section heads 272 "pgsize=i", # Number of lines in a page (def: 66) 273 "seealso", # Link to other manpages only in the SEE ALSO section 274 "solaris", # Parse 'man -k' output from a solaris system 275 "sun", # Section heads are not overstriked in input 276 "title=s", # Title of manpage (def: Not defined) 277 "topm=i", # Number of lines for top margin (def: 7) 278 "uelem=s", # HTML Element for underlined text (def: "I") 279 280 "help" # Short usage message 281 ); 282 return 0 if defined($opt_help); 283 284 $pgsz = $opt_pgsize || $pgsz; 285 if (defined($opt_nodepage)) { 286 $hdsz = 0; 287 $ftsz = 0; 288 } else { 289 $hdsz = $opt_topm if defined($opt_topm); 290 $ftsz = $opt_botm if defined($opt_botm); 291 } 292 $txsz = $pgsz - ($hdsz + $ftsz); 293 $leftmsz = $opt_leftm if defined($opt_leftm); 294 $leftm = ' ' x $leftmsz; 295 296 $Bare = defined($opt_bare); 297 $Compress = defined($opt_compress); 298 $K = defined($opt_k); 299 $NoDepage = defined($opt_nodepage); 300 $NoHeads = defined($opt_noheads); 301 $SeeAlso = defined($opt_seealso); 302 $Solaris = defined($opt_solaris); 303 $Sun = defined($opt_sun); 304 305 $Title = $opt_title || $Title; 306 $CgiUrl = $opt_cgiurlexp || 307 ($opt_cgiurl ? qq{return "$opt_cgiurl"} : ''); 308 309 $BTag = $opt_belem || $BTag; 310 $UTag = $opt_uelem || $UTag; 311 $BTag =~ s/[<>]//g; 312 $UTag =~ s/[<>]//g; 313 314 if (defined($opt_headmap)) { 315 require $opt_headmap or warn "Unable to read $opt_headmap\n"; 316 } 317 1; 318} 319 320##--------------------------------------------------------------------------- 321sub printhead { 322 print $OutFH "<HTML>\n"; 323 print $OutFH "<HEAD>\n", 324 "<TITLE>$Title</TITLE>\n", 325 "</HEAD>\n" if $Title; 326 print $OutFH "<BODY>\n"; 327 print $OutFH "<H1>$Title</H1>\n", 328 "<HR>\n" if $Title; 329} 330 331##--------------------------------------------------------------------------- 332sub printtail { 333 print $OutFH <<EndOfRef; 334<HR> 335<ADDRESS> 336Man(1) output converted with 337<a href="http://www.oac.uci.edu/indiv/ehood/man2html.html">man2html</a> 338</ADDRESS> 339</BODY> 340</HTML> 341EndOfRef 342} 343 344##--------------------------------------------------------------------------- 345sub emphasize { 346 my($txt) = shift; 347 $txt =~ s/.\010//go; 348 $txt = "<$UTag>$txt</$UTag>"; 349 $txt; 350} 351 352##--------------------------------------------------------------------------- 353sub strongize { 354 my($txt) = shift; 355 $txt =~ s/.\010//go; 356 $txt = "<$BTag>$txt</$BTag>"; 357 $txt; 358} 359 360##--------------------------------------------------------------------------- 361sub entitize { 362 my($txt) = shift; 363 364 ## Check for special characters in overstrike text ## 365 $$txt =~ s/_\010\&/strike('_', '&')/geo; 366 $$txt =~ s/_\010</strike('_', '<')/geo; 367 $$txt =~ s/_\010>/strike('_', '>')/geo; 368 369 $$txt =~ s/(\&\010)+\&/strike('&', '&')/geo; 370 $$txt =~ s/(<\010)+</strike('<', '<')/geo; 371 $$txt =~ s/(>\010)+>/strike('>', '>')/geo; 372 373 ## Check for special characters in regular text. Must be careful 374 ## to check before/after character in expression because it might be 375 ## a special character. 376 $$txt =~ s/([^\010]\&[^\010])/htmlize2($1)/geo; 377 $$txt =~ s/([^\010]<[^\010])/htmlize2($1)/geo; 378 $$txt =~ s/([^\010]>[^\010])/htmlize2($1)/geo; 379} 380 381##--------------------------------------------------------------------------- 382## escape special characters in a string, in-place 383## 384sub htmlize { 385 my($str) = shift; 386 $$str =~ s/&/\&/g; 387 $$str =~ s/</\</g; 388 $$str =~ s/>/\>/g; 389 $$str; 390} 391 392##--------------------------------------------------------------------------- 393## htmlize2() is used by entitize. 394## 395sub htmlize2 { 396 my($str) = shift; 397 $str =~ s/&/\&/g; 398 $str =~ s/</\</g; 399 $str =~ s/>/\>/g; 400 $str; 401} 402 403##--------------------------------------------------------------------------- 404## strike converts HTML special characters in overstriked text 405## into entity references. The entities are overstriked so 406## strongize() and emphasize() will recognize the entity to be 407## wrapped in tags. 408## 409sub strike { 410 my($w, $char) = @_; 411 my($ret); 412 if ($w eq '_') { 413 if ($char eq '&') { 414 $ret = "_$bs\&_${bs}a_${bs}m_${bs}p_${bs};"; 415 } elsif ($char eq '<') { 416 $ret = "_$bs\&_${bs}l_${bs}t_${bs};"; 417 } elsif ($char eq '>') { 418 $ret = "_$bs\&_${bs}g_${bs}t_${bs};"; 419 } else { 420 warn qq|Unrecognized character, "$char", passed to strike()\n|; 421 } 422 } else { 423 if ($char eq '&') { 424 $ret = "\&$bs\&a${bs}am${bs}mp${bs}p;${bs};"; 425 } elsif ($char eq '<') { 426 $ret = "\&$bs\&l${bs}lt${bs}t;${bs};"; 427 } elsif ($char eq '>') { 428 $ret = "\&$bs\&g${bs}gt${bs}t;${bs};"; 429 } else { 430 warn qq|Unrecognized character, "$char", passed to strike()\n|; 431 } 432 } 433 $ret; 434} 435 436##--------------------------------------------------------------------------- 437## make_xref() converts a manpage crossreference into a hyperlink. 438## 439sub make_xref { 440 my $str = shift; 441 $str =~ s/.\010//go; # Remove overstriking 442 443 if ($CgiUrl) { 444 my($title,$section,$subsection) = 445 ($str =~ /([\+_\.\w-]+)\((\d)(\w*)\)/); 446 my($subsection) = lc($subsection); 447 448 $title =~ s/\+/%2B/g; 449 my($href) = (eval $CgiUrl); 450 qq|<B><A HREF="$href">$str</A></B>|; 451 } else { 452 qq|<B>$str</B>|; 453 } 454} 455 456##--------------------------------------------------------------------------- 457## man_k() process a keyword search. The problem we have is there 458## is no standard for keyword search results from man. Solaris 459## systems have a different enough format to warrent dealing 460## with it as a special case. For other cases, we try our best. 461## Unfortunately, there are some lines of results that may be 462## skipped. 463## 464sub man_k { 465 my($line,$refs,$section,$subsection,$desc,$i, 466 %Sec1, %Sec1sub, %Sec2, %Sec2sub, %Sec3, %Sec3sub, 467 %Sec4, %Sec4sub, %Sec5, %Sec5sub, %Sec6, %Sec6sub, 468 %Sec7, %Sec7sub, %Sec8, %Sec8sub, %Sec9, %Sec9sub, 469 %SecN, %SecNsub, %SecNsec); 470 471 printhead() unless $Bare; 472 print $OutFH "<!-- Man keyword results converted by ", 473 "man2html $VERSION -->\n"; 474 475 while ($line = <$InFH>) { 476 next if $line !~ /\(\d\w?\)\s+-\s/; # check if line can be handled 477 ($refs,$section,$subsection,$desc) = 478 $line =~ /^\s*(.*)\((\d)(\w*)\)\s*-\s*(.*)$/; 479 480 if ($Solaris) { 481 $refs =~ s/^\s*([\+_\.\w-]+)\s+([\+_\.\w-]+)\s*$/$1/; 482 # <topic> <manpage> 483 } else { 484 $refs =~ s/\s(and|or)\s/,/gi; # Convert and/or to commas 485 $refs =~ s/^[^:\s]:\s*//; # Remove prefixed whatis path 486 } 487 $refs =~ s/\s//g; # Remove all whitespace 488 $refs =~ s/,/, /g; # Put space after comma 489 htmlize(\$desc); # Check for special chars in desc 490 $desc =~ s/^(.)/\U$1/; # Uppercase first letter in desc 491 492 if ($section eq '1') { 493 $Sec1{$refs} = $desc; $Sec1sub{$refs} = $subsection; 494 } elsif ($section eq '2') { 495 $Sec2{$refs} = $desc; $Sec2sub{$refs} = $subsection; 496 } elsif ($section eq '3') { 497 $Sec3{$refs} = $desc; $Sec3sub{$refs} = $subsection; 498 } elsif ($section eq '4') { 499 $Sec4{$refs} = $desc; $Sec4sub{$refs} = $subsection; 500 } elsif ($section eq '5') { 501 $Sec5{$refs} = $desc; $Sec5sub{$refs} = $subsection; 502 } elsif ($section eq '6') { 503 $Sec6{$refs} = $desc; $Sec6sub{$refs} = $subsection; 504 } elsif ($section eq '7') { 505 $Sec7{$refs} = $desc; $Sec7sub{$refs} = $subsection; 506 } elsif ($section eq '8') { 507 $Sec8{$refs} = $desc; $Sec8sub{$refs} = $subsection; 508 } elsif ($section eq '9') { 509 $Sec9{$refs} = $desc; $Sec9sub{$refs} = $subsection; 510 } else { # Catch all 511 $SecN{$refs} = $desc; $SecNsec{$refs} = $section; 512 $SecNsub{$refs} = $subsection; 513 } 514 } 515 print_mank_sec(\%Sec1, 1, \%Sec1sub); 516 print_mank_sec(\%Sec2, 2, \%Sec2sub); 517 print_mank_sec(\%Sec3, 3, \%Sec3sub); 518 print_mank_sec(\%Sec4, 4, \%Sec4sub); 519 print_mank_sec(\%Sec5, 5, \%Sec5sub); 520 print_mank_sec(\%Sec6, 6, \%Sec6sub); 521 print_mank_sec(\%Sec7, 7, \%Sec7sub); 522 print_mank_sec(\%Sec8, 8, \%Sec8sub); 523 print_mank_sec(\%Sec9, 9, \%Sec9sub); 524 print_mank_sec(\%SecN, 'N', \%SecNsub, \%SecNsec); 525 526 printtail() unless $Bare; 527} 528##--------------------------------------------------------------------------- 529## print_mank_sec() prints out manpage cross-refs of a specific section. 530## 531sub print_mank_sec { 532 my($sec, $sect, $secsub, $secsec) = @_; 533 my(@array, @refs, $href, $item, $title, $subsection, $i, $section, 534 $xref); 535 $section = $sect; 536 537 @array = sort keys %$sec; 538 if ($#array >= 0) { 539 print $OutFH "<H2>Section $section</H2>\n", 540 "<DL COMPACT>\n"; 541 foreach $item (@array) { 542 @refs = split(/,/, $item); 543 $section = $secsec->{$item} if $sect eq 'N'; 544 $subsection = $secsub->{$item}; 545 if ($CgiUrl) { 546 ($title = $refs[0]) =~ s/\(\)//g; # watch out for extra ()'s 547 $xref = eval $CgiUrl; 548 } 549 print $OutFH "<DT>\n"; 550 $i = 0; 551 foreach (@refs) { 552 if ($CgiUrl) { 553 print $OutFH qq|<B><A HREF="$xref">$_</A></B>|; 554 } else { 555 print $OutFH $_; 556 } 557 print $OutFH ", " if $i < $#refs; 558 $i++; 559 } 560 print $OutFH " ($section$subsection)\n", 561 "</DT><DD>\n", 562 $sec->{$item}, "</DD>\n"; 563 } 564 print $OutFH "</DL>\n"; 565 } 566} 567 568##--------------------------------------------------------------------------- 569## 570sub usage { 571 print $OutFH <<EndOfUsage; 572Usage: $PROG [ options ] < infile > outfile 573Options: 574 -bare : Do not put in HTML, HEAD, BODY tags 575 -belem <elem> : HTML Element for overstriked text (def: "B") 576 -botm <#> : Number of lines for bottom margin (def: 7) 577 -cgiurl <url> : URL for linking to other manpages 578 -cgiurlexp <url> : Perl expression URL for linking to other manpages 579 -compress : Compress consective blank lines 580 -headmap <file> : Filename of user section head map file 581 -help : This message 582 -k : Process a keyword search result 583 -leftm <#> : Character width of left margin (def: 0) 584 -nodepage : Do not remove pagination lines 585 -noheads : Turn off section head detection 586 -pgsize <#> : Number of lines in a page (def: 66) 587 -seealso : Link to other manpages only in the SEE ALSO section 588 -solaris : Process keyword search result in Solaris format 589 -sun : Section heads are not overstriked in input 590 -title <string> : Title of manpage (def: Not defined) 591 -topm <#> : Number of lines for top margin (def: 7) 592 -uelem <elem> : HTML Element for underlined text (def: "I") 593 594Description: 595 $PROG takes formatted manpages from STDIN and converts it to HTML sent 596 to STDOUT. The -topm and -botm arguments are the number of lines to the 597 main body text and NOT to the running headers/footers. 598 599Version: 600 $VERSION 601 Copyright (C) 1995-1997 Earl Hood, ehood\@medusa.acs.uci.edu 602 $PROG comes with ABSOLUTELY NO WARRANTY and $PROG may be copied only 603 under the terms of the GNU General Public License, which may be found in 604 the $PROG distribution. 605 606EndOfUsage 607 exit 0; 608} 609