1#!/usr/bin/perl
2##---------------------------------------------------------------------------##
3##  File:
4##      @(#) man2html 1.2 97/08/12 12:57:30 @(#)
5##  Author:
6##      Earl Hood, ehood@medusa.acs.uci.edu
7##  Description:
8##      man2html is a Perl program to convert formatted nroff output
9##	to HTML.
10##
11##	Recommend command-line options based on platform:
12##
13##	Platform		Options
14##	---------------------------------------------------------------------
15##	c2mp			<None, the defaults should be okay>
16##	hp9000s700/800		-leftm 1 -topm 8
17##	sun4			-sun
18##	---------------------------------------------------------------------
19##
20##---------------------------------------------------------------------------##
21##  Copyright (C) 1995-1997	Earl Hood, ehood@medusa.acs.uci.edu
22##
23##  This program is free software; you can redistribute it and/or modify
24##  it under the terms of the GNU General Public License as published by
25##  the Free Software Foundation; either version 2 of the License, or
26##  (at your option) any later version.
27##
28##  This program is distributed in the hope that it will be useful,
29##  but WITHOUT ANY WARRANTY; without even the implied warranty of
30##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
31##  GNU General Public License for more details.
32##
33##  You should have received a copy of the GNU General Public License
34##  along with this program; if not, write to the Free Software
35##  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
36##  02111-1307, USA
37##---------------------------------------------------------------------------##
38
39package Man2Html;
40
41use Getopt::Long;
42
43($PROG = $0) =~ s/.*\///;
44$VERSION = "3.0.1";
45
46## Input and outputs filehandles
47$InFH	= \*STDIN   unless $InFH;
48$OutFH	= \*STDOUT  unless $OutFH;
49
50## Backspace character:  Used in overstriking detection
51*bs = \"\b";
52
53##	Hash of section titles and their HTML tag wrapper.
54##	This list allows customization of what HTML tag is used for
55##	a given section head.
56##
57##	The section title can be a regular expression.  Therefore, one must
58##	be careful about quoting special characters.
59##
60%SectionHead = (
61
62    '\S.*OPTIONS.*'		=> '<H2>',
63    'AUTHORS?'  		=> '<H2>',
64    'BUGS'      		=> '<H2>',
65    'COMPATIBILITY'		=> '<H2>',
66    'DEPENDENCIES'		=> '<H2>',
67    'DESCRIPTION'		=> '<H2>',
68    'DIAGNOSTICS'		=> '<H2>',
69    'ENVIRONMENT'		=> '<H2>',
70    'ERRORS'    		=> '<H2>',
71    'EXAMPLES'  		=> '<H2>',
72    'EXTERNAL INFLUENCES'	=> '<H2>',
73    'FILES'      		=> '<H2>',
74    'LIMITATIONS'		=> '<H2>',
75    'NAME'      		=> '<H2>',
76    'NOTES?'    		=> '<H2>',
77    'OPTIONS'    		=> '<H2>',
78    'REFERENCES'		=> '<H2>',
79    'RETURN VALUE'		=> '<H2>',
80    'SECTION.*:'		=> '<H2>',
81    'SEE ALSO'  		=> '<H2>',
82    'STANDARDS CONFORMANCE'	=> '<H2>',
83    'STYLE CONVENTION'		=> '<H2>',
84    'SYNOPSIS'  		=> '<H2>',
85    'SYNTAX'    		=> '<H2>',
86    'WARNINGS'  		=> '<H2>',
87    '\s+Section.*:'		=> '<H3>',
88
89);
90
91## Fallback tag if above is not found
92$HeadFallback = '<H2>';
93
94## Other gobals
95
96$Bare      = 0;		# Skip printing HTML head/foot flag
97$BTag	   = 'B';	# Overstrike tag
98$CgiUrl    = '';	# CGI URL expression
99$Compress  = 0;		# Do blank line compression flag
100$K         = 0;		# Do keyword search processing flag
101$NoDepage  = 0;		# Do not strip page information
102$NoHeads   = 0;		# Do no header detection flag
103$SeeAlso   = 0;		# Do only SEE ALSO xrefs flag
104$Solaris   = 0;		# Solaris keyword search processing flag
105$Sun       = 0;		# Headers not overstriken flag
106$Title     = '';	# Title
107$UTag	   = 'I';	# Underline tag
108$ftsz	   = 7;		# Bottome margin size
109$hdsz	   = 7;		# Top margin size
110$leftm     = '';	# Left margin pad
111$leftmsz   = 0;		# Left margin size
112$pgsz	   = 66;	# Size of page size
113$txsz      = 52;	# Text body length size
114
115#############################################################################
116##	Main Block
117#############################################################################
118{
119    if (get_cli_opts()) {
120	if ($K) {
121	    man_k();
122	} else {
123	    do_it();
124	}
125    } else {
126	usage();
127    }
128}
129
130#############################################################################
131##	Subroutines
132#############################################################################
133
134sub do_it {
135
136    ##	Define while loop and then eval it when used.  The reason
137    ##	is to avoid the regular expression reevaulation in the
138    ##	section head detection code.
139
140    $doitcode =<<'EndOfDoItCode';
141
142    my($line, $tmp, $i, $head, $preindent, $see_also, $do);
143
144    $see_also = !$SeeAlso;
145    print $OutFH "<!-- Manpage converted by man2html $VERSION -->\n";
146    LOOP: while(!eof($InFH)) {
147	$blank = 0;
148	for ($i=0; $i < $hdsz; $i++) {
149	    last LOOP  unless defined($_ = <$InFH>);
150	}
151	for ($i=0; $i < $txsz; $i++) {
152	    last LOOP  unless defined($_ = <$InFH>);
153
154	    ## Check if compress consecutive blank lines
155	    if ($Compress and !/\S/) {
156		if ($blank) { next; } else { $blank = 1; }
157	    } else {
158		$blank = 0;
159	    }
160
161	    ## Try to check if line space is needed at page boundaries ##
162	    if (!$NoDepage && ($i==0 || $i==($txsz-1)) && !/^\s*$/) {
163		/^(\s*)/;  $tmp = length($1);
164		if ($do) {
165		    if ($tmp < $preindent) { print $OutFH "\n"; }
166		} else {
167		    $do = 1;
168		}
169		$preindent = $tmp;
170	    } else {
171		$do = 0;  $preindent = 0;
172	    }
173
174	    ## Interpret line
175	    $line = $_;
176	    entitize(\$_);		# Convert [$<>] to entity references
177
178	    ## Check for 'SEE ALSO' link only
179	    if (!$see_also && $CgiUrl && $SeeAlso) {
180		($tmp = $line) =~ s/.\010//go;
181		if ($tmp =~ /^\s*SEE\s+ALSO\s*$/o) { $see_also = 1; }
182		else { $see_also = 0; }
183	    }
184
185	    ## Create anchor links for manpage references
186	    s/((((.\010)+)?[\+_\.\w-])+\(((.\010)+)?
187	      \d((.\010)+)?\w*\))
188	     /make_xref($1)
189	     /geox  if $see_also;
190
191	    ## Emphasize underlined words
192	    # s/((_\010[^_])+[\.\(\)_]?(_\010[^_])+\)?)/emphasize($1)/oge;
193	    # s/((_\010[^_])+([\.\(\)_]?(_\010[^_])+)?)/emphasize($1)/oge;
194	    #
195	    # The previous expressions were trying to be clever about
196	    # detecting underlined text which contain non-alphanumeric
197	    # characters.  nroff will not underline non-alphanumeric
198	    # characters in an underlined phrase, and the above was trying
199	    # to detect that.  It does not work all the time, and it
200	    # screws up other text, so a simplified expression is used.
201
202	    s/((_\010[^_])+)/emphasize($1)/oge;
203
204	    $secth = 0;
205	    ## Check for strong text and headings
206	    if ($Sun || /.\010./o) {
207		if (!$NoHeads) {
208		    $line =~ s/.\010//go;
209		    $tmp = $HeadFallback;
210EndOfDoItCode
211
212    ##  Create switch statement for detecting a heading
213    ##
214    $doitcode .= "HEADSW: {\n";
215    foreach $head (keys %SectionHead) {
216	$doitcode .= join("", "\$tmp = '$SectionHead{$head}', ",
217			      "\$secth = 1, last HEADSW  ",
218			      "if \$line =~ /^$leftm$head/o;\n");
219    }
220    $doitcode .= "}\n";
221
222    ##  Rest of routine
223    ##
224    $doitcode .=<<'EndOfDoItCode';
225		    if ($secth || $line =~ /^$leftm\S/o) {
226			chop $line;
227			$_ = $tmp . $line . $tmp;
228			s%<([^>]*)>$%</$1>%;
229			$_ = "\n</PRE>\n" . $_ . "<PRE>\n";
230		    } else {
231			s/(((.\010)+.)+)/strongize($1)/oge;
232		    }
233		} else {
234		    s/(((.\010)+.)+)/strongize($1)/oge;
235		}
236	    }
237	    print $OutFH $_;
238	}
239
240	for ($i=0; $i < $ftsz; $i++) {
241	    last LOOP  unless defined($_ = <$InFH>);
242	}
243    }
244EndOfDoItCode
245
246
247    ##	Perform processing.
248
249    printhead()  unless $Bare;
250    print $OutFH "<PRE>\n";
251    eval $doitcode;			# $doitcode defined above
252    print $OutFH "</PRE>\n";
253    printtail()  unless $Bare;
254}
255
256##---------------------------------------------------------------------------
257##
258sub get_cli_opts {
259    return 0  unless
260    GetOptions(
261	"bare",		# Leave out HTML, HEAD, BODY tags.
262	"belem=s",	# HTML Element for overstriked text (def: "B")
263	"botm=i",	# Number of lines for bottom margin (def: 7)
264	"cgiurl=s",	# CGI URL for linking to other manpages
265	"cgiurlexp=s",	# CGI URL Perl expr for linking to other manpages
266	"compress",	# Compress consecutive blank lines
267	"headmap=s",	# Filename of user section head map file
268	"k",		# Process input from 'man -k' output.
269	"leftm=i",	# Character width of left margin (def: 0)
270	"nodepage",	# Do not remove pagination lines
271	"noheads",	# Do not detect for section heads
272	"pgsize=i",	# Number of lines in a page (def: 66)
273	"seealso",	# Link to other manpages only in the SEE ALSO section
274	"solaris",	# Parse 'man -k' output from a solaris system
275	"sun",		# Section heads are not overstriked in input
276	"title=s",	# Title of manpage (def: Not defined)
277	"topm=i",	# Number of lines for top margin (def: 7)
278	"uelem=s",	# HTML Element for underlined text (def: "I")
279
280	"help"		# Short usage message
281    );
282    return 0  if defined($opt_help);
283
284    $pgsz = $opt_pgsize || $pgsz;
285    if (defined($opt_nodepage)) {
286	$hdsz   = 0;
287	$ftsz   = 0;
288    } else {
289	$hdsz   = $opt_topm  if defined($opt_topm);
290	$ftsz   = $opt_botm  if defined($opt_botm);
291    }
292    $txsz       = $pgsz - ($hdsz + $ftsz);
293    $leftmsz    = $opt_leftm  if defined($opt_leftm);
294    $leftm      = ' ' x $leftmsz;
295
296    $Bare       = defined($opt_bare);
297    $Compress   = defined($opt_compress);
298    $K          = defined($opt_k);
299    $NoDepage   = defined($opt_nodepage);
300    $NoHeads    = defined($opt_noheads);
301    $SeeAlso    = defined($opt_seealso);
302    $Solaris    = defined($opt_solaris);
303    $Sun        = defined($opt_sun);
304
305    $Title      = $opt_title || $Title;
306    $CgiUrl     = $opt_cgiurlexp ||
307			($opt_cgiurl ? qq{return "$opt_cgiurl"} : '');
308
309    $BTag	= $opt_belem || $BTag;
310    $UTag	= $opt_uelem || $UTag;
311    $BTag	=~ s/[<>]//g;
312    $UTag	=~ s/[<>]//g;
313
314    if (defined($opt_headmap)) {
315	require $opt_headmap or warn "Unable to read $opt_headmap\n";
316    }
317    1;
318}
319
320##---------------------------------------------------------------------------
321sub printhead {
322    print $OutFH "<HTML>\n";
323    print $OutFH "<HEAD>\n",
324		 "<TITLE>$Title</TITLE>\n",
325		 "</HEAD>\n"  if $Title;
326    print $OutFH "<BODY>\n";
327    print $OutFH "<H1>$Title</H1>\n",
328		 "<HR>\n"  if $Title;
329}
330
331##---------------------------------------------------------------------------
332sub printtail {
333    print $OutFH <<EndOfRef;
334<HR>
335<ADDRESS>
336Man(1) output converted with
337<a href="http://www.oac.uci.edu/indiv/ehood/man2html.html">man2html</a>
338</ADDRESS>
339</BODY>
340</HTML>
341EndOfRef
342}
343
344##---------------------------------------------------------------------------
345sub emphasize {
346    my($txt) = shift;
347    $txt =~ s/.\010//go;
348    $txt = "<$UTag>$txt</$UTag>";
349    $txt;
350}
351
352##---------------------------------------------------------------------------
353sub strongize {
354    my($txt) = shift;
355    $txt =~ s/.\010//go;
356    $txt = "<$BTag>$txt</$BTag>";
357    $txt;
358}
359
360##---------------------------------------------------------------------------
361sub entitize {
362    my($txt) = shift;
363
364    ## Check for special characters in overstrike text ##
365    $$txt =~ s/_\010\&/strike('_', '&')/geo;
366    $$txt =~ s/_\010</strike('_', '<')/geo;
367    $$txt =~ s/_\010>/strike('_', '>')/geo;
368
369    $$txt =~ s/(\&\010)+\&/strike('&', '&')/geo;
370    $$txt =~ s/(<\010)+</strike('<', '<')/geo;
371    $$txt =~ s/(>\010)+>/strike('>', '>')/geo;
372
373    ## Check for special characters in regular text.  Must be careful
374    ## to check before/after character in expression because it might be
375    ## a special character.
376    $$txt =~ s/([^\010]\&[^\010])/htmlize2($1)/geo;
377    $$txt =~ s/([^\010]<[^\010])/htmlize2($1)/geo;
378    $$txt =~ s/([^\010]>[^\010])/htmlize2($1)/geo;
379}
380
381##---------------------------------------------------------------------------
382##	escape special characters in a string, in-place
383##
384sub htmlize {
385    my($str) = shift;
386    $$str =~ s/&/\&amp;/g;
387    $$str =~ s/</\&lt;/g;
388    $$str =~ s/>/\&gt;/g;
389    $$str;
390}
391
392##---------------------------------------------------------------------------
393##	htmlize2() is used by entitize.
394##
395sub htmlize2 {
396    my($str) = shift;
397    $str =~ s/&/\&amp;/g;
398    $str =~ s/</\&lt;/g;
399    $str =~ s/>/\&gt;/g;
400    $str;
401}
402
403##---------------------------------------------------------------------------
404##	strike converts HTML special characters in overstriked text
405##	into entity references.  The entities are overstriked so
406##	strongize() and emphasize() will recognize the entity to be
407##	wrapped in tags.
408##
409sub strike {
410    my($w, $char) = @_;
411    my($ret);
412    if ($w eq '_') {
413	if ($char eq '&') {
414	    $ret = "_$bs\&_${bs}a_${bs}m_${bs}p_${bs};";
415	} elsif ($char eq '<') {
416	    $ret = "_$bs\&_${bs}l_${bs}t_${bs};";
417	} elsif ($char eq '>') {
418	    $ret = "_$bs\&_${bs}g_${bs}t_${bs};";
419	} else {
420	    warn qq|Unrecognized character, "$char", passed to strike()\n|;
421	}
422    } else {
423	if ($char eq '&') {
424	    $ret = "\&$bs\&a${bs}am${bs}mp${bs}p;${bs};";
425	} elsif ($char eq '<') {
426	    $ret = "\&$bs\&l${bs}lt${bs}t;${bs};";
427	} elsif ($char eq '>') {
428	    $ret = "\&$bs\&g${bs}gt${bs}t;${bs};";
429	} else {
430	    warn qq|Unrecognized character, "$char", passed to strike()\n|;
431	}
432    }
433    $ret;
434}
435
436##---------------------------------------------------------------------------
437##	make_xref() converts a manpage crossreference into a hyperlink.
438##
439sub make_xref {
440    my $str = shift;
441    $str =~ s/.\010//go;			# Remove overstriking
442
443    if ($CgiUrl) {
444	my($title,$section,$subsection) =
445	    ($str =~ /([\+_\.\w-]+)\((\d)(\w*)\)/);
446	    my($subsection) = lc($subsection);
447
448	$title =~ s/\+/%2B/g;
449	my($href) = (eval $CgiUrl);
450	qq|<B><A HREF="$href">$str</A></B>|;
451    } else {
452	qq|<B>$str</B>|;
453    }
454}
455
456##---------------------------------------------------------------------------
457##	man_k() process a keyword search.  The problem we have is there
458##	is no standard for keyword search results from man.  Solaris
459##	systems have a different enough format to warrent dealing
460##	with it as a special case.  For other cases, we try our best.
461##	Unfortunately, there are some lines of results that may be
462##	skipped.
463##
464sub man_k {
465    my($line,$refs,$section,$subsection,$desc,$i,
466       %Sec1, %Sec1sub, %Sec2, %Sec2sub, %Sec3, %Sec3sub,
467       %Sec4, %Sec4sub, %Sec5, %Sec5sub, %Sec6, %Sec6sub,
468       %Sec7, %Sec7sub, %Sec8, %Sec8sub, %Sec9, %Sec9sub,
469       %SecN, %SecNsub, %SecNsec);
470
471    printhead()  unless $Bare;
472    print $OutFH "<!-- Man keyword results converted by ",
473		      "man2html $VERSION -->\n";
474
475    while ($line = <$InFH>) {
476	next if $line !~ /\(\d\w?\)\s+-\s/; # check if line can be handled
477	($refs,$section,$subsection,$desc) =
478	    $line =~ /^\s*(.*)\((\d)(\w*)\)\s*-\s*(.*)$/;
479
480	if ($Solaris) {
481	    $refs =~ s/^\s*([\+_\.\w-]+)\s+([\+_\.\w-]+)\s*$/$1/;
482					#  <topic> <manpage>
483	} else {
484	    $refs =~ s/\s(and|or)\s/,/gi; # Convert and/or to commas
485	    $refs =~ s/^[^:\s]:\s*//;	# Remove prefixed whatis path
486	}
487	$refs =~ s/\s//g;		# Remove all whitespace
488	$refs =~ s/,/, /g;		# Put space after comma
489	htmlize(\$desc);		# Check for special chars in desc
490	$desc =~ s/^(.)/\U$1/;		# Uppercase first letter in desc
491
492	if ($section eq '1') {
493	    $Sec1{$refs} = $desc; $Sec1sub{$refs} = $subsection;
494	} elsif ($section eq '2') {
495	    $Sec2{$refs} = $desc; $Sec2sub{$refs} = $subsection;
496	} elsif ($section eq '3') {
497	    $Sec3{$refs} = $desc; $Sec3sub{$refs} = $subsection;
498	} elsif ($section eq '4') {
499	    $Sec4{$refs} = $desc; $Sec4sub{$refs} = $subsection;
500	} elsif ($section eq '5') {
501	    $Sec5{$refs} = $desc; $Sec5sub{$refs} = $subsection;
502	} elsif ($section eq '6') {
503	    $Sec6{$refs} = $desc; $Sec6sub{$refs} = $subsection;
504	} elsif ($section eq '7') {
505	    $Sec7{$refs} = $desc; $Sec7sub{$refs} = $subsection;
506	} elsif ($section eq '8') {
507	    $Sec8{$refs} = $desc; $Sec8sub{$refs} = $subsection;
508	} elsif ($section eq '9') {
509	    $Sec9{$refs} = $desc; $Sec9sub{$refs} = $subsection;
510	} else {			# Catch all
511	    $SecN{$refs} = $desc; $SecNsec{$refs} = $section;
512	    $SecNsub{$refs} = $subsection;
513	}
514    }
515    print_mank_sec(\%Sec1, 1, \%Sec1sub);
516    print_mank_sec(\%Sec2, 2, \%Sec2sub);
517    print_mank_sec(\%Sec3, 3, \%Sec3sub);
518    print_mank_sec(\%Sec4, 4, \%Sec4sub);
519    print_mank_sec(\%Sec5, 5, \%Sec5sub);
520    print_mank_sec(\%Sec6, 6, \%Sec6sub);
521    print_mank_sec(\%Sec7, 7, \%Sec7sub);
522    print_mank_sec(\%Sec8, 8, \%Sec8sub);
523    print_mank_sec(\%Sec9, 9, \%Sec9sub);
524    print_mank_sec(\%SecN, 'N', \%SecNsub, \%SecNsec);
525
526    printtail()  unless $Bare;
527}
528##---------------------------------------------------------------------------
529##	print_mank_sec() prints out manpage cross-refs of a specific section.
530##
531sub print_mank_sec {
532    my($sec, $sect, $secsub, $secsec) = @_;
533    my(@array, @refs, $href, $item, $title, $subsection, $i, $section,
534       $xref);
535    $section = $sect;
536
537    @array = sort keys %$sec;
538    if ($#array >= 0) {
539	print $OutFH "<H2>Section $section</H2>\n",
540		     "<DL COMPACT>\n";
541	foreach $item (@array) {
542	    @refs = split(/,/, $item);
543	    $section = $secsec->{$item}  if $sect eq 'N';
544	    $subsection = $secsub->{$item};
545	    if ($CgiUrl) {
546		($title = $refs[0]) =~ s/\(\)//g;  # watch out for extra ()'s
547		$xref = eval $CgiUrl;
548	    }
549	    print $OutFH "<DT>\n";
550	    $i = 0;
551	    foreach (@refs) {
552		if ($CgiUrl) {
553		    print $OutFH qq|<B><A HREF="$xref">$_</A></B>|;
554		} else {
555		    print $OutFH $_;
556		}
557		print $OutFH ", "  if $i < $#refs;
558		$i++;
559	    }
560	    print $OutFH " ($section$subsection)\n",
561			 "</DT><DD>\n",
562			 $sec->{$item}, "</DD>\n";
563	}
564	print $OutFH "</DL>\n";
565    }
566}
567
568##---------------------------------------------------------------------------
569##
570sub usage {
571    print $OutFH <<EndOfUsage;
572Usage: $PROG [ options ] < infile > outfile
573Options:
574  -bare            : Do not put in HTML, HEAD, BODY tags
575  -belem <elem>    : HTML Element for overstriked text (def: "B")
576  -botm <#>        : Number of lines for bottom margin (def: 7)
577  -cgiurl <url>    : URL for linking to other manpages
578  -cgiurlexp <url> : Perl expression URL for linking to other manpages
579  -compress        : Compress consective blank lines
580  -headmap <file>  : Filename of user section head map file
581  -help            : This message
582  -k               : Process a keyword search result
583  -leftm <#>       : Character width of left margin (def: 0)
584  -nodepage        : Do not remove pagination lines
585  -noheads         : Turn off section head detection
586  -pgsize <#>      : Number of lines in a page (def: 66)
587  -seealso         : Link to other manpages only in the SEE ALSO section
588  -solaris         : Process keyword search result in Solaris format
589  -sun             : Section heads are not overstriked in input
590  -title <string>  : Title of manpage (def: Not defined)
591  -topm <#>        : Number of lines for top margin (def: 7)
592  -uelem <elem>    : HTML Element for underlined text (def: "I")
593
594Description:
595  $PROG takes formatted manpages from STDIN and converts it to HTML sent
596  to STDOUT.  The -topm and -botm arguments are the number of lines to the
597  main body text and NOT to the running headers/footers.
598
599Version:
600  $VERSION
601  Copyright (C) 1995-1997  Earl Hood, ehood\@medusa.acs.uci.edu
602  $PROG comes with ABSOLUTELY NO WARRANTY and $PROG may be copied only
603  under the terms of the GNU General Public License, which may be found in
604  the $PROG distribution.
605
606EndOfUsage
607    exit 0;
608}
609