1# Copyright 2008, 2009, 2010, 2011, 2012, 2013, 2015 Kevin Ryde 2 3# HTML-FormatExternal is free software; you can redistribute it and/or 4# modify it under the terms of the GNU General Public License as published 5# by the Free Software Foundation; either version 3, or (at your option) any 6# later version. 7# 8# HTML-FormatExternal is distributed in the hope that it will be useful, but 9# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 10# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 11# for more details. 12# 13# You should have received a copy of the GNU General Public License along 14# with HTML-FormatExternal. If not, see <http://www.gnu.org/licenses/>. 15 16 17 18# Maybe: 19# capture error output 20# errors_to => \$var 21# combine error messages 22# 23 24 25package HTML::FormatExternal; 26use 5.006; 27use strict; 28use warnings; 29use Carp; 30use File::Spec 0.80; # version 0.80 of perl 5.6.0 or thereabouts for devnull() 31use IPC::Run; 32 33# uncomment this to run the ### lines 34# use Smart::Comments; 35 36our $VERSION = 26; 37 38sub new { 39 my ($class, %self) = @_; 40 return bless \%self, $class; 41} 42sub format { 43 my ($self, $html) = @_; 44 if (ref $html) { $html = $html->as_HTML; } 45 return $self->format_string ($html, %$self); 46} 47 48use constant _WIDE_INPUT_CHARSET => 'UTF-8'; 49use constant _WIDE_OUTPUT_CHARSET => 'UTF-8'; 50 51# format_string() takes the easy approach of putting the string in a temp 52# file and letting format_file() do the real work. The formatter programs 53# can generally read stdin and write stdout, so might do that with select() 54# to simultaneously write and read back. 55# 56sub format_string { 57 my ($class, $html_str, %options) = @_; 58 59 my $fh = _tempfile(); 60 my $input_wide = eval { utf8::is_utf8($html_str) }; 61 _output_wide(\%options, $input_wide); 62 63 # insert <base> while in wide chars 64 if (defined $options{'base'}) { 65 $html_str = _base_prefix(\%options, $html_str, $input_wide); 66 } 67 68 if ($input_wide) { 69 if (! $options{'input_charset'}) { 70 $options{'input_charset'} = $class->_WIDE_INPUT_CHARSET; 71 } 72 ### input_charset for wide: $options{'input_charset'} 73 if ($options{'input_charset'} eq 'entitize') { 74 $html_str = _entitize($html_str); 75 delete $options{'input_charset'}; 76 } else { 77 my $layer = ":encoding($options{'input_charset'})"; 78 binmode ($fh, $layer) or die 'Cannot add layer ',$layer; 79 } 80 } 81 82 do { 83 print $fh $html_str 84 and close($fh) 85 } || die 'Cannot write temp file: ',$!; 86 87 return $class->format_file ($fh->filename, %options); 88} 89 90# Left margin is synthesized by adding spaces afterwards because the various 91# programs have pretty variable support for a specified margin. 92# * w3m doesn't seem to have a left margin option at all 93# * lynx has one but it's too well hidden in its style sheet or something 94# * elinks has document.browse.margin_width but it's limited to 8 or so 95# * netrik doesn't seem to have one at all 96# * vilistextum has a "spaces" internally for lists etc but no apparent 97# way to initialize from the command line 98# 99sub format_file { 100 my ($class, $filename, %options) = @_; 101 102 # If neither leftmargin nor rightmargin are specified then '_width' is 103 # unset and the _make_run() funcs leave it to the program defaults. 104 # 105 # If either leftmargin or rightmargin are set then '_width' is established 106 # and the _make_run() funcs use it and and zero left margin, then the 107 # actual left margin is applied below. 108 # 109 # The DEFAULT_LEFTMARGIN and DEFAULT_RIGHTMARGIN establish the defaults 110 # when just one of the two is set. Not good hard coding those values, 111 # but the programs don't have anything to set one but not the other. 112 # 113 my $leftmargin = $options{'leftmargin'}; 114 my $rightmargin = $options{'rightmargin'}; 115 if (defined $leftmargin || defined $rightmargin) { 116 if (! defined $leftmargin) { $leftmargin = $class->DEFAULT_LEFTMARGIN; } 117 if (! defined $rightmargin) { $rightmargin = $class->DEFAULT_RIGHTMARGIN; } 118 $options{'_width'} = $rightmargin - $leftmargin; 119 } 120 121 _output_wide(\%options, 0); # file input is reckoned as not wide 122 if ($options{'output_wide'}) { 123 $options{'output_charset'} ||= $class->_WIDE_OUTPUT_CHARSET; 124 } 125 126 my $tempfh; 127 if (defined $options{'base'}) { 128 # insert <base> by copying to a temp file 129 130 # File::Copy rudely calls eq() to compare $from and $to. Need either 131 # File::Temp 0.18 to have that work on $tempfh, or File::Copy 2.??? for 132 # it to check an overload method exists first. Newer File::Temp is 133 # available from cpan, where File::Copy may not be, so ask for 134 # File::Temp 0.18. 135 require File::Temp; 136 File::Temp->VERSION(0.18); 137 138 # must sysread()/syswrite() because that's what File::Copy does (as of 139 # its version 2.30) so anything held in the perl buffering by the normal 140 # read() is lost. 141 142 my $initial; 143 my $fh; 144 do { 145 open $fh, '<', $filename 146 and binmode $fh 147 and defined (sysread $fh, $initial, 4) 148 } || croak "Cannot open $filename: $!"; 149 ### $initial 150 151 $initial = _base_prefix(\%options, $initial, 0); 152 153 $tempfh = _tempfile(); 154 $tempfh->autoflush(1); 155 require File::Copy; 156 do { 157 defined(syswrite($tempfh, $initial)) 158 and File::Copy::copy($fh, $tempfh) 159 and close $tempfh 160 and close $fh 161 } || croak "Cannot copy $filename to temp file: $!"; 162 163 164 $filename = $tempfh->filename; 165 } 166 167 # # dump the file being crunched 168 # print "Bytes passed to program:\n"; 169 # IPC::Run::run(['hd'], '<',$filename, '|',['cat']); 170 171 # _make_run() can set $options{'ENV'} too 172 my ($command_aref, @run) = $class->_make_run($filename, \%options); 173 my $env = $options{'ENV'} || {}; 174 175 ### $command_aref 176 ### @run 177 ### $env 178 179 if (! @run) { 180 push @run, '<', File::Spec->devnull; 181 } 182 183 my $str; 184 { 185 local %ENV = (%ENV, %$env); # overrides from _make_command() 186 eval { IPC::Run::run($command_aref, 187 @run, 188 '>', \$str, 189 # FIXME: what to do with stderr ? 190 # '2>', File::Spec->devnull, 191 ) }; 192 } 193 _die_on_insecure(); 194 ### $str 195 196 ### final output_wide: $options{'output_wide'} 197 if ($options{'output_wide'}) { 198 require Encode; 199 $str = Encode::decode ($options{'output_charset'}, $str); 200 } 201 202 if ($leftmargin) { 203 my $fill = ' ' x $leftmargin; 204 $str =~ s/^(.)/$fill$1/mg; # non-empty lines only 205 } 206 return $str; 207} 208 209# most program running errors are quietly ignored for now, but re-throw 210# "Insecure $ENV{PATH}" when cannot run due to taintedness. 211sub _die_on_insecure { 212 if ($@ =~ /^Insecure/) { 213 die $@; 214 } 215} 216 217sub _run_version { 218 my ($self_or_class, $command_aref, @ipc_options) = @_; 219 ### _run_version() ... 220 ### $command_aref 221 ### @ipc_options 222 223 if (! @ipc_options) { 224 @ipc_options = ('2>', File::Spec->devnull); 225 } 226 227 my $version; # left undef if any exec/slurp problem 228 eval { IPC::Run::run($command_aref, 229 '<', File::Spec->devnull, 230 '>', \$version, 231 @ipc_options) }; 232 233 # strip blank lines at end of lynx, maybe others 234 if (defined $version) { $version =~ s/\n+$/\n/s; } 235 return $version; 236} 237 238# return a File::Temp filehandle object 239sub _tempfile { 240 require File::Temp; 241 my $fh = File::Temp->new (TEMPLATE => 'HTML-FormatExternal-XXXXXX', 242 SUFFIX => '.html', 243 TMPDIR => 1); 244 binmode($fh) or die 'Oops, cannot set binmode() on temp file'; 245 246 ### tempfile: $fh->filename 247 # $fh->unlink_on_destroy(0); # to preserve for debugging ... 248 249 return $fh; 250} 251 252sub _output_wide { 253 my ($options, $input_wide) = @_; 254 if (! defined $options->{'output_wide'} 255 || $options->{'output_wide'} eq 'as_input') { 256 $options->{'output_wide'} = $input_wide; 257 } 258} 259 260# $str is HTML or some initial bytes. 261# Return a new string with <base> at the start. 262# 263sub _base_prefix { 264 my ($options, $str, $input_wide) = @_; 265 my $base = delete $options->{'base'}; 266 ### _base_prefix: $base 267 268 $base = "$base"; # stringize possible URI object 269 $base = _entitize($base); # probably shouldn't be any non-ascii in a url 270 $base = "<base href=\"$base\">\n"; 271 272 my $pos = 0; 273 unless ($input_wide) { 274 # encode $base in the input_charset, and possibly after a BOM. 275 # 276 # Lynx recognises a BOM, if it doesn't have other -assume_charset. It 277 # recognises it only at the start of the file, so must insert <base> 278 # after it here to preserve that feature of Lynx. 279 # 280 # If input_charset is utf-32 or utf-16 then it seems reasonable to step 281 # over any BOM. But Lynx for some reason doesn't like a BOM together 282 # with utf-32 or utf-16 specified. Dunno if that's a bug or a feature 283 # on its part. 284 285 my $input_charset = $options->{'input_charset'}; 286 if (! defined $input_charset || lc($input_charset) eq 'utf-32') { 287 if ($str =~ /^\000\000\376\377/) { 288 $input_charset = 'utf-32be'; 289 $pos = 4; 290 } elsif ($str =~ /^\377\376\000\000/) { 291 $input_charset = 'utf-32le'; 292 $pos = 4; 293 } 294 } 295 if (! defined $input_charset || lc($input_charset) eq 'utf-16') { 296 if ($str =~ /^\376\377/) { 297 $input_charset = 'utf-16be'; 298 $pos = 4; 299 } elsif ($str =~ /^\377\376/) { 300 $input_charset = 'utf-16le'; 301 $pos = 2; 302 } 303 } 304 if (defined $input_charset) { 305 # encode() errors out if unknown charset, and doesn't exist for older 306 # Perl, in which case leave $base as ascii. May not be right, but 307 # ought to work with the various ASCII superset encodings. 308 eval { 309 require Encode; 310 $base = Encode::encode ($input_charset, $base); 311 }; 312 } 313 } 314 substr($str, $pos,0, $base); # insert $base at $pos 315 return $str; 316} 317 318# return $str with non-ascii replaced by { entities 319sub _entitize { 320 my ($str) = @_; 321 $str =~ s{([^\x20-\x7E])}{'&#'.ord($1).';'}eg; 322 ### $str 323 return $str; 324} 325 3261; 327__END__ 328 329=for stopwords HTML-FormatExternal formatter formatters charset charsets TreeBuilder ie latin-1 config Elinks absolutized tty Ryde filename recognise BOM UTF entitized unrepresentable untaint superset onwards overstriking 330 331=head1 NAME 332 333HTML::FormatExternal - HTML to text formatting using external programs 334 335=head1 DESCRIPTION 336 337This is a collection of formatter modules which turn HTML into plain text by 338dumping it through the respective external programs. 339 340 HTML::FormatText::Elinks 341 HTML::FormatText::Html2text 342 HTML::FormatText::Links 343 HTML::FormatText::Lynx 344 HTML::FormatText::Netrik 345 HTML::FormatText::Vilistextum 346 HTML::FormatText::W3m 347 HTML::FormatText::Zen 348 349The module interfaces are compatible with C<HTML::Formatter> modules such as 350C<HTML::FormatText>, but the external programs do all the work. 351 352Common formatting options are used where possible, such as C<leftmargin> and 353C<rightmargin>. So just by switching the class you can use a different 354program (or the plain C<HTML::FormatText>) according to personal preference, 355or strengths and weaknesses, or what you've got. 356 357There's nothing particularly difficult about piping through these programs, 358but a unified interface hides details like how to set margins and how to 359force input or output charsets. 360 361=head1 FUNCTIONS 362 363Each of the classes above provide the following functions. The C<XXX> in 364the class names here is a placeholder for any of C<Elinks>, C<Lynx>, etc as 365above. 366 367See F<examples/demo.pl> in the HTML-FormatExternal sources for a complete 368sample program. 369 370=head2 Formatter Compatible Functions 371 372=over 4 373 374=item C<< $text = HTML::FormatText::XXX->format_file ($filename, key=>value,...) >> 375 376=item C<< $text = HTML::FormatText::XXX->format_string ($html_string, key=>value,...) >> 377 378Run the formatter program over a file or string with the given options and 379return the formatted result as a string. See L</OPTIONS> below for possible 380key/value options. For example, 381 382 $text = HTML::FormatText::Lynx->format_file ('/my/file.html'); 383 384 $text = HTML::FormatText::W3m->format_string 385 ('<html><body> <p> Hello world! </p </body></html>'); 386 387C<format_file()> ensures any C<$filename> is interpreted as a filename (by 388escaping as necessary against however the programs interpret command line 389arguments). 390 391=item C<< $formatter = HTML::FormatText::XXX->new (key=>value, ...) >> 392 393Create a formatter object with the given options. In the current 394implementation an object doesn't do much more than remember the options for 395future use. 396 397 $formatter = HTML::FormatText::Elinks->new(rightmargin => 60); 398 399=item C<< $text = $formatter->format ($tree_or_string) >> 400 401Run the C<$formatter> program on a C<HTML::TreeBuilder> tree or a string, 402using the options in C<$formatter>, and return the result as a string. 403 404A TreeBuilder argument (ie. a C<HTML::Element>) is accepted for 405compatibility with C<HTML::Formatter>. The tree is simply turned into a 406string with C<< $tree->as_HTML >> to pass to the program, so if you've got a 407string already then give that instead of a tree. 408 409C<HTML::Element> itself has a C<format()> method (see 410L<HTML::Element/format>) which runs a given C<$formatter>. 411A C<HTML::FormatExternal> object can be used for C<$formatter>. 412 413 $text = $tree->format($formatter); 414 415 # which dispatches to 416 $text = $formatter->format($tree); 417 418=back 419 420=head2 Extra Functions 421 422The following are extra methods not available in the plain 423C<HTML::FormatText>. 424 425=over 4 426 427=item C<< HTML::FormatText::XXX->program_version () >> 428 429=item C<< HTML::FormatText::XXX->program_full_version () >> 430 431=item C<< $formatter->program_version () >> 432 433=item C<< $formatter->program_full_version () >> 434 435Return the version number of the formatter program as reported by its 436C<--version> or similar option. If the formatter program is not available 437then return C<undef>. 438 439C<program_version()> is the bare version number, perhaps with "beta" or 440similar indication. C<program_full_version()> is the entire version output, 441which may include build options, copyright notice, etc. 442 443 $str = HTML::FormatText::Lynx->program_version(); 444 # eg. "2.8.7dev.10" 445 446 $str = HTML::FormatText::W3m->program_full_version(); 447 # eg. "w3m version w3m/0.5.2, options lang=en,m17n,image,..." 448 449The version number of the respective Perl module itself is available in the 450usual way (see L<UNIVERSAL/VERSION>). 451 452 $modulever = HTML::FormatText::Netrik->VERSION; 453 $modulever = $formatter->VERSION 454 455=back 456 457=head1 CHARSETS 458 459File or byte string input is by default interpreted by the programs in their 460usual ways. This should mean HTML Latin-1 but user configurations might 461override that and some programs recognise a C<< <meta> >> charset 462declaration or a Unicode BOM. The C<input_charset> option below can force 463the input charset. 464 465Perl wide-character input string is encoded and passed to the program in 466whatever way it best understands. Usually this is UTF-8 but in some cases 467it is entitized instead. The C<input_charset> option can force the input 468charset to use if for some reason UTF-8 is not best. 469 470The output string is either bytes or wide chars. By default output is the 471same as input, so wide char string input gives wide output and byte input 472string or file input gives byte output. The C<output_wide> option can force 473the output type (and is the way to get wide chars back from 474C<format_file()>). 475 476Byte output is whatever the program produces. Its default might be the 477locale charset or other user configuration which suits direct display to the 478user's terminal. The C<output_charset> option can force the output to be 479certain or to be ready for further processing. 480 481Wide char output is done by choosing the best output charset the program can 482do and decoding its output. Usually this means UTF-8 but some of the 483programs may only have less. The C<output_charset> option can force the 484charset used and decoded. If it's something less than UTF-8 then some 485programs might for example give ASCII art approximations of otherwise 486unrepresentable characters. 487 488Byte input is usual for HTML downloaded from a HTTP server or from a MIME 489email and the headers have the C<input_charset> which applies. Byte output 490is good to go straight out to a tty or back to more MIME etc. The input and 491output charsets could differ if a server gives something other than what you 492want for final output. 493 494Wide chars are most convenient for crunching text within Perl. The default 495wide input giving wide output is designed to be transparent for this. 496 497For reference, if a C<HTML::Element> tree contains wide char strings then 498its usual C<as_HTML()> method, which is used by C<format()> above, produces 499wide char HTML so the formatters here give wide char text. Actually 500C<as_HTML()> produces all ASCII because its default behaviour is to entitize 501anything "unsafe", but it's still a wide char string so the formatted output 502text is wide. 503 504=head1 OPTIONS 505 506The following options can be given to the constructor or to the formatting 507methods. The defaults are whatever the respective programs do. The 508programs generally read their config files when dumping so the defaults and 509formatting details may follow the user's personal preferences. Usually this 510is a good thing. 511 512=over 4 513 514=item C<< leftmargin => INTEGER >> 515 516=item C<< rightmargin => INTEGER >> 517 518The column numbers for the left and right hand ends of the text. 519C<leftmargin> 0 means no padding on the left. C<rightmargin> is the text 520width, so for instance 60 would mean the longest line is 60 characters 521(inclusive of any C<leftmargin>). These options are compatible with 522C<HTML::FormatText>. 523 524C<rightmargin> is not necessarily a hard limit. Some of the programs will 525exceed it in a HTML literal C<< <pre> >>, or a run of C< > or similar. 526 527=item C<< input_charset => STRING >> 528 529Force the HTML input to be interpreted as bytes of the given charset, 530irrespective of locale, user configuration, C<< <meta> >> in the HTML, etc. 531 532=item C<< output_charset => STRING >> 533 534Force the text output to be encoded as the given charset. The default 535varies among the programs, but usually defaults to the locale. 536 537=item C<< output_wide => 0,1,"as_input" >> 538 539Select output string as wide characters rather than bytes. The default is 540C<"as_input"> which means a wide char input string results in a wide char 541output string and a byte input or file input is byte output. See 542L</CHARSETS> above for how wide characters work. 543 544Bytes or wide chars output can be forced by 0 or 1 respectively. For 545example to get wide char output when formatting a file, 546 547 $wide_char_text = HTML::FormatText::W3m->format_file 548 ('/my/file.html', output_wide => 1); 549 550=item C<< base => STRING >> 551 552Set the base URL for any relative links within the HTML (similar to 553C<HTML::FormatText::WithLinks>). Usually this should be the location the 554HTML was downloaded from. 555 556If the document contains its own C<< <base> >> setting then currently the 557document takes precedence. Only Lynx and Elinks display absolutized link 558targets and the option has no effect on the other programs. 559 560=back 561 562=head1 TAINT MODE 563 564The formatter modules can be used under C<perl -T> taint mode. They run 565external programs so it's necessary to untaint C<$ENV{PATH}> in the usual 566way per L<perlsec/Cleaning Up Your Path>. 567 568The formatted text strings returned are always tainted, on the basis that 569they use or include data from outside the Perl program. The 570C<program_version()> and C<program_full_version()> strings are tainted too. 571 572=head1 BUGS 573 574C<leftmargin> is implemented by adding spaces to the program output. For 575byte output it this is ASCII spaces and that will be badly wrong for unusual 576output like UTF-16 which is not a byte superset of ASCII. For wide char 577output the margin is applied after decoding to wide chars so is correct. 578It'd be better to ask the programs to do the margin but their options for 579that are poor. 580 581There's nothing done with errors or warning messages from the programs. 582Generally they make a best effort on doubtful HTML, but fatal errors like 583bad options or missing libraries ought to be somehow trapped. 584 585=head1 OTHER POSSIBILITIES 586 587C<elinks> (from Aug 2008 onwards) and C<netrik> can produce ANSI escapes for 588colours, underline, etc, and C<html2text> and C<lynx> can produce tty style 589backspace overstriking. This might be good for text destined for a tty or 590further crunching. Perhaps an C<ansi> or C<tty> option could enable this, 591where possible, but for now it's deliberately turned off in those programs 592to keep the default as plain text. 593 594=head1 SEE ALSO 595 596L<HTML::FormatText::Elinks>, 597L<HTML::FormatText::Html2text>, 598L<HTML::FormatText::Links>, 599L<HTML::FormatText::Netrik>, 600L<HTML::FormatText::Lynx>, 601L<HTML::FormatText::Vilistextum>, 602L<HTML::FormatText::W3m>, 603L<HTML::FormatText::Zen> 604 605L<HTML::FormatText>, 606L<HTML::FormatText::WithLinks>, 607L<HTML::FormatText::WithLinks::AndTables> 608 609=head1 HOME PAGE 610 611L<http://user42.tuxfamily.org/html-formatexternal/index.html> 612 613=head1 LICENSE 614 615Copyright 2008, 2009, 2010, 2011, 2012, 2013, 2015 Kevin Ryde 616 617HTML-FormatExternal is free software; you can redistribute it and/or modify 618it under the terms of the GNU General Public License as published by the 619Free Software Foundation; either version 3, or (at your option) any later 620version. 621 622HTML-FormatExternal is distributed in the hope that it will be useful, but 623WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 624or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 625more details. 626 627You should have received a copy of the GNU General Public License along with 628HTML-FormatExternal. If not, see L<http://www.gnu.org/licenses/>. 629 630=cut 631