1package HTML::FormatRTF; 2 3# ABSTRACT: Format HTML as RTF 4 5 6use 5.006_001; 7use strict; 8use warnings; 9 10# We now use Smart::Comments in place of the old DEBUG framework. 11# this should be commented out in release versions.... 12##use Smart::Comments; 13 14use base 'HTML::Formatter'; 15 16our $VERSION = '2.12'; # VERSION 17our $AUTHORITY = 'cpan:NIGELM'; # AUTHORITY 18 19# ------------------------------------------------------------------------ 20my %Escape = ( 21 map( ( chr($_), chr($_) ), # things not apparently needing escaping 22 0x20 .. 0x7E ), 23 map( ( chr($_), sprintf( "\\'%02x", $_ ) ), # apparently escapeworthy things 24 0x00 .. 0x1F, 0x5c, 0x7b, 0x7d, 0x7f .. 0xFF, 0x46 ), 25 26 # We get to escape out 'F' so that we can send RTF files thru the mail 27 # without the slightest worry that paragraphs beginning with "From" 28 # will get munged. 29 30 # And some refinements: 31 #"\n" => "\n\\line ", 32 #"\cm" => "\n\\line ", 33 #"\cj" => "\n\\line ", 34 35 "\t" => "\\tab ", # Tabs (altho theoretically raw \t's are okay) 36 37 # "\f" => "\n\\page\n", # Formfeed 38 "-" => "\\_", # Turn plaintext '-' into a non-breaking hyphen 39 "\xA0" => "\\~", # Latin-1 non-breaking space 40 "\xAD" => "\\-", # Latin-1 soft (optional) hyphen 41 42 # CRAZY HACKS: 43 "\n" => "\\line\n", 44 "\r" => "\n", 45 46 # "\cb" => "{\n\\cs21\\lang1024\\noproof ", # \\cf1 47 # "\cc" => "}", 48); 49 50# ------------------------------------------------------------------------ 51sub default_values { 52 ( shift->SUPER::default_values(), 53 'lm' => 0, # left margin 54 'rm' => 0, # right margin (actually, maximum text width) 55 56 'head1_halfpoint_size' => 32, 57 'head2_halfpoint_size' => 28, 58 'head3_halfpoint_size' => 25, 59 'head4_halfpoint_size' => 22, 60 'head5_halfpoint_size' => 20, 61 'head6_halfpoint_size' => 18, 62 'codeblock_halfpoint_size' => 18, 63 'header_halfpoint_size' => 17, 64 'normal_halfpoint_size' => 22, 65 ); 66} 67 68# ------------------------------------------------------------------------ 69sub configure { 70 my ( $self, $hash ) = shift; 71 72 $self->{lm} = 0; 73 $self->{rm} = 0; 74 75 # include the hash parameters into self - as RT#56278 76 map { $self->{$_} = $hash->{$_} } keys %$hash if ( ref($hash) ); 77 $self; 78} 79 80# ------------------------------------------------------------------------ 81sub begin { 82 my $self = shift; 83 84 ### Start document... 85 $self->SUPER::begin; 86 87 $self->collect( $self->doc_init, $self->font_table, $self->stylesheet, $self->color_table, $self->doc_info, 88 $self->doc_really_start, "\n" ) 89 unless $self->{'no_prolog'}; 90 91 $self->{'Para'} = ''; 92 $self->{'quotelevel'} = 0; 93 94 return; 95} 96 97# ------------------------------------------------------------------------ 98sub end { 99 my $self = shift; 100 101 $self->vspace(0); 102 $self->out('THIS IS NEVER SEEN'); 103 104 # just to force the previous para to be written out. 105 $self->collect("}") unless $self->{'no_trailer'}; # ends the document 106 107 ### End document... 108 return; 109} 110 111# ------------------------------------------------------------------------ 112sub vspace { 113 my $self = shift; 114 115 #$self->emit_para if defined $self->{'vspace'}; 116 my $rv = $self->SUPER::vspace(@_); 117 $self->emit_para if defined $self->{'vspace'}; 118 $rv; 119} 120 121# ------------------------------------------------------------------------ 122sub stylesheet { 123 124 # TODO: maybe actually /use/ the character styles? 125 126 return sprintf <<'END', # snazzy styles 127{\stylesheet 128{\snext0 Normal;} 129{\*\cs1 \additive Default Paragraph Font;} 130{\*\cs2 \additive \i\sbasedon1 html-ital;} 131{\*\cs3 \additive \b\sbasedon1 html-bold;} 132{\*\cs4 \additive \f1\sbasedon1 html-code;} 133 134{\s20\ql \f1\fs%s\lang1024\noproof\sbasedon0 \snext0 html-pre;} 135 136{\s31\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head1;} 137{\s32\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head2;} 138{\s33\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head3;} 139{\s34\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head4;} 140{\s35\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head5;} 141{\s36\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head6;} 142} 143 144END 145 146 @{ $_[0] }{ 147 qw< 148 codeblock_halfpoint_size 149 head1_halfpoint_size 150 head2_halfpoint_size 151 head3_halfpoint_size 152 head4_halfpoint_size 153 head5_halfpoint_size 154 head6_halfpoint_size 155 > 156 }; 157} 158 159# ------------------------------------------------------------------------ 160# Override these as necessary for further customization 161 162sub font_table { 163 my $self = shift; 164 165 return sprintf <<'END' , # text font, code font, heading font 166{\fonttbl 167{\f0\froman %s;} 168{\f1\fmodern %s;} 169{\f2\fswiss %s;} 170} 171 172END 173 174 map { 175 ; # custom-dumb escaper: 176 my $x = $_; 177 $x =~ s/([\x00-\x1F\\\{\}\x7F-\xFF])/sprintf("\\'%02x", $1)/g; 178 $x =~ s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg; 179 $x; 180 } 181 $self->{'fontname_body'} || 'Times', 182 $self->{'fontname_code'} || 'Courier New', 183 $self->{'fontname_headings'} || 'Arial', 184 ; 185} 186 187# ------------------------------------------------------------------------ 188sub doc_init { 189 return <<'END'; 190{\rtf1\ansi\deff0 191 192END 193} 194 195# ------------------------------------------------------------------------ 196sub color_table { 197 return <<'END'; 198{\colortbl;\red255\green0\blue0;\red0\green0\blue255;} 199END 200} 201 202# ------------------------------------------------------------------------ 203sub doc_info { 204 my $self = $_[0]; 205 206 return sprintf <<'END', $self->version_tag; 207{\info{\doccomm generated by %s} 208{\author [see doc]}{\company [see doc]}{\operator [see doc]} 209} 210 211END 212 213} 214 215# ------------------------------------------------------------------------ 216sub doc_really_start { 217 my $self = $_[0]; 218 219 return sprintf <<'END', 220\deflang%s\widowctrl 221{\header\pard\qr\plain\f2\fs%s 222p.\chpgn\par} 223\fs%s 224 225END 226 $self->{'document_language'} || 1033, $self->{"header_halfpoint_size"}, $self->{"normal_halfpoint_size"},; 227} 228 229# ------------------------------------------------------------------------ 230sub emit_para { # rather like showline in FormatPS 231 my $self = shift; 232 233 my $para = $self->{'Para'}; 234 $self->{'Para'} = undef; 235 236 #### emit_para called by: (caller(1) )[3]; 237 238 unless ( defined $para ) { 239 #### emit_para with empty buffer... 240 return; 241 } 242 243 $para =~ s/^ +//s; 244 $para =~ s/ +$//s; 245 246 # And now: a not terribly clever algorithm for inserting newlines 247 # at a guaranteed harmless place: after a block of whitespace 248 # after the 65th column. This was copied from RTF::Writer. 249 $para =~ s/( 250 [^\cm\cj\n]{65} # Snare 65 characters from a line 251 [^\cm\cj\n\x20]{0,50} # and finish any current word 252 ) 253 (\x20{1,10})(?![\cm\cj\n]) # capture some spaces not at line-end 254 /$1$2\n/gx # and put a NL before those spaces 255 ; 256 257 $self->collect( 258 sprintf( 259 '{\pard\sa%d\li%d\ri%d%s\plain' . "\n", 260 261 #100 + 262 10 * $self->{'normal_halfpoint_size'} * ( $self->{'vspace'} || 0 ), 263 264 $self->{'lm'}, 265 $self->{'rm'}, 266 267 $self->{'center'} ? '\qc' : '\ql', 268 ), 269 270 defined( $self->{'next_bullet'} ) 271 ? do { 272 my $bullet = $self->{'next_bullet'}; 273 $self->{'next_bullet'} = undef; 274 sprintf "\\fi-%d\n%s", 275 4.5 * $self->{'normal_halfpoint_size'}, 276 ( $bullet eq '*' ) ? "\\'95 " : ( rtf_esc($bullet) . ". " ); 277 } 278 : (), 279 280 $para, 281 "\n\\par}\n\n", 282 ); 283 284 $self->{'vspace'} = undef; # we finally get to clear it here! 285 286 return; 287} 288 289# ------------------------------------------------------------------------ 290sub new_font_size { 291 my $self = $_[0]; 292 293 $self->out( \sprintf "{\\fs%u\n", $self->scale_font_for( $self->{'normal_halfpoint_size'} ) ); 294} 295 296# ------------------------------------------------------------------------ 297sub restore_font_size { shift->out( \'}' ) } 298 299# ------------------------------------------------------------------------ 300sub hr_start { 301 my $self = shift; 302 303 # A bit of a hack: 304 305 $self->vspace(.3); 306 $self->out( \( '\qc\ul\f1\fs20\nocheck\lang1024 ' . ( '\~' x ( $self->{'hr_width'} || 50 ) ) ) ); 307 $self->vspace(.7); 308 1; 309} 310 311# ------------------------------------------------------------------------ 312 313sub br_start { 314 $_[0]->out( \"\\line\n" ); 315} 316 317# ------------------------------------------------------------------------ 318sub header_start { 319 my ( $self, $level ) = @_; 320 321 # for h1 ... h6's 322 # This really should have been called heading_start, but it's too late 323 # to change now. 324 325 ### Heading of level: $level 326 #$self->adjust_lm(0); # assert new paragraph 327 $self->vspace(1.5); 328 329 $self->out( 330 \( sprintf '\s3%s\ql\keepn\f2\fs%s\ul' . "\n", $level, $self->{ 'head' . $level . '_halfpoint_size' }, $level, 331 ) 332 ); 333 334 return 1; 335} 336 337# ------------------------------------------------------------------------ 338sub header_end { 339 340 # This really should have been called heading_end but it's too late 341 # to change now. 342 343 $_[0]->vspace(1); 344 1; 345} 346 347# ------------------------------------------------------------------------ 348sub bullet { 349 my ( $self, $bullet ) = @_; 350 351 $self->{'next_bullet'} = $bullet; 352 return; 353} 354 355# ------------------------------------------------------------------------ 356sub adjust_lm { 357 $_[0]->emit_para(); 358 $_[0]->{'lm'} += $_[1] * $_[0]->{'normal_halfpoint_size'} * 5; 359 1; 360} 361 362# ------------------------------------------------------------------------ 363sub adjust_rm { 364 $_[0]->emit_para(); 365 $_[0]->{'rm'} -= $_[1] * $_[0]->{'normal_halfpoint_size'} * 5; 366 1; 367} # Yes, flip the sign on the right margin! 368 369# BTW, halfpoints * 10 = twips 370 371# ------------------------------------------------------------------------ 372sub pre_start { 373 my $self = shift; 374 375 $self->SUPER::pre_start(@_); 376 $self->out( \sprintf '\s20\f1\fs%s\noproof\lang1024\lang1076 ', $self->{'codeblock_halfpoint_size'}, ); 377 return 1; 378} 379 380# ------------------------------------------------------------------------ 381sub b_start { shift->out( \'{\b ' ) } 382sub b_end { shift->out( \'}' ) } 383sub i_start { shift->out( \'{\i ' ) } 384sub i_end { shift->out( \'}' ) } 385sub tt_start { shift->out( \'{\f1\noproof\lang1024\lang1076 ' ) } 386sub tt_end { shift->out( \'}' ) } 387sub sub_start { shift->out( \'{\sub ' ) } 388sub sub_end { shift->out( \'}' ) } 389sub sup_start { shift->out( \'{\super ' ) } 390sub sup_end { shift->out( \'}' ) } 391sub strike_start { shift->out( \'{\strike ' ) } 392sub strike_end { shift->out( \'}' ) } 393 394# ------------------------------------------------------------------------ 395sub q_start { 396 my $self = $_[0]; 397 398 $self->out( ( ( ++$self->{'quotelevel'} ) % 2 ) ? \'\ldblquote ' : \'\lquote ' ); 399} 400 401# ------------------------------------------------------------------------ 402sub q_end { 403 my $self = $_[0]; 404 405 $self->out( ( ( --$self->{'quotelevel'} ) % 2 ) ? \'\rquote ' : \'\rdblquote ' ); 406} 407 408# ------------------------------------------------------------------------ 409sub pre_out { $_[0]->out( ref( $_[1] ) ? $_[1] : \rtf_esc_codely( $_[1] ) ) } 410 411# ------------------------------------------------------------------------ 412sub out { # output a word (or, if escaped, chunk of RTF) 413 my $self = shift; 414 415 #return $self->pre_out(@_) if $self->{pre}; 416 417 #### out called by: $_[0], (caller(1) )[3] 418 419 return unless defined $_[0]; # and length $_[0]; 420 421 $self->{'Para'} = '' unless defined $self->{'Para'}; 422 $self->{'Para'} .= ref( $_[0] ) ? ${ $_[0] } : rtf_esc( $_[0] ); 423 424 return 1; 425} 426 427# ------------------------------------------------------------------------ 428use integer; 429 430sub rtf_esc { 431 my $x; # scratch 432 if ( !defined wantarray ) { # void context: alter in-place! 433 for (@_) { 434 s/([F\x00-\x1F\-\\\{\}\x7F-\xFF])/$Escape{$1}/g; # ESCAPER 435 s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg; 436 } 437 return; 438 } 439 elsif (wantarray) { # return an array 440 return map { 441 ; 442 ( $x = $_ ) =~ s/([F\x00-\x1F\-\\\{\}\x7F-\xFF])/$Escape{$1}/g; # ESCAPER 443 $x =~ s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg; 444 445 # Hyper-escape all Unicode characters. 446 $x; 447 } @_; 448 } 449 else { # return a single scalar 450 ( $x = ( ( @_ == 1 ) ? $_[0] : join '', @_ ) ) =~ s/([F\x00-\x1F\-\\\{\}\x7F-\xFF])/$Escape{$1}/g; # ESCAPER 451 # Escape \, {, }, -, control chars, and 7f-ff. 452 $x =~ s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg; 453 454 # Hyper-escape all Unicode characters. 455 return $x; 456 } 457} 458 459# ------------------------------------------------------------------------ 460sub rtf_esc_codely { 461 462 # Doesn't change "-" to hard-hyphen, nor apply computerese style 463 464 my $x; # scratch 465 if ( !defined wantarray ) { # void context: alter in-place! 466 for (@_) { 467 s/([F\x00-\x1F\\\{\}\x7F-\xFF])/$Escape{$1}/g; 468 s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg; 469 470 # Hyper-escape all Unicode characters. 471 } 472 return; 473 } 474 elsif (wantarray) { # return an array 475 return map { 476 ; 477 ( $x = $_ ) =~ s/([F\x00-\x1F\\\{\}\x7F-\xFF])/$Escape{$1}/g; 478 $x =~ s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg; 479 480 # Hyper-escape all Unicode characters. 481 $x; 482 } @_; 483 } 484 else { # return a single scalar 485 ( $x = ( ( @_ == 1 ) ? $_[0] : join '', @_ ) ) =~ s/([F\x00-\x1F\\\{\}\x7F-\xFF])/$Escape{$1}/g; 486 487 # Escape \, {, }, -, control chars, and 7f-ff. 488 $x =~ s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg; 489 490 # Hyper-escape all Unicode characters. 491 return $x; 492 } 493} 494 4951; 496 497__END__ 498 499=pod 500 501=for test_synopsis 1; 502__END__ 503 504=for stopwords arial bookman lm pagenumber prolog rtf tahoma verdana CPAN 505 homepage rm sans serif twentieths 506 507=head1 NAME 508 509HTML::FormatRTF - Format HTML as RTF 510 511=head1 VERSION 512 513version 2.12 514 515=head1 SYNOPSIS 516 517 use HTML::FormatRTF; 518 519 my $out_file = "test.rtf"; 520 open(RTF, ">$out_file") 521 or die "Can't write-open $out_file: $!\nAborting"; 522 523 print RTF HTML::FormatRTF->format_file( 524 'test.html', 525 'fontname_headings' => "Verdana", 526 ); 527 close(RTF); 528 529=head1 DESCRIPTION 530 531HTML::FormatRTF is a class for objects that you use to convert HTML to RTF. 532There is currently no proper support for tables or forms. 533 534This is a subclass of L<HTML::Formatter>, whose documentation you should 535consult for more information on underlying methods such as C<new>, C<format>, 536C<format_file> etc 537 538You can specify any of the following parameters in the call to C<new>, 539C<format_file>, or C<format_string>: 540 541=over 542 543=item lm 544 545Amount of I<extra> indenting to apply to the left margin, in twips 546(I<tw>entI<i>eths of a I<p>oint). Default is 0. 547 548So if you wanted the left margin to be an additional half inch larger, you'd 549set C<< lm => 720 >> (since there's 1440 twips in an inch). If you wanted it to 550be about 1.5cm larger, you'd set C<< lw => 850 >> (since there's about 567 551twips in a centimeter). 552 553=item rm 554 555Amount of I<extra> indenting to apply to the left margin, in twips 556(I<tw>entI<i>eths of a I<p>oint). Default is 0. 557 558=item normal_halfpoint_size 559 560This is the size of normal text in the document, in I<half>-points. The default 561value is 22, meaning that normal text is in 11 point. 562 563=item header_halfpoint_size 564 565This is the size of text used in the document's page-header, in I<half>-points. 566The default value is 17, meaning that normal text is in 7.5 point. Currently, 567the header consists just of "p. I<pagenumber>" in the upper-right-hand corner, 568and cannot be disabled. 569 570=item head1_halfpoint_size ... head6_halfpoint_size 571 572These control the font size of each heading level, in half-twips. For example, 573the default for head3_halfpoint_size is 25, meaning that HTML C<< <h3>...</h3> 574>> text will be in 12.5 point text (in addition to being underlined and in the 575heading font). 576 577=item codeblock_halfpoint_size 578 579This controls the font size (in half-points) of the text used for C<< 580<pre>...</pre> >> text. By default, it is 18, meaning 9 point. 581 582=item fontname_body 583 584This option controls what font is to be used for the body of the text -- that 585is, everything other than heading text and text in pre/code/tt elements. The 586default value is currently "Times". Other handy values I can suggest using are 587"Georgia" or "Bookman Old Style". 588 589=item fontname_code 590 591This option controls what font is to be used for text in pre/code/tt elements. 592The default value is currently "Courier New". 593 594=item fontname_headings 595 596This option controls what font name is to be used for headings. You can use 597the same font as fontname_body, but I prefer a sans-serif font, so the default 598value is currently "Arial". Also consider "Tahoma" and "Verdana". 599 600=item document_language 601 602This option controls what Microsoft language number will be specified as the 603language for this document. The current default value is 1033, for US English. 604Consult an RTF reference for other language numbers. 605 606=item hr_width 607 608This option controls how many underline characters will be used for rendering a 609"<hr>" tag. Its default value is currently 50. You can usually leave this 610alone, but under some circumstances you might want to use a smaller or larger 611number. 612 613=item no_prolog 614 615If this option is set to a true value, HTML::FormatRTF will make a point of 616I<not> emitting the RTF prolog before the document. By default, this is off, 617meaning that HTML::FormatRTF I<will> emit the prolog. This option is of 618interest only to advanced users. 619 620=item no_trailer 621 622If this option is set to a true value, HTML::FormatRTF will make a point of 623I<not> emitting the RTF trailer at the end of the document. By default, this 624is off, meaning that HTML::FormatRTF I<will> emit the bit of RTF that ends the 625document. This option is of interest only to advanced users. 626 627=back 628 629=head1 SEE ALSO 630 631L<HTML::Formatter>, L<RTF::Writer> 632 633=head1 INSTALLATION 634 635See perlmodinstall for information and options on installing Perl modules. 636 637=head1 BUGS AND LIMITATIONS 638 639You can make new bug reports, and view existing ones, through the 640web interface at L<http://rt.cpan.org/Public/Dist/Display.html?Name=HTML-Format>. 641 642=head1 AVAILABILITY 643 644The project homepage is L<https://metacpan.org/release/HTML-Format>. 645 646The latest version of this module is available from the Comprehensive Perl 647Archive Network (CPAN). Visit L<http://www.perl.com/CPAN/> to find a CPAN 648site near you, or see L<https://metacpan.org/module/HTML::Format/>. 649 650=head1 AUTHORS 651 652=over 4 653 654=item * 655 656Nigel Metheringham <nigelm@cpan.org> 657 658=item * 659 660Sean M Burke <sburke@cpan.org> 661 662=item * 663 664Gisle Aas <gisle@ActiveState.com> 665 666=back 667 668=head1 COPYRIGHT AND LICENSE 669 670This software is copyright (c) 2015 by Nigel Metheringham, 2002-2005 Sean M Burke, 1999-2002 Gisle Aas. 671 672This is free software; you can redistribute it and/or modify it under 673the same terms as the Perl 5 programming language system itself. 674 675=cut 676