1package HTML::FormatRTF;
2
3# ABSTRACT: Format HTML as RTF
4
5
6use 5.006_001;
7use strict;
8use warnings;
9
10# We now use Smart::Comments in place of the old DEBUG framework.
11# this should be commented out in release versions....
12##use Smart::Comments;
13
14use base 'HTML::Formatter';
15
16our $VERSION = '2.12'; # VERSION
17our $AUTHORITY = 'cpan:NIGELM'; # AUTHORITY
18
19# ------------------------------------------------------------------------
20my %Escape = (
21    map( ( chr($_), chr($_) ),    # things not apparently needing escaping
22        0x20 .. 0x7E ),
23    map( ( chr($_), sprintf( "\\'%02x", $_ ) ),    # apparently escapeworthy things
24        0x00 .. 0x1F, 0x5c, 0x7b, 0x7d, 0x7f .. 0xFF, 0x46 ),
25
26    # We get to escape out 'F' so that we can send RTF files thru the mail
27    # without the slightest worry that paragraphs beginning with "From"
28    # will get munged.
29
30    # And some refinements:
31    #"\n"   => "\n\\line ",
32    #"\cm"  => "\n\\line ",
33    #"\cj"  => "\n\\line ",
34
35    "\t" => "\\tab ",    # Tabs (altho theoretically raw \t's are okay)
36
37    # "\f"   => "\n\\page\n", # Formfeed
38    "-"    => "\\_",     # Turn plaintext '-' into a non-breaking hyphen
39    "\xA0" => "\\~",     # Latin-1 non-breaking space
40    "\xAD" => "\\-",     # Latin-1 soft (optional) hyphen
41
42    # CRAZY HACKS:
43    "\n" => "\\line\n",
44    "\r" => "\n",
45
46    # "\cb" => "{\n\\cs21\\lang1024\\noproof ",  # \\cf1
47    # "\cc" => "}",
48);
49
50# ------------------------------------------------------------------------
51sub default_values {
52    (   shift->SUPER::default_values(),
53        'lm' => 0,    # left margin
54        'rm' => 0,    # right margin (actually, maximum text width)
55
56        'head1_halfpoint_size'     => 32,
57        'head2_halfpoint_size'     => 28,
58        'head3_halfpoint_size'     => 25,
59        'head4_halfpoint_size'     => 22,
60        'head5_halfpoint_size'     => 20,
61        'head6_halfpoint_size'     => 18,
62        'codeblock_halfpoint_size' => 18,
63        'header_halfpoint_size'    => 17,
64        'normal_halfpoint_size'    => 22,
65    );
66}
67
68# ------------------------------------------------------------------------
69sub configure {
70    my ( $self, $hash ) = shift;
71
72    $self->{lm} = 0;
73    $self->{rm} = 0;
74
75    # include the hash parameters into self - as RT#56278
76    map { $self->{$_} = $hash->{$_} } keys %$hash if ( ref($hash) );
77    $self;
78}
79
80# ------------------------------------------------------------------------
81sub begin {
82    my $self = shift;
83
84    ### Start document...
85    $self->SUPER::begin;
86
87    $self->collect( $self->doc_init, $self->font_table, $self->stylesheet, $self->color_table, $self->doc_info,
88        $self->doc_really_start, "\n" )
89        unless $self->{'no_prolog'};
90
91    $self->{'Para'}       = '';
92    $self->{'quotelevel'} = 0;
93
94    return;
95}
96
97# ------------------------------------------------------------------------
98sub end {
99    my $self = shift;
100
101    $self->vspace(0);
102    $self->out('THIS IS NEVER SEEN');
103
104    # just to force the previous para to be written out.
105    $self->collect("}") unless $self->{'no_trailer'};    # ends the document
106
107    ### End document...
108    return;
109}
110
111# ------------------------------------------------------------------------
112sub vspace {
113    my $self = shift;
114
115    #$self->emit_para if defined $self->{'vspace'};
116    my $rv = $self->SUPER::vspace(@_);
117    $self->emit_para if defined $self->{'vspace'};
118    $rv;
119}
120
121# ------------------------------------------------------------------------
122sub stylesheet {
123
124    # TODO: maybe actually /use/ the character styles?
125
126    return sprintf <<'END',    # snazzy styles
127{\stylesheet
128{\snext0 Normal;}
129{\*\cs1 \additive Default Paragraph Font;}
130{\*\cs2 \additive \i\sbasedon1 html-ital;}
131{\*\cs3 \additive \b\sbasedon1 html-bold;}
132{\*\cs4 \additive \f1\sbasedon1 html-code;}
133
134{\s20\ql \f1\fs%s\lang1024\noproof\sbasedon0 \snext0 html-pre;}
135
136{\s31\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head1;}
137{\s32\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head2;}
138{\s33\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head3;}
139{\s34\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head4;}
140{\s35\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head5;}
141{\s36\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head6;}
142}
143
144END
145
146        @{ $_[0] }{
147        qw<
148            codeblock_halfpoint_size
149            head1_halfpoint_size
150            head2_halfpoint_size
151            head3_halfpoint_size
152            head4_halfpoint_size
153            head5_halfpoint_size
154            head6_halfpoint_size
155            >
156        };
157}
158
159# ------------------------------------------------------------------------
160# Override these as necessary for further customization
161
162sub font_table {
163    my $self = shift;
164
165    return sprintf <<'END' ,    # text font, code font, heading font
166{\fonttbl
167{\f0\froman %s;}
168{\f1\fmodern %s;}
169{\f2\fswiss %s;}
170}
171
172END
173
174        map {
175        ;                       # custom-dumb escaper:
176        my $x = $_;
177        $x =~ s/([\x00-\x1F\\\{\}\x7F-\xFF])/sprintf("\\'%02x", $1)/g;
178        $x =~ s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg;
179        $x;
180        }
181        $self->{'fontname_body'}     || 'Times',
182        $self->{'fontname_code'}     || 'Courier New',
183        $self->{'fontname_headings'} || 'Arial',
184        ;
185}
186
187# ------------------------------------------------------------------------
188sub doc_init {
189    return <<'END';
190{\rtf1\ansi\deff0
191
192END
193}
194
195# ------------------------------------------------------------------------
196sub color_table {
197    return <<'END';
198{\colortbl;\red255\green0\blue0;\red0\green0\blue255;}
199END
200}
201
202# ------------------------------------------------------------------------
203sub doc_info {
204    my $self = $_[0];
205
206    return sprintf <<'END', $self->version_tag;
207{\info{\doccomm generated by %s}
208{\author [see doc]}{\company [see doc]}{\operator [see doc]}
209}
210
211END
212
213}
214
215# ------------------------------------------------------------------------
216sub doc_really_start {
217    my $self = $_[0];
218
219    return sprintf <<'END',
220\deflang%s\widowctrl
221{\header\pard\qr\plain\f2\fs%s
222p.\chpgn\par}
223\fs%s
224
225END
226        $self->{'document_language'} || 1033, $self->{"header_halfpoint_size"}, $self->{"normal_halfpoint_size"},;
227}
228
229# ------------------------------------------------------------------------
230sub emit_para {    # rather like showline in FormatPS
231    my $self = shift;
232
233    my $para = $self->{'Para'};
234    $self->{'Para'} = undef;
235
236    #### emit_para called by: (caller(1) )[3];
237
238    unless ( defined $para ) {
239        #### emit_para with empty buffer...
240        return;
241    }
242
243    $para =~ s/^ +//s;
244    $para =~ s/ +$//s;
245
246    # And now: a not terribly clever algorithm for inserting newlines
247    # at a guaranteed harmless place: after a block of whitespace
248    # after the 65th column.  This was copied from RTF::Writer.
249    $para =~ s/(
250       [^\cm\cj\n]{65}        # Snare 65 characters from a line
251       [^\cm\cj\n\x20]{0,50}  #  and finish any current word
252      )
253      (\x20{1,10})(?![\cm\cj\n]) # capture some spaces not at line-end
254     /$1$2\n/gx    # and put a NL before those spaces
255        ;
256
257    $self->collect(
258        sprintf(
259            '{\pard\sa%d\li%d\ri%d%s\plain' . "\n",
260
261            #100 +
262            10 * $self->{'normal_halfpoint_size'} * ( $self->{'vspace'} || 0 ),
263
264            $self->{'lm'},
265            $self->{'rm'},
266
267            $self->{'center'} ? '\qc' : '\ql',
268        ),
269
270        defined( $self->{'next_bullet'} )
271        ? do {
272            my $bullet = $self->{'next_bullet'};
273            $self->{'next_bullet'} = undef;
274            sprintf "\\fi-%d\n%s",
275                4.5 * $self->{'normal_halfpoint_size'},
276                ( $bullet eq '*' ) ? "\\'95 " : ( rtf_esc($bullet) . ". " );
277            }
278        : (),
279
280        $para,
281        "\n\\par}\n\n",
282    );
283
284    $self->{'vspace'} = undef;    # we finally get to clear it here!
285
286    return;
287}
288
289# ------------------------------------------------------------------------
290sub new_font_size {
291    my $self = $_[0];
292
293    $self->out( \sprintf "{\\fs%u\n", $self->scale_font_for( $self->{'normal_halfpoint_size'} ) );
294}
295
296# ------------------------------------------------------------------------
297sub restore_font_size { shift->out( \'}' ) }
298
299# ------------------------------------------------------------------------
300sub hr_start {
301    my $self = shift;
302
303    # A bit of a hack:
304
305    $self->vspace(.3);
306    $self->out( \( '\qc\ul\f1\fs20\nocheck\lang1024 ' . ( '\~' x ( $self->{'hr_width'} || 50 ) ) ) );
307    $self->vspace(.7);
308    1;
309}
310
311# ------------------------------------------------------------------------
312
313sub br_start {
314    $_[0]->out( \"\\line\n" );
315}
316
317# ------------------------------------------------------------------------
318sub header_start {
319    my ( $self, $level ) = @_;
320
321    # for h1 ... h6's
322    # This really should have been called heading_start, but it's too late
323    #  to change now.
324
325    ### Heading of level: $level
326    #$self->adjust_lm(0); # assert new paragraph
327    $self->vspace(1.5);
328
329    $self->out(
330        \(  sprintf '\s3%s\ql\keepn\f2\fs%s\ul' . "\n", $level, $self->{ 'head' . $level . '_halfpoint_size' }, $level,
331        )
332    );
333
334    return 1;
335}
336
337# ------------------------------------------------------------------------
338sub header_end {
339
340    # This really should have been called heading_end but it's too late
341    #  to change now.
342
343    $_[0]->vspace(1);
344    1;
345}
346
347# ------------------------------------------------------------------------
348sub bullet {
349    my ( $self, $bullet ) = @_;
350
351    $self->{'next_bullet'} = $bullet;
352    return;
353}
354
355# ------------------------------------------------------------------------
356sub adjust_lm {
357    $_[0]->emit_para();
358    $_[0]->{'lm'} += $_[1] * $_[0]->{'normal_halfpoint_size'} * 5;
359    1;
360}
361
362# ------------------------------------------------------------------------
363sub adjust_rm {
364    $_[0]->emit_para();
365    $_[0]->{'rm'} -= $_[1] * $_[0]->{'normal_halfpoint_size'} * 5;
366    1;
367}    # Yes, flip the sign on the right margin!
368
369# BTW, halfpoints * 10 = twips
370
371# ------------------------------------------------------------------------
372sub pre_start {
373    my $self = shift;
374
375    $self->SUPER::pre_start(@_);
376    $self->out( \sprintf '\s20\f1\fs%s\noproof\lang1024\lang1076 ', $self->{'codeblock_halfpoint_size'}, );
377    return 1;
378}
379
380# ------------------------------------------------------------------------
381sub b_start      { shift->out( \'{\b ' ) }
382sub b_end        { shift->out( \'}' ) }
383sub i_start      { shift->out( \'{\i ' ) }
384sub i_end        { shift->out( \'}' ) }
385sub tt_start     { shift->out( \'{\f1\noproof\lang1024\lang1076 ' ) }
386sub tt_end       { shift->out( \'}' ) }
387sub sub_start    { shift->out( \'{\sub ' ) }
388sub sub_end      { shift->out( \'}' ) }
389sub sup_start    { shift->out( \'{\super ' ) }
390sub sup_end      { shift->out( \'}' ) }
391sub strike_start { shift->out( \'{\strike ' ) }
392sub strike_end   { shift->out( \'}' ) }
393
394# ------------------------------------------------------------------------
395sub q_start {
396    my $self = $_[0];
397
398    $self->out( ( ( ++$self->{'quotelevel'} ) % 2 ) ? \'\ldblquote ' : \'\lquote ' );
399}
400
401# ------------------------------------------------------------------------
402sub q_end {
403    my $self = $_[0];
404
405    $self->out( ( ( --$self->{'quotelevel'} ) % 2 ) ? \'\rquote ' : \'\rdblquote ' );
406}
407
408# ------------------------------------------------------------------------
409sub pre_out { $_[0]->out( ref( $_[1] ) ? $_[1] : \rtf_esc_codely( $_[1] ) ) }
410
411# ------------------------------------------------------------------------
412sub out {    # output a word (or, if escaped, chunk of RTF)
413    my $self = shift;
414
415    #return $self->pre_out(@_) if $self->{pre};
416
417    #### out called by: $_[0], (caller(1) )[3]
418
419    return unless defined $_[0];    # and length $_[0];
420
421    $self->{'Para'} = '' unless defined $self->{'Para'};
422    $self->{'Para'} .= ref( $_[0] ) ? ${ $_[0] } : rtf_esc( $_[0] );
423
424    return 1;
425}
426
427# ------------------------------------------------------------------------
428use integer;
429
430sub rtf_esc {
431    my $x;                          # scratch
432    if ( !defined wantarray ) {     # void context: alter in-place!
433        for (@_) {
434            s/([F\x00-\x1F\-\\\{\}\x7F-\xFF])/$Escape{$1}/g;    # ESCAPER
435            s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg;
436        }
437        return;
438    }
439    elsif (wantarray) {                                         # return an array
440        return map {
441            ;
442            ( $x = $_ ) =~ s/([F\x00-\x1F\-\\\{\}\x7F-\xFF])/$Escape{$1}/g;    # ESCAPER
443            $x =~ s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg;
444
445            # Hyper-escape all Unicode characters.
446            $x;
447        } @_;
448    }
449    else {                                                                     # return a single scalar
450        ( $x = ( ( @_ == 1 ) ? $_[0] : join '', @_ ) ) =~ s/([F\x00-\x1F\-\\\{\}\x7F-\xFF])/$Escape{$1}/g;    # ESCAPER
451                 # Escape \, {, }, -, control chars, and 7f-ff.
452        $x =~ s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg;
453
454        # Hyper-escape all Unicode characters.
455        return $x;
456    }
457}
458
459# ------------------------------------------------------------------------
460sub rtf_esc_codely {
461
462    # Doesn't change "-" to hard-hyphen, nor apply computerese style
463
464    my $x;    # scratch
465    if ( !defined wantarray ) {    # void context: alter in-place!
466        for (@_) {
467            s/([F\x00-\x1F\\\{\}\x7F-\xFF])/$Escape{$1}/g;
468            s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg;
469
470            # Hyper-escape all Unicode characters.
471        }
472        return;
473    }
474    elsif (wantarray) {            # return an array
475        return map {
476            ;
477            ( $x = $_ ) =~ s/([F\x00-\x1F\\\{\}\x7F-\xFF])/$Escape{$1}/g;
478            $x =~ s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg;
479
480            # Hyper-escape all Unicode characters.
481            $x;
482        } @_;
483    }
484    else {                         # return a single scalar
485        ( $x = ( ( @_ == 1 ) ? $_[0] : join '', @_ ) ) =~ s/([F\x00-\x1F\\\{\}\x7F-\xFF])/$Escape{$1}/g;
486
487        # Escape \, {, }, -, control chars, and 7f-ff.
488        $x =~ s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg;
489
490        # Hyper-escape all Unicode characters.
491        return $x;
492    }
493}
494
4951;
496
497__END__
498
499=pod
500
501=for test_synopsis 1;
502__END__
503
504=for stopwords arial bookman lm pagenumber prolog rtf tahoma verdana CPAN
505    homepage rm sans serif twentieths
506
507=head1 NAME
508
509HTML::FormatRTF - Format HTML as RTF
510
511=head1 VERSION
512
513version 2.12
514
515=head1 SYNOPSIS
516
517  use HTML::FormatRTF;
518
519  my $out_file = "test.rtf";
520  open(RTF, ">$out_file")
521   or die "Can't write-open $out_file: $!\nAborting";
522
523  print RTF HTML::FormatRTF->format_file(
524    'test.html',
525      'fontname_headings' => "Verdana",
526  );
527  close(RTF);
528
529=head1 DESCRIPTION
530
531HTML::FormatRTF is a class for objects that you use to convert HTML to RTF.
532There is currently no proper support for tables or forms.
533
534This is a subclass of L<HTML::Formatter>, whose documentation you should
535consult for more information on underlying methods such as C<new>, C<format>,
536C<format_file> etc
537
538You can specify any of the following parameters in the call to C<new>,
539C<format_file>, or C<format_string>:
540
541=over
542
543=item lm
544
545Amount of I<extra> indenting to apply to the left margin, in twips
546(I<tw>entI<i>eths of a I<p>oint). Default is 0.
547
548So if you wanted the left margin to be an additional half inch larger, you'd
549set C<< lm => 720 >> (since there's 1440 twips in an inch). If you wanted it to
550be about 1.5cm larger, you'd set C<< lw => 850 >> (since there's about 567
551twips in a centimeter).
552
553=item rm
554
555Amount of I<extra> indenting to apply to the left margin, in twips
556(I<tw>entI<i>eths of a I<p>oint).  Default is 0.
557
558=item normal_halfpoint_size
559
560This is the size of normal text in the document, in I<half>-points. The default
561value is 22, meaning that normal text is in 11 point.
562
563=item header_halfpoint_size
564
565This is the size of text used in the document's page-header, in I<half>-points.
566The default value is 17, meaning that normal text is in 7.5 point.  Currently,
567the header consists just of "p. I<pagenumber>" in the upper-right-hand corner,
568and cannot be disabled.
569
570=item head1_halfpoint_size ... head6_halfpoint_size
571
572These control the font size of each heading level, in half-twips.  For example,
573the default for head3_halfpoint_size is 25, meaning that HTML C<< <h3>...</h3>
574>> text will be in 12.5 point text (in addition to being underlined and in the
575heading font).
576
577=item codeblock_halfpoint_size
578
579This controls the font size (in half-points) of the text used for C<<
580<pre>...</pre> >> text.  By default, it is 18, meaning 9 point.
581
582=item fontname_body
583
584This option controls what font is to be used for the body of the text -- that
585is, everything other than heading text and text in pre/code/tt elements. The
586default value is currently "Times".  Other handy values I can suggest using are
587"Georgia" or "Bookman Old Style".
588
589=item fontname_code
590
591This option controls what font is to be used for text in pre/code/tt elements.
592The default value is currently "Courier New".
593
594=item fontname_headings
595
596This option controls what font name is to be used for headings.  You can use
597the same font as fontname_body, but I prefer a sans-serif font, so the default
598value is currently "Arial".  Also consider "Tahoma" and "Verdana".
599
600=item document_language
601
602This option controls what Microsoft language number will be specified as the
603language for this document. The current default value is 1033, for US English.
604Consult an RTF reference for other language numbers.
605
606=item hr_width
607
608This option controls how many underline characters will be used for rendering a
609"<hr>" tag. Its default value is currently 50. You can usually leave this
610alone, but under some circumstances you might want to use a smaller or larger
611number.
612
613=item no_prolog
614
615If this option is set to a true value, HTML::FormatRTF will make a point of
616I<not> emitting the RTF prolog before the document.  By default, this is off,
617meaning that HTML::FormatRTF I<will> emit the prolog.  This option is of
618interest only to advanced users.
619
620=item no_trailer
621
622If this option is set to a true value, HTML::FormatRTF will make a point of
623I<not> emitting the RTF trailer at the end of the document.  By default, this
624is off, meaning that HTML::FormatRTF I<will> emit the bit of RTF that ends the
625document.  This option is of interest only to advanced users.
626
627=back
628
629=head1 SEE ALSO
630
631L<HTML::Formatter>, L<RTF::Writer>
632
633=head1 INSTALLATION
634
635See perlmodinstall for information and options on installing Perl modules.
636
637=head1 BUGS AND LIMITATIONS
638
639You can make new bug reports, and view existing ones, through the
640web interface at L<http://rt.cpan.org/Public/Dist/Display.html?Name=HTML-Format>.
641
642=head1 AVAILABILITY
643
644The project homepage is L<https://metacpan.org/release/HTML-Format>.
645
646The latest version of this module is available from the Comprehensive Perl
647Archive Network (CPAN). Visit L<http://www.perl.com/CPAN/> to find a CPAN
648site near you, or see L<https://metacpan.org/module/HTML::Format/>.
649
650=head1 AUTHORS
651
652=over 4
653
654=item *
655
656Nigel Metheringham <nigelm@cpan.org>
657
658=item *
659
660Sean M Burke <sburke@cpan.org>
661
662=item *
663
664Gisle Aas <gisle@ActiveState.com>
665
666=back
667
668=head1 COPYRIGHT AND LICENSE
669
670This software is copyright (c) 2015 by Nigel Metheringham, 2002-2005 Sean M Burke, 1999-2002 Gisle Aas.
671
672This is free software; you can redistribute it and/or modify it under
673the same terms as the Perl 5 programming language system itself.
674
675=cut
676