1package Text::ParagraphDiff;
2
3use strict;
4use warnings 'all';
5
6use Algorithm::Diff qw(diff);
7use Carp qw(croak);
8use HTML::Entities ();
9use POSIX qw(strftime);
10
11use vars qw(@EXPORT @EXPORT_OK @ISA $VERSION);
12require Exporter;
13@EXPORT = qw(text_diff);
14@EXPORT_OK = qw(create_diff html_header html_footer);
15@ISA = qw(Exporter);
16$VERSION = "2.70";
17
18
19
20# XXX: Can't use pod here because it messes up the doc on CPAN. :(
21
22# text_diff( old, new, [options hashref] )
23
24# C<text_diff> binds together C<html_header>, C<create_diff>, and
25# C<html_footer> to create a single document that is the "paragraph
26# diff" of the 2 records.
27
28sub text_diff {
29    return ((html_header(@_)).(create_diff(@_)).(html_footer(@_)));
30}
31
32
33
34# create_diff ( old, new, [options hashref] )
35
36# C<create_diff> creates the actual paragraph diff.
37
38sub create_diff {
39
40    my($old,$new) = (shift,shift);
41    my $opt=shift if (@_);
42
43    my $old_orig = _get_lines($old, $opt);
44    my $new_orig = _get_lines($new, $opt);
45    $new_orig = [''] unless @$new_orig;
46
47    my %highlight;
48    if ($opt->{plain}) {
49        $highlight{minus} = qq(<b><font color="#FF0000" size="+1"> );
50        $highlight{plus}  = qq(<b><font color="#005500" size="+1"> );
51        $highlight{end} = "</font></b>";
52    }
53    else {
54        $highlight{minus} = qq(<span class="minus"> );
55        $highlight{plus}  = qq(<span class="plus"> );
56        $highlight{end}   = qq(</span>);
57    }
58
59    $opt->{plus_order} = 0 unless $opt->{plus_order};
60
61    my (@old,@old_count);
62    foreach (@$old_orig)
63    {
64        $_ = HTML::Entities::encode($_) unless exists $opt->{escape};
65        my @words = (/\S+/g);
66        push @old, @words;
67        push @old_count, scalar(@words);
68
69    }
70
71
72    my ($total_diff, @new, @leading_space, @count);
73    foreach (@$new_orig)
74    {
75        my ($leading_white) = /^( *)/;
76        push @leading_space, $leading_white;
77
78        $_ = HTML::Entities::encode($_) unless exists $opt->{escape};
79        my @words = (/\S+/g);
80
81        push @$total_diff, map { [' ',$_] } @words;
82        push @new, @words;
83        push @count, scalar(@words);
84    }
85
86    $opt->{sep} = ['<p>','</p>'] unless exists $opt->{sep};
87    my ($plus,$minus) = _get_diffs(\@old, \@new, \@old_count, $opt->{sep});
88
89    _merge_plus  ($total_diff, $plus) if @$plus;
90    _merge_minus ($total_diff, $minus, $opt->{minus_first}) if @$minus;
91    _merge_white ($total_diff, \@leading_space);
92
93    $total_diff = _merge_lines ($total_diff, \@old_count, \@count);
94
95    _fold ($total_diff);
96
97    my $output = _format ($total_diff, \%highlight, $opt->{sep});
98    return $output;
99}
100
101#########
102# Utility
103
104# turns potential files into recordsets
105sub _get_lines {
106    my ($file, $opt) = @_;
107    my @lines;
108    if (!ref $file) {
109        if ($opt->{string}) {
110            return [split /\r\n|\r|\n/,$file];
111        }
112        else {
113            open (FILE, "< $file") or croak "Can't open file $file: $!";
114            @lines = <FILE>;
115            close(FILE);
116            return \@lines;
117        }
118    }
119    else {
120        return $file;
121    }
122}
123
124sub _fold {
125    my ($diff) = @_;
126
127    foreach (@$diff) {
128        my $i = 0;
129        while ($i+1 < @$_) {
130            if ($_->[$i][0] eq $_->[$i+1][0]) {
131                my $item = splice @$_, $i+1, 1;
132                $_->[$i][1] .= (" ".$item->[1]);
133                next;
134            }
135            $i++;
136        }
137    }
138}
139
140# diffs the files and splits into "plusses and "minuses"
141sub _get_diffs {
142    my ($old,$new,$count,$sep) = @_;
143    my @diffs = diff($old, $new);
144    my ($plus,$minus) = ([],[]);
145    foreach my $hunk (@diffs) {
146        foreach (@$hunk) {
147            push @$plus,  $_ if $_->[0] eq '+';
148            push @$minus, $_ if $_->[0] eq '-';
149        }
150    }
151    _fix_minus ($minus, $count, $sep);
152    return ($plus,$minus);
153}
154
155# re-adjusts the minus's position to correspond with the positve,
156# and adds paragraph markers where necessary
157sub _fix_minus {
158    my ($d,$count,$sep) = @_;
159    my ($i,$x) = (0,0);
160    foreach my $break (@$count) {
161        $i += $break;
162        while ( ($x < @$d) && ($i > $d->[$x][1]) ) {
163            ++$x
164        }
165        last unless @$d > $x;
166        $d->[$x-1][2] .= $sep->[1].$sep->[0] if ($i-1) == $d->[$x-1][1];
167        ++$x
168    }
169}
170
171#########
172# Merging
173
174# integrate the "plus" into the main document
175sub _merge_plus {
176    my ($total_diff, $plus_diff) = @_;
177
178    while ( my $cur = shift @$plus_diff ) {
179        $total_diff->[$cur->[1]][0] = '+';
180    }
181}
182
183# integrate the minus into the main document, making sure not
184# to split up any plusses
185sub _merge_minus {
186    my ($total_diff, $min_diff, $minus_first) = @_;
187    my ($pos,$offset) = (0,0);
188
189    while ( my $cur = shift @$min_diff ) {
190        while ($pos < ($cur->[1]+$offset)) {
191            ++$offset if $total_diff->[$pos][0] eq '+';
192            ++$pos;
193        }
194        if ($pos >= $#{$total_diff}) {
195            push @$total_diff, ['-',$cur->[2]];
196            last;
197        }
198        while ($pos < @$total_diff && $total_diff->[$pos][0] eq '+') {
199            ++$offset;
200            ++$pos;
201        }
202        my $current = 0;
203        $current = $offset if $minus_first;
204        splice @$total_diff, $pos-$current, 0, ['-',$cur->[2]];
205    }
206
207    push @$total_diff, map { ['-',$_->[2]] } @$min_diff if @$min_diff;
208}
209
210# merge in whitespace.
211sub _merge_white {
212    my ($total_diff, $whitespace) = @_;
213    my $pos = 0;
214
215    while ( @$whitespace ) {
216        my $cur = shift @$whitespace;
217        while (    ($pos < @$total_diff)
218                && ($total_diff->[$pos][0] ne '-')
219              ) { $pos++ }
220        $total_diff->[$pos][1] = $cur . $total_diff->[$pos][1]
221            if $total_diff->[$pos][1];
222        ++$pos;
223    }
224}
225
226sub _merge_lines {
227    my ($total_diff, $old_count, $new_count) = @_;
228    my $new = [];
229    my @old_count_orig = @$old_count;
230
231    foreach my $words_in_line ( @$new_count ) {
232        if ($words_in_line > 0) {
233            push @$new, [];
234            my ($pos,$total) = (0,0);
235            while ($pos < $words_in_line ) {
236                until ($old_count->[0]) {
237                    last unless @$old_count;
238                    shift @$old_count;
239                    shift @old_count_orig;
240                }
241                ++$pos if $total_diff->[$total][0] ne '-';
242                $old_count->[0] = $old_count->[0] - 1 if $total_diff->[$total][0] ne '+';
243                ++$total;
244            }
245            $new->[-1] = [splice @$total_diff,0,$total];
246        }
247    }
248
249    if (@$old_count && $old_count->[0] < $old_count_orig[0]) {
250        push @{$new->[-1]}, splice(@$total_diff, 0, $old_count->[0]);
251        shift @old_count_orig;
252    }
253    while (@old_count_orig) {
254        push @$new, [splice @$total_diff, 0, shift(@old_count_orig)]
255    }
256
257    return $new;
258}
259
260#########
261# Output
262
263sub _format {
264    my ($diff,$highlight,$sep) = @_;
265    my $output;
266
267    foreach my $hunk (@$diff) {
268        $output .= "\n$sep->[0]\n";
269        foreach my $sect (@$hunk) {
270            if ($sect->[0] eq ' ') {
271                $output .= "$sect->[1] ";
272            }
273            elsif ($sect->[0] eq '+') {
274                $output .= " $highlight->{plus}$sect->[1]$highlight->{end} ";
275            }
276            else {
277                # $sect->[1] = '' unless $sect->[1];
278                $output .= " $highlight->{minus}$sect->[1]$highlight->{end} ";
279            }
280        }
281        $output .= "\n$sep->[1]\n";
282    }
283    return $output;
284}
285
286sub html_header {
287    my ($old,$new,$opt) = @_;
288
289    my $old_time = strftime( "%A, %B %d, %Y @ %H:%M:%S",
290                            (ref $old) ? time : (stat $old)[9]
291                            , 0, 0, 0, 0, 70, 0 );
292    my $new_time = strftime( "%A, %B %d, %Y @ %H:%M:%S",
293                            (ref $new) ? time : (stat $new)[9]
294                            , 0, 0, 0, 0, 70, 0 );
295
296    $old = (!ref $old) ? $old : "old";
297    $new = (!ref $new) ? $new : "new";
298
299    if ($opt->{plain}) {
300        return "<html><head><title>Difference of $old, $new</title></head><body>"
301    }
302
303    my $header = (exists $opt->{header}) ? $opt->{header} : qq(
304        <div>
305        <font size="+2"><b>Difference of:</b></font>
306        <table border="0" cellspacing="5">
307        <tr><td class="minus">---</td><td class="minus"><b>$old</b></td><td>$old_time</td></tr>
308        <tr><td class="plus" >+++</td><td class="plus" ><b>$new</b></td><td>$new_time</td></tr>
309        </table></div>
310    );
311
312    my $script = ($opt->{functionality}) ? "" : qq(
313        <script>
314        toggle_plus_status = 1;
315        toggle_minus_status = 1;
316        function dis_plus() {
317            for(i=0; (a = document.getElementsByTagName("span")[i]); i++) {
318                if(a.className == "plus") {
319                    a.style.display="none";
320                }
321            }
322        }
323        function dis_minus() {
324            for(i=0; (a = document.getElementsByTagName("span")[i]); i++) {
325                if(a.className == "minus") {
326                    a.style.display="none";
327                }
328            }
329        }
330        function view_plus() {
331            for(i=0; (a = document.getElementsByTagName("span")[i]); i++) {
332                if(a.className == "plus") {
333                    a.style.display="inline";
334                }
335            }
336        }
337        function view_minus() {
338            for(i=0; (a = document.getElementsByTagName("span")[i]); i++) {
339                if(a.className == "minus") {
340                    a.style.display="inline";
341                }
342            }
343        }
344
345        function toggle_plus() {
346            if (toggle_plus_status == 1) {
347                dis_plus();
348                toggle_plus_status = 0;
349            }
350            else {
351                view_plus();
352                toggle_plus_status = 1;
353            }
354        }
355
356        function toggle_minus() {
357            if (toggle_minus_status == 1) {
358                dis_minus();
359                toggle_minus_status = 0;
360            }
361            else {
362                view_minus();
363                toggle_minus_status = 1;
364            }
365        }
366        </script>
367    );
368
369    my $style = (exists $opt->{style}) ? $opt->{style} : qq(
370        <style>
371            .plus{background-color:#00BBBB; visibility="visible"}
372            .minus{background-color:#FF9999; visibility="visible"}
373            DIV{ margin:50px; border:solid; background-color:#F2F2F2; padding:5px; }
374            BODY{line-height:1.7; background-color:#888888}
375            B{font-size:bigger;}
376            .togglep {
377                font-size : 12px;
378                font-family : geneva, arial, sans-serif;
379                color : #ffc;
380                background-color : #00BBBB;
381            }
382            .togglem {
383                font-size : 12px;
384                font-family : geneva, arial, sans-serif;
385                color : #ffc;
386                background-color : #ff9999;
387            }
388        </style>
389    );
390
391    my $functionality = ($opt->{functionality}) ? "" : qq(
392        <div>
393        <form>
394        <table border="0" cellspacing="5">
395        <td><input type="button" class="togglep" value="Toggle Plus" onclick="toggle_plus(); return false;" /></td><td width="10">&nbsp;</td>
396        <td><input type="button" class="togglem" value="Toggle Minus" onclick="toggle_minus(); return false;" /></td><td width="10">&nbsp;</td>
397        </table>
398        </form>
399        </div>
400    );
401
402    return qq(
403        <!DOCTYPE html
404            PUBLIC "-//W3C//DTD XHMTL 1.0 Transitional//EN"
405            "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
406        <html><head>
407        <title>Difference of $old, $new</title>
408        $script
409        $style
410        </head><body>
411        $header
412        $functionality
413        <div>
414    );
415}
416
417sub html_footer {
418    my $div = "";
419
420    if (@_ == 3) {
421        return $_[2]->{footer} if exists $_[2]->{footer};
422        $div = "</div>" unless $_[2]->{plain}
423    }
424
425    return $div."</body></html>"
426}
427
4281;
429
430__END__
431
432=pod
433
434=head1 NAME
435
436Text::ParagraphDiff - Visual Difference for paragraphed text.
437
438=head1 ABSTRACT
439
440C<Text::ParagraphDiff> finds the difference between two paragraphed text files
441by word rather than by line, reflows the text together, and then outputs result
442as xhtml.
443
444=head1 SYNOPSIS
445
446    use Text::ParagraphDiff;
447
448    # old.txt and new.txt are filenames
449    print text_diff('old.txt', 'new.txt');
450
451    # Or pass array references
452    print text_diff(\@old, \@new);
453
454    # T-Diff 2 plain strings (a FAQ)
455    print text_diff("old", "new", {string=>1});
456
457    # Pass options (see below)
458    print text_diff($old, $new, {plain=>1});
459
460    # or use the premade script in bin/:
461    # ./tdiff oldfile newfile
462
463=head1 DESCRIPTION
464
465C<Text::ParagraphDiff> is a reimplementation of C<diff> that is meant for
466paragraphed text rather than for code.  Instead of "diffing" a document by
467line, C<Text::ParagraphDiff> expands a document to one word per line, uses
468C<Algorithm::Diff> to find the difference, and then reflows the text back
469together, highlighting the "add" and "subtract" sections.  Writers and editors
470might find this useful for sending revisions to each other across the internet,
471or a single user might use it to keep track of personal work.  For example
472output, please see diff.html in the distribution, as well as the sources for
473the difference, old.txt and new.txt.
474
475The output is in xhtml, for ease of generation, ease of access, and ease of
476viewing.  C<Text::ParagraphDiff> also takes advantage of two advanced features
477of the median: CSS and JavaScript.
478
479CSS is used to cut down on output size and to make the output very pleasing to
480the eye.  JavaScript is used to implement additional functionality: two buttons
481that can toggle the display of the difference.  CSS and JavaScript can be
482turned off; see the C<plain> option below. (Note: CSS & Javascript tested with
483Mozilla 1.0, Camino 0.7, and IE 5.x)
484
485=head1 EXPORT
486
487C<text_diff> is exported by default.
488
489Additionally, C<create_diff>, C<html_header>, and C<html_footer> are optionally
490exported by request (e.g. use C<< Text::ParagraphDiff qw(create_diff)) >>.
491C<create_diff> is the actual diff itself; C<html_header> and C<html_footer>
492should be obvious.
493
494=head1 OPTIONS
495
496C<text_diff> is the suggested interface, and it can be configured with a number
497of different options.
498
499Options are stored in a hashref, C<$opt>.  C<$opt> is an optional last argument
500to C<text_diff>, passed like this:
501
502    text_diff($old, $new, { plain => 1,
503                            escape => 1,
504                            string => 1,
505                            minus_first => 1,
506                            functionality => 1,
507                            style => 'stylesheet_code_here',
508                            header => 'header_markup_here',
509                            sep => ['<p>','</p>']
510                          });
511
512All options are, uh, optional.
513
514Options are:
515
516=over 3
517
518=item B<plain>
519
520When set to a true value, C<plain> will cause a document to be rendered
521plainly, with very sparse html that should be valid even through Netscape
522Navigator 2.0.
523
524=item B<string>
525
526When set to a true value, C<string> will cause the first 2 arguments to
527be treated as strings, and not files.  These strings will be split on
528the newline character.
529
530=item B<escape>
531
532When C<escape> is set, then input will not be escaped.  Useful if you want to
533include your own markup.
534
535=item B<minus_first>
536
537By default, when there is a +/- pair, + items appear first by default.
538However, if C<minus_first> is set to a true value, then the order will
539be reversed.
540
541=item B<functionality>
542
543When set to a true value, C<functionality> will cause the JavaScript toggle
544buttons to not be shown.
545
546=item B<style>
547
548When C<style> is set, its value will override the default stylesheet.  Please
549see C<output_html_header> above for the default stylesheet specifications.
550
551=item B<header>
552
553When C<header> is set, its value will override the default difference header.
554Please see C<output_html_header> above for more details.
555
556=item B<sep>
557
558When C<sep> is set, its value will override the default paragraph
559separator.  C<sep> should be a reference to an array of 2 elements;
560the starting paragraph separator, and the ending separator.  The default
561value is C<['<p>',</p>']>.
562
563=back
564
565=head1 BUGS
566
567In old versions, some situations of deletion of entire paragraphs in special
568places might make the surrounding line-breaks become whacky.  Although this
569bug is theoretically fixed, if you do encounter it, let me know.  If you can
570isolate the case, please send me a bug report, I might be able to fix it.  In
571the mean time, if this does happen to you, just fix the output's markup by
572hand, as it shouldn't be too complicated.
573
574=head1 AUTHOR
575
576Joseph F. Ryan (ryan.311@osu.edu)
577Tests done by Jonas Liljegren  (jonas@liljegren.org)
578
579=head1 SEE ALSO
580
581C<Algorithm::Diff>.
582
583=cut
584