1package Text::ParagraphDiff; 2 3use strict; 4use warnings 'all'; 5 6use Algorithm::Diff qw(diff); 7use Carp qw(croak); 8use HTML::Entities (); 9use POSIX qw(strftime); 10 11use vars qw(@EXPORT @EXPORT_OK @ISA $VERSION); 12require Exporter; 13@EXPORT = qw(text_diff); 14@EXPORT_OK = qw(create_diff html_header html_footer); 15@ISA = qw(Exporter); 16$VERSION = "2.70"; 17 18 19 20# XXX: Can't use pod here because it messes up the doc on CPAN. :( 21 22# text_diff( old, new, [options hashref] ) 23 24# C<text_diff> binds together C<html_header>, C<create_diff>, and 25# C<html_footer> to create a single document that is the "paragraph 26# diff" of the 2 records. 27 28sub text_diff { 29 return ((html_header(@_)).(create_diff(@_)).(html_footer(@_))); 30} 31 32 33 34# create_diff ( old, new, [options hashref] ) 35 36# C<create_diff> creates the actual paragraph diff. 37 38sub create_diff { 39 40 my($old,$new) = (shift,shift); 41 my $opt=shift if (@_); 42 43 my $old_orig = _get_lines($old, $opt); 44 my $new_orig = _get_lines($new, $opt); 45 $new_orig = [''] unless @$new_orig; 46 47 my %highlight; 48 if ($opt->{plain}) { 49 $highlight{minus} = qq(<b><font color="#FF0000" size="+1"> ); 50 $highlight{plus} = qq(<b><font color="#005500" size="+1"> ); 51 $highlight{end} = "</font></b>"; 52 } 53 else { 54 $highlight{minus} = qq(<span class="minus"> ); 55 $highlight{plus} = qq(<span class="plus"> ); 56 $highlight{end} = qq(</span>); 57 } 58 59 $opt->{plus_order} = 0 unless $opt->{plus_order}; 60 61 my (@old,@old_count); 62 foreach (@$old_orig) 63 { 64 $_ = HTML::Entities::encode($_) unless exists $opt->{escape}; 65 my @words = (/\S+/g); 66 push @old, @words; 67 push @old_count, scalar(@words); 68 69 } 70 71 72 my ($total_diff, @new, @leading_space, @count); 73 foreach (@$new_orig) 74 { 75 my ($leading_white) = /^( *)/; 76 push @leading_space, $leading_white; 77 78 $_ = HTML::Entities::encode($_) unless exists $opt->{escape}; 79 my @words = (/\S+/g); 80 81 push @$total_diff, map { [' ',$_] } @words; 82 push @new, @words; 83 push @count, scalar(@words); 84 } 85 86 $opt->{sep} = ['<p>','</p>'] unless exists $opt->{sep}; 87 my ($plus,$minus) = _get_diffs(\@old, \@new, \@old_count, $opt->{sep}); 88 89 _merge_plus ($total_diff, $plus) if @$plus; 90 _merge_minus ($total_diff, $minus, $opt->{minus_first}) if @$minus; 91 _merge_white ($total_diff, \@leading_space); 92 93 $total_diff = _merge_lines ($total_diff, \@old_count, \@count); 94 95 _fold ($total_diff); 96 97 my $output = _format ($total_diff, \%highlight, $opt->{sep}); 98 return $output; 99} 100 101######### 102# Utility 103 104# turns potential files into recordsets 105sub _get_lines { 106 my ($file, $opt) = @_; 107 my @lines; 108 if (!ref $file) { 109 if ($opt->{string}) { 110 return [split /\r\n|\r|\n/,$file]; 111 } 112 else { 113 open (FILE, "< $file") or croak "Can't open file $file: $!"; 114 @lines = <FILE>; 115 close(FILE); 116 return \@lines; 117 } 118 } 119 else { 120 return $file; 121 } 122} 123 124sub _fold { 125 my ($diff) = @_; 126 127 foreach (@$diff) { 128 my $i = 0; 129 while ($i+1 < @$_) { 130 if ($_->[$i][0] eq $_->[$i+1][0]) { 131 my $item = splice @$_, $i+1, 1; 132 $_->[$i][1] .= (" ".$item->[1]); 133 next; 134 } 135 $i++; 136 } 137 } 138} 139 140# diffs the files and splits into "plusses and "minuses" 141sub _get_diffs { 142 my ($old,$new,$count,$sep) = @_; 143 my @diffs = diff($old, $new); 144 my ($plus,$minus) = ([],[]); 145 foreach my $hunk (@diffs) { 146 foreach (@$hunk) { 147 push @$plus, $_ if $_->[0] eq '+'; 148 push @$minus, $_ if $_->[0] eq '-'; 149 } 150 } 151 _fix_minus ($minus, $count, $sep); 152 return ($plus,$minus); 153} 154 155# re-adjusts the minus's position to correspond with the positve, 156# and adds paragraph markers where necessary 157sub _fix_minus { 158 my ($d,$count,$sep) = @_; 159 my ($i,$x) = (0,0); 160 foreach my $break (@$count) { 161 $i += $break; 162 while ( ($x < @$d) && ($i > $d->[$x][1]) ) { 163 ++$x 164 } 165 last unless @$d > $x; 166 $d->[$x-1][2] .= $sep->[1].$sep->[0] if ($i-1) == $d->[$x-1][1]; 167 ++$x 168 } 169} 170 171######### 172# Merging 173 174# integrate the "plus" into the main document 175sub _merge_plus { 176 my ($total_diff, $plus_diff) = @_; 177 178 while ( my $cur = shift @$plus_diff ) { 179 $total_diff->[$cur->[1]][0] = '+'; 180 } 181} 182 183# integrate the minus into the main document, making sure not 184# to split up any plusses 185sub _merge_minus { 186 my ($total_diff, $min_diff, $minus_first) = @_; 187 my ($pos,$offset) = (0,0); 188 189 while ( my $cur = shift @$min_diff ) { 190 while ($pos < ($cur->[1]+$offset)) { 191 ++$offset if $total_diff->[$pos][0] eq '+'; 192 ++$pos; 193 } 194 if ($pos >= $#{$total_diff}) { 195 push @$total_diff, ['-',$cur->[2]]; 196 last; 197 } 198 while ($pos < @$total_diff && $total_diff->[$pos][0] eq '+') { 199 ++$offset; 200 ++$pos; 201 } 202 my $current = 0; 203 $current = $offset if $minus_first; 204 splice @$total_diff, $pos-$current, 0, ['-',$cur->[2]]; 205 } 206 207 push @$total_diff, map { ['-',$_->[2]] } @$min_diff if @$min_diff; 208} 209 210# merge in whitespace. 211sub _merge_white { 212 my ($total_diff, $whitespace) = @_; 213 my $pos = 0; 214 215 while ( @$whitespace ) { 216 my $cur = shift @$whitespace; 217 while ( ($pos < @$total_diff) 218 && ($total_diff->[$pos][0] ne '-') 219 ) { $pos++ } 220 $total_diff->[$pos][1] = $cur . $total_diff->[$pos][1] 221 if $total_diff->[$pos][1]; 222 ++$pos; 223 } 224} 225 226sub _merge_lines { 227 my ($total_diff, $old_count, $new_count) = @_; 228 my $new = []; 229 my @old_count_orig = @$old_count; 230 231 foreach my $words_in_line ( @$new_count ) { 232 if ($words_in_line > 0) { 233 push @$new, []; 234 my ($pos,$total) = (0,0); 235 while ($pos < $words_in_line ) { 236 until ($old_count->[0]) { 237 last unless @$old_count; 238 shift @$old_count; 239 shift @old_count_orig; 240 } 241 ++$pos if $total_diff->[$total][0] ne '-'; 242 $old_count->[0] = $old_count->[0] - 1 if $total_diff->[$total][0] ne '+'; 243 ++$total; 244 } 245 $new->[-1] = [splice @$total_diff,0,$total]; 246 } 247 } 248 249 if (@$old_count && $old_count->[0] < $old_count_orig[0]) { 250 push @{$new->[-1]}, splice(@$total_diff, 0, $old_count->[0]); 251 shift @old_count_orig; 252 } 253 while (@old_count_orig) { 254 push @$new, [splice @$total_diff, 0, shift(@old_count_orig)] 255 } 256 257 return $new; 258} 259 260######### 261# Output 262 263sub _format { 264 my ($diff,$highlight,$sep) = @_; 265 my $output; 266 267 foreach my $hunk (@$diff) { 268 $output .= "\n$sep->[0]\n"; 269 foreach my $sect (@$hunk) { 270 if ($sect->[0] eq ' ') { 271 $output .= "$sect->[1] "; 272 } 273 elsif ($sect->[0] eq '+') { 274 $output .= " $highlight->{plus}$sect->[1]$highlight->{end} "; 275 } 276 else { 277 # $sect->[1] = '' unless $sect->[1]; 278 $output .= " $highlight->{minus}$sect->[1]$highlight->{end} "; 279 } 280 } 281 $output .= "\n$sep->[1]\n"; 282 } 283 return $output; 284} 285 286sub html_header { 287 my ($old,$new,$opt) = @_; 288 289 my $old_time = strftime( "%A, %B %d, %Y @ %H:%M:%S", 290 (ref $old) ? time : (stat $old)[9] 291 , 0, 0, 0, 0, 70, 0 ); 292 my $new_time = strftime( "%A, %B %d, %Y @ %H:%M:%S", 293 (ref $new) ? time : (stat $new)[9] 294 , 0, 0, 0, 0, 70, 0 ); 295 296 $old = (!ref $old) ? $old : "old"; 297 $new = (!ref $new) ? $new : "new"; 298 299 if ($opt->{plain}) { 300 return "<html><head><title>Difference of $old, $new</title></head><body>" 301 } 302 303 my $header = (exists $opt->{header}) ? $opt->{header} : qq( 304 <div> 305 <font size="+2"><b>Difference of:</b></font> 306 <table border="0" cellspacing="5"> 307 <tr><td class="minus">---</td><td class="minus"><b>$old</b></td><td>$old_time</td></tr> 308 <tr><td class="plus" >+++</td><td class="plus" ><b>$new</b></td><td>$new_time</td></tr> 309 </table></div> 310 ); 311 312 my $script = ($opt->{functionality}) ? "" : qq( 313 <script> 314 toggle_plus_status = 1; 315 toggle_minus_status = 1; 316 function dis_plus() { 317 for(i=0; (a = document.getElementsByTagName("span")[i]); i++) { 318 if(a.className == "plus") { 319 a.style.display="none"; 320 } 321 } 322 } 323 function dis_minus() { 324 for(i=0; (a = document.getElementsByTagName("span")[i]); i++) { 325 if(a.className == "minus") { 326 a.style.display="none"; 327 } 328 } 329 } 330 function view_plus() { 331 for(i=0; (a = document.getElementsByTagName("span")[i]); i++) { 332 if(a.className == "plus") { 333 a.style.display="inline"; 334 } 335 } 336 } 337 function view_minus() { 338 for(i=0; (a = document.getElementsByTagName("span")[i]); i++) { 339 if(a.className == "minus") { 340 a.style.display="inline"; 341 } 342 } 343 } 344 345 function toggle_plus() { 346 if (toggle_plus_status == 1) { 347 dis_plus(); 348 toggle_plus_status = 0; 349 } 350 else { 351 view_plus(); 352 toggle_plus_status = 1; 353 } 354 } 355 356 function toggle_minus() { 357 if (toggle_minus_status == 1) { 358 dis_minus(); 359 toggle_minus_status = 0; 360 } 361 else { 362 view_minus(); 363 toggle_minus_status = 1; 364 } 365 } 366 </script> 367 ); 368 369 my $style = (exists $opt->{style}) ? $opt->{style} : qq( 370 <style> 371 .plus{background-color:#00BBBB; visibility="visible"} 372 .minus{background-color:#FF9999; visibility="visible"} 373 DIV{ margin:50px; border:solid; background-color:#F2F2F2; padding:5px; } 374 BODY{line-height:1.7; background-color:#888888} 375 B{font-size:bigger;} 376 .togglep { 377 font-size : 12px; 378 font-family : geneva, arial, sans-serif; 379 color : #ffc; 380 background-color : #00BBBB; 381 } 382 .togglem { 383 font-size : 12px; 384 font-family : geneva, arial, sans-serif; 385 color : #ffc; 386 background-color : #ff9999; 387 } 388 </style> 389 ); 390 391 my $functionality = ($opt->{functionality}) ? "" : qq( 392 <div> 393 <form> 394 <table border="0" cellspacing="5"> 395 <td><input type="button" class="togglep" value="Toggle Plus" onclick="toggle_plus(); return false;" /></td><td width="10"> </td> 396 <td><input type="button" class="togglem" value="Toggle Minus" onclick="toggle_minus(); return false;" /></td><td width="10"> </td> 397 </table> 398 </form> 399 </div> 400 ); 401 402 return qq( 403 <!DOCTYPE html 404 PUBLIC "-//W3C//DTD XHMTL 1.0 Transitional//EN" 405 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 406 <html><head> 407 <title>Difference of $old, $new</title> 408 $script 409 $style 410 </head><body> 411 $header 412 $functionality 413 <div> 414 ); 415} 416 417sub html_footer { 418 my $div = ""; 419 420 if (@_ == 3) { 421 return $_[2]->{footer} if exists $_[2]->{footer}; 422 $div = "</div>" unless $_[2]->{plain} 423 } 424 425 return $div."</body></html>" 426} 427 4281; 429 430__END__ 431 432=pod 433 434=head1 NAME 435 436Text::ParagraphDiff - Visual Difference for paragraphed text. 437 438=head1 ABSTRACT 439 440C<Text::ParagraphDiff> finds the difference between two paragraphed text files 441by word rather than by line, reflows the text together, and then outputs result 442as xhtml. 443 444=head1 SYNOPSIS 445 446 use Text::ParagraphDiff; 447 448 # old.txt and new.txt are filenames 449 print text_diff('old.txt', 'new.txt'); 450 451 # Or pass array references 452 print text_diff(\@old, \@new); 453 454 # T-Diff 2 plain strings (a FAQ) 455 print text_diff("old", "new", {string=>1}); 456 457 # Pass options (see below) 458 print text_diff($old, $new, {plain=>1}); 459 460 # or use the premade script in bin/: 461 # ./tdiff oldfile newfile 462 463=head1 DESCRIPTION 464 465C<Text::ParagraphDiff> is a reimplementation of C<diff> that is meant for 466paragraphed text rather than for code. Instead of "diffing" a document by 467line, C<Text::ParagraphDiff> expands a document to one word per line, uses 468C<Algorithm::Diff> to find the difference, and then reflows the text back 469together, highlighting the "add" and "subtract" sections. Writers and editors 470might find this useful for sending revisions to each other across the internet, 471or a single user might use it to keep track of personal work. For example 472output, please see diff.html in the distribution, as well as the sources for 473the difference, old.txt and new.txt. 474 475The output is in xhtml, for ease of generation, ease of access, and ease of 476viewing. C<Text::ParagraphDiff> also takes advantage of two advanced features 477of the median: CSS and JavaScript. 478 479CSS is used to cut down on output size and to make the output very pleasing to 480the eye. JavaScript is used to implement additional functionality: two buttons 481that can toggle the display of the difference. CSS and JavaScript can be 482turned off; see the C<plain> option below. (Note: CSS & Javascript tested with 483Mozilla 1.0, Camino 0.7, and IE 5.x) 484 485=head1 EXPORT 486 487C<text_diff> is exported by default. 488 489Additionally, C<create_diff>, C<html_header>, and C<html_footer> are optionally 490exported by request (e.g. use C<< Text::ParagraphDiff qw(create_diff)) >>. 491C<create_diff> is the actual diff itself; C<html_header> and C<html_footer> 492should be obvious. 493 494=head1 OPTIONS 495 496C<text_diff> is the suggested interface, and it can be configured with a number 497of different options. 498 499Options are stored in a hashref, C<$opt>. C<$opt> is an optional last argument 500to C<text_diff>, passed like this: 501 502 text_diff($old, $new, { plain => 1, 503 escape => 1, 504 string => 1, 505 minus_first => 1, 506 functionality => 1, 507 style => 'stylesheet_code_here', 508 header => 'header_markup_here', 509 sep => ['<p>','</p>'] 510 }); 511 512All options are, uh, optional. 513 514Options are: 515 516=over 3 517 518=item B<plain> 519 520When set to a true value, C<plain> will cause a document to be rendered 521plainly, with very sparse html that should be valid even through Netscape 522Navigator 2.0. 523 524=item B<string> 525 526When set to a true value, C<string> will cause the first 2 arguments to 527be treated as strings, and not files. These strings will be split on 528the newline character. 529 530=item B<escape> 531 532When C<escape> is set, then input will not be escaped. Useful if you want to 533include your own markup. 534 535=item B<minus_first> 536 537By default, when there is a +/- pair, + items appear first by default. 538However, if C<minus_first> is set to a true value, then the order will 539be reversed. 540 541=item B<functionality> 542 543When set to a true value, C<functionality> will cause the JavaScript toggle 544buttons to not be shown. 545 546=item B<style> 547 548When C<style> is set, its value will override the default stylesheet. Please 549see C<output_html_header> above for the default stylesheet specifications. 550 551=item B<header> 552 553When C<header> is set, its value will override the default difference header. 554Please see C<output_html_header> above for more details. 555 556=item B<sep> 557 558When C<sep> is set, its value will override the default paragraph 559separator. C<sep> should be a reference to an array of 2 elements; 560the starting paragraph separator, and the ending separator. The default 561value is C<['<p>',</p>']>. 562 563=back 564 565=head1 BUGS 566 567In old versions, some situations of deletion of entire paragraphs in special 568places might make the surrounding line-breaks become whacky. Although this 569bug is theoretically fixed, if you do encounter it, let me know. If you can 570isolate the case, please send me a bug report, I might be able to fix it. In 571the mean time, if this does happen to you, just fix the output's markup by 572hand, as it shouldn't be too complicated. 573 574=head1 AUTHOR 575 576Joseph F. Ryan (ryan.311@osu.edu) 577Tests done by Jonas Liljegren (jonas@liljegren.org) 578 579=head1 SEE ALSO 580 581C<Algorithm::Diff>. 582 583=cut 584