1package Algorithm::Diff; 2use strict; 3use vars qw($VERSION @EXPORT_OK @ISA @EXPORT); 4use integer; # see below in _replaceNextLargerWith() for mod to make 5 # if you don't use this 6require Exporter; 7@ISA = qw(Exporter); 8@EXPORT = qw(); 9@EXPORT_OK = qw(LCS diff traverse_sequences); 10$VERSION = sprintf('%d.%02d', (q$Revision: 1.1.1.1 $ =~ /\d+/g)); 11 12# McIlroy-Hunt diff algorithm 13# Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com> 14# by Ned Konz, perl@bike-nomad.com 15 16=head1 NAME 17 18Algorithm::Diff - Compute `intelligent' differences between two files / lists 19 20=head1 SYNOPSIS 21 22 use Algorithm::Diff qw(diff LCS traverse_sequences); 23 24 @lcs = LCS( \@seq1, \@seq2 ); 25 26 @lcs = LCS( \@seq1, \@seq2, $key_generation_function ); 27 28 $lcsref = LCS( \@seq1, \@seq2 ); 29 30 $lcsref = LCS( \@seq1, \@seq2, $key_generation_function ); 31 32 @diffs = diff( \@seq1, \@seq2 ); 33 34 @diffs = diff( \@seq1, \@seq2, $key_generation_function ); 35 36 traverse_sequences( \@seq1, \@seq2, 37 { MATCH => $callback, 38 DISCARD_A => $callback, 39 DISCARD_B => $callback, 40 } ); 41 42 traverse_sequences( \@seq1, \@seq2, 43 { MATCH => $callback, 44 DISCARD_A => $callback, 45 DISCARD_B => $callback, 46 }, 47 $key_generation_function ); 48 49=head1 INTRODUCTION 50 51(by Mark-Jason Dominus) 52 53I once read an article written by the authors of C<diff>; they said 54that they hard worked very hard on the algorithm until they found the 55right one. 56 57I think what they ended up using (and I hope someone will correct me, 58because I am not very confident about this) was the `longest common 59subsequence' method. in the LCS problem, you have two sequences of 60items: 61 62 a b c d f g h j q z 63 64 a b c d e f g i j k r x y z 65 66and you want to find the longest sequence of items that is present in 67both original sequences in the same order. That is, you want to find 68a new sequence I<S> which can be obtained from the first sequence by 69deleting some items, and from the secend sequence by deleting other 70items. You also want I<S> to be as long as possible. In this case 71I<S> is 72 73 a b c d f g j z 74 75From there it's only a small step to get diff-like output: 76 77 e h i k q r x y 78 + - + + - + + + 79 80This module solves the LCS problem. It also includes a canned 81function to generate C<diff>-like output. 82 83It might seem from the example above that the LCS of two sequences is 84always pretty obvious, but that's not always the case, especially when 85the two sequences have many repeated elements. For example, consider 86 87 a x b y c z p d q 88 a b c a x b y c z 89 90A naive approach might start by matching up the C<a> and C<b> that 91appear at the beginning of each sequence, like this: 92 93 a x b y c z p d q 94 a b c a b y c z 95 96This finds the common subsequence C<a b c z>. But actually, the LCS 97is C<a x b y c z>: 98 99 a x b y c z p d q 100 a b c a x b y c z 101 102=head1 USAGE 103 104This module provides three exportable functions, which we'll deal with in 105ascending order of difficulty: C<LCS>, C<diff>, and 106C<traverse_sequences>. 107 108=head2 C<LCS> 109 110Given references to two lists of items, LCS returns an array containing their 111longest common subsequence. In scalar context, it returns a reference to 112such a list. 113 114 @lcs = LCS( \@seq1, \@seq2 ); 115 $lcsref = LCS( \@seq1, \@seq2 ); 116 117C<LCS> may be passed an optional third parameter; this is a CODE 118reference to a key generation function. See L</KEY GENERATION 119FUNCTIONS>. 120 121 @lcs = LCS( \@seq1, \@seq2, $keyGen ); 122 $lcsref = LCS( \@seq1, \@seq2, $keyGen ); 123 124Additional parameters, if any, will be passed to the key generation 125routine. 126 127=head2 C<diff> 128 129 @diffs = diff( \@seq1, \@seq2 ); 130 $diffs_ref = diff( \@seq1, \@seq2 ); 131 132C<diff> computes the smallest set of additions and deletions necessary 133to turn the first sequence into the second, and returns a description 134of these changes. The description is a list of I<hunks>; each hunk 135represents a contiguous section of items which should be added, 136deleted, or replaced. The return value of C<diff> is a list of 137hunks, or, in scalar context, a reference to such a list. 138 139Here is an example: The diff of the following two sequences: 140 141 a b c e h j l m n p 142 b c d e f j k l m r s t 143 144Result: 145 146 [ 147 [ [ '-', 0, 'a' ] ], 148 149 [ [ '+', 2, 'd' ] ], 150 151 [ [ '-', 4, 'h' ] , 152 [ '+', 4, 'f' ] ], 153 154 [ [ '+', 6, 'k' ] ], 155 156 [ [ '-', 8, 'n' ], 157 [ '-', 9, 'p' ], 158 [ '+', 9, 'r' ], 159 [ '+', 10, 's' ], 160 [ '+', 11, 't' ], 161 ] 162 ] 163 164There are five hunks here. The first hunk says that the C<a> at 165position 0 of the first sequence should be deleted (C<->). The second 166hunk says that the C<d> at position 2 of the second sequence should 167be inserted (C<+>). The third hunk says that the C<h> at position 4 168of the first sequence should be removed and replaced with the C<f> 169from position 4 of the second sequence. The other two hunks similarly. 170 171C<diff> may be passed an optional third parameter; this is a CODE 172reference to a key generation function. See L</KEY GENERATION 173FUNCTIONS>. 174 175Additional parameters, if any, will be passed to the key generation 176routine. 177 178=head2 C<traverse_sequences> 179 180C<traverse_sequences> is the most general facility provided by this 181module; C<diff> and C<LCS> are implemented as calls to it. 182 183Imagine that there are two arrows. Arrow A points to an element of 184sequence A, and arrow B points to an element of the sequence B. 185Initially, the arrows point to the first elements of the respective 186sequences. C<traverse_sequences> will advance the arrows through the 187sequences one element at a time, calling an appropriate user-specified 188callback function before each advance. It willadvance the arrows in 189such a way that if there are equal elements C<$A[$i]> and C<$B[$j]> 190which are equal and which are part of the LCS, there will be some 191moment during the execution of C<traverse_sequences> when arrow A is 192pointing to C<$A[$i]> and arrow B is pointing to C<$B[$j]>. When this 193happens, C<traverse_sequences> will call the C<MATCH> callback 194function and then it will advance both arrows. 195 196Otherwise, one of the arrows is pointing to an element of its sequence 197that is not part of the LCS. C<traverse_sequences> will advance that 198arrow and will call the C<DISCARD_A> or the C<DISCARD_B> callback, 199depending on which arrow it advanced. If both arrows point to 200elements that are not part of the LCS, then C<traverse_sequences> will 201advance one of them and call the appropriate callback, but it is not 202specified which it will call. 203 204The arguments to C<traverse_sequences> are the two sequences to 205traverse, and a callback which specifies the callback functions, like 206this: 207 208 traverse_sequences( \@seq1, \@seq2, 209 { MATCH => $callback_1, 210 DISCARD_A => $callback_2, 211 DISCARD_B => $callback_3, 212 } ); 213 214Callbacks are invoked with at least the indices of the two arrows as 215their arguments. They are not expected to return any values. If a 216callback is omitted from the table, it is not called. 217 218If arrow A reaches the end of its sequence, before arrow B does, 219C<traverse_sequences> will call the C<A_FINISHED> callback when it 220advances arrow B, if there is such a function; if not it will call 221C<DISCARD_B> instead. Similarly if arrow B finishes first. 222C<traverse_sequences> returns when both arrows are at the ends of 223their respective sequences. It returns true on success and false on 224failure. At present there is no way to fail. 225 226C<traverse_sequences> may be passed an optional fourth parameter; this 227is a CODE reference to a key generation function. See L</KEY 228GENERATION FUNCTIONS>. 229 230Additional parameters, if any, will be passed to the key generation 231function. 232 233=head1 KEY GENERATION FUNCTIONS 234 235C<diff>, C<LCS>, and C<traverse_sequences> accept an optional last parameter. 236This is a CODE reference to a key generating (hashing) function that should 237return a string that uniquely identifies a given element. 238It should be the case that if two elements are to be considered equal, 239their keys should be the same (and the other way around). 240If no key generation function is provided, the key will be the 241element as a string. 242 243By default, comparisons will use "eq" and elements will be turned into keys 244using the default stringizing operator '""'. 245 246Where this is important is when you're comparing something other than 247strings. If it is the case that you have multiple different objects 248that should be considered to be equal, you should supply a key 249generation function. Otherwise, you have to make sure that your arrays 250contain unique references. 251 252For instance, consider this example: 253 254 package Person; 255 256 sub new 257 { 258 my $package = shift; 259 return bless { name => '', ssn => '', @_ }, $package; 260 } 261 262 sub clone 263 { 264 my $old = shift; 265 my $new = bless { %$old }, ref($old); 266 } 267 268 sub hash 269 { 270 return shift()->{'ssn'}; 271 } 272 273 my $person1 = Person->new( name => 'Joe', ssn => '123-45-6789' ); 274 my $person2 = Person->new( name => 'Mary', ssn => '123-47-0000' ); 275 my $person3 = Person->new( name => 'Pete', ssn => '999-45-2222' ); 276 my $person4 = Person->new( name => 'Peggy', ssn => '123-45-9999' ); 277 my $person5 = Person->new( name => 'Frank', ssn => '000-45-9999' ); 278 279If you did this: 280 281 my $array1 = [ $person1, $person2, $person4 ]; 282 my $array2 = [ $person1, $person3, $person4, $person5 ]; 283 Algorithm::Diff::diff( $array1, $array2 ); 284 285everything would work out OK (each of the objects would be converted 286into a string like "Person=HASH(0x82425b0)" for comparison). 287 288But if you did this: 289 290 my $array1 = [ $person1, $person2, $person4 ]; 291 my $array2 = [ $person1, $person3, $person4->clone(), $person5 ]; 292 Algorithm::Diff::diff( $array1, $array2 ); 293 294$person4 and $person4->clone() (which have the same name and SSN) 295would be seen as different objects. If you wanted them to be considered 296equivalent, you would have to pass in a key generation function: 297 298 my $array1 = [ $person1, $person2, $person4 ]; 299 my $array2 = [ $person1, $person3, $person4->clone(), $person5 ]; 300 Algorithm::Diff::diff( $array1, $array2, \&Person::hash ); 301 302This would use the 'ssn' field in each Person as a comparison key, and 303so would consider $person4 and $person4->clone() as equal. 304 305You may also pass additional parameters to the key generation function 306if you wish. 307 308=head1 AUTHOR 309 310This version by Ned Konz, perl@bike-nomad.com 311 312=head1 CREDITS 313 314Versions through 0.59 (and much of this documentation) were written by: 315 316Mark-Jason Dominus, mjd-perl-diff@plover.com 317 318This version borrows the documentation and names of the routines 319from Mark-Jason's, but has all new code in Diff.pm. 320 321This code was adapted from the Smalltalk code of 322Mario Wolczko <mario@wolczko.com>, which is available at 323ftp://st.cs.uiuc.edu/pub/Smalltalk/MANCHESTER/manchester/4.0/diff.st 324 325The algorithm is that described in 326I<A Fast Algorithm for Computing Longest Common Subsequences>, 327CACM, vol.20, no.5, pp.350-353, May 1977, with a few 328minor improvements to improve the speed. 329 330=cut 331 332# Create a hash that maps each element of $aCollection to the set of positions 333# it occupies in $aCollection, restricted to the elements within the range of 334# indexes specified by $start and $end. 335# The fourth parameter is a subroutine reference that will be called to 336# generate a string to use as a key. 337# Additional parameters, if any, will be passed to this subroutine. 338# 339# my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen ); 340 341sub _withPositionsOfInInterval 342{ 343 my $aCollection = shift; # array ref 344 my $start = shift; 345 my $end = shift; 346 my $keyGen = shift; 347 my %d; 348 my $index; 349 for ( $index = $start; $index <= $end; $index++ ) 350 { 351 my $element = $aCollection->[ $index ]; 352 my $key = &$keyGen( $element, @_ ); 353 if ( exists( $d{ $key } ) ) 354 { 355 push( @{ $d{ $key } }, $index ); 356 } 357 else 358 { 359 $d{ $key } = [ $index ]; 360 } 361 } 362 return wantarray ? %d: \%d; 363} 364 365# Find the place at which aValue would normally be inserted into the array. If 366# that place is already occupied by aValue, do nothing, and return undef. If 367# the place does not exist (i.e., it is off the end of the array), add it to 368# the end, otherwise replace the element at that point with aValue. 369# It is assumed that the array's values are numeric. 370# This is where the bulk (75%) of the time is spent in this module, so try to 371# make it fast! 372 373sub _replaceNextLargerWith 374{ 375 my ( $array, $aValue, $high ) = @_; 376 $high ||= $#$array; 377 378 # off the end? 379 if ( $high == -1 || $aValue > $array->[ -1 ] ) 380 { 381 push( @$array, $aValue ); 382 return $high + 1; 383 } 384 385 # binary search for insertion point... 386 my $low = 0; 387 my $index; 388 my $found; 389 while ( $low <= $high ) 390 { 391 $index = ( $high + $low ) / 2; 392# $index = int(( $high + $low ) / 2); # without 'use integer' 393 $found = $array->[ $index ]; 394 395 if ( $aValue == $found ) 396 { 397 return undef; 398 } 399 elsif ( $aValue > $found ) 400 { 401 $low = $index + 1; 402 } 403 else 404 { 405 $high = $index - 1; 406 } 407 } 408 409 # now insertion point is in $low. 410 $array->[ $low ] = $aValue; # overwrite next larger 411 return $low; 412} 413 414# This method computes the longest common subsequence in $a and $b. 415 416# Result is array or ref, whose contents is such that 417# $a->[ $i ] = $b->[ $result[ $i ] ] 418# foreach $i in ( 0..scalar( @result ) if $result[ $i ] is defined. 419 420# An additional argument may be passed; this is a hash or key generating 421# function that should return a string that uniquely identifies the given 422# element. It should be the case that if the key is the same, the elements 423# will compare the same. If this parameter is undef or missing, the key 424# will be the element as a string. 425 426# By default, comparisons will use "eq" and elements will be turned into keys 427# using the default stringizing operator '""'. 428 429# Additional parameters, if any, will be passed to the key generation routine. 430 431sub _longestCommonSubsequence 432{ 433 my $a = shift; # array ref 434 my $b = shift; # array ref 435 my $keyGen = shift; # code ref 436 my $compare; # code ref 437 438 # set up code refs 439 # Note that these are optimized. 440 if ( !defined( $keyGen ) ) # optimize for strings 441 { 442 $keyGen = sub { $_[0] }; 443 $compare = sub { my ($a, $b) = @_; $a eq $b }; 444 } 445 else 446 { 447 $compare = sub { 448 my $a = shift; my $b = shift; 449 &$keyGen( $a, @_ ) eq &$keyGen( $b, @_ ) 450 }; 451 } 452 453 my ($aStart, $aFinish, $bStart, $bFinish, $matchVector) = (0, $#$a, 0, $#$b, []); 454 455 # First we prune off any common elements at the beginning 456 while ( $aStart <= $aFinish 457 and $bStart <= $bFinish 458 and &$compare( $a->[ $aStart ], $b->[ $bStart ], @_ ) ) 459 { 460 $matchVector->[ $aStart++ ] = $bStart++; 461 } 462 463 # now the end 464 while ( $aStart <= $aFinish 465 and $bStart <= $bFinish 466 and &$compare( $a->[ $aFinish ], $b->[ $bFinish ], @_ ) ) 467 { 468 $matchVector->[ $aFinish-- ] = $bFinish--; 469 } 470 471 # Now compute the equivalence classes of positions of elements 472 my $bMatches = _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ ); 473 my $thresh = []; 474 my $links = []; 475 476 my ( $i, $ai, $j, $k ); 477 for ( $i = $aStart; $i <= $aFinish; $i++ ) 478 { 479 $ai = &$keyGen( $a->[ $i ] ); 480 if ( exists( $bMatches->{ $ai } ) ) 481 { 482 $k = 0; 483 for $j ( reverse( @{ $bMatches->{ $ai } } ) ) 484 { 485 # optimization: most of the time this will be true 486 if ( $k 487 and $thresh->[ $k ] > $j 488 and $thresh->[ $k - 1 ] < $j ) 489 { 490 $thresh->[ $k ] = $j; 491 } 492 else 493 { 494 $k = _replaceNextLargerWith( $thresh, $j, $k ); 495 } 496 497 # oddly, it's faster to always test this (CPU cache?). 498 if ( defined( $k ) ) 499 { 500 $links->[ $k ] = 501 [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ]; 502 } 503 } 504 } 505 } 506 507 if ( @$thresh ) 508 { 509 for ( my $link = $links->[ $#$thresh ]; $link; $link = $link->[ 0 ] ) 510 { 511 $matchVector->[ $link->[ 1 ] ] = $link->[ 2 ]; 512 } 513 } 514 515 return wantarray ? @$matchVector : $matchVector; 516} 517 518sub traverse_sequences 519{ 520 my $a = shift; # array ref 521 my $b = shift; # array ref 522 my $callbacks = shift || { }; 523 my $keyGen = shift; 524 my $matchCallback = $callbacks->{'MATCH'} || sub { }; 525 my $discardACallback = $callbacks->{'DISCARD_A'} || sub { }; 526 my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { }; 527 my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ ); 528 # Process all the lines in match vector 529 my $lastA = $#$a; 530 my $lastB = $#$b; 531 my $bi = 0; 532 my $ai; 533 for ( $ai = 0; $ai <= $#$matchVector; $ai++ ) 534 { 535 my $bLine = $matchVector->[ $ai ]; 536 if ( defined( $bLine ) ) 537 { 538 &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine; 539 &$matchCallback( $ai, $bi++, @_ ); 540 } 541 else 542 { 543 &$discardACallback( $ai, $bi, @_ ); 544 } 545 } 546 547 &$discardACallback( $ai++, $bi, @_ ) while ( $ai <= $lastA ); 548 &$discardBCallback( $ai, $bi++, @_ ) while ( $bi <= $lastB ); 549 return 1; 550} 551 552sub LCS 553{ 554 my $a = shift; # array ref 555 my $matchVector = _longestCommonSubsequence( $a, @_ ); 556 my @retval; 557 my $i; 558 for ( $i = 0; $i <= $#$matchVector; $i++ ) 559 { 560 if ( defined( $matchVector->[ $i ] ) ) 561 { 562 push( @retval, $a->[ $i ] ); 563 } 564 } 565 return wantarray ? @retval : \@retval; 566} 567 568sub diff 569{ 570 my $a = shift; # array ref 571 my $b = shift; # array ref 572 my $retval = []; 573 my $hunk = []; 574 my $discard = sub { push( @$hunk, [ '-', $_[ 0 ], $a->[ $_[ 0 ] ] ] ) }; 575 my $add = sub { push( @$hunk, [ '+', $_[ 1 ], $b->[ $_[ 1 ] ] ] ) }; 576 my $match = sub { push( @$retval, $hunk ) if scalar(@$hunk); $hunk = [] }; 577 traverse_sequences( $a, $b, 578 { MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, 579 @_ ); 580 &$match(); 581 return wantarray ? @$retval : $retval; 582} 583 5841; 585