1<?php 2 3/** 4 * FINE granularity DIFF 5 * 6 * Computes a set of instructions to convert the content of 7 * one string into another. 8 * 9 * Originally created by Raymond Hill (https://github.com/gorhill/PHP-FineDiff), brought up 10 * to date by Cog Powered (https://github.com/cogpowered/FineDiff). 11 * 12 * @copyright Copyright 2011 (c) Raymond Hill (http://raymondhill.net/blog/?p=441) 13 * @copyright Copyright 2013 (c) Robert Crowe (http://cogpowered.com) 14 * @link https://github.com/cogpowered/FineDiff 15 * @version 0.0.1 16 * @license MIT License (http://www.opensource.org/licenses/mit-license.php) 17 */ 18 19namespace cogpowered\FineDiff\Parser; 20 21use cogpowered\FineDiff\Granularity\GranularityInterface; 22use cogpowered\FineDiff\Exceptions\GranularityCountException; 23use cogpowered\FineDiff\Parser\Operations\Copy; 24use cogpowered\FineDiff\Parser\Operations\Delete; 25use cogpowered\FineDiff\Parser\Operations\Insert; 26use cogpowered\FineDiff\Parser\Operations\Replace; 27 28/** 29 * Generates a set of instructions to convert one string to another. 30 */ 31class Parser implements ParserInterface 32{ 33 /** 34 * @var cogpowered\FineDiff\GranularityInterface 35 */ 36 protected $granularity; 37 38 /** 39 * @var cogpowered\FineDiff\Parser\OpcodesInterface 40 */ 41 protected $opcodes; 42 43 /** 44 * @var string Text we are comparing against. 45 */ 46 protected $from_text; 47 48 /** 49 * @var int Position of the $from_text we are at. 50 */ 51 protected $from_offset = 0; 52 53 /** 54 * @var cogpowered\FineDiff\Operations\OperationInterface 55 */ 56 protected $last_edit; 57 58 /** 59 * @var int Current position in the granularity array. 60 */ 61 protected $stackpointer = 0; 62 63 /** 64 * @var array Holds the individual opcodes as the diff takes place. 65 */ 66 protected $edits = array(); 67 68 /** 69 * @inheritdoc 70 */ 71 public function __construct(GranularityInterface $granularity) 72 { 73 $this->granularity = $granularity; 74 75 // Set default opcodes generator 76 $this->opcodes = new Opcodes; 77 } 78 79 /** 80 * @inheritdoc 81 */ 82 public function getGranularity() 83 { 84 return $this->granularity; 85 } 86 87 /** 88 * @inheritdoc 89 */ 90 public function setGranularity(GranularityInterface $granularity) 91 { 92 $this->granularity = $granularity; 93 } 94 95 /** 96 * @inheritdoc 97 */ 98 public function getOpcodes() 99 { 100 return $this->opcodes; 101 } 102 103 /** 104 * @inheritdoc 105 */ 106 public function setOpcodes(OpcodesInterface $opcodes) 107 { 108 $this->opcodes = $opcodes; 109 } 110 111 /** 112 * @inheritdoc 113 */ 114 public function parse($from_text, $to_text) 115 { 116 // Ensure the granularity contains some delimiters 117 if (count($this->granularity) === 0) { 118 throw new GranularityCountException('Granularity contains no delimiters'); 119 } 120 121 // Reset internal parser properties 122 $this->from_text = $from_text; 123 $this->from_offset = 0; 124 $this->last_edit = null; 125 $this->stackpointer = 0; 126 $this->edits = array(); 127 128 // Parse the two string 129 $this->process($from_text, $to_text); 130 131 // Return processed diff 132 $this->opcodes->setOpcodes($this->edits); 133 return $this->opcodes; 134 } 135 136 /** 137 * Actually kicks off the processing. Recursive function. 138 * 139 * @param string $from_text 140 * @param string $to_text 141 * @return void 142 */ 143 protected function process($from_text, $to_text) 144 { 145 // Lets get parsing 146 $delimiters = $this->granularity[$this->stackpointer++]; 147 $has_next_stage = $this->stackpointer < count($this->granularity); 148 149 // Actually perform diff 150 $diff = $this->diff($from_text, $to_text, $delimiters); 151 $diff = (is_array($diff)) ? $diff : array(); 152 153 foreach ($diff as $fragment) { 154 155 // increase granularity 156 if ($fragment instanceof Replace && $has_next_stage) { 157 $this->process( 158 substr($this->from_text, $this->from_offset, $fragment->getFromLen()), 159 $fragment->getText() 160 ); 161 } 162 // fuse copy ops whenever possible 163 elseif ($fragment instanceof Copy && $this->last_edit instanceof Copy) { 164 $this->edits[count($this->edits)-1]->increase($fragment->getFromLen()); 165 $this->from_offset += $fragment->getFromLen(); 166 } 167 else { 168 /* $fragment instanceof Copy */ 169 /* $fragment instanceof Delete */ 170 /* $fragment instanceof Insert */ 171 $this->edits[] = $this->last_edit = $fragment; 172 $this->from_offset += $fragment->getFromLen(); 173 } 174 } 175 176 $this->stackpointer--; 177 } 178 179 /** 180 * Core parsing function. 181 * 182 * @param string $from_text 183 * @param string $to_text 184 * @param string $delimiters Delimiter to use for this parse. 185 * @return array 186 */ 187 protected function diff($from_text, $to_text, $delimiters) 188 { 189 // Empty delimiter means character-level diffing. 190 // In such case, use code path optimized for character-level diffing. 191 if (empty($delimiters)) { 192 return $this->charDiff($from_text, $to_text); 193 } 194 195 $result = array(); 196 197 // fragment-level diffing 198 $from_text_len = strlen($from_text); 199 $to_text_len = strlen($to_text); 200 $from_fragments = $this->extractFragments($from_text, $delimiters); 201 $to_fragments = $this->extractFragments($to_text, $delimiters); 202 203 $jobs = array(array(0, $from_text_len, 0, $to_text_len)); 204 $cached_array_keys = array(); 205 206 207 while ($job = array_pop($jobs)) { 208 209 // get the segments which must be diff'ed 210 list($from_segment_start, $from_segment_end, $to_segment_start, $to_segment_end) = $job; 211 212 // catch easy cases first 213 $from_segment_length = $from_segment_end - $from_segment_start; 214 $to_segment_length = $to_segment_end - $to_segment_start; 215 216 if (!$from_segment_length || !$to_segment_length ) { 217 218 if ( $from_segment_length ) { 219 $result[$from_segment_start * 4] = new Delete($from_segment_length); 220 } else if ( $to_segment_length ) { 221 $result[$from_segment_start * 4 + 1] = new Insert(substr($to_text, $to_segment_start, $to_segment_length)); 222 } 223 224 continue; 225 } 226 227 // find longest copy operation for the current segments 228 $best_copy_length = 0; 229 230 $from_base_fragment_index = $from_segment_start; 231 $cached_array_keys_for_current_segment = array(); 232 233 while ( $from_base_fragment_index < $from_segment_end ) { 234 235 $from_base_fragment = $from_fragments[$from_base_fragment_index]; 236 $from_base_fragment_length = strlen($from_base_fragment); 237 238 // performance boost: cache array keys 239 if (!isset($cached_array_keys_for_current_segment[$from_base_fragment])) { 240 241 if ( !isset($cached_array_keys[$from_base_fragment]) ) { 242 $to_all_fragment_indices = $cached_array_keys[$from_base_fragment] = array_keys($to_fragments, $from_base_fragment, true); 243 } 244 else { 245 $to_all_fragment_indices = $cached_array_keys[$from_base_fragment]; 246 } 247 248 // get only indices which falls within current segment 249 if ($to_segment_start > 0 || $to_segment_end < $to_text_len) { 250 251 $to_fragment_indices = array(); 252 253 foreach ($to_all_fragment_indices as $to_fragment_index) { 254 255 if ($to_fragment_index < $to_segment_start) { 256 continue; 257 } 258 259 if ($to_fragment_index >= $to_segment_end) { 260 break; 261 } 262 263 $to_fragment_indices[] = $to_fragment_index; 264 } 265 266 $cached_array_keys_for_current_segment[$from_base_fragment] = $to_fragment_indices; 267 268 } else { 269 $to_fragment_indices = $to_all_fragment_indices; 270 } 271 272 } else { 273 $to_fragment_indices = $cached_array_keys_for_current_segment[$from_base_fragment]; 274 } 275 276 // iterate through collected indices 277 foreach ($to_fragment_indices as $to_base_fragment_index) { 278 279 $fragment_index_offset = $from_base_fragment_length; 280 281 // iterate until no more match 282 for (;;) { 283 284 $fragment_from_index = $from_base_fragment_index + $fragment_index_offset; 285 286 if ($fragment_from_index >= $from_segment_end) { 287 break; 288 } 289 290 $fragment_to_index = $to_base_fragment_index + $fragment_index_offset; 291 292 if ($fragment_to_index >= $to_segment_end) { 293 break; 294 } 295 296 if ($from_fragments[$fragment_from_index] !== $to_fragments[$fragment_to_index]) { 297 break; 298 } 299 300 $fragment_length = strlen($from_fragments[$fragment_from_index]); 301 $fragment_index_offset += $fragment_length; 302 } 303 304 if ($fragment_index_offset > $best_copy_length) { 305 $best_copy_length = $fragment_index_offset; 306 $best_from_start = $from_base_fragment_index; 307 $best_to_start = $to_base_fragment_index; 308 } 309 } 310 311 $from_base_fragment_index += strlen($from_base_fragment); 312 313 // If match is larger than half segment size, no point trying to find better 314 // TODO: Really? 315 if ($best_copy_length >= $from_segment_length / 2) { 316 break; 317 } 318 319 // no point to keep looking if what is left is less than 320 // current best match 321 if ( $from_base_fragment_index + $best_copy_length >= $from_segment_end ) { 322 break; 323 } 324 } 325 326 if ($best_copy_length) { 327 $jobs[] = array($from_segment_start, $best_from_start, $to_segment_start, $best_to_start); 328 $result[$best_from_start * 4 + 2] = new Copy($best_copy_length); 329 $jobs[] = array($best_from_start + $best_copy_length, $from_segment_end, $best_to_start + $best_copy_length, $to_segment_end); 330 } else { 331 $result[$from_segment_start * 4 ] = new Replace($from_segment_length, substr($to_text, $to_segment_start, $to_segment_length)); 332 } 333 } 334 335 ksort($result, SORT_NUMERIC); 336 return array_values($result); 337 } 338 339 /** 340 * Same as Parser::diff but tuned for character level granularity. 341 * 342 * @param string $from_text 343 * @param string $to_text 344 * @return array 345 */ 346 protected function charDiff($from_text, $to_text) 347 { 348 $result = array(); 349 $jobs = array(array(0, strlen($from_text), 0, strlen($to_text))); 350 351 while ($job = array_pop($jobs)) { 352 353 // get the segments which must be diff'ed 354 list($from_segment_start, $from_segment_end, $to_segment_start, $to_segment_end) = $job; 355 356 $from_segment_len = $from_segment_end - $from_segment_start; 357 $to_segment_len = $to_segment_end - $to_segment_start; 358 359 // catch easy cases first 360 if (!$from_segment_len || !$to_segment_len) { 361 362 if ($from_segment_len) { 363 $result[$from_segment_start * 4 + 0] = new Delete($from_segment_len); 364 } else if ( $to_segment_len ) { 365 $result[$from_segment_start * 4 + 1] = new Insert(substr($to_text, $to_segment_start, $to_segment_len)); 366 } 367 368 continue; 369 } 370 371 if ($from_segment_len >= $to_segment_len) { 372 373 $copy_len = $to_segment_len; 374 375 while ($copy_len) { 376 377 $to_copy_start = $to_segment_start; 378 $to_copy_start_max = $to_segment_end - $copy_len; 379 380 while ($to_copy_start <= $to_copy_start_max) { 381 382 $from_copy_start = strpos(substr($from_text, $from_segment_start, $from_segment_len), substr($to_text, $to_copy_start, $copy_len)); 383 384 if ($from_copy_start !== false) { 385 $from_copy_start += $from_segment_start; 386 break 2; 387 } 388 389 $to_copy_start++; 390 } 391 392 $copy_len--; 393 } 394 } else { 395 396 $copy_len = $from_segment_len; 397 398 while ($copy_len) { 399 400 $from_copy_start = $from_segment_start; 401 $from_copy_start_max = $from_segment_end - $copy_len; 402 403 while ($from_copy_start <= $from_copy_start_max) { 404 405 $to_copy_start = strpos(substr($to_text, $to_segment_start, $to_segment_len), substr($from_text, $from_copy_start, $copy_len)); 406 407 if ($to_copy_start !== false) { 408 $to_copy_start += $to_segment_start; 409 break 2; 410 } 411 412 $from_copy_start++; 413 } 414 415 $copy_len--; 416 } 417 } 418 419 // match found 420 if ( $copy_len ) { 421 $jobs[] = array($from_segment_start, $from_copy_start, $to_segment_start, $to_copy_start); 422 $result[$from_copy_start * 4 + 2] = new Copy($copy_len); 423 $jobs[] = array($from_copy_start + $copy_len, $from_segment_end, $to_copy_start + $copy_len, $to_segment_end); 424 } 425 // no match, so delete all, insert all 426 else { 427 $result[$from_segment_start * 4] = new Replace($from_segment_len, substr($to_text, $to_segment_start, $to_segment_len)); 428 } 429 } 430 431 ksort($result, SORT_NUMERIC); 432 return array_values($result); 433 } 434 435 /** 436 * Efficiently fragment the text into an array according to specified delimiters. 437 * 438 * No delimiters means fragment into single character. The array indices are the offset of the fragments into 439 * the input string. A sentinel empty fragment is always added at the end. 440 * Careful: No check is performed as to the validity of the delimiters. 441 * 442 * @param string $text 443 * @param string $delimiters 444 * @param array 445 */ 446 protected function extractFragments($text, $delimiters) 447 { 448 // special case: split into characters 449 if (empty($delimiters)) { 450 $chars = str_split($text, 1); 451 $chars[strlen($text)] = ''; 452 453 return $chars; 454 } 455 456 $fragments = array(); 457 $start = 0; 458 $end = 0; 459 460 for (;;) { 461 462 $end += strcspn($text, $delimiters, $end); 463 $end += strspn($text, $delimiters, $end); 464 465 if ($end === $start) { 466 break; 467 } 468 469 $fragments[$start] = substr($text, $start, $end - $start); 470 $start = $end; 471 } 472 473 $fragments[$start] = ''; 474 475 return $fragments; 476 } 477}