1<?php 2 3declare(strict_types=1); 4 5/* 6 * FINE granularity DIFF 7 * 8 * (c) 2011 Raymond Hill (http://raymondhill.net/blog/?p=441) 9 * (c) 2013 Robert Crowe (http://cogpowered.com) 10 * (c) 2021 Christian Kuhn 11 * 12 * For the full copyright and license information, please view 13 * the LICENSE file that was distributed with this source code. 14 */ 15 16namespace cogpowered\FineDiff\Parser; 17 18use cogpowered\FineDiff\Exceptions\GranularityCountException; 19use cogpowered\FineDiff\Exceptions\OperationException; 20use cogpowered\FineDiff\Granularity\GranularityInterface; 21use cogpowered\FineDiff\Parser\Operations\Copy; 22use cogpowered\FineDiff\Parser\Operations\Delete; 23use cogpowered\FineDiff\Parser\Operations\Insert; 24use cogpowered\FineDiff\Parser\Operations\OperationInterface; 25use cogpowered\FineDiff\Parser\Operations\Replace; 26 27/** 28 * Generates a set of instructions to convert one string to another. 29 */ 30class Parser implements ParserInterface 31{ 32 /** 33 * @var GranularityInterface 34 */ 35 protected $granularity; 36 37 /** 38 * @var OpcodesInterface 39 */ 40 protected $opcodes; 41 42 /** 43 * @var string Text we are comparing against. 44 */ 45 protected $from_text; 46 47 /** 48 * @var int Position of the $from_text we are at. 49 */ 50 protected $from_offset = 0; 51 52 /** 53 * @var OperationInterface|null 54 */ 55 protected $last_edit; 56 57 /** 58 * @var int Current position in the granularity array. 59 */ 60 protected $stackpointer = 0; 61 62 /** 63 * @var array<int, OperationInterface> Holds the individual opcodes as the diff takes place. 64 */ 65 protected $edits = []; 66 67 /** 68 * @inheritdoc 69 */ 70 public function __construct(GranularityInterface $granularity) 71 { 72 $this->granularity = $granularity; 73 74 // Set default opcodes generator 75 $this->opcodes = new Opcodes(); 76 } 77 78 /** 79 * @inheritdoc 80 */ 81 public function getGranularity() 82 { 83 return $this->granularity; 84 } 85 86 /** 87 * @inheritdoc 88 */ 89 public function setGranularity(GranularityInterface $granularity): void 90 { 91 $this->granularity = $granularity; 92 } 93 94 /** 95 * @inheritdoc 96 */ 97 public function getOpcodes(): OpcodesInterface 98 { 99 return $this->opcodes; 100 } 101 102 /** 103 * @inheritdoc 104 */ 105 public function setOpcodes(OpcodesInterface $opcodes): void 106 { 107 $this->opcodes = $opcodes; 108 } 109 110 /** 111 * @inheritdoc 112 * @throws GranularityCountException|OperationException 113 */ 114 public function parse(string $from_text, string $to_text): OpcodesInterface 115 { 116 // Ensure the granularity contains some delimiters 117 if (count($this->granularity) === 0) { 118 throw new GranularityCountException('Granularity contains no delimiters'); 119 } 120 121 // Reset internal parser properties 122 $this->from_text = $from_text; 123 $this->from_offset = 0; 124 $this->last_edit = null; 125 $this->stackpointer = 0; 126 $this->edits = []; 127 128 // Parse the two string 129 $this->process($from_text, $to_text); 130 131 // Return processed diff 132 $this->opcodes->setOpcodes($this->edits); 133 return $this->opcodes; 134 } 135 136 /** 137 * Actually kicks off the processing. Recursive function. 138 */ 139 protected function process(string $from_text, string $to_text): void 140 { 141 // Lets get parsing 142 $delimiters = $this->granularity[$this->stackpointer++] ?? ''; 143 $has_next_stage = $this->stackpointer < count($this->granularity); 144 145 // Actually perform diff 146 $diff = $this->diff($from_text, $to_text, $delimiters); 147 148 foreach ($diff as $fragment) { 149 // increase granularity 150 if ($fragment instanceof Replace && $has_next_stage) { 151 $this->process( 152 mb_substr($this->from_text, $this->from_offset, $fragment->getFromLen()), 153 $fragment->getText() 154 ); 155 } elseif ($fragment instanceof Copy && $this->last_edit instanceof Copy) { 156 // fuse copy ops whenever possible 157 /** @var Copy $copyOperation */ 158 $copyOperation = $this->edits[count($this->edits)-1]; 159 $copyOperation->increase($fragment->getFromLen()); 160 $this->from_offset += $fragment->getFromLen(); 161 } else { 162 /* $fragment instanceof Copy */ 163 /* $fragment instanceof Delete */ 164 /* $fragment instanceof Insert */ 165 $this->edits[] = $this->last_edit = $fragment; 166 $this->from_offset += $fragment->getFromLen(); 167 } 168 } 169 170 $this->stackpointer--; 171 } 172 173 /** 174 * Core parsing function. 175 * 176 * @return array<int, OperationInterface> 177 */ 178 protected function diff(string $from_text, string $to_text, string $delimiters): array 179 { 180 // Empty delimiter means character-level diffing. 181 // In such case, use code path optimized for character-level diffing. 182 if (empty($delimiters)) { 183 return $this->charDiff($from_text, $to_text); 184 } 185 186 $result = []; 187 188 // fragment-level diffing 189 $from_text_len = mb_strlen($from_text); 190 $to_text_len = mb_strlen($to_text); 191 $from_fragments = $this->extractFragments($from_text, $delimiters); 192 $to_fragments = $this->extractFragments($to_text, $delimiters); 193 194 $jobs = [[0, $from_text_len, 0, $to_text_len]]; 195 $cached_array_keys = []; 196 $best_from_start = 0; 197 $best_to_start = 0; 198 199 while ($job = array_pop($jobs)) { 200 201 // get the segments which must be diff'ed 202 list($from_segment_start, $from_segment_end, $to_segment_start, $to_segment_end) = $job; 203 204 // catch easy cases first 205 $from_segment_length = $from_segment_end - $from_segment_start; 206 $to_segment_length = $to_segment_end - $to_segment_start; 207 208 if (!$from_segment_length || !$to_segment_length) { 209 if ($from_segment_length) { 210 $result[$from_segment_start * 4] = new Delete($from_segment_length); 211 } elseif ($to_segment_length) { 212 $result[$from_segment_start * 4 + 1] = new Insert(mb_substr($to_text, $to_segment_start, $to_segment_length)); 213 } 214 215 continue; 216 } 217 218 // find longest copy operation for the current segments 219 $best_copy_length = 0; 220 221 $from_base_fragment_index = $from_segment_start; 222 $cached_array_keys_for_current_segment = []; 223 224 while ($from_base_fragment_index < $from_segment_end) { 225 $from_base_fragment = $from_fragments[$from_base_fragment_index]; 226 $from_base_fragment_length = mb_strlen($from_base_fragment); 227 228 // performance boost: cache array keys 229 if (!isset($cached_array_keys_for_current_segment[$from_base_fragment])) { 230 if (!isset($cached_array_keys[$from_base_fragment])) { 231 $to_all_fragment_indices = $cached_array_keys[$from_base_fragment] = array_keys($to_fragments, $from_base_fragment, true); 232 } else { 233 $to_all_fragment_indices = $cached_array_keys[$from_base_fragment]; 234 } 235 236 // get only indices which falls within current segment 237 if ($to_segment_start > 0 || $to_segment_end < $to_text_len) { 238 $to_fragment_indices = []; 239 240 foreach ($to_all_fragment_indices as $to_fragment_index) { 241 if ($to_fragment_index < $to_segment_start) { 242 continue; 243 } 244 245 if ($to_fragment_index >= $to_segment_end) { 246 break; 247 } 248 249 $to_fragment_indices[] = $to_fragment_index; 250 } 251 252 $cached_array_keys_for_current_segment[$from_base_fragment] = $to_fragment_indices; 253 } else { 254 $to_fragment_indices = $to_all_fragment_indices; 255 } 256 } else { 257 $to_fragment_indices = $cached_array_keys_for_current_segment[$from_base_fragment]; 258 } 259 260 // iterate through collected indices 261 foreach ($to_fragment_indices as $to_base_fragment_index) { 262 $fragment_index_offset = $from_base_fragment_length; 263 264 // iterate until no more match 265 for (;;) { 266 $fragment_from_index = $from_base_fragment_index + $fragment_index_offset; 267 268 if ($fragment_from_index >= $from_segment_end) { 269 break; 270 } 271 272 $fragment_to_index = $to_base_fragment_index + $fragment_index_offset; 273 274 if ($fragment_to_index >= $to_segment_end) { 275 break; 276 } 277 278 if ($from_fragments[$fragment_from_index] !== $to_fragments[$fragment_to_index]) { 279 break; 280 } 281 282 $fragment_length = mb_strlen($from_fragments[$fragment_from_index]); 283 $fragment_index_offset += $fragment_length; 284 } 285 286 if ($fragment_index_offset > $best_copy_length) { 287 $best_copy_length = $fragment_index_offset; 288 $best_from_start = $from_base_fragment_index; 289 $best_to_start = $to_base_fragment_index; 290 } 291 } 292 293 $from_base_fragment_index += mb_strlen($from_base_fragment); 294 295 // If match is larger than half segment size, no point trying to find better 296 // @todo: Really? 297 if ($best_copy_length >= $from_segment_length / 2) { 298 break; 299 } 300 301 // no point to keep looking if what is left is less than 302 // current best match 303 if ($from_base_fragment_index + $best_copy_length >= $from_segment_end) { 304 break; 305 } 306 } 307 308 if ($best_copy_length) { 309 $jobs[] = [$from_segment_start, $best_from_start, $to_segment_start, $best_to_start]; 310 $result[$best_from_start * 4 + 2] = new Copy($best_copy_length); 311 $jobs[] = [$best_from_start + $best_copy_length, $from_segment_end, $best_to_start + $best_copy_length, $to_segment_end]; 312 } else { 313 $result[$from_segment_start * 4 ] = new Replace($from_segment_length, mb_substr($to_text, $to_segment_start, $to_segment_length)); 314 } 315 } 316 317 ksort($result, SORT_NUMERIC); 318 return array_values($result); 319 } 320 321 /** 322 * Same as Parser::diff but tuned for character level granularity. 323 * 324 * @return array<int, OperationInterface> 325 */ 326 protected function charDiff(string $from_text, string $to_text): array 327 { 328 $result = []; 329 $jobs = [[0, mb_strlen($from_text), 0, mb_strlen($to_text)]]; 330 $from_copy_start = 0; 331 $to_copy_start = 0; 332 333 while ($job = array_pop($jobs)) { 334 335 // get the segments which must be diff'ed 336 list($from_segment_start, $from_segment_end, $to_segment_start, $to_segment_end) = $job; 337 338 $from_segment_len = $from_segment_end - $from_segment_start; 339 $to_segment_len = $to_segment_end - $to_segment_start; 340 341 // catch easy cases first 342 if (!$from_segment_len || !$to_segment_len) { 343 if ($from_segment_len) { 344 $result[$from_segment_start * 4 + 0] = new Delete($from_segment_len); 345 } elseif ($to_segment_len) { 346 $result[$from_segment_start * 4 + 1] = new Insert(mb_substr($to_text, $to_segment_start, $to_segment_len)); 347 } 348 349 continue; 350 } 351 352 if ($from_segment_len >= $to_segment_len) { 353 $copy_len = $to_segment_len; 354 355 while ($copy_len) { 356 $to_copy_start = $to_segment_start; 357 $to_copy_start_max = $to_segment_end - $copy_len; 358 359 while ($to_copy_start <= $to_copy_start_max) { 360 $from_copy_start = mb_strpos(mb_substr($from_text, $from_segment_start, $from_segment_len), mb_substr($to_text, $to_copy_start, $copy_len)); 361 362 if ($from_copy_start !== false) { 363 $from_copy_start += $from_segment_start; 364 break 2; 365 } 366 367 $to_copy_start++; 368 } 369 370 $copy_len--; 371 } 372 } else { 373 $copy_len = $from_segment_len; 374 375 while ($copy_len) { 376 $from_copy_start = $from_segment_start; 377 $from_copy_start_max = $from_segment_end - $copy_len; 378 379 while ($from_copy_start <= $from_copy_start_max) { 380 $to_copy_start = mb_strpos(mb_substr($to_text, $to_segment_start, $to_segment_len), mb_substr($from_text, $from_copy_start, $copy_len)); 381 382 if ($to_copy_start !== false) { 383 $to_copy_start += $to_segment_start; 384 break 2; 385 } 386 387 $from_copy_start++; 388 } 389 390 $copy_len--; 391 } 392 } 393 394 // match found 395 if ($copy_len) { 396 $jobs[] = [$from_segment_start, $from_copy_start, $to_segment_start, $to_copy_start]; 397 $result[$from_copy_start * 4 + 2] = new Copy($copy_len); 398 $jobs[] = [$from_copy_start + $copy_len, $from_segment_end, $to_copy_start + $copy_len, $to_segment_end]; 399 } 400 // no match, so delete all, insert all 401 else { 402 $result[$from_segment_start * 4] = new Replace($from_segment_len, mb_substr($to_text, $to_segment_start, $to_segment_len)); 403 } 404 } 405 406 ksort($result, SORT_NUMERIC); 407 return array_values($result); 408 } 409 410 /** 411 * Efficiently fragment the text into an array according to specified delimiters. 412 * 413 * The array indices are the offset of the fragments into 414 * the input string. A sentinel empty fragment is always added at the end. 415 * Careful: No check is performed as to the validity of the delimiters. 416 * 417 * @return array<int, string> 418 */ 419 protected function extractFragments(string $text, string $delimiters) 420 { 421 $fragments = []; 422 $start = 0; 423 $end = 0; 424 for (;;) { 425 $end += strcspn($text, $delimiters, $end); 426 $end += strspn($text, $delimiters, $end); 427 if ($end === $start) { 428 break; 429 } 430 $fragments[$start] = mb_substr($text, $start, $end - $start); 431 $start = $end; 432 } 433 $fragments[$start] = ''; 434 return $fragments; 435 } 436} 437