1<?php 2 3/** 4 * Compute edit distance between two scalar sequences. This class uses 5 * Levenshtein (or Damerau-Levenshtein) to compute the edit distance between 6 * two inputs. The inputs are arrays containing any scalars (not just strings) 7 * so it can be used with, e.g., utf8 sequences. 8 * 9 * $matrix = id(new PhutilEditDistanceMatrix()) 10 * ->setSequences(str_split('ran'), str_split('rat')); 11 * 12 * $cost = $matrix->getEditDistance(); 13 * 14 * Edit distance computation is slow and requires a large amount of memory; 15 * both are roughly O(N * M) in the length of the strings. 16 * 17 * You can customize the cost of insertions, deletions and replacements. By 18 * default, each has a cost of 1. 19 * 20 * $matrix->setReplaceCost(1); 21 * 22 * By default, transpositions are not evaluated (i.e., the algorithm is 23 * Levenshtein). You can cause transpositions to be evaluated by setting a 24 * transpose cost (which will change the algorithm to Damerau-Levenshtein): 25 * 26 * $matrix->setTransposeCost(1); 27 * 28 * You can also provide a cost to alter the type of operation being applied. 29 * Many strings have several equivalently expensive edit sequences, but one 30 * some sequences are more readable to humans than others. Providing a small 31 * cost to alter operation type tends to smooth out the sequence and produce 32 * long runs of a single operation, which are generally more readable. For 33 * example, these strings: 34 * 35 * (x) 36 * ((x)) 37 * 38 * ...have edit strings "issis" and "isssi", which describe edit operations with 39 * the same cost, but the latter string is more comprehensible to human viewers. 40 * 41 * If you set an alter cost, you must call @{method:setComputeString} to enable 42 * type computation. The alter cost should generally be smaller than `c / N`, 43 * where `c` is the smallest operational cost and `N` is the length of the 44 * longest string. For example, if you are using the default costs (insert = 1, 45 * delete = 1, replace = 1) and computing edit distances for strings of fewer 46 * than 1,000 characters, you might set the alter cost to 0.001. 47 */ 48final class PhutilEditDistanceMatrix extends Phobject { 49 50 private $insertCost = 1; 51 private $deleteCost = 1; 52 private $replaceCost = 1; 53 private $transposeCost = null; 54 private $alterCost = 0; 55 private $maximumLength; 56 private $computeString; 57 private $applySmoothing = self::SMOOTHING_NONE; 58 private $reachedMaximumLength; 59 60 private $x; 61 private $y; 62 private $prefix; 63 private $suffix; 64 65 private $distanceMatrix = null; 66 private $typeMatrix = null; 67 68 const SMOOTHING_NONE = 'none'; 69 const SMOOTHING_INTERNAL = 'internal'; 70 const SMOOTHING_FULL = 'full'; 71 72 public function setMaximumLength($maximum_length) { 73 $this->maximumLength = $maximum_length; 74 return $this; 75 } 76 77 public function getMaximumLength() { 78 return coalesce($this->maximumLength, $this->getInfinity()); 79 } 80 81 public function didReachMaximumLength() { 82 return $this->reachedMaximumLength; 83 } 84 85 public function setComputeString($compute_string) { 86 $this->computeString = $compute_string; 87 return $this; 88 } 89 90 public function getComputeString() { 91 return $this->computeString; 92 } 93 94 public function setTransposeCost($transpose_cost) { 95 $this->transposeCost = $transpose_cost; 96 return $this; 97 } 98 99 public function getTransposeCost() { 100 return $this->transposeCost; 101 } 102 103 public function setReplaceCost($replace_cost) { 104 $this->replaceCost = $replace_cost; 105 return $this; 106 } 107 108 public function getReplaceCost() { 109 return $this->replaceCost; 110 } 111 112 public function setDeleteCost($delete_cost) { 113 $this->deleteCost = $delete_cost; 114 return $this; 115 } 116 117 public function getDeleteCost() { 118 return $this->deleteCost; 119 } 120 121 public function setInsertCost($insert_cost) { 122 $this->insertCost = $insert_cost; 123 return $this; 124 } 125 126 public function getInsertCost() { 127 return $this->insertCost; 128 } 129 130 public function setAlterCost($alter_cost) { 131 $this->alterCost = $alter_cost; 132 return $this; 133 } 134 135 public function getAlterCost() { 136 return $this->alterCost; 137 } 138 139 public function setApplySmoothing($apply_smoothing) { 140 $this->applySmoothing = $apply_smoothing; 141 return $this; 142 } 143 144 public function getApplySmoothing() { 145 return $this->applySmoothing; 146 } 147 148 public function setSequences(array $x, array $y) { 149 150 // NOTE: We strip common prefixes and suffixes from the inputs because 151 // the runtime of the edit distance algorithm is large and it is common 152 // to diff similar strings. 153 154 $xl = count($x); 155 $yl = count($y); 156 $l = min($xl, $yl); 157 158 $prefix_l = 0; 159 $suffix_l = 0; 160 161 for ($ii = 0; $ii < $l; $ii++) { 162 if ($x[$ii] !== $y[$ii]) { 163 break; 164 } 165 $prefix_l++; 166 } 167 168 for ($ii = 1; $ii <= ($l - $prefix_l); $ii++) { 169 if ($x[$xl - $ii] !== $y[$yl - $ii]) { 170 break; 171 } 172 $suffix_l++; 173 } 174 175 $this->prefix = array_slice($x, 0, $prefix_l); 176 $this->suffix = array_slice($x, $xl - $suffix_l); 177 178 $this->x = array_slice($x, $prefix_l, $xl - ($suffix_l + $prefix_l)); 179 $this->y = array_slice($y, $prefix_l, $yl - ($suffix_l + $prefix_l)); 180 $this->distanceMatrix = null; 181 182 return $this; 183 } 184 185 private function requireSequences() { 186 if ($this->x === null) { 187 throw new PhutilInvalidStateException('setSequences'); 188 } 189 } 190 191 public function getEditDistance() { 192 $this->requireSequences(); 193 194 $x = $this->x; 195 $y = $this->y; 196 197 if (!$x) { 198 return $this->insertCost * count($y); 199 } 200 201 if (!$y) { 202 return $this->deleteCost * count($x); 203 } 204 205 $max = $this->getMaximumLength(); 206 if (count($x) > $max || count($y) > $max) { 207 $this->reachedMaximumLength = true; 208 return ($this->insertCost * count($y)) + ($this->deleteCost * count($x)); 209 } 210 211 if ($x === $y) { 212 return 0; 213 } 214 215 $matrix = $this->getDistanceMatrix(); 216 217 return $matrix[count($x)][count($y)]; 218 } 219 220 /** 221 * Return a string representing the edits between the sequences. The string 222 * has these characters: 223 * 224 * - s (same): Same character in both strings. 225 * - i (insert): Character inserted. 226 * - d (delete): Character deleted. 227 * - x (replace): Character replaced. 228 * - t (transpose): Character transposed. 229 */ 230 public function getEditString() { 231 $this->requireSequences(); 232 233 $x = $this->x; 234 $y = $this->y; 235 236 if (!$x && !$y) { 237 return $this->padEditString(''); 238 } 239 240 if (!$x) { 241 return $this->padEditString(str_repeat('i', count($y))); 242 } 243 244 if (!$y) { 245 return $this->padEditString(str_repeat('d', count($x))); 246 } 247 248 if ($x === $y) { 249 return $this->padEditString(str_repeat('s', count($x))); 250 } 251 252 $max = $this->getMaximumLength(); 253 if (count($x) > $max || count($y) > $max) { 254 $this->reachedMaximumLength = true; 255 return $this->padEditString( 256 str_repeat('d', count($x)). 257 str_repeat('i', count($y))); 258 } 259 260 $matrix = $this->getTypeMatrix(); 261 262 $xx = count($x); 263 $yy = count($y); 264 265 $transposes = array(); 266 $str = ''; 267 while (true) { 268 $type = $matrix[$xx][$yy]; 269 270 if (is_array($type)) { 271 $chr = 't'; 272 $transposes[$type[0]][$type[1]] = true; 273 $type = $type[2]; 274 } else { 275 $chr = $type; 276 } 277 278 if (isset($transposes[$xx][$yy])) { 279 $chr = 't'; 280 } 281 282 if ($type == 's' || $type == 'x') { 283 $xx -= 1; 284 $yy -= 1; 285 } else if ($type == 'i') { 286 $yy -= 1; 287 } else if ($type == 'd') { 288 $xx -= 1; 289 } else { 290 throw new Exception(pht("Unknown type '%s' in type matrix.", $type)); 291 } 292 293 $str .= $chr; 294 295 if ($xx <= 0 && $yy <= 0) { 296 break; 297 } 298 } 299 300 $str = strrev($str); 301 302 // For full smoothing, we pad the edit string before smoothing it, so 303 // ranges of similar characters at the beginning or end of the string can 304 // also be smoothed. 305 306 // For internal smoothing, we only smooth ranges within the change itself. 307 308 $smoothing = $this->getApplySmoothing(); 309 switch ($smoothing) { 310 case self::SMOOTHING_FULL: 311 $str = $this->padEditString($str); 312 $str = $this->applySmoothing($str, true); 313 break; 314 case self::SMOOTHING_INTERNAL: 315 $str = $this->applySmoothing($str, false); 316 $str = $this->padEditString($str); 317 break; 318 case self::SMOOTHING_NONE: 319 $str = $this->padEditString($str); 320 break; 321 default: 322 throw new Exception( 323 pht( 324 'Unknown smoothing type "%s".', 325 $smoothing)); 326 } 327 328 return $str; 329 } 330 331 private function padEditString($str) { 332 if ($this->prefix) { 333 $str = str_repeat('s', count($this->prefix)).$str; 334 } 335 if ($this->suffix) { 336 $str = $str.str_repeat('s', count($this->suffix)); 337 } 338 return $str; 339 } 340 341 private function getTypeMatrix() { 342 if (!$this->computeString) { 343 throw new PhutilInvalidStateException('setComputeString'); 344 } 345 if ($this->typeMatrix === null) { 346 $this->computeMatrix($this->x, $this->y); 347 } 348 return $this->typeMatrix; 349 } 350 351 private function getDistanceMatrix() { 352 if ($this->distanceMatrix === null) { 353 $this->computeMatrix($this->x, $this->y); 354 } 355 return $this->distanceMatrix; 356 } 357 358 private function computeMatrix(array $x, array $y) { 359 $xl = count($x); 360 $yl = count($y); 361 362 $m = array(); 363 364 $infinity = $this->getInfinity(); 365 366 $use_damerau = ($this->transposeCost !== null); 367 if ($use_damerau) { 368 // Initialize the default cost of a transpose. 369 $m[-1][-1] = $infinity; 370 371 // Initialize the character position dictionary for Damerau. 372 $d = array(); 373 foreach ($x as $c) { 374 $d[$c] = -1; 375 } 376 foreach ($y as $c) { 377 $d[$c] = -1; 378 } 379 380 // Initialize the transpose costs for Damerau. 381 for ($xx = 0; $xx <= $xl; $xx++) { 382 $m[$xx][-1] = $infinity; 383 } 384 385 for ($yy = 0; $yy <= $yl; $yy++) { 386 $m[-1][$yy] = $infinity; 387 } 388 } 389 390 // Initialize the top row of the matrix. 391 for ($xx = 0; $xx <= $xl; $xx++) { 392 $m[$xx][0] = $xx * $this->deleteCost; 393 } 394 395 // Initialize the left column of the matrix. 396 $cost = 0; 397 for ($yy = 0; $yy <= $yl; $yy++) { 398 $m[0][$yy] = $yy * $this->insertCost; 399 } 400 401 $use_types = ($this->computeString); 402 if ($use_types) { 403 $t = array(); 404 for ($xx = 0; $xx <= $xl; $xx++) { 405 $t[$xx][0] = 'd'; 406 } 407 for ($yy = 0; $yy <= $yl; $yy++) { 408 $t[0][$yy] = 'i'; 409 } 410 $t[0][0] = 's'; 411 } 412 413 $alt_cost = $this->getAlterCost(); 414 if ($alt_cost && !$use_types) { 415 throw new Exception( 416 pht( 417 'If you provide an alter cost with %s, you must enable '. 418 'type computation with %s.', 419 'setAlterCost()', 420 'setComputeStrings()')); 421 } 422 423 // Build the edit distance matrix. 424 for ($xx = 1; $xx <= $xl; $xx++) { 425 $last_dy = -1; 426 for ($yy = 1; $yy <= $yl; $yy++) { 427 if ($use_damerau) { 428 $dx = $d[$y[$yy - 1]]; 429 $dy = $last_dy; 430 } 431 432 if ($x[$xx - 1] === $y[$yy - 1]) { 433 $rep_cost = $m[$xx - 1][$yy - 1] + 0; 434 $rep_type = 's'; 435 } else { 436 $rep_cost = $m[$xx - 1][$yy - 1] + $this->replaceCost; 437 $rep_type = 'x'; 438 } 439 440 $del_cost = $m[$xx - 1][$yy ] + $this->deleteCost; 441 $ins_cost = $m[$xx ][$yy - 1] + $this->insertCost; 442 if ($alt_cost) { 443 $del_char = $t[$xx - 1][$yy ]; 444 $ins_char = $t[$xx ][$yy - 1]; 445 $rep_char = $t[$xx - 1][$yy - 1]; 446 447 if ($del_char !== 'd') { 448 $del_cost += $alt_cost; 449 } 450 if ($ins_char !== 'i') { 451 $ins_cost += $alt_cost; 452 } 453 if ($rep_char !== $rep_type) { 454 $rep_cost += $alt_cost; 455 } 456 } 457 458 if ($rep_cost <= $del_cost && $rep_cost <= $ins_cost) { 459 $cost = $rep_cost; 460 $type = $rep_type; 461 if ($rep_type === 's') { 462 $last_dy = $yy - 1; 463 } 464 } else if ($ins_cost <= $del_cost) { 465 $cost = $ins_cost; 466 $type = 'i'; 467 } else { 468 $cost = $del_cost; 469 $type = 'd'; 470 } 471 472 if ($use_damerau) { 473 $trn_count = (($xx - $dx - 2) + ($yy - $dy - 2) + 1); 474 $trn_cost = $m[$dx][$dy] + ($trn_count * $this->transposeCost); 475 if ($trn_cost < $cost) { 476 $cost = $trn_cost; 477 $type = array($dx + 1, $dy + 1, $type); 478 } 479 } 480 481 $m[$xx][$yy] = $cost; 482 if ($use_types) { 483 $t[$xx][$yy] = $type; 484 } 485 } 486 487 if ($use_damerau) { 488 $d[$x[$xx - 1]] = ($xx - 1); 489 } 490 } 491 492 $this->distanceMatrix = $m; 493 if ($use_types) { 494 $this->typeMatrix = $t; 495 } 496 } 497 498 private function getInfinity() { 499 return INF; 500 } 501 502 private function printMatrix(array $m) { 503 $x = $this->x; 504 $y = $this->y; 505 506 $p = '% 8s '; 507 printf($p, ' '); 508 foreach (head($m) as $yk => $yv) { 509 printf($p, idx($y, $yk - 1, '-')); 510 } 511 echo "\n"; 512 foreach ($m as $xk => $xv) { 513 printf($p, idx($x, $xk - 1, '-')); 514 foreach ($xv as $yk => $yv) { 515 printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv)); 516 } 517 echo "\n"; 518 } 519 } 520 521 private function printTypeMatrix(array $t) { 522 $x = $this->x; 523 $y = $this->y; 524 525 $p = '% 8s '; 526 printf($p, ' '); 527 foreach (head($t) as $yk => $yv) { 528 printf($p, idx($y, $yk - 1, '-')); 529 } 530 echo "\n"; 531 foreach ($t as $xk => $xv) { 532 printf($p, idx($x, $xk - 1, '-')); 533 foreach ($xv as $yk => $yv) { 534 printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv)); 535 } 536 echo "\n"; 537 } 538 } 539 540 private function applySmoothing($str, $full) { 541 if ($full) { 542 $prefix = '(^|[xdi])'; 543 $suffix = '([xdi]|\z)'; 544 } else { 545 $prefix = '([xdi])'; 546 $suffix = '([xdi])'; 547 } 548 549 // Smooth the string out, by replacing short runs of similar characters 550 // with 'x' operations. This makes the result more readable to humans, 551 // since there are fewer choppy runs of short added and removed substrings. 552 do { 553 $original = $str; 554 $str = preg_replace('/'.$prefix.'(s{3})'.$suffix.'/', '$1xxx$3', $str); 555 $str = preg_replace('/'.$prefix.'(s{2})'.$suffix.'/', '$1xx$3', $str); 556 $str = preg_replace('/'.$prefix.'(s{1})'.$suffix.'/', '$1x$3', $str); 557 } while ($str != $original); 558 559 return $str; 560 } 561 562} 563