1<?php
2
3/**
4 * Compute edit distance between two scalar sequences. This class uses
5 * Levenshtein (or Damerau-Levenshtein) to compute the edit distance between
6 * two inputs. The inputs are arrays containing any scalars (not just strings)
7 * so it can be used with, e.g., utf8 sequences.
8 *
9 *  $matrix = id(new PhutilEditDistanceMatrix())
10 *    ->setSequences(str_split('ran'), str_split('rat'));
11 *
12 *  $cost = $matrix->getEditDistance();
13 *
14 * Edit distance computation is slow and requires a large amount of memory;
15 * both are roughly O(N * M) in the length of the strings.
16 *
17 * You can customize the cost of insertions, deletions and replacements. By
18 * default, each has a cost of 1.
19 *
20 *   $matrix->setReplaceCost(1);
21 *
22 * By default, transpositions are not evaluated (i.e., the algorithm is
23 * Levenshtein). You can cause transpositions to be evaluated by setting a
24 * transpose cost (which will change the algorithm to Damerau-Levenshtein):
25 *
26 *   $matrix->setTransposeCost(1);
27 *
28 * You can also provide a cost to alter the type of operation being applied.
29 * Many strings have several equivalently expensive edit sequences, but one
30 * some sequences are more readable to humans than others. Providing a small
31 * cost to alter operation type tends to smooth out the sequence and produce
32 * long runs of a single operation, which are generally more readable. For
33 * example, these strings:
34 *
35 *   (x)
36 *   ((x))
37 *
38 * ...have edit strings "issis" and "isssi", which describe edit operations with
39 * the same cost, but the latter string is more comprehensible to human viewers.
40 *
41 * If you set an alter cost, you must call @{method:setComputeString} to enable
42 * type computation. The alter cost should generally be smaller than `c / N`,
43 * where `c` is the smallest operational cost and `N` is the length of the
44 * longest string. For example, if you are using the default costs (insert = 1,
45 * delete = 1, replace = 1) and computing edit distances for strings of fewer
46 * than 1,000 characters, you might set the alter cost to 0.001.
47 */
48final class PhutilEditDistanceMatrix extends Phobject {
49
50  private $insertCost    = 1;
51  private $deleteCost    = 1;
52  private $replaceCost   = 1;
53  private $transposeCost = null;
54  private $alterCost     = 0;
55  private $maximumLength;
56  private $computeString;
57  private $applySmoothing = self::SMOOTHING_NONE;
58  private $reachedMaximumLength;
59
60  private $x;
61  private $y;
62  private $prefix;
63  private $suffix;
64
65  private $distanceMatrix = null;
66  private $typeMatrix = null;
67
68  const SMOOTHING_NONE = 'none';
69  const SMOOTHING_INTERNAL = 'internal';
70  const SMOOTHING_FULL = 'full';
71
72  public function setMaximumLength($maximum_length) {
73    $this->maximumLength = $maximum_length;
74    return $this;
75  }
76
77  public function getMaximumLength() {
78    return coalesce($this->maximumLength, $this->getInfinity());
79  }
80
81  public function didReachMaximumLength() {
82    return $this->reachedMaximumLength;
83  }
84
85  public function setComputeString($compute_string) {
86    $this->computeString = $compute_string;
87    return $this;
88  }
89
90  public function getComputeString() {
91    return $this->computeString;
92  }
93
94  public function setTransposeCost($transpose_cost) {
95    $this->transposeCost = $transpose_cost;
96    return $this;
97  }
98
99  public function getTransposeCost() {
100    return $this->transposeCost;
101  }
102
103  public function setReplaceCost($replace_cost) {
104    $this->replaceCost = $replace_cost;
105    return $this;
106  }
107
108  public function getReplaceCost() {
109    return $this->replaceCost;
110  }
111
112  public function setDeleteCost($delete_cost) {
113    $this->deleteCost = $delete_cost;
114    return $this;
115  }
116
117  public function getDeleteCost() {
118    return $this->deleteCost;
119  }
120
121  public function setInsertCost($insert_cost) {
122    $this->insertCost = $insert_cost;
123    return $this;
124  }
125
126  public function getInsertCost() {
127    return $this->insertCost;
128  }
129
130  public function setAlterCost($alter_cost) {
131    $this->alterCost = $alter_cost;
132    return $this;
133  }
134
135  public function getAlterCost() {
136    return $this->alterCost;
137  }
138
139  public function setApplySmoothing($apply_smoothing) {
140    $this->applySmoothing = $apply_smoothing;
141    return $this;
142  }
143
144  public function getApplySmoothing() {
145    return $this->applySmoothing;
146  }
147
148  public function setSequences(array $x, array $y) {
149
150    // NOTE: We strip common prefixes and suffixes from the inputs because
151    // the runtime of the edit distance algorithm is large and it is common
152    // to diff similar strings.
153
154    $xl = count($x);
155    $yl = count($y);
156    $l = min($xl, $yl);
157
158    $prefix_l = 0;
159    $suffix_l = 0;
160
161    for ($ii = 0; $ii < $l; $ii++) {
162      if ($x[$ii] !== $y[$ii]) {
163        break;
164      }
165      $prefix_l++;
166    }
167
168    for ($ii = 1; $ii <= ($l - $prefix_l); $ii++) {
169      if ($x[$xl - $ii] !== $y[$yl - $ii]) {
170        break;
171      }
172      $suffix_l++;
173    }
174
175    $this->prefix = array_slice($x, 0, $prefix_l);
176    $this->suffix = array_slice($x, $xl - $suffix_l);
177
178    $this->x = array_slice($x, $prefix_l, $xl - ($suffix_l + $prefix_l));
179    $this->y = array_slice($y, $prefix_l, $yl - ($suffix_l + $prefix_l));
180    $this->distanceMatrix = null;
181
182    return $this;
183  }
184
185  private function requireSequences() {
186    if ($this->x === null) {
187      throw new PhutilInvalidStateException('setSequences');
188    }
189  }
190
191  public function getEditDistance() {
192    $this->requireSequences();
193
194    $x = $this->x;
195    $y = $this->y;
196
197    if (!$x) {
198      return $this->insertCost * count($y);
199    }
200
201    if (!$y) {
202      return $this->deleteCost * count($x);
203    }
204
205    $max = $this->getMaximumLength();
206    if (count($x) > $max || count($y) > $max) {
207      $this->reachedMaximumLength = true;
208      return ($this->insertCost * count($y)) + ($this->deleteCost * count($x));
209    }
210
211    if ($x === $y) {
212      return 0;
213    }
214
215    $matrix = $this->getDistanceMatrix();
216
217    return $matrix[count($x)][count($y)];
218  }
219
220  /**
221   * Return a string representing the edits between the sequences. The string
222   * has these characters:
223   *
224   *   - s (same): Same character in both strings.
225   *   - i (insert): Character inserted.
226   *   - d (delete): Character deleted.
227   *   - x (replace): Character replaced.
228   *   - t (transpose): Character transposed.
229   */
230  public function getEditString() {
231    $this->requireSequences();
232
233    $x = $this->x;
234    $y = $this->y;
235
236    if (!$x && !$y) {
237      return $this->padEditString('');
238    }
239
240    if (!$x) {
241      return $this->padEditString(str_repeat('i', count($y)));
242    }
243
244    if (!$y) {
245      return $this->padEditString(str_repeat('d', count($x)));
246    }
247
248    if ($x === $y) {
249      return $this->padEditString(str_repeat('s', count($x)));
250    }
251
252    $max = $this->getMaximumLength();
253    if (count($x) > $max || count($y) > $max) {
254      $this->reachedMaximumLength = true;
255      return $this->padEditString(
256        str_repeat('d', count($x)).
257        str_repeat('i', count($y)));
258    }
259
260    $matrix = $this->getTypeMatrix();
261
262    $xx = count($x);
263    $yy = count($y);
264
265    $transposes = array();
266    $str = '';
267    while (true) {
268      $type = $matrix[$xx][$yy];
269
270      if (is_array($type)) {
271        $chr = 't';
272        $transposes[$type[0]][$type[1]] = true;
273        $type = $type[2];
274      } else {
275        $chr = $type;
276      }
277
278      if (isset($transposes[$xx][$yy])) {
279        $chr = 't';
280      }
281
282      if ($type == 's' || $type == 'x') {
283        $xx -= 1;
284        $yy -= 1;
285      } else if ($type == 'i') {
286        $yy -= 1;
287      } else if ($type == 'd') {
288        $xx -= 1;
289      } else {
290        throw new Exception(pht("Unknown type '%s' in type matrix.", $type));
291      }
292
293      $str .= $chr;
294
295      if ($xx <= 0 && $yy <= 0) {
296        break;
297      }
298    }
299
300    $str = strrev($str);
301
302    // For full smoothing, we pad the edit string before smoothing it, so
303    // ranges of similar characters at the beginning or end of the string can
304    // also be smoothed.
305
306    // For internal smoothing, we only smooth ranges within the change itself.
307
308    $smoothing = $this->getApplySmoothing();
309    switch ($smoothing) {
310      case self::SMOOTHING_FULL:
311        $str = $this->padEditString($str);
312        $str = $this->applySmoothing($str, true);
313        break;
314      case self::SMOOTHING_INTERNAL:
315        $str = $this->applySmoothing($str, false);
316        $str = $this->padEditString($str);
317        break;
318      case self::SMOOTHING_NONE:
319        $str = $this->padEditString($str);
320        break;
321      default:
322        throw new Exception(
323          pht(
324            'Unknown smoothing type "%s".',
325            $smoothing));
326    }
327
328    return $str;
329  }
330
331  private function padEditString($str) {
332    if ($this->prefix) {
333      $str = str_repeat('s', count($this->prefix)).$str;
334    }
335    if ($this->suffix) {
336      $str = $str.str_repeat('s', count($this->suffix));
337    }
338    return $str;
339  }
340
341  private function getTypeMatrix() {
342    if (!$this->computeString) {
343      throw new PhutilInvalidStateException('setComputeString');
344    }
345    if ($this->typeMatrix === null) {
346      $this->computeMatrix($this->x, $this->y);
347    }
348    return $this->typeMatrix;
349  }
350
351  private function getDistanceMatrix() {
352    if ($this->distanceMatrix === null) {
353      $this->computeMatrix($this->x, $this->y);
354    }
355    return $this->distanceMatrix;
356  }
357
358  private function computeMatrix(array $x, array $y) {
359    $xl = count($x);
360    $yl = count($y);
361
362    $m = array();
363
364    $infinity = $this->getInfinity();
365
366    $use_damerau = ($this->transposeCost !== null);
367    if ($use_damerau) {
368      // Initialize the default cost of a transpose.
369      $m[-1][-1] = $infinity;
370
371      // Initialize the character position dictionary for Damerau.
372      $d = array();
373      foreach ($x as $c) {
374        $d[$c] = -1;
375      }
376      foreach ($y as $c) {
377        $d[$c] = -1;
378      }
379
380      // Initialize the transpose costs for Damerau.
381      for ($xx = 0; $xx <= $xl; $xx++) {
382        $m[$xx][-1] = $infinity;
383      }
384
385      for ($yy = 0; $yy <= $yl; $yy++) {
386        $m[-1][$yy] = $infinity;
387      }
388    }
389
390    // Initialize the top row of the matrix.
391    for ($xx = 0; $xx <= $xl; $xx++) {
392      $m[$xx][0] = $xx * $this->deleteCost;
393    }
394
395    // Initialize the left column of the matrix.
396    $cost = 0;
397    for ($yy = 0; $yy <= $yl; $yy++) {
398      $m[0][$yy] = $yy * $this->insertCost;
399    }
400
401    $use_types = ($this->computeString);
402    if ($use_types) {
403      $t = array();
404      for ($xx = 0; $xx <= $xl; $xx++) {
405        $t[$xx][0] = 'd';
406      }
407      for ($yy = 0; $yy <= $yl; $yy++) {
408        $t[0][$yy] = 'i';
409      }
410      $t[0][0] = 's';
411    }
412
413    $alt_cost = $this->getAlterCost();
414    if ($alt_cost && !$use_types) {
415      throw new Exception(
416        pht(
417          'If you provide an alter cost with %s, you must enable '.
418          'type computation with %s.',
419          'setAlterCost()',
420          'setComputeStrings()'));
421    }
422
423    // Build the edit distance matrix.
424    for ($xx = 1; $xx <= $xl; $xx++) {
425      $last_dy = -1;
426      for ($yy = 1; $yy <= $yl; $yy++) {
427        if ($use_damerau) {
428          $dx = $d[$y[$yy - 1]];
429          $dy = $last_dy;
430        }
431
432        if ($x[$xx - 1] === $y[$yy - 1]) {
433          $rep_cost = $m[$xx - 1][$yy - 1] + 0;
434          $rep_type = 's';
435        } else {
436          $rep_cost = $m[$xx - 1][$yy - 1] + $this->replaceCost;
437          $rep_type = 'x';
438        }
439
440        $del_cost = $m[$xx - 1][$yy    ] + $this->deleteCost;
441        $ins_cost = $m[$xx    ][$yy - 1] + $this->insertCost;
442        if ($alt_cost) {
443          $del_char = $t[$xx - 1][$yy    ];
444          $ins_char = $t[$xx    ][$yy - 1];
445          $rep_char = $t[$xx - 1][$yy - 1];
446
447          if ($del_char !== 'd') {
448            $del_cost += $alt_cost;
449          }
450          if ($ins_char !== 'i') {
451            $ins_cost += $alt_cost;
452          }
453          if ($rep_char !== $rep_type) {
454            $rep_cost += $alt_cost;
455          }
456        }
457
458        if ($rep_cost <= $del_cost && $rep_cost <= $ins_cost) {
459          $cost = $rep_cost;
460          $type = $rep_type;
461          if ($rep_type === 's') {
462            $last_dy = $yy - 1;
463          }
464        } else if ($ins_cost <= $del_cost) {
465          $cost = $ins_cost;
466          $type = 'i';
467        } else {
468          $cost = $del_cost;
469          $type = 'd';
470        }
471
472        if ($use_damerau) {
473          $trn_count = (($xx - $dx - 2) + ($yy - $dy - 2) + 1);
474          $trn_cost = $m[$dx][$dy] + ($trn_count * $this->transposeCost);
475          if ($trn_cost < $cost) {
476            $cost = $trn_cost;
477            $type = array($dx + 1, $dy + 1, $type);
478          }
479        }
480
481        $m[$xx][$yy] = $cost;
482        if ($use_types) {
483          $t[$xx][$yy] = $type;
484        }
485      }
486
487      if ($use_damerau) {
488        $d[$x[$xx - 1]] = ($xx - 1);
489      }
490    }
491
492    $this->distanceMatrix = $m;
493    if ($use_types) {
494      $this->typeMatrix = $t;
495    }
496  }
497
498  private function getInfinity() {
499    return INF;
500  }
501
502  private function printMatrix(array $m) {
503    $x = $this->x;
504    $y = $this->y;
505
506    $p = '% 8s ';
507    printf($p, ' ');
508    foreach (head($m) as $yk => $yv) {
509      printf($p, idx($y, $yk - 1, '-'));
510    }
511    echo "\n";
512    foreach ($m as $xk => $xv) {
513      printf($p, idx($x, $xk - 1, '-'));
514      foreach ($xv as $yk => $yv) {
515        printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv));
516      }
517      echo "\n";
518    }
519  }
520
521  private function printTypeMatrix(array $t) {
522    $x = $this->x;
523    $y = $this->y;
524
525    $p = '% 8s ';
526    printf($p, ' ');
527    foreach (head($t) as $yk => $yv) {
528      printf($p, idx($y, $yk - 1, '-'));
529    }
530    echo "\n";
531    foreach ($t as $xk => $xv) {
532      printf($p, idx($x, $xk - 1, '-'));
533      foreach ($xv as $yk => $yv) {
534        printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv));
535      }
536      echo "\n";
537    }
538  }
539
540  private function applySmoothing($str, $full) {
541    if ($full) {
542      $prefix = '(^|[xdi])';
543      $suffix = '([xdi]|\z)';
544    } else {
545      $prefix = '([xdi])';
546      $suffix = '([xdi])';
547    }
548
549    // Smooth the string out, by replacing short runs of similar characters
550    // with 'x' operations. This makes the result more readable to humans,
551    // since there are fewer choppy runs of short added and removed substrings.
552    do {
553      $original = $str;
554      $str = preg_replace('/'.$prefix.'(s{3})'.$suffix.'/', '$1xxx$3', $str);
555      $str = preg_replace('/'.$prefix.'(s{2})'.$suffix.'/', '$1xx$3', $str);
556      $str = preg_replace('/'.$prefix.'(s{1})'.$suffix.'/', '$1x$3', $str);
557    } while ($str != $original);
558
559    return $str;
560  }
561
562}
563