1 //===-- llvm/ADT/edit_distance.h - Array edit distance function --- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file defines a Levenshtein distance function that works for any two 11 /// sequences, with each element of each sequence being analogous to a character 12 /// in a string. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_ADT_EDIT_DISTANCE_H 17 #define LLVM_ADT_EDIT_DISTANCE_H 18 19 #include "llvm/ADT/ArrayRef.h" 20 #include <algorithm> 21 #include <memory> 22 23 namespace llvm { 24 25 /// Determine the edit distance between two sequences. 26 /// 27 /// \param FromArray the first sequence to compare. 28 /// 29 /// \param ToArray the second sequence to compare. 30 /// 31 /// \param Map A Functor to apply to each item of the sequences before 32 /// comparison. 33 /// 34 /// \param AllowReplacements whether to allow element replacements (change one 35 /// element into another) as a single operation, rather than as two operations 36 /// (an insertion and a removal). 37 /// 38 /// \param MaxEditDistance If non-zero, the maximum edit distance that this 39 /// routine is allowed to compute. If the edit distance will exceed that 40 /// maximum, returns \c MaxEditDistance+1. 41 /// 42 /// \returns the minimum number of element insertions, removals, or (if 43 /// \p AllowReplacements is \c true) replacements needed to transform one of 44 /// the given sequences into the other. If zero, the sequences are identical. 45 template <typename T, typename Functor> 46 unsigned ComputeMappedEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray, 47 Functor Map, bool AllowReplacements = true, 48 unsigned MaxEditDistance = 0) { 49 // The algorithm implemented below is the "classic" 50 // dynamic-programming algorithm for computing the Levenshtein 51 // distance, which is described here: 52 // 53 // http://en.wikipedia.org/wiki/Levenshtein_distance 54 // 55 // Although the algorithm is typically described using an m x n 56 // array, only one row plus one element are used at a time, so this 57 // implementation just keeps one vector for the row. To update one entry, 58 // only the entries to the left, top, and top-left are needed. The left 59 // entry is in Row[x-1], the top entry is what's in Row[x] from the last 60 // iteration, and the top-left entry is stored in Previous. 61 typename ArrayRef<T>::size_type m = FromArray.size(); 62 typename ArrayRef<T>::size_type n = ToArray.size(); 63 64 if (MaxEditDistance) { 65 // If the difference in size between the 2 arrays is larger than the max 66 // distance allowed, we can bail out as we will always need at least 67 // MaxEditDistance insertions or removals. 68 typename ArrayRef<T>::size_type AbsDiff = m > n ? m - n : n - m; 69 if (AbsDiff > MaxEditDistance) 70 return MaxEditDistance + 1; 71 } 72 73 const unsigned SmallBufferSize = 64; 74 unsigned SmallBuffer[SmallBufferSize]; 75 std::unique_ptr<unsigned[]> Allocated; 76 unsigned *Row = SmallBuffer; 77 if (n + 1 > SmallBufferSize) { 78 Row = new unsigned[n + 1]; 79 Allocated.reset(Row); 80 } 81 82 for (unsigned i = 1; i <= n; ++i) 83 Row[i] = i; 84 85 for (typename ArrayRef<T>::size_type y = 1; y <= m; ++y) { 86 Row[0] = y; 87 unsigned BestThisRow = Row[0]; 88 89 unsigned Previous = y - 1; 90 const auto &CurItem = Map(FromArray[y - 1]); 91 for (typename ArrayRef<T>::size_type x = 1; x <= n; ++x) { 92 int OldRow = Row[x]; 93 if (AllowReplacements) { 94 Row[x] = std::min(Previous + (CurItem == Map(ToArray[x - 1]) ? 0u : 1u), 95 std::min(Row[x - 1], Row[x]) + 1); 96 } 97 else { 98 if (CurItem == Map(ToArray[x - 1])) 99 Row[x] = Previous; 100 else Row[x] = std::min(Row[x-1], Row[x]) + 1; 101 } 102 Previous = OldRow; 103 BestThisRow = std::min(BestThisRow, Row[x]); 104 } 105 106 if (MaxEditDistance && BestThisRow > MaxEditDistance) 107 return MaxEditDistance + 1; 108 } 109 110 unsigned Result = Row[n]; 111 return Result; 112 } 113 114 template <typename T> 115 unsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray, 116 bool AllowReplacements = true, 117 unsigned MaxEditDistance = 0) { 118 return ComputeMappedEditDistance( 119 FromArray, ToArray, [](const T &X) -> const T & { return X; }, 120 AllowReplacements, MaxEditDistance); 121 } 122 123 } // End llvm namespace 124 125 #endif 126