1 // The MIT License (MIT)
2 //
3 // Copyright (c) 2020 Max Bachmann
4 // Copyright (c) 2014 Jean-Bernard Jansen
5 //
6 // Permission is hereby granted, free of charge, to any person obtaining a copy
7 // of this software and associated documentation files (the "Software"), to deal
8 // in the Software without restriction, including without limitation the rights
9 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 // copies of the Software, and to permit persons to whom the Software is
11 // furnished to do so, subject to the following conditions:
12 //
13 // The above copyright notice and this permission notice shall be included in
14 // all copies or substantial portions of the Software.
15 //
16 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 // THE SOFTWARE.
23 
24 #pragma once
25 
26 #include <rapidfuzz/details/common.hpp>
27 
28 #include <unordered_map>
29 #include <algorithm>
30 #include <tuple>
31 #include <algorithm>
32 #include <vector>
33 
34 namespace rapidfuzz {
35 namespace detail {
36 struct MatchingBlock {
37   std::size_t spos;
38   std::size_t dpos;
39   std::size_t length;
MatchingBlockrapidfuzz::detail::MatchingBlock40   MatchingBlock(std::size_t aSPos, std::size_t aDPos, std::size_t aLength)
41       : spos(aSPos), dpos(aDPos), length(aLength)
42   {}
43 };
44 
45 
46 namespace difflib {
47 
48 template <typename Sentence1, typename Sentence2>
49 class SequenceMatcher {
50  public:
51   using match_t = std::tuple<size_t, size_t, size_t>;
52 
SequenceMatcher(Sentence1 const & a,Sentence2 const & b)53   SequenceMatcher(Sentence1 const& a, Sentence2 const& b)
54   : a_(a), b_(b) {
55     j2len_.resize(b.size()+1);
56   }
57 
find_longest_match(size_t a_low,size_t a_high,size_t b_low,size_t b_high)58   match_t find_longest_match(size_t a_low, size_t a_high, size_t b_low, size_t b_high) {
59     size_t best_i = a_low;
60     size_t best_j = b_low;
61     size_t best_size = 0;
62 
63     // Find longest junk free match
64     {
65       for(size_t i = a_low; i < a_high; ++i) {
66         std::size_t last_cache = 0;
67         for(size_t j = b_low; j < b_high; ++j) {
68           if (common::mixed_sign_unequal(b_[j], a_[i])) {
69             j2len_[j] = last_cache;
70             last_cache = 0;
71             continue;
72           }
73 
74           size_t k = j2len_[j] + 1;
75           j2len_[j] = last_cache;
76           last_cache = k;
77           if (k > best_size) {
78             best_i = i - k + 1;
79             best_j = j - k + 1;
80             best_size = k;
81           }
82         }
83       }
84 
85       // we never write to the first element
86       for(size_t j = b_low+1; j < b_high; ++j) {
87         j2len_[j] = 0;
88       }
89     }
90 
91     while (best_i > a_low && best_j > b_low && common::mixed_sign_equal(a_[best_i-1], b_[best_j-1])) {
92       --best_i;
93       --best_j;
94       ++best_size;
95     }
96 
97     while ((best_i+best_size) < a_high && (best_j+best_size) < b_high
98            && common::mixed_sign_equal(a_[best_i+best_size], b_[best_j+best_size]))
99     {
100       ++best_size;
101     }
102 
103     return std::make_tuple(best_i, best_j, best_size);
104   }
105 
get_matching_blocks()106   std::vector<MatchingBlock> get_matching_blocks() {
107     // The following are tuple extracting aliases
108     std::vector<std::tuple<size_t, size_t, size_t, size_t>> queue;
109     std::vector<match_t> matching_blocks_pass1;
110 
111     std::size_t queue_head = 0;
112     queue.reserve(std::min(a_.size(), b_.size()));
113     queue.emplace_back(0, a_.size(), 0, b_.size());
114 
115     while(queue_head < queue.size()) {
116       size_t a_low, a_high, b_low, b_high;
117       std::tie(a_low, a_high, b_low, b_high) = queue[queue_head++];
118       std::size_t spos, dpos, length;
119       std::tie(spos, dpos, length) = find_longest_match(a_low, a_high, b_low, b_high);
120       if (length) {
121         if (a_low < spos && b_low < dpos) {
122           queue.emplace_back(a_low, spos, b_low, dpos);
123         }
124         if ((spos + length) < a_high && (dpos + length) < b_high) {
125           queue.emplace_back(spos + length, a_high, dpos + length, b_high);
126         }
127         matching_blocks_pass1.emplace_back(spos, dpos, length);
128       }
129     }
130     std::sort(std::begin(matching_blocks_pass1), std::end(matching_blocks_pass1));
131 
132     std::vector<MatchingBlock> matching_blocks;
133 
134     matching_blocks.reserve(matching_blocks_pass1.size());
135 
136     size_t i1, j1, k1;
137     i1 = j1 = k1 = 0;
138 
139     for(match_t const& m : matching_blocks_pass1) {
140       if (i1 + k1 == std::get<0>(m) && j1 + k1 == std::get<1>(m)) {
141         k1 += std::get<2>(m);
142       }
143       else {
144         if (k1) matching_blocks.emplace_back(i1, j1, k1);
145         std::tie(i1, j1, k1) = m;
146       }
147     }
148     if (k1) matching_blocks.emplace_back(i1, j1, k1);
149     matching_blocks.emplace_back(a_.size(), b_.size(), 0);
150 
151     return matching_blocks;
152   }
153 
154 protected:
155   Sentence1 a_;
156   Sentence2 b_;
157 
158 private:
159   // Cache to avoid reallocations
160   std::vector<size_t> j2len_;
161 };
162 
163 }  // namespace difflib
164 
165 /* TODO this implementation is broken. The LCS part works fine, but extracting the longest sequences does not work
166  * properly
167  */
168 #if 0
169 template<typename Sentence1, typename BlockPatternCharT,  typename Sentence2>
170 std::vector<MatchingBlock> longest_common_subsequence(Sentence1 s1, const common::PatternMatchVector<BlockPatternCharT>& blockmap_s1, Sentence2 s2) {
171   if (s1.size() > 64) {
172     return difflib::SequenceMatcher<Sentence1, Sentence2>(s1, s2).get_matching_blocks();
173   }
174 
175   std::vector<rapidfuzz::MatchingBlock> matching_blocks;
176 
177   // Hyyrö, Heikki. (2004). A Note on Bit-Parallel Alignment Computation. 79-87.
178   // build LCS Matrix
179   std::uint64_t S = ~0x0ull;
180   std::vector<std::uint64_t> Vs;
181   Vs.reserve(s2.size());
182 
183   for (std::size_t j = 0; j < s2.size(); ++j) {
184     uint64_t Matches = blockmap_s1.get(s2[j]);
185     uint64_t u = S & Matches;
186     S = (S + u) | (S - u);
187     Vs.push_back(S);
188   }
189 
190   std::size_t pos_s1 = s1.size() - 1;
191   std::size_t pos_s2 = s2.size() - 1;
192 
193   std::size_t length = 0;
194 
195   while(pos_s1 != (std::size_t)-1 && pos_s2 != (std::size_t)-1) {
196     if (Vs[pos_s2] & (0x1ull << pos_s1)) {
197       if (length) {
198         matching_blocks.emplace_back(pos_s1 + 1, pos_s2 + 1, length);
199         length = 0;
200       }
201       --pos_s1;
202     } else {
203       if (!(pos_s2 && ~Vs[pos_s2-1] & (0x1ull << pos_s1))) {
204         ++length;
205         --pos_s1;
206       } else {
207         if (length) {
208           matching_blocks.emplace_back(pos_s1 + 1, pos_s2 + 1, length);
209           length = 0;
210         }
211       }
212       --pos_s2;
213     }
214   }
215 
216   if (length) {
217     matching_blocks.emplace_back(pos_s1 + 1, pos_s2 + 1, length);
218   }
219 
220   matching_blocks.emplace_back(s1.size(), s2.size(), 0);
221 
222   return matching_blocks;
223 }
224 #endif
225 
226 template<typename Sentence1, /*typename BlockPatternCharT,*/ typename Sentence2>
get_matching_blocks(Sentence1 s1,Sentence2 s2)227 std::vector<MatchingBlock> get_matching_blocks(Sentence1 s1, /*const common::PatternMatchVector<BlockPatternCharT>& blockmap_s1,*/ Sentence2 s2) {
228   //return longest_common_subsequence(s1, blockmap_s1, s2);
229   return difflib::SequenceMatcher<Sentence1, Sentence2>(s1, s2).get_matching_blocks();
230 }
231 
232 } /* namespace detail */
233 } /* namespace rapidfuzz */
234