1 /*
2 SPDX-FileCopyrightText: 2003 Otto Bruggeman <bruggie@home.nl>
3 SPDX-FileCopyrightText: 2011 Dmitry Risenberg <dmitry.risenberg@gmail.com>
4
5 SPDX-License-Identifier: GPL-2.0-or-later
6 */
7
8 #ifndef LEVENSHTEIN_H
9 #define LEVENSHTEIN_H
10
11 #include <iostream>
12 // #include <QString>
13 // #include <komparediffdebug.h>
14
15 namespace Diff2 {
16
17 class Marker;
18
19
20 class Marker;
21
22 /**
23 * Computes the Levenshtein distance between two sequences.
24 * The actual sequence contents must be prepended with one virtual item each for easier index access.
25 */
26 template<class SequencePair> class LevenshteinTable
27 {
28 public:
29 LevenshteinTable();
30 LevenshteinTable(unsigned int width, unsigned int height);
31 ~LevenshteinTable();
32
33 public:
34 int getContent(unsigned int posX, unsigned int posY) const;
35 int setContent(unsigned int posX, unsigned int posY, int value);
36 bool setSize(unsigned int width, unsigned int height);
37
width()38 unsigned int width() const { return m_width; };
height()39 unsigned int height() const { return m_height; };
40
41 /** Debug method to check if the table is properly filled */
42 void dumpLevenshteinTable(void);
43
44 /**
45 * This calculates the levenshtein distance of 2 sequences.
46 * This object takes ownership of the argument
47 */
48 unsigned int createTable(SequencePair* sequences);
49
50 void createListsOfMarkers(void);
51 int chooseRoute(int c1, int c2, int c3, int current);
52
53 protected:
54 LevenshteinTable(const LevenshteinTable& table);
55 const LevenshteinTable& operator = (const LevenshteinTable& table);
56
57 private:
58 unsigned int m_width;
59 unsigned int m_height;
60 unsigned int m_size;
61 unsigned int* m_table;
62 SequencePair* m_sequences;
63 };
64
LevenshteinTable()65 template<class SequencePair> LevenshteinTable<SequencePair>::LevenshteinTable()
66 : m_width(256),
67 m_height(256),
68 m_size(m_height* m_width),
69 m_table(new unsigned int[ m_size ]),
70 m_sequences(nullptr)
71 {
72 }
73
LevenshteinTable(unsigned int width,unsigned int height)74 template<class SequencePair> LevenshteinTable<SequencePair>::LevenshteinTable(unsigned int width, unsigned int height)
75 : m_width(width),
76 m_height(height),
77 m_size(m_width* m_height),
78 m_table(new unsigned int[ m_size ]),
79 m_sequences(0)
80 {
81 }
82
~LevenshteinTable()83 template<class SequencePair> LevenshteinTable<SequencePair>::~LevenshteinTable()
84 {
85 delete[] m_table;
86 delete m_sequences;
87 }
88
getContent(unsigned int posX,unsigned int posY)89 template<class SequencePair> int LevenshteinTable<SequencePair>::getContent(unsigned int posX, unsigned int posY) const
90 {
91 // qCDebug(LIBKOMPAREDIFF2) << "Width = " << m_width << ", height = " << m_height << ", posX = " << posX << ", posY = " << posY;
92 return m_table[ posY * m_width + posX ];
93 }
94
setContent(unsigned int posX,unsigned int posY,int value)95 template<class SequencePair> int LevenshteinTable<SequencePair>::setContent(unsigned int posX, unsigned int posY, int value)
96 {
97 m_table[ posY * m_width + posX ] = value;
98
99 return 0;
100 }
101
setSize(unsigned int width,unsigned int height)102 template<class SequencePair> bool LevenshteinTable<SequencePair>::setSize(unsigned int width, unsigned int height)
103 {
104 // Set a limit of 16.7 million entries, will be about 64 MB of ram, that should be plenty
105 if (((width) * (height)) > (256 * 256 * 256))
106 return false;
107
108 if (((width) * (height)) > m_size)
109 {
110 delete[] m_table;
111
112 m_size = width * height;
113 m_table = new unsigned int[ m_size ];
114 }
115
116 m_width = width;
117 m_height = height;
118
119 return true;
120 }
121
dumpLevenshteinTable()122 template<class SequencePair> void LevenshteinTable<SequencePair>::dumpLevenshteinTable()
123 {
124 for (unsigned int i = 0; i < m_height; ++i)
125 {
126 for (unsigned int j = 0; j < m_width; ++j)
127 {
128 std::cout.width(3);
129 std::cout << getContent(j, i);
130 }
131 std::cout << std::endl;
132 }
133 }
134
createTable(SequencePair * sequences)135 template<class SequencePair> unsigned int LevenshteinTable<SequencePair>::createTable(SequencePair* sequences)
136 {
137 m_sequences = sequences;
138 unsigned int m = m_sequences->lengthFirst();
139 unsigned int n = m_sequences->lengthSecond();
140
141 if (!setSize(m, n))
142 return 0;
143
144 unsigned int i;
145 unsigned int j;
146
147 // initialize first row
148 for (i = 0; i < m; ++i)
149 setContent(i, 0, i);
150 // initialize first column
151 for (j = 0; j < n; ++j)
152 setContent(0, j, j);
153
154 int cost = 0, north = 0, west = 0, northwest = 0;
155
156 // Optimization, calculate row wise instead of column wise, wont trash the cache so much with large strings
157 for (j = 1; j < n; ++j)
158 {
159 for (i = 1; i < m; ++i)
160 {
161 if (m_sequences->equal(i, j))
162 cost = 0;
163 else
164 cost = SequencePair::allowReplace ? 1 : 2;
165
166 north = getContent(i, j - 1) + 1;
167 west = getContent(i - 1, j) + 1;
168 northwest = getContent(i - 1, j - 1) + cost;
169
170 setContent(i, j, qMin(north, qMin(west, northwest)));
171 }
172 }
173
174 return getContent(m - 1, n - 1);
175 }
176
chooseRoute(int c1,int c2,int c3,int current)177 template<class SequencePair> int LevenshteinTable<SequencePair>::chooseRoute(int c1, int c2, int c3, int current)
178 {
179 // qCDebug(LIBKOMPAREDIFF2) << "c1 = " << c1 << ", c2 = " << c2 << ", c3 = " << c3;
180 // preference order: c2, c3, c1, hopefully this will work out for me
181 if (c2 <= c1 && c2 <= c3)
182 {
183 if (SequencePair::allowReplace || (c2 == current))
184 {
185 return 1;
186 }
187 }
188
189 if (c3 <= c2 && c3 <= c1)
190 return 2;
191
192 return 0;
193 }
194
createListsOfMarkers()195 template<class SequencePair> void LevenshteinTable<SequencePair>::createListsOfMarkers()
196 {
197 // qCDebug(LIBKOMPAREDIFF2) << source;
198 // qCDebug(LIBKOMPAREDIFF2) << destination;
199 // dumpLevenshteinTable();
200
201 unsigned int x = m_width - 1;
202 unsigned int y = m_height - 1;
203
204 unsigned int difference = getContent(x, y);
205
206 // If the number of differences is more than half the length of the largest string
207 // don't bother to mark the individual changes
208 // Patch based on work by Felix Berger as put as attachment to bug 75794
209 if (!m_sequences->needFineGrainedOutput(difference))
210 {
211 m_sequences->prependFirst(new Marker(Marker::End, x));
212 m_sequences->prependFirst(new Marker(Marker::Start, 0));
213 m_sequences->prependSecond(new Marker(Marker::End, y));
214 m_sequences->prependSecond(new Marker(Marker::Start, 0));
215 return;
216 }
217
218 Marker* c = nullptr;
219
220 int n, nw, w, direction, currentValue;
221 while (x > 0 && y > 0)
222 {
223 currentValue = getContent(x, y);
224
225 n = getContent(x, y - 1);
226 w = getContent(x - 1, y);
227 nw = getContent(x - 1, y - 1);
228 direction = chooseRoute(n, nw, w, currentValue);
229
230 switch (direction)
231 {
232 case 0: // north
233 // qCDebug(LIBKOMPAREDIFF2) << "Picking north";
234 // qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
235
236 if (!m_sequences->markerListSecond().isEmpty())
237 c = m_sequences->markerListSecond().first();
238 else
239 c = nullptr;
240
241 if (c && c->type() == Marker::End)
242 {
243 // qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
244 if (n == currentValue)
245 m_sequences->prependSecond(new Marker(Marker::Start, y));
246 // else: the change continues, do not do anything
247 }
248 else
249 {
250 // qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
251 if (n < currentValue)
252 m_sequences->prependSecond(new Marker(Marker::End, y));
253 }
254
255 --y;
256 if (y == 0) {
257 m_sequences->prependSecond(new Marker(Marker::Start, 0));
258 }
259 break;
260 case 1: // northwest
261 // qCDebug(LIBKOMPAREDIFF2) << "Picking northwest";
262 // qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
263
264 if (!m_sequences->markerListSecond().isEmpty())
265 c = m_sequences->markerListSecond().first();
266 else
267 c = nullptr;
268
269 if (c && c->type() == Marker::End)
270 {
271 // qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue;
272 if (nw == currentValue)
273 m_sequences->prependSecond(new Marker(Marker::Start, y));
274 // else: the change continues, do not do anything
275 }
276 else
277 {
278 // qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
279 if (nw < currentValue)
280 m_sequences->prependSecond(new Marker(Marker::End, y));
281 }
282
283 if (!m_sequences->markerListFirst().isEmpty())
284 c = m_sequences->markerListFirst().first();
285 else
286 c = nullptr;
287
288 if (c && c->type() == Marker::End)
289 {
290 // qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue;
291 if (nw == currentValue)
292 m_sequences->prependFirst(new Marker(Marker::Start, x));
293 // else: the change continues, do not do anything
294 }
295 else
296 {
297 // qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
298 if (nw < currentValue)
299 m_sequences->prependFirst(new Marker(Marker::End, x));
300 }
301
302 --y;
303 --x;
304 break;
305 case 2: // west
306 // qCDebug(LIBKOMPAREDIFF2) << "Picking west";
307 // qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
308
309 if (!m_sequences->markerListFirst().isEmpty())
310 c = m_sequences->markerListFirst().first();
311 else
312 c = nullptr;
313
314 if (c && c->type() == Marker::End)
315 {
316 // qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue;
317 if (w == currentValue)
318 m_sequences->prependFirst(new Marker(Marker::Start, x));
319 // else: the change continues, do not do anything
320 }
321 else
322 {
323 // qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
324 if (w < currentValue)
325 m_sequences->prependFirst(new Marker(Marker::End, x));
326 }
327
328 --x;
329 if (x == 0) {
330 m_sequences->prependFirst(new Marker(Marker::Start, 0));
331 }
332 break;
333 }
334 }
335
336 // When leaving the loop it does not mean both are 0! If not there is still a change at the beginning of the line we missed so adding now.
337 if (x != 0)
338 {
339 m_sequences->prependFirst(new Marker(Marker::End, x));
340 m_sequences->prependFirst(new Marker(Marker::Start, 0));
341 }
342
343 if (y != 0)
344 {
345 m_sequences->prependSecond(new Marker(Marker::End, y));
346 m_sequences->prependSecond(new Marker(Marker::Start, 0));
347 }
348
349 // qCDebug(LIBKOMPAREDIFF2) << "Source string: " << source;
350
351 // QStringList list;
352 // int prevValue = 0;
353 // MarkerListConstIterator mit = m_sequences->markerListFirst().begin();
354 // MarkerListConstIterator end = m_sequences->markerListFirst().end();
355 // for ( ; mit != end; ++mit )
356 // {
357 // c = *mit;
358 // qCDebug(LIBKOMPAREDIFF2) << "Source Marker Entry : Type: " << c->type() << ", Offset: " << c->offset();
359 // list.append( source.mid( prevValue, c->offset() - prevValue ) );
360 // prevValue = c->offset();
361 // }
362 // if ( prevValue < source.length() - 1 )
363 // {
364 // list.append( source.mid( prevValue, source.length() - prevValue ) );
365 // }
366 // qCDebug(LIBKOMPAREDIFF2) << "Source Resulting stringlist : " << list.join("\n");
367 //
368 // list.clear();
369 // prevValue = 0;
370 //
371 // qCDebug(LIBKOMPAREDIFF2) << "Destination string: " << destination;
372 // mit = m_sequences->markerListSecond().begin();
373 // end = m_sequences->markerListSecond().end();
374 // for ( ; mit != end; ++mit )
375 // {
376 // c = *mit;
377 // qCDebug(LIBKOMPAREDIFF2) << "Destination Marker Entry : Type: " << c->type() << ", Offset: " << c->offset();
378 // list.append( destination.mid( prevValue, c->offset() - prevValue ) );
379 // prevValue = c->offset();
380 // }
381 // if ( prevValue < destination.length() - 1 )
382 // {
383 // list.append( destination.mid( prevValue, destination.length() - prevValue ) );
384 // }
385 // qCDebug(LIBKOMPAREDIFF2) << "Destination Resulting string : " << list.join("\n");
386 }
387
388 } // namespace Diff2
389
390 #endif // LEVENSHTEIN_H
391