1 /*
2 SPDX-FileCopyrightText: 2003 Otto Bruggeman <bruggie@home.nl>
3 SPDX-FileCopyrightText: 2011 Dmitry Risenberg <dmitry.risenberg@gmail.com>
4 
5 SPDX-License-Identifier: GPL-2.0-or-later
6 */
7 
8 #ifndef LEVENSHTEIN_H
9 #define LEVENSHTEIN_H
10 
11 #include <iostream>
12 // #include <QString>
13 // #include <komparediffdebug.h>
14 
15 namespace Diff2 {
16 
17 class Marker;
18 
19 
20 class Marker;
21 
22 /**
23  * Computes the Levenshtein distance between two sequences.
24  * The actual sequence contents must be prepended with one virtual item each for easier index access.
25  */
26 template<class SequencePair> class LevenshteinTable
27 {
28 public:
29     LevenshteinTable();
30     LevenshteinTable(unsigned int width, unsigned int height);
31     ~LevenshteinTable();
32 
33 public:
34     int  getContent(unsigned int posX, unsigned int posY) const;
35     int  setContent(unsigned int posX, unsigned int posY, int value);
36     bool setSize(unsigned int width, unsigned int height);
37 
width()38     unsigned int width()  const { return m_width; };
height()39     unsigned int height() const { return m_height; };
40 
41     /** Debug method  to check if the table is properly filled */
42     void dumpLevenshteinTable(void);
43 
44     /**
45      * This calculates the levenshtein distance of 2 sequences.
46      * This object takes ownership of the argument
47      */
48     unsigned int createTable(SequencePair* sequences);
49 
50     void createListsOfMarkers(void);
51     int chooseRoute(int c1, int c2, int c3, int current);
52 
53 protected:
54     LevenshteinTable(const LevenshteinTable& table);
55     const LevenshteinTable& operator = (const LevenshteinTable& table);
56 
57 private:
58     unsigned int      m_width;
59     unsigned int      m_height;
60     unsigned int      m_size;
61     unsigned int*     m_table;
62     SequencePair*     m_sequences;
63 };
64 
LevenshteinTable()65 template<class SequencePair> LevenshteinTable<SequencePair>::LevenshteinTable()
66     : m_width(256),
67       m_height(256),
68       m_size(m_height* m_width),
69       m_table(new unsigned int[ m_size ]),
70       m_sequences(nullptr)
71 {
72 }
73 
LevenshteinTable(unsigned int width,unsigned int height)74 template<class SequencePair> LevenshteinTable<SequencePair>::LevenshteinTable(unsigned int width, unsigned int height)
75     : m_width(width),
76       m_height(height),
77       m_size(m_width* m_height),
78       m_table(new unsigned int[ m_size ]),
79       m_sequences(0)
80 {
81 }
82 
~LevenshteinTable()83 template<class SequencePair> LevenshteinTable<SequencePair>::~LevenshteinTable()
84 {
85     delete[] m_table;
86     delete m_sequences;
87 }
88 
getContent(unsigned int posX,unsigned int posY)89 template<class SequencePair> int LevenshteinTable<SequencePair>::getContent(unsigned int posX, unsigned int posY) const
90 {
91 //     qCDebug(LIBKOMPAREDIFF2) << "Width = " << m_width << ", height = " << m_height << ", posX = " << posX << ", posY = " << posY;
92     return m_table[ posY * m_width + posX ];
93 }
94 
setContent(unsigned int posX,unsigned int posY,int value)95 template<class SequencePair> int LevenshteinTable<SequencePair>::setContent(unsigned int posX, unsigned int posY, int value)
96 {
97     m_table[ posY * m_width + posX ] = value;
98 
99     return 0;
100 }
101 
setSize(unsigned int width,unsigned int height)102 template<class SequencePair> bool LevenshteinTable<SequencePair>::setSize(unsigned int width, unsigned int height)
103 {
104     // Set a limit of 16.7 million entries, will be about 64 MB of ram, that should be plenty
105     if (((width) * (height)) > (256 * 256 * 256))
106         return false;
107 
108     if (((width) * (height)) > m_size)
109     {
110         delete[] m_table;
111 
112         m_size = width * height;
113         m_table = new unsigned int[ m_size ];
114     }
115 
116     m_width = width;
117     m_height = height;
118 
119     return true;
120 }
121 
dumpLevenshteinTable()122 template<class SequencePair> void LevenshteinTable<SequencePair>::dumpLevenshteinTable()
123 {
124     for (unsigned int i = 0; i < m_height; ++i)
125     {
126         for (unsigned int j = 0; j < m_width; ++j)
127         {
128             std::cout.width(3);
129             std::cout << getContent(j, i);
130         }
131         std::cout << std::endl;
132     }
133 }
134 
createTable(SequencePair * sequences)135 template<class SequencePair> unsigned int LevenshteinTable<SequencePair>::createTable(SequencePair* sequences)
136 {
137     m_sequences = sequences;
138     unsigned int m = m_sequences->lengthFirst();
139     unsigned int n = m_sequences->lengthSecond();
140 
141     if (!setSize(m, n))
142         return 0;
143 
144     unsigned int i;
145     unsigned int j;
146 
147     // initialize first row
148     for (i = 0; i < m; ++i)
149         setContent(i, 0, i);
150     // initialize first column
151     for (j = 0; j < n; ++j)
152         setContent(0, j, j);
153 
154     int cost = 0, north = 0, west = 0, northwest = 0;
155 
156     // Optimization, calculate row wise instead of column wise, wont trash the cache so much with large strings
157     for (j = 1; j < n; ++j)
158     {
159         for (i = 1; i < m; ++i)
160         {
161             if (m_sequences->equal(i, j))
162                 cost = 0;
163             else
164                 cost = SequencePair::allowReplace ? 1 : 2;
165 
166             north     = getContent(i, j - 1) + 1;
167             west      = getContent(i - 1, j) + 1;
168             northwest = getContent(i - 1, j - 1) + cost;
169 
170             setContent(i, j, qMin(north, qMin(west, northwest)));
171         }
172     }
173 
174     return getContent(m - 1, n - 1);
175 }
176 
chooseRoute(int c1,int c2,int c3,int current)177 template<class SequencePair> int LevenshteinTable<SequencePair>::chooseRoute(int c1, int c2, int c3, int current)
178 {
179 //     qCDebug(LIBKOMPAREDIFF2) << "c1 = " << c1 << ", c2 = " << c2 << ", c3 = " << c3;
180     // preference order: c2, c3, c1, hopefully this will work out for me
181     if (c2 <= c1 && c2 <= c3)
182     {
183         if (SequencePair::allowReplace || (c2 == current))
184         {
185             return 1;
186         }
187     }
188 
189     if (c3 <= c2 && c3 <= c1)
190         return 2;
191 
192     return 0;
193 }
194 
createListsOfMarkers()195 template<class SequencePair> void LevenshteinTable<SequencePair>::createListsOfMarkers()
196 {
197 //     qCDebug(LIBKOMPAREDIFF2) << source;
198 //     qCDebug(LIBKOMPAREDIFF2) << destination;
199 //     dumpLevenshteinTable();
200 
201     unsigned int x = m_width - 1;
202     unsigned int y = m_height - 1;
203 
204     unsigned int difference = getContent(x, y);
205 
206     // If the number of differences is more than half the length of the largest string
207     // don't bother to mark the individual changes
208     // Patch based on work by Felix Berger as put as attachment to bug 75794
209     if (!m_sequences->needFineGrainedOutput(difference))
210     {
211         m_sequences->prependFirst(new Marker(Marker::End, x));
212         m_sequences->prependFirst(new Marker(Marker::Start, 0));
213         m_sequences->prependSecond(new Marker(Marker::End, y));
214         m_sequences->prependSecond(new Marker(Marker::Start, 0));
215         return;
216     }
217 
218     Marker* c = nullptr;
219 
220     int n, nw, w, direction, currentValue;
221     while (x > 0 && y > 0)
222     {
223         currentValue = getContent(x, y);
224 
225         n  = getContent(x, y - 1);
226         w  = getContent(x - 1, y);
227         nw = getContent(x - 1, y - 1);
228         direction = chooseRoute(n, nw, w, currentValue);
229 
230         switch (direction)
231         {
232         case 0: // north
233 //             qCDebug(LIBKOMPAREDIFF2) << "Picking north";
234 //             qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
235 
236             if (!m_sequences->markerListSecond().isEmpty())
237                 c = m_sequences->markerListSecond().first();
238             else
239                 c = nullptr;
240 
241             if (c && c->type() == Marker::End)
242             {
243 //                 qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
244                 if (n == currentValue)
245                     m_sequences->prependSecond(new Marker(Marker::Start, y));
246 // else: the change continues, do not do anything
247             }
248             else
249             {
250 //                 qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
251                 if (n < currentValue)
252                     m_sequences->prependSecond(new Marker(Marker::End, y));
253             }
254 
255             --y;
256             if (y == 0) {
257                 m_sequences->prependSecond(new Marker(Marker::Start, 0));
258             }
259             break;
260         case 1: // northwest
261 //             qCDebug(LIBKOMPAREDIFF2) << "Picking northwest";
262 //             qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
263 
264             if (!m_sequences->markerListSecond().isEmpty())
265                 c = m_sequences->markerListSecond().first();
266             else
267                 c = nullptr;
268 
269             if (c && c->type() == Marker::End)
270             {
271 //                 qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue;
272                 if (nw == currentValue)
273                     m_sequences->prependSecond(new Marker(Marker::Start, y));
274                 // else: the change continues, do not do anything
275             }
276             else
277             {
278 //                 qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
279                 if (nw < currentValue)
280                     m_sequences->prependSecond(new Marker(Marker::End, y));
281             }
282 
283             if (!m_sequences->markerListFirst().isEmpty())
284                 c = m_sequences->markerListFirst().first();
285             else
286                 c = nullptr;
287 
288             if (c && c->type() == Marker::End)
289             {
290 //                 qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue;
291                 if (nw == currentValue)
292                     m_sequences->prependFirst(new Marker(Marker::Start, x));
293                 // else: the change continues, do not do anything
294             }
295             else
296             {
297 //                 qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
298                 if (nw < currentValue)
299                     m_sequences->prependFirst(new Marker(Marker::End, x));
300             }
301 
302             --y;
303             --x;
304             break;
305         case 2: // west
306 //             qCDebug(LIBKOMPAREDIFF2) << "Picking west";
307 //             qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
308 
309             if (!m_sequences->markerListFirst().isEmpty())
310                 c = m_sequences->markerListFirst().first();
311             else
312                 c = nullptr;
313 
314             if (c && c->type() == Marker::End)
315             {
316 //                 qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue;
317                 if (w == currentValue)
318                     m_sequences->prependFirst(new Marker(Marker::Start, x));
319                 // else: the change continues, do not do anything
320             }
321             else
322             {
323 //                 qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
324                 if (w < currentValue)
325                     m_sequences->prependFirst(new Marker(Marker::End, x));
326             }
327 
328             --x;
329             if (x == 0) {
330                 m_sequences->prependFirst(new Marker(Marker::Start, 0));
331             }
332             break;
333         }
334     }
335 
336     // When leaving the loop it does not mean both are 0! If not there is still a change at the beginning of the line we missed so adding now.
337     if (x != 0)
338     {
339         m_sequences->prependFirst(new Marker(Marker::End, x));
340         m_sequences->prependFirst(new Marker(Marker::Start, 0));
341     }
342 
343     if (y != 0)
344     {
345         m_sequences->prependSecond(new Marker(Marker::End, y));
346         m_sequences->prependSecond(new Marker(Marker::Start, 0));
347     }
348 
349 //     qCDebug(LIBKOMPAREDIFF2) << "Source string: " << source;
350 
351 //     QStringList list;
352 //     int prevValue = 0;
353 //     MarkerListConstIterator mit = m_sequences->markerListFirst().begin();
354 //     MarkerListConstIterator end = m_sequences->markerListFirst().end();
355 //     for ( ; mit != end; ++mit )
356 //     {
357 //         c = *mit;
358 //         qCDebug(LIBKOMPAREDIFF2) << "Source Marker Entry : Type: " << c->type() << ", Offset: " << c->offset();
359 //         list.append( source.mid( prevValue, c->offset() - prevValue ) );
360 //         prevValue = c->offset();
361 //     }
362 //     if ( prevValue < source.length() - 1 )
363 //     {
364 //         list.append( source.mid( prevValue, source.length() - prevValue ) );
365 //     }
366 //     qCDebug(LIBKOMPAREDIFF2) << "Source Resulting stringlist : " << list.join("\n");
367 //
368 //     list.clear();
369 //     prevValue = 0;
370 //
371 //     qCDebug(LIBKOMPAREDIFF2) << "Destination string: " << destination;
372 //     mit = m_sequences->markerListSecond().begin();
373 //     end = m_sequences->markerListSecond().end();
374 //     for ( ; mit != end; ++mit )
375 //     {
376 //         c = *mit;
377 //         qCDebug(LIBKOMPAREDIFF2) << "Destination Marker Entry : Type: " << c->type() << ", Offset: " << c->offset();
378 //         list.append( destination.mid( prevValue, c->offset() - prevValue ) );
379 //         prevValue = c->offset();
380 //     }
381 //     if ( prevValue < destination.length() - 1 )
382 //     {
383 //         list.append( destination.mid( prevValue, destination.length() - prevValue ) );
384 //     }
385 //     qCDebug(LIBKOMPAREDIFF2) << "Destination Resulting string : " << list.join("\n");
386 }
387 
388 } // namespace Diff2
389 
390 #endif // LEVENSHTEIN_H
391