1 /* File I/O for GNU DIFF.
2 
3     Modified for KDiff3 by Joachim Eibl <joachim.eibl at gmx.de> 2003, 2004, 2005.
4     The original file was part of GNU DIFF.
5 
6     Part of KDiff3 - Text Diff And Merge Tool
7 
8     SPDX-FileCopyrightText: 1988-2002 Free Software Foundation, Inc.
9     SPDX-FileCopyrightText: 2002-2011 Joachim Eibl, joachim.eibl at gmx.de
10     SPDX-FileCopyrightText: 2018-2020 Michael Reeves reeves.87@gmail.com
11     SPDX-License-Identifier: GPL-2.0-or-later
12 */
13 
14 #include "gnudiff_diff.h"
15 #include <stdlib.h>
16 #include <type_traits>
17 
18 /* Rotate an unsigned value to the left.  */
19 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof(v) * CHAR_BIT - (n)))
20 
21 /* Given a hash value and a new character, return a new hash value.  */
22 #define HASH(h, c) ((c) + ROL(h, 7))
23 
24 /* The type of a hash value.  */
25 typedef size_t hash_value;
26 static_assert(std::is_unsigned<hash_value>::value, "hash_value must be signed.");
27 
28 /* Lines are put into equivalence classes of lines that match in lines_differ.
29    Each equivalence class is represented by one of these structures,
30    but only while the classes are being computed.
31    Afterward, each class is represented by a number.  */
32 struct equivclass {
33     GNULineRef next;   /* Next item in this bucket.  */
34     hash_value hash;   /* Hash of lines in this class.  */
35     const QChar *line; /* A line that fits this class.  */
36     size_t length;     /* That line's length, not counting its newline.  */
37 };
38 
39 /* Hash-table: array of buckets, each being a chain of equivalence classes.
40    buckets[-1] is reserved for incomplete lines.  */
41 static GNULineRef *buckets;
42 
43 /* Number of buckets in the hash table array, not counting buckets[-1].  */
44 static size_t nbuckets;
45 
46 /* Array in which the equivalence classes are allocated.
47    The bucket-chains go through the elements in this array.
48    The number of an equivalence class is its index in this array.  */
49 static equivclass *equivs;
50 
51 /* Index of first free element in the array `equivs'.  */
52 static GNULineRef equivs_index;
53 
54 /* Number of elements allocated in the array `equivs'.  */
55 static GNULineRef equivs_alloc;
56 
57 /* Check for binary files and compare them for exact identity.  */
58 
59 /* Return 1 if BUF contains a non text character.
60    SIZE is the number of characters in BUF.  */
61 
62 #define binary_file_p(buf, size) (memchr(buf, 0, size) != 0)
63 
64 /* Compare two lines (typically one from each input file)
65    according to the command line options.
66    For efficiency, this is invoked only when the lines do not match exactly
67    but an option like -i might cause us to ignore the difference.
68    Return nonzero if the lines differ.  */
69 
lines_differ(const QChar * s1,size_t len1,const QChar * s2,size_t len2)70 bool GnuDiff::lines_differ(const QChar *s1, size_t len1, const QChar *s2, size_t len2)
71 {
72     const QChar *t1 = s1;
73     const QChar *t2 = s2;
74     const QChar *s1end = s1 + len1;
75     const QChar *s2end = s2 + len2;
76 
77     for(;; ++t1, ++t2)
78     {
79         /* Test for exact char equality first, since it's a common case.  */
80         if(t1 != s1end && t2 != s2end && *t1 == *t2)
81             continue;
82         else
83         {
84             while(t1 != s1end &&
85                   ((bIgnoreWhiteSpace && isspace(t1->unicode())) ||
86                    (bIgnoreNumbers && (t1->isDigit() || *t1 == '-' || *t1 == '.'))))
87             {
88                 ++t1;
89             }
90 
91             while(t2 != s2end &&
92                   ((bIgnoreWhiteSpace && isspace(t2->unicode())) ||
93                    (bIgnoreNumbers && (t2->isDigit() || *t2 == '-' || *t2 == '.'))))
94             {
95                 ++t2;
96             }
97 
98             if(t1 != s1end && t2 != s2end)
99             {
100                 if(ignore_case)
101                 { /* Lowercase comparison. */
102                     if(t1->toLower() == t2->toLower())
103                         continue;
104                 }
105                 else if(*t1 == *t2)
106                     continue;
107                 else
108                     return true;
109             }
110             else if(t1 == s1end && t2 == s2end)
111                 return false;
112             else
113                 return true;
114         }
115     }
116     return false;
117 }
118 
119 /* Split the file into lines, simultaneously computing the equivalence
120    class for each line.  */
121 
find_and_hash_each_line(file_data * current)122 void GnuDiff::find_and_hash_each_line(file_data *current)
123 {
124     hash_value h;
125     const QChar *p = current->prefix_end;
126     QChar c;
127     GNULineRef i, *bucket;
128     size_t length;
129 
130     /* Cache often-used quantities in local variables to help the compiler.  */
131     const QChar **linbuf = current->linbuf;
132     GNULineRef alloc_lines = current->alloc_lines;
133     GNULineRef line = 0;
134     GNULineRef linbuf_base = current->linbuf_base;
135     GNULineRef *cureqs = (GNULineRef *)xmalloc(alloc_lines * sizeof(*cureqs));
136     equivclass *eqs = equivs;
137     GNULineRef eqs_index = equivs_index;
138     GNULineRef eqs_alloc = equivs_alloc;
139     const QChar *suffix_begin = current->suffix_begin;
140     const QChar *bufend = current->buffer + current->buffered;
141     bool diff_length_compare_anyway =
142         ignore_white_space != IGNORE_NO_WHITE_SPACE || bIgnoreNumbers;
143     bool same_length_diff_contents_compare_anyway =
144         diff_length_compare_anyway | ignore_case;
145 
146     while(p < suffix_begin)
147     {
148         const QChar *ip = p;
149 
150         h = 0;
151 
152         /* Hash this line until we find a newline or bufend is reached.  */
153         if(ignore_case)
154             switch(ignore_white_space)
155             {
156                 case IGNORE_ALL_SPACE:
157                     while(p < bufend && !Utils::isEndOfLine(c = *p))
158                     {
159                         if(!(isspace(c.unicode()) || (bIgnoreNumbers && (c.isDigit() || c == '-' || c == '.'))))
160                             h = HASH(h, c.toLower().unicode());
161                         ++p;
162                     }
163                     break;
164 
165                 default:
166                     while(p < bufend && !Utils::isEndOfLine(c = *p))
167                     {
168                         h = HASH(h, c.toLower().unicode());
169                         ++p;
170                     }
171                     break;
172             }
173         else
174             switch(ignore_white_space)
175             {
176                 case IGNORE_ALL_SPACE:
177                     while(p < bufend && !Utils::isEndOfLine(c = *p))
178                     {
179                         if(!(isspace(c.unicode()) || (bIgnoreNumbers && (c.isDigit() || c == '-' || c == '.'))))
180                             h = HASH(h, c.unicode());
181                         ++p;
182                     }
183                     break;
184 
185                 default:
186                     while(p < bufend && !Utils::isEndOfLine(c = *p))
187                     {
188                         h = HASH(h, c.unicode());
189                         ++p;
190                     }
191                     break;
192             }
193 
194         bucket = &buckets[h % nbuckets];
195         length = p - ip;
196         ++p;
197 
198         for(i = *bucket;; i = eqs[i].next)
199             if(!i)
200             {
201                 /* Create a new equivalence class in this bucket.  */
202                 i = eqs_index++;
203                 if(i == eqs_alloc)
204                 {
205                     if((GNULineRef)(GNULINEREF_MAX / (2 * sizeof(*eqs))) <= eqs_alloc)
206                         xalloc_die();
207                     eqs_alloc *= 2;
208                     eqs = (equivclass *)xrealloc(eqs, eqs_alloc * sizeof(*eqs));
209                 }
210                 eqs[i].next = *bucket;
211                 eqs[i].hash = h;
212                 eqs[i].line = ip;
213                 eqs[i].length = length;
214                 *bucket = i;
215                 break;
216             }
217             else if(eqs[i].hash == h)
218             {
219                 const QChar *eqline = eqs[i].line;
220 
221                 /* Reuse existing class if lines_differ reports the lines
222                equal.  */
223                 if(eqs[i].length == length)
224                 {
225                     /* Reuse existing equivalence class if the lines are identical.
226            This detects the common case of exact identity
227            faster than lines_differ would.  */
228                     if(memcmp(eqline, ip, length * sizeof(QChar)) == 0)
229                         break;
230                     if(!same_length_diff_contents_compare_anyway)
231                         continue;
232                 }
233                 else if(!diff_length_compare_anyway)
234                     continue;
235 
236                 if(!lines_differ(eqline, eqs[i].length, ip, length))
237                     break;
238             }
239 
240         /* Maybe increase the size of the line table.  */
241         if(line == alloc_lines)
242         {
243             /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
244             if((GNULineRef)(GNULINEREF_MAX / 3) <= alloc_lines || (GNULineRef)(GNULINEREF_MAX / sizeof(*cureqs)) <= 2 * alloc_lines - linbuf_base || (GNULineRef)(GNULINEREF_MAX / sizeof(ptrdiff_t)) <= alloc_lines - linbuf_base)
245                 xalloc_die();
246             alloc_lines = 2 * alloc_lines - linbuf_base;
247             cureqs = (GNULineRef *)xrealloc(cureqs, alloc_lines * sizeof(*cureqs));
248             linbuf += linbuf_base;
249             linbuf = (const QChar **)xrealloc(linbuf,
250                                               (alloc_lines - linbuf_base) * sizeof(ptrdiff_t));
251             linbuf -= linbuf_base;
252         }
253         linbuf[line] = ip;
254         cureqs[line] = i;
255         ++line;
256     }
257 
258     current->buffered_lines = line;
259 
260     for(i = 0;; ++i)
261     {
262         /* Record the line start for lines in the suffix that we care about.
263      Record one more line start than lines,
264      so that we can compute the length of any buffered line.  */
265         if(line == alloc_lines)
266         {
267             /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
268             if((GNULineRef)(GNULINEREF_MAX / 3) <= alloc_lines || (GNULineRef)(GNULINEREF_MAX / sizeof(*cureqs)) <= 2 * alloc_lines - linbuf_base || (GNULineRef)(GNULINEREF_MAX / sizeof(ptrdiff_t)) <= alloc_lines - linbuf_base)
269                 xalloc_die();
270             alloc_lines = 2 * alloc_lines - linbuf_base;
271             linbuf += linbuf_base;
272             linbuf = (const QChar **)xrealloc(linbuf,
273                                               (alloc_lines - linbuf_base) * sizeof(ptrdiff_t));
274             linbuf -= linbuf_base;
275         }
276         linbuf[line] = p;
277 
278         if(p >= bufend)
279             break;
280 
281         if(context <= i && no_diff_means_no_output)
282             break;
283 
284         line++;
285 
286         while(p < bufend && !Utils::isEndOfLine(*p++))
287             continue;
288     }
289 
290     /* Done with cache in local variables.  */
291     current->linbuf = linbuf;
292     current->valid_lines = line;
293     current->alloc_lines = alloc_lines;
294     current->equivs = cureqs;
295     equivs = eqs;
296     equivs_alloc = eqs_alloc;
297     equivs_index = eqs_index;
298 }
299 
300 /* We have found N lines in a buffer of size S; guess the
301    proportionate number of lines that will be found in a buffer of
302    size T.  However, do not guess a number of lines so large that the
303    resulting line table might cause overflow in size calculations.  */
304 GNULineRef
guess_lines(GNULineRef n,size_t s,size_t t)305 GnuDiff::guess_lines(GNULineRef n, size_t s, size_t t)
306 {
307     size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
308     size_t guessed_lines = std::max((size_t)1, t / guessed_bytes_per_line);
309     return (GNULineRef)std::min((GNULineRef)guessed_lines, (GNULineRef)(GNULINEREF_MAX / (2 * sizeof(ptrdiff_t) + 1) - 5)) + 5;
310 }
311 
312 /* Given a vector of two file_data objects, find the identical
313    prefixes and suffixes of each object.  */
314 
find_identical_ends(file_data filevec[])315 void GnuDiff::find_identical_ends(file_data filevec[])
316 {
317     /* Find identical prefix.  */
318     const QChar *p0, *p1, *buffer0, *buffer1;
319     p0 = buffer0 = filevec[0].buffer;
320     p1 = buffer1 = filevec[1].buffer;
321     size_t n0, n1;
322     n0 = filevec[0].buffered;
323     n1 = filevec[1].buffered;
324     const QChar *const pEnd0 = p0 + n0;
325     const QChar *const pEnd1 = p1 + n1;
326 
327     if(p0 == p1)
328         /* The buffers are the same; sentinels won't work.  */
329         p0 = p1 += n1;
330     else
331     {
332         /* Loop until first mismatch, or end. */
333         while(p0 != pEnd0 && p1 != pEnd1 && *p0 == *p1)
334         {
335             p0++;
336             p1++;
337         }
338     }
339 
340     /* Now P0 and P1 point at the first nonmatching characters.  */
341 
342     /* Skip back to last line-beginning in the prefix. */
343     while(p0 != buffer0 && !Utils::isEndOfLine(p0[-1]))
344         p0--, p1--;
345 
346     /* Record the prefix.  */
347     filevec[0].prefix_end = p0;
348     filevec[1].prefix_end = p1;
349 
350     /* Find identical suffix.  */
351 
352     /* P0 and P1 point beyond the last chars not yet compared.  */
353     p0 = buffer0 + n0;
354     p1 = buffer1 + n1;
355 
356     const QChar *end0, *beg0;
357     end0 = p0; /* Addr of last char in file 0.  */
358 
359     /* Get value of P0 at which we should stop scanning backward:
360       this is when either P0 or P1 points just past the last char
361       of the identical prefix.  */
362     beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
363 
364     /* Scan back until chars don't match or we reach that point.  */
365     for(; p0 != beg0; p0--, p1--)
366     {
367         if(*p0 != *p1)
368         {
369             /* Point at the first char of the matching suffix.  */
370             beg0 = p0;
371             break;
372         }
373     }
374 
375     // Go to the next line (skip last line with a difference)
376     if(p0 != end0)
377     {
378         if(*p0 != *p1)
379             ++p0;
380         while(p0 < pEnd0 && !Utils::isEndOfLine(*p0++))
381             continue;
382     }
383 
384     p1 += p0 - beg0;
385 
386     /* Record the suffix.  */
387     filevec[0].suffix_begin = p0;
388     filevec[1].suffix_begin = p1;
389 
390     /* Calculate number of lines of prefix to save.
391 
392      prefix_count == 0 means save the whole prefix;
393      we need this for options like -D that output the whole file,
394      or for enormous contexts (to avoid worrying about arithmetic overflow).
395      We also need it for options like -F that output some preceding line;
396      at least we will need to find the last few lines,
397      but since we don't know how many, it's easiest to find them all.
398 
399      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
400      of the line buffer; they'll be moved to the proper location later.
401      Handle 1 more line than the context says (because we count 1 too many),
402      rounded up to the next power of 2 to speed index computation.  */
403 
404     const QChar **linbuf0, **linbuf1;
405     GNULineRef alloc_lines0, alloc_lines1;
406     GNULineRef buffered_prefix, prefix_count, prefix_mask;
407     GNULineRef middle_guess, suffix_guess;
408     if(no_diff_means_no_output && context < (GNULineRef)(GNULINEREF_MAX / 4) && context < (GNULineRef)(n0))
409     {
410         middle_guess = guess_lines(0, 0, p0 - filevec[0].prefix_end);
411         suffix_guess = guess_lines(0, 0, buffer0 + n0 - p0);
412         for(prefix_count = 1; prefix_count <= context; prefix_count *= 2)
413             continue;
414         alloc_lines0 = (prefix_count + middle_guess + std::min(context, suffix_guess));
415     }
416     else
417     {
418         prefix_count = 0;
419         alloc_lines0 = guess_lines(0, 0, n0);
420     }
421 
422     prefix_mask = prefix_count - 1;
423     GNULineRef lines = 0;
424     linbuf0 = (const QChar **)xmalloc(alloc_lines0 * sizeof(ptrdiff_t));
425     p0 = buffer0;
426 
427     /* If the prefix is needed, find the prefix lines.  */
428     if(!(no_diff_means_no_output && filevec[0].prefix_end == p0 && filevec[1].prefix_end == p1))
429     {
430         end0 = filevec[0].prefix_end;
431         while(p0 != end0)
432         {
433             GNULineRef l = lines++ & prefix_mask;
434             if(l == alloc_lines0)
435             {
436                 if((GNULineRef)(GNULINEREF_MAX / (2 * sizeof(ptrdiff_t))) <= alloc_lines0)
437                     xalloc_die();
438                 alloc_lines0 *= 2;
439                 linbuf0 = (const QChar **)xrealloc(linbuf0, alloc_lines0 * sizeof(ptrdiff_t));
440             }
441             linbuf0[l] = p0;
442             while(p0 < pEnd0 && !Utils::isEndOfLine(*p0++))
443                 continue;
444         }
445     }
446     buffered_prefix = prefix_count && context < lines ? context : lines;
447 
448     /* Allocate line buffer 1.  */
449 
450     middle_guess = guess_lines(lines, p0 - buffer0, p1 - filevec[1].prefix_end);
451     suffix_guess = guess_lines(lines, p0 - buffer0, buffer1 + n1 - p1);
452     alloc_lines1 = buffered_prefix + middle_guess + std::min(context, suffix_guess);
453     if(alloc_lines1 < buffered_prefix || (GNULineRef)(GNULINEREF_MAX / sizeof(ptrdiff_t)) <= alloc_lines1)
454         xalloc_die();
455     linbuf1 = (const QChar **)xmalloc(alloc_lines1 * sizeof(ptrdiff_t));
456 
457     GNULineRef i;
458     if(buffered_prefix != lines)
459     {
460         /* Rotate prefix lines to proper location.  */
461         for(i = 0; i < buffered_prefix; ++i)
462             linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
463         for(i = 0; i < buffered_prefix; ++i)
464             linbuf0[i] = linbuf1[i];
465     }
466 
467     /* Initialize line buffer 1 from line buffer 0.  */
468     for(i = 0; i < buffered_prefix; ++i)
469         linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
470 
471     /* Record the line buffer, adjusted so that
472      linbuf[0] points at the first differing line.  */
473     filevec[0].linbuf = linbuf0 + buffered_prefix;
474     filevec[1].linbuf = linbuf1 + buffered_prefix;
475     filevec[0].linbuf_base = filevec[1].linbuf_base = -buffered_prefix;
476     filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
477     filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
478     filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
479 }
480 
481 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
482    than 2**k.  This table is derived from Chris K. Caldwell's list
483    <https://www.utm.edu/research/primes/lists/2small/>.  */
484 
485 static unsigned char const prime_offset[] =
486     {
487         0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
488         15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
489         11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
490         55, 93, 1, 57, 25};
491 
492 /* Verify that this host's size_t is not too wide for the above table.  */
493 
494 static_assert(sizeof(size_t) * CHAR_BIT <= sizeof prime_offset, "Not enough primes in table");
495 
496 /* Given a vector of two file_data objects, read the file associated
497    with each one, and build the table of equivalence classes.
498    Return nonzero if either file appears to be a binary file.
499    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
500 
read_files(file_data filevec[],bool)501 bool GnuDiff::read_files(file_data filevec[], bool /*pretend_binary*/)
502 {
503     GNULineRef i;
504 
505     find_identical_ends(filevec);
506 
507     equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
508     if((GNULineRef)(GNULINEREF_MAX / sizeof(*equivs)) <= equivs_alloc)
509         xalloc_die();
510     equivs = (equivclass *)xmalloc(equivs_alloc * sizeof(*equivs));
511     /* Equivalence class 0 is permanently safe for lines that were not
512      hashed.  Real equivalence classes start at 1.  */
513     equivs_index = 1;
514 
515     /* Allocate (one plus) a prime number of hash buckets.  Use a prime
516      number between 1/3 and 2/3 of the value of equiv_allocs,
517      approximately.  */
518     for(i = 9; ((GNULineRef)1 << i) < equivs_alloc / 3; ++i)
519         continue;
520     nbuckets = ((GNULineRef)1 << i) - prime_offset[i];
521     if(GNULINEREF_MAX / sizeof(*buckets) <= nbuckets)
522         xalloc_die();
523     buckets = (GNULineRef *)zalloc((nbuckets + 1) * sizeof(*buckets));
524     buckets++;
525 
526     for(i = 0; i < 2; ++i)
527         find_and_hash_each_line(&filevec[i]);
528 
529     filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
530 
531     free(equivs);
532     free(buckets - 1);
533 
534     return false;
535 }
536