1 /* File I/O for GNU DIFF.
2
3 Modified for KDiff3 by Joachim Eibl <joachim.eibl at gmx.de> 2003, 2004, 2005.
4 The original file was part of GNU DIFF.
5
6 Part of KDiff3 - Text Diff And Merge Tool
7
8 SPDX-FileCopyrightText: 1988-2002 Free Software Foundation, Inc.
9 SPDX-FileCopyrightText: 2002-2011 Joachim Eibl, joachim.eibl at gmx.de
10 SPDX-FileCopyrightText: 2018-2020 Michael Reeves reeves.87@gmail.com
11 SPDX-License-Identifier: GPL-2.0-or-later
12 */
13
14 #include "gnudiff_diff.h"
15 #include <stdlib.h>
16 #include <type_traits>
17
18 /* Rotate an unsigned value to the left. */
19 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof(v) * CHAR_BIT - (n)))
20
21 /* Given a hash value and a new character, return a new hash value. */
22 #define HASH(h, c) ((c) + ROL(h, 7))
23
24 /* The type of a hash value. */
25 typedef size_t hash_value;
26 static_assert(std::is_unsigned<hash_value>::value, "hash_value must be signed.");
27
28 /* Lines are put into equivalence classes of lines that match in lines_differ.
29 Each equivalence class is represented by one of these structures,
30 but only while the classes are being computed.
31 Afterward, each class is represented by a number. */
32 struct equivclass {
33 GNULineRef next; /* Next item in this bucket. */
34 hash_value hash; /* Hash of lines in this class. */
35 const QChar *line; /* A line that fits this class. */
36 size_t length; /* That line's length, not counting its newline. */
37 };
38
39 /* Hash-table: array of buckets, each being a chain of equivalence classes.
40 buckets[-1] is reserved for incomplete lines. */
41 static GNULineRef *buckets;
42
43 /* Number of buckets in the hash table array, not counting buckets[-1]. */
44 static size_t nbuckets;
45
46 /* Array in which the equivalence classes are allocated.
47 The bucket-chains go through the elements in this array.
48 The number of an equivalence class is its index in this array. */
49 static equivclass *equivs;
50
51 /* Index of first free element in the array `equivs'. */
52 static GNULineRef equivs_index;
53
54 /* Number of elements allocated in the array `equivs'. */
55 static GNULineRef equivs_alloc;
56
57 /* Check for binary files and compare them for exact identity. */
58
59 /* Return 1 if BUF contains a non text character.
60 SIZE is the number of characters in BUF. */
61
62 #define binary_file_p(buf, size) (memchr(buf, 0, size) != 0)
63
64 /* Compare two lines (typically one from each input file)
65 according to the command line options.
66 For efficiency, this is invoked only when the lines do not match exactly
67 but an option like -i might cause us to ignore the difference.
68 Return nonzero if the lines differ. */
69
lines_differ(const QChar * s1,size_t len1,const QChar * s2,size_t len2)70 bool GnuDiff::lines_differ(const QChar *s1, size_t len1, const QChar *s2, size_t len2)
71 {
72 const QChar *t1 = s1;
73 const QChar *t2 = s2;
74 const QChar *s1end = s1 + len1;
75 const QChar *s2end = s2 + len2;
76
77 for(;; ++t1, ++t2)
78 {
79 /* Test for exact char equality first, since it's a common case. */
80 if(t1 != s1end && t2 != s2end && *t1 == *t2)
81 continue;
82 else
83 {
84 while(t1 != s1end &&
85 ((bIgnoreWhiteSpace && isspace(t1->unicode())) ||
86 (bIgnoreNumbers && (t1->isDigit() || *t1 == '-' || *t1 == '.'))))
87 {
88 ++t1;
89 }
90
91 while(t2 != s2end &&
92 ((bIgnoreWhiteSpace && isspace(t2->unicode())) ||
93 (bIgnoreNumbers && (t2->isDigit() || *t2 == '-' || *t2 == '.'))))
94 {
95 ++t2;
96 }
97
98 if(t1 != s1end && t2 != s2end)
99 {
100 if(ignore_case)
101 { /* Lowercase comparison. */
102 if(t1->toLower() == t2->toLower())
103 continue;
104 }
105 else if(*t1 == *t2)
106 continue;
107 else
108 return true;
109 }
110 else if(t1 == s1end && t2 == s2end)
111 return false;
112 else
113 return true;
114 }
115 }
116 return false;
117 }
118
119 /* Split the file into lines, simultaneously computing the equivalence
120 class for each line. */
121
find_and_hash_each_line(file_data * current)122 void GnuDiff::find_and_hash_each_line(file_data *current)
123 {
124 hash_value h;
125 const QChar *p = current->prefix_end;
126 QChar c;
127 GNULineRef i, *bucket;
128 size_t length;
129
130 /* Cache often-used quantities in local variables to help the compiler. */
131 const QChar **linbuf = current->linbuf;
132 GNULineRef alloc_lines = current->alloc_lines;
133 GNULineRef line = 0;
134 GNULineRef linbuf_base = current->linbuf_base;
135 GNULineRef *cureqs = (GNULineRef *)xmalloc(alloc_lines * sizeof(*cureqs));
136 equivclass *eqs = equivs;
137 GNULineRef eqs_index = equivs_index;
138 GNULineRef eqs_alloc = equivs_alloc;
139 const QChar *suffix_begin = current->suffix_begin;
140 const QChar *bufend = current->buffer + current->buffered;
141 bool diff_length_compare_anyway =
142 ignore_white_space != IGNORE_NO_WHITE_SPACE || bIgnoreNumbers;
143 bool same_length_diff_contents_compare_anyway =
144 diff_length_compare_anyway | ignore_case;
145
146 while(p < suffix_begin)
147 {
148 const QChar *ip = p;
149
150 h = 0;
151
152 /* Hash this line until we find a newline or bufend is reached. */
153 if(ignore_case)
154 switch(ignore_white_space)
155 {
156 case IGNORE_ALL_SPACE:
157 while(p < bufend && !Utils::isEndOfLine(c = *p))
158 {
159 if(!(isspace(c.unicode()) || (bIgnoreNumbers && (c.isDigit() || c == '-' || c == '.'))))
160 h = HASH(h, c.toLower().unicode());
161 ++p;
162 }
163 break;
164
165 default:
166 while(p < bufend && !Utils::isEndOfLine(c = *p))
167 {
168 h = HASH(h, c.toLower().unicode());
169 ++p;
170 }
171 break;
172 }
173 else
174 switch(ignore_white_space)
175 {
176 case IGNORE_ALL_SPACE:
177 while(p < bufend && !Utils::isEndOfLine(c = *p))
178 {
179 if(!(isspace(c.unicode()) || (bIgnoreNumbers && (c.isDigit() || c == '-' || c == '.'))))
180 h = HASH(h, c.unicode());
181 ++p;
182 }
183 break;
184
185 default:
186 while(p < bufend && !Utils::isEndOfLine(c = *p))
187 {
188 h = HASH(h, c.unicode());
189 ++p;
190 }
191 break;
192 }
193
194 bucket = &buckets[h % nbuckets];
195 length = p - ip;
196 ++p;
197
198 for(i = *bucket;; i = eqs[i].next)
199 if(!i)
200 {
201 /* Create a new equivalence class in this bucket. */
202 i = eqs_index++;
203 if(i == eqs_alloc)
204 {
205 if((GNULineRef)(GNULINEREF_MAX / (2 * sizeof(*eqs))) <= eqs_alloc)
206 xalloc_die();
207 eqs_alloc *= 2;
208 eqs = (equivclass *)xrealloc(eqs, eqs_alloc * sizeof(*eqs));
209 }
210 eqs[i].next = *bucket;
211 eqs[i].hash = h;
212 eqs[i].line = ip;
213 eqs[i].length = length;
214 *bucket = i;
215 break;
216 }
217 else if(eqs[i].hash == h)
218 {
219 const QChar *eqline = eqs[i].line;
220
221 /* Reuse existing class if lines_differ reports the lines
222 equal. */
223 if(eqs[i].length == length)
224 {
225 /* Reuse existing equivalence class if the lines are identical.
226 This detects the common case of exact identity
227 faster than lines_differ would. */
228 if(memcmp(eqline, ip, length * sizeof(QChar)) == 0)
229 break;
230 if(!same_length_diff_contents_compare_anyway)
231 continue;
232 }
233 else if(!diff_length_compare_anyway)
234 continue;
235
236 if(!lines_differ(eqline, eqs[i].length, ip, length))
237 break;
238 }
239
240 /* Maybe increase the size of the line table. */
241 if(line == alloc_lines)
242 {
243 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
244 if((GNULineRef)(GNULINEREF_MAX / 3) <= alloc_lines || (GNULineRef)(GNULINEREF_MAX / sizeof(*cureqs)) <= 2 * alloc_lines - linbuf_base || (GNULineRef)(GNULINEREF_MAX / sizeof(ptrdiff_t)) <= alloc_lines - linbuf_base)
245 xalloc_die();
246 alloc_lines = 2 * alloc_lines - linbuf_base;
247 cureqs = (GNULineRef *)xrealloc(cureqs, alloc_lines * sizeof(*cureqs));
248 linbuf += linbuf_base;
249 linbuf = (const QChar **)xrealloc(linbuf,
250 (alloc_lines - linbuf_base) * sizeof(ptrdiff_t));
251 linbuf -= linbuf_base;
252 }
253 linbuf[line] = ip;
254 cureqs[line] = i;
255 ++line;
256 }
257
258 current->buffered_lines = line;
259
260 for(i = 0;; ++i)
261 {
262 /* Record the line start for lines in the suffix that we care about.
263 Record one more line start than lines,
264 so that we can compute the length of any buffered line. */
265 if(line == alloc_lines)
266 {
267 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
268 if((GNULineRef)(GNULINEREF_MAX / 3) <= alloc_lines || (GNULineRef)(GNULINEREF_MAX / sizeof(*cureqs)) <= 2 * alloc_lines - linbuf_base || (GNULineRef)(GNULINEREF_MAX / sizeof(ptrdiff_t)) <= alloc_lines - linbuf_base)
269 xalloc_die();
270 alloc_lines = 2 * alloc_lines - linbuf_base;
271 linbuf += linbuf_base;
272 linbuf = (const QChar **)xrealloc(linbuf,
273 (alloc_lines - linbuf_base) * sizeof(ptrdiff_t));
274 linbuf -= linbuf_base;
275 }
276 linbuf[line] = p;
277
278 if(p >= bufend)
279 break;
280
281 if(context <= i && no_diff_means_no_output)
282 break;
283
284 line++;
285
286 while(p < bufend && !Utils::isEndOfLine(*p++))
287 continue;
288 }
289
290 /* Done with cache in local variables. */
291 current->linbuf = linbuf;
292 current->valid_lines = line;
293 current->alloc_lines = alloc_lines;
294 current->equivs = cureqs;
295 equivs = eqs;
296 equivs_alloc = eqs_alloc;
297 equivs_index = eqs_index;
298 }
299
300 /* We have found N lines in a buffer of size S; guess the
301 proportionate number of lines that will be found in a buffer of
302 size T. However, do not guess a number of lines so large that the
303 resulting line table might cause overflow in size calculations. */
304 GNULineRef
guess_lines(GNULineRef n,size_t s,size_t t)305 GnuDiff::guess_lines(GNULineRef n, size_t s, size_t t)
306 {
307 size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
308 size_t guessed_lines = std::max((size_t)1, t / guessed_bytes_per_line);
309 return (GNULineRef)std::min((GNULineRef)guessed_lines, (GNULineRef)(GNULINEREF_MAX / (2 * sizeof(ptrdiff_t) + 1) - 5)) + 5;
310 }
311
312 /* Given a vector of two file_data objects, find the identical
313 prefixes and suffixes of each object. */
314
find_identical_ends(file_data filevec[])315 void GnuDiff::find_identical_ends(file_data filevec[])
316 {
317 /* Find identical prefix. */
318 const QChar *p0, *p1, *buffer0, *buffer1;
319 p0 = buffer0 = filevec[0].buffer;
320 p1 = buffer1 = filevec[1].buffer;
321 size_t n0, n1;
322 n0 = filevec[0].buffered;
323 n1 = filevec[1].buffered;
324 const QChar *const pEnd0 = p0 + n0;
325 const QChar *const pEnd1 = p1 + n1;
326
327 if(p0 == p1)
328 /* The buffers are the same; sentinels won't work. */
329 p0 = p1 += n1;
330 else
331 {
332 /* Loop until first mismatch, or end. */
333 while(p0 != pEnd0 && p1 != pEnd1 && *p0 == *p1)
334 {
335 p0++;
336 p1++;
337 }
338 }
339
340 /* Now P0 and P1 point at the first nonmatching characters. */
341
342 /* Skip back to last line-beginning in the prefix. */
343 while(p0 != buffer0 && !Utils::isEndOfLine(p0[-1]))
344 p0--, p1--;
345
346 /* Record the prefix. */
347 filevec[0].prefix_end = p0;
348 filevec[1].prefix_end = p1;
349
350 /* Find identical suffix. */
351
352 /* P0 and P1 point beyond the last chars not yet compared. */
353 p0 = buffer0 + n0;
354 p1 = buffer1 + n1;
355
356 const QChar *end0, *beg0;
357 end0 = p0; /* Addr of last char in file 0. */
358
359 /* Get value of P0 at which we should stop scanning backward:
360 this is when either P0 or P1 points just past the last char
361 of the identical prefix. */
362 beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
363
364 /* Scan back until chars don't match or we reach that point. */
365 for(; p0 != beg0; p0--, p1--)
366 {
367 if(*p0 != *p1)
368 {
369 /* Point at the first char of the matching suffix. */
370 beg0 = p0;
371 break;
372 }
373 }
374
375 // Go to the next line (skip last line with a difference)
376 if(p0 != end0)
377 {
378 if(*p0 != *p1)
379 ++p0;
380 while(p0 < pEnd0 && !Utils::isEndOfLine(*p0++))
381 continue;
382 }
383
384 p1 += p0 - beg0;
385
386 /* Record the suffix. */
387 filevec[0].suffix_begin = p0;
388 filevec[1].suffix_begin = p1;
389
390 /* Calculate number of lines of prefix to save.
391
392 prefix_count == 0 means save the whole prefix;
393 we need this for options like -D that output the whole file,
394 or for enormous contexts (to avoid worrying about arithmetic overflow).
395 We also need it for options like -F that output some preceding line;
396 at least we will need to find the last few lines,
397 but since we don't know how many, it's easiest to find them all.
398
399 Otherwise, prefix_count != 0. Save just prefix_count lines at start
400 of the line buffer; they'll be moved to the proper location later.
401 Handle 1 more line than the context says (because we count 1 too many),
402 rounded up to the next power of 2 to speed index computation. */
403
404 const QChar **linbuf0, **linbuf1;
405 GNULineRef alloc_lines0, alloc_lines1;
406 GNULineRef buffered_prefix, prefix_count, prefix_mask;
407 GNULineRef middle_guess, suffix_guess;
408 if(no_diff_means_no_output && context < (GNULineRef)(GNULINEREF_MAX / 4) && context < (GNULineRef)(n0))
409 {
410 middle_guess = guess_lines(0, 0, p0 - filevec[0].prefix_end);
411 suffix_guess = guess_lines(0, 0, buffer0 + n0 - p0);
412 for(prefix_count = 1; prefix_count <= context; prefix_count *= 2)
413 continue;
414 alloc_lines0 = (prefix_count + middle_guess + std::min(context, suffix_guess));
415 }
416 else
417 {
418 prefix_count = 0;
419 alloc_lines0 = guess_lines(0, 0, n0);
420 }
421
422 prefix_mask = prefix_count - 1;
423 GNULineRef lines = 0;
424 linbuf0 = (const QChar **)xmalloc(alloc_lines0 * sizeof(ptrdiff_t));
425 p0 = buffer0;
426
427 /* If the prefix is needed, find the prefix lines. */
428 if(!(no_diff_means_no_output && filevec[0].prefix_end == p0 && filevec[1].prefix_end == p1))
429 {
430 end0 = filevec[0].prefix_end;
431 while(p0 != end0)
432 {
433 GNULineRef l = lines++ & prefix_mask;
434 if(l == alloc_lines0)
435 {
436 if((GNULineRef)(GNULINEREF_MAX / (2 * sizeof(ptrdiff_t))) <= alloc_lines0)
437 xalloc_die();
438 alloc_lines0 *= 2;
439 linbuf0 = (const QChar **)xrealloc(linbuf0, alloc_lines0 * sizeof(ptrdiff_t));
440 }
441 linbuf0[l] = p0;
442 while(p0 < pEnd0 && !Utils::isEndOfLine(*p0++))
443 continue;
444 }
445 }
446 buffered_prefix = prefix_count && context < lines ? context : lines;
447
448 /* Allocate line buffer 1. */
449
450 middle_guess = guess_lines(lines, p0 - buffer0, p1 - filevec[1].prefix_end);
451 suffix_guess = guess_lines(lines, p0 - buffer0, buffer1 + n1 - p1);
452 alloc_lines1 = buffered_prefix + middle_guess + std::min(context, suffix_guess);
453 if(alloc_lines1 < buffered_prefix || (GNULineRef)(GNULINEREF_MAX / sizeof(ptrdiff_t)) <= alloc_lines1)
454 xalloc_die();
455 linbuf1 = (const QChar **)xmalloc(alloc_lines1 * sizeof(ptrdiff_t));
456
457 GNULineRef i;
458 if(buffered_prefix != lines)
459 {
460 /* Rotate prefix lines to proper location. */
461 for(i = 0; i < buffered_prefix; ++i)
462 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
463 for(i = 0; i < buffered_prefix; ++i)
464 linbuf0[i] = linbuf1[i];
465 }
466
467 /* Initialize line buffer 1 from line buffer 0. */
468 for(i = 0; i < buffered_prefix; ++i)
469 linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
470
471 /* Record the line buffer, adjusted so that
472 linbuf[0] points at the first differing line. */
473 filevec[0].linbuf = linbuf0 + buffered_prefix;
474 filevec[1].linbuf = linbuf1 + buffered_prefix;
475 filevec[0].linbuf_base = filevec[1].linbuf_base = -buffered_prefix;
476 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
477 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
478 filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
479 }
480
481 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
482 than 2**k. This table is derived from Chris K. Caldwell's list
483 <https://www.utm.edu/research/primes/lists/2small/>. */
484
485 static unsigned char const prime_offset[] =
486 {
487 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
488 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
489 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
490 55, 93, 1, 57, 25};
491
492 /* Verify that this host's size_t is not too wide for the above table. */
493
494 static_assert(sizeof(size_t) * CHAR_BIT <= sizeof prime_offset, "Not enough primes in table");
495
496 /* Given a vector of two file_data objects, read the file associated
497 with each one, and build the table of equivalence classes.
498 Return nonzero if either file appears to be a binary file.
499 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */
500
read_files(file_data filevec[],bool)501 bool GnuDiff::read_files(file_data filevec[], bool /*pretend_binary*/)
502 {
503 GNULineRef i;
504
505 find_identical_ends(filevec);
506
507 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
508 if((GNULineRef)(GNULINEREF_MAX / sizeof(*equivs)) <= equivs_alloc)
509 xalloc_die();
510 equivs = (equivclass *)xmalloc(equivs_alloc * sizeof(*equivs));
511 /* Equivalence class 0 is permanently safe for lines that were not
512 hashed. Real equivalence classes start at 1. */
513 equivs_index = 1;
514
515 /* Allocate (one plus) a prime number of hash buckets. Use a prime
516 number between 1/3 and 2/3 of the value of equiv_allocs,
517 approximately. */
518 for(i = 9; ((GNULineRef)1 << i) < equivs_alloc / 3; ++i)
519 continue;
520 nbuckets = ((GNULineRef)1 << i) - prime_offset[i];
521 if(GNULINEREF_MAX / sizeof(*buckets) <= nbuckets)
522 xalloc_die();
523 buckets = (GNULineRef *)zalloc((nbuckets + 1) * sizeof(*buckets));
524 buckets++;
525
526 for(i = 0; i < 2; ++i)
527 find_and_hash_each_line(&filevec[i]);
528
529 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
530
531 free(equivs);
532 free(buckets - 1);
533
534 return false;
535 }
536