1*63eb84d1Schristos /* Functions to make fuzzy comparisons between strings
2*63eb84d1Schristos Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006 Free Software Foundation, Inc.
3*63eb84d1Schristos
4*63eb84d1Schristos This program is free software; you can redistribute it and/or modify
5*63eb84d1Schristos it under the terms of the GNU General Public License as published by
6*63eb84d1Schristos the Free Software Foundation; either version 2 of the License, or (at
7*63eb84d1Schristos your option) any later version.
8*63eb84d1Schristos
9*63eb84d1Schristos This program is distributed in the hope that it will be useful, but
10*63eb84d1Schristos WITHOUT ANY WARRANTY; without even the implied warranty of
11*63eb84d1Schristos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12*63eb84d1Schristos General Public License for more details.
13*63eb84d1Schristos
14*63eb84d1Schristos You should have received a copy of the GNU General Public License
15*63eb84d1Schristos along with this program; if not, write to the Free Software
16*63eb84d1Schristos Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17*63eb84d1Schristos
18*63eb84d1Schristos
19*63eb84d1Schristos Derived from GNU diff 2.7, analyze.c et al.
20*63eb84d1Schristos
21*63eb84d1Schristos The basic idea is to consider two strings as similar if, when
22*63eb84d1Schristos transforming the first string into the second string through a
23*63eb84d1Schristos sequence of edits (inserts and deletes of one character each),
24*63eb84d1Schristos this sequence is short - or equivalently, if the ordered list
25*63eb84d1Schristos of characters that are untouched by these edits is long. For a
26*63eb84d1Schristos good introduction to the subject, read about the "Levenshtein
27*63eb84d1Schristos distance" in Wikipedia.
28*63eb84d1Schristos
29*63eb84d1Schristos The basic algorithm is described in:
30*63eb84d1Schristos "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
31*63eb84d1Schristos Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
32*63eb84d1Schristos see especially section 4.2, which describes the variation used below.
33*63eb84d1Schristos
34*63eb84d1Schristos The basic algorithm was independently discovered as described in:
35*63eb84d1Schristos "Algorithms for Approximate String Matching", E. Ukkonen,
36*63eb84d1Schristos Information and Control Vol. 64, 1985, pp. 100-118.
37*63eb84d1Schristos
38*63eb84d1Schristos Unless the 'minimal' flag is set, this code uses the TOO_EXPENSIVE
39*63eb84d1Schristos heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
40*63eb84d1Schristos at the price of producing suboptimal output for large inputs with
41*63eb84d1Schristos many differences.
42*63eb84d1Schristos
43*63eb84d1Schristos Modified to work on strings rather than files
44*63eb84d1Schristos by Peter Miller <pmiller@agso.gov.au>, October 1995 */
45*63eb84d1Schristos
46*63eb84d1Schristos #include <config.h>
47*63eb84d1Schristos
48*63eb84d1Schristos /* Specification. */
49*63eb84d1Schristos #include "fstrcmp.h"
50*63eb84d1Schristos
51*63eb84d1Schristos #include <string.h>
52*63eb84d1Schristos #include <stdio.h>
53*63eb84d1Schristos #include <stdlib.h>
54*63eb84d1Schristos #include <limits.h>
55*63eb84d1Schristos
56*63eb84d1Schristos #include "lock.h"
57*63eb84d1Schristos #include "tls.h"
58*63eb84d1Schristos #include "xalloc.h"
59*63eb84d1Schristos
60*63eb84d1Schristos #ifndef uintptr_t
61*63eb84d1Schristos # define uintptr_t unsigned long
62*63eb84d1Schristos #endif
63*63eb84d1Schristos
64*63eb84d1Schristos
65*63eb84d1Schristos /*
66*63eb84d1Schristos * Context of comparison operation.
67*63eb84d1Schristos */
68*63eb84d1Schristos struct context
69*63eb84d1Schristos {
70*63eb84d1Schristos /*
71*63eb84d1Schristos * Data on one input string being compared.
72*63eb84d1Schristos */
73*63eb84d1Schristos struct string_data
74*63eb84d1Schristos {
75*63eb84d1Schristos /* The string to be compared. */
76*63eb84d1Schristos const char *data;
77*63eb84d1Schristos
78*63eb84d1Schristos /* The length of the string to be compared. */
79*63eb84d1Schristos int data_length;
80*63eb84d1Schristos
81*63eb84d1Schristos /* The number of characters inserted or deleted. */
82*63eb84d1Schristos int edit_count;
83*63eb84d1Schristos }
84*63eb84d1Schristos string[2];
85*63eb84d1Schristos
86*63eb84d1Schristos #ifdef MINUS_H_FLAG
87*63eb84d1Schristos
88*63eb84d1Schristos /* This corresponds to the diff -H flag. With this heuristic, for
89*63eb84d1Schristos strings with a constant small density of changes, the algorithm is
90*63eb84d1Schristos linear in the strings size. This is unlikely in typical uses of
91*63eb84d1Schristos fstrcmp, and so is usually compiled out. Besides, there is no
92*63eb84d1Schristos interface to set it true. */
93*63eb84d1Schristos int heuristic;
94*63eb84d1Schristos
95*63eb84d1Schristos #endif
96*63eb84d1Schristos
97*63eb84d1Schristos /* Vector, indexed by diagonal, containing 1 + the X coordinate of the
98*63eb84d1Schristos point furthest along the given diagonal in the forward search of the
99*63eb84d1Schristos edit matrix. */
100*63eb84d1Schristos int *fdiag;
101*63eb84d1Schristos
102*63eb84d1Schristos /* Vector, indexed by diagonal, containing the X coordinate of the point
103*63eb84d1Schristos furthest along the given diagonal in the backward search of the edit
104*63eb84d1Schristos matrix. */
105*63eb84d1Schristos int *bdiag;
106*63eb84d1Schristos
107*63eb84d1Schristos /* Edit scripts longer than this are too expensive to compute. */
108*63eb84d1Schristos int too_expensive;
109*63eb84d1Schristos
110*63eb84d1Schristos /* Snakes bigger than this are considered `big'. */
111*63eb84d1Schristos #define SNAKE_LIMIT 20
112*63eb84d1Schristos };
113*63eb84d1Schristos
114*63eb84d1Schristos struct partition
115*63eb84d1Schristos {
116*63eb84d1Schristos /* Midpoints of this partition. */
117*63eb84d1Schristos int xmid, ymid;
118*63eb84d1Schristos
119*63eb84d1Schristos /* Nonzero if low half will be analyzed minimally. */
120*63eb84d1Schristos int lo_minimal;
121*63eb84d1Schristos
122*63eb84d1Schristos /* Likewise for high half. */
123*63eb84d1Schristos int hi_minimal;
124*63eb84d1Schristos };
125*63eb84d1Schristos
126*63eb84d1Schristos
127*63eb84d1Schristos /* NAME
128*63eb84d1Schristos diag - find diagonal path
129*63eb84d1Schristos
130*63eb84d1Schristos SYNOPSIS
131*63eb84d1Schristos int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
132*63eb84d1Schristos struct partition *part, struct context *ctxt);
133*63eb84d1Schristos
134*63eb84d1Schristos DESCRIPTION
135*63eb84d1Schristos Find the midpoint of the shortest edit script for a specified
136*63eb84d1Schristos portion of the two strings.
137*63eb84d1Schristos
138*63eb84d1Schristos Scan from the beginnings of the strings, and simultaneously from
139*63eb84d1Schristos the ends, doing a breadth-first search through the space of
140*63eb84d1Schristos edit-sequence. When the two searches meet, we have found the
141*63eb84d1Schristos midpoint of the shortest edit sequence.
142*63eb84d1Schristos
143*63eb84d1Schristos If MINIMAL is nonzero, find the minimal edit script regardless
144*63eb84d1Schristos of expense. Otherwise, if the search is too expensive, use
145*63eb84d1Schristos heuristics to stop the search and report a suboptimal answer.
146*63eb84d1Schristos
147*63eb84d1Schristos RETURNS
148*63eb84d1Schristos Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal
149*63eb84d1Schristos number XMID - YMID equals the number of inserted characters
150*63eb84d1Schristos minus the number of deleted characters (counting only characters
151*63eb84d1Schristos before the midpoint). Return the approximate edit cost; this is
152*63eb84d1Schristos the total number of characters inserted or deleted (counting
153*63eb84d1Schristos only characters before the midpoint), unless a heuristic is used
154*63eb84d1Schristos to terminate the search prematurely.
155*63eb84d1Schristos
156*63eb84d1Schristos Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
157*63eb84d1Schristos for the left half of the partition is known; similarly for
158*63eb84d1Schristos PART->RIGHT_MINIMAL.
159*63eb84d1Schristos
160*63eb84d1Schristos CAVEAT
161*63eb84d1Schristos This function assumes that the first characters of the specified
162*63eb84d1Schristos portions of the two strings do not match, and likewise that the
163*63eb84d1Schristos last characters do not match. The caller must trim matching
164*63eb84d1Schristos characters from the beginning and end of the portions it is
165*63eb84d1Schristos going to specify.
166*63eb84d1Schristos
167*63eb84d1Schristos If we return the "wrong" partitions, the worst this can do is
168*63eb84d1Schristos cause suboptimal diff output. It cannot cause incorrect diff
169*63eb84d1Schristos output. */
170*63eb84d1Schristos
171*63eb84d1Schristos static int
diag(int xoff,int xlim,int yoff,int ylim,int minimal,struct partition * part,struct context * ctxt)172*63eb84d1Schristos diag (int xoff, int xlim, int yoff, int ylim, int minimal,
173*63eb84d1Schristos struct partition *part, struct context *ctxt)
174*63eb84d1Schristos {
175*63eb84d1Schristos int *const fd = ctxt->fdiag; /* Give the compiler a chance. */
176*63eb84d1Schristos int *const bd = ctxt->bdiag; /* Additional help for the compiler. */
177*63eb84d1Schristos const char *const xv = ctxt->string[0].data; /* Still more help for the compiler. */
178*63eb84d1Schristos const char *const yv = ctxt->string[1].data; /* And more and more . . . */
179*63eb84d1Schristos const int dmin = xoff - ylim; /* Minimum valid diagonal. */
180*63eb84d1Schristos const int dmax = xlim - yoff; /* Maximum valid diagonal. */
181*63eb84d1Schristos const int fmid = xoff - yoff; /* Center diagonal of top-down search. */
182*63eb84d1Schristos const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
183*63eb84d1Schristos int fmin = fmid;
184*63eb84d1Schristos int fmax = fmid; /* Limits of top-down search. */
185*63eb84d1Schristos int bmin = bmid;
186*63eb84d1Schristos int bmax = bmid; /* Limits of bottom-up search. */
187*63eb84d1Schristos int c; /* Cost. */
188*63eb84d1Schristos int odd = (fmid - bmid) & 1;
189*63eb84d1Schristos
190*63eb84d1Schristos /*
191*63eb84d1Schristos * True if southeast corner is on an odd diagonal with respect
192*63eb84d1Schristos * to the northwest.
193*63eb84d1Schristos */
194*63eb84d1Schristos fd[fmid] = xoff;
195*63eb84d1Schristos bd[bmid] = xlim;
196*63eb84d1Schristos for (c = 1;; ++c)
197*63eb84d1Schristos {
198*63eb84d1Schristos int d; /* Active diagonal. */
199*63eb84d1Schristos int big_snake;
200*63eb84d1Schristos
201*63eb84d1Schristos big_snake = 0;
202*63eb84d1Schristos /* Extend the top-down search by an edit step in each diagonal. */
203*63eb84d1Schristos if (fmin > dmin)
204*63eb84d1Schristos fd[--fmin - 1] = -1;
205*63eb84d1Schristos else
206*63eb84d1Schristos ++fmin;
207*63eb84d1Schristos if (fmax < dmax)
208*63eb84d1Schristos fd[++fmax + 1] = -1;
209*63eb84d1Schristos else
210*63eb84d1Schristos --fmax;
211*63eb84d1Schristos for (d = fmax; d >= fmin; d -= 2)
212*63eb84d1Schristos {
213*63eb84d1Schristos int x;
214*63eb84d1Schristos int y;
215*63eb84d1Schristos int oldx;
216*63eb84d1Schristos int tlo;
217*63eb84d1Schristos int thi;
218*63eb84d1Schristos
219*63eb84d1Schristos tlo = fd[d - 1],
220*63eb84d1Schristos thi = fd[d + 1];
221*63eb84d1Schristos
222*63eb84d1Schristos if (tlo >= thi)
223*63eb84d1Schristos x = tlo + 1;
224*63eb84d1Schristos else
225*63eb84d1Schristos x = thi;
226*63eb84d1Schristos oldx = x;
227*63eb84d1Schristos y = x - d;
228*63eb84d1Schristos while (x < xlim && y < ylim && xv[x] == yv[y])
229*63eb84d1Schristos {
230*63eb84d1Schristos ++x;
231*63eb84d1Schristos ++y;
232*63eb84d1Schristos }
233*63eb84d1Schristos if (x - oldx > SNAKE_LIMIT)
234*63eb84d1Schristos big_snake = 1;
235*63eb84d1Schristos fd[d] = x;
236*63eb84d1Schristos if (odd && bmin <= d && d <= bmax && bd[d] <= x)
237*63eb84d1Schristos {
238*63eb84d1Schristos part->xmid = x;
239*63eb84d1Schristos part->ymid = y;
240*63eb84d1Schristos part->lo_minimal = part->hi_minimal = 1;
241*63eb84d1Schristos return 2 * c - 1;
242*63eb84d1Schristos }
243*63eb84d1Schristos }
244*63eb84d1Schristos /* Similarly extend the bottom-up search. */
245*63eb84d1Schristos if (bmin > dmin)
246*63eb84d1Schristos bd[--bmin - 1] = INT_MAX;
247*63eb84d1Schristos else
248*63eb84d1Schristos ++bmin;
249*63eb84d1Schristos if (bmax < dmax)
250*63eb84d1Schristos bd[++bmax + 1] = INT_MAX;
251*63eb84d1Schristos else
252*63eb84d1Schristos --bmax;
253*63eb84d1Schristos for (d = bmax; d >= bmin; d -= 2)
254*63eb84d1Schristos {
255*63eb84d1Schristos int x;
256*63eb84d1Schristos int y;
257*63eb84d1Schristos int oldx;
258*63eb84d1Schristos int tlo;
259*63eb84d1Schristos int thi;
260*63eb84d1Schristos
261*63eb84d1Schristos tlo = bd[d - 1],
262*63eb84d1Schristos thi = bd[d + 1];
263*63eb84d1Schristos if (tlo < thi)
264*63eb84d1Schristos x = tlo;
265*63eb84d1Schristos else
266*63eb84d1Schristos x = thi - 1;
267*63eb84d1Schristos oldx = x;
268*63eb84d1Schristos y = x - d;
269*63eb84d1Schristos while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
270*63eb84d1Schristos {
271*63eb84d1Schristos --x;
272*63eb84d1Schristos --y;
273*63eb84d1Schristos }
274*63eb84d1Schristos if (oldx - x > SNAKE_LIMIT)
275*63eb84d1Schristos big_snake = 1;
276*63eb84d1Schristos bd[d] = x;
277*63eb84d1Schristos if (!odd && fmin <= d && d <= fmax && x <= fd[d])
278*63eb84d1Schristos {
279*63eb84d1Schristos part->xmid = x;
280*63eb84d1Schristos part->ymid = y;
281*63eb84d1Schristos part->lo_minimal = part->hi_minimal = 1;
282*63eb84d1Schristos return 2 * c;
283*63eb84d1Schristos }
284*63eb84d1Schristos }
285*63eb84d1Schristos
286*63eb84d1Schristos if (minimal)
287*63eb84d1Schristos continue;
288*63eb84d1Schristos
289*63eb84d1Schristos #ifdef MINUS_H_FLAG
290*63eb84d1Schristos /* Heuristic: check occasionally for a diagonal that has made lots
291*63eb84d1Schristos of progress compared with the edit distance. If we have any
292*63eb84d1Schristos such, find the one that has made the most progress and return
293*63eb84d1Schristos it as if it had succeeded.
294*63eb84d1Schristos
295*63eb84d1Schristos With this heuristic, for strings with a constant small density
296*63eb84d1Schristos of changes, the algorithm is linear in the strings size. */
297*63eb84d1Schristos if (c > 200 && big_snake && ctxt->heuristic)
298*63eb84d1Schristos {
299*63eb84d1Schristos int best;
300*63eb84d1Schristos
301*63eb84d1Schristos best = 0;
302*63eb84d1Schristos for (d = fmax; d >= fmin; d -= 2)
303*63eb84d1Schristos {
304*63eb84d1Schristos int dd;
305*63eb84d1Schristos int x;
306*63eb84d1Schristos int y;
307*63eb84d1Schristos int v;
308*63eb84d1Schristos
309*63eb84d1Schristos dd = d - fmid;
310*63eb84d1Schristos x = fd[d];
311*63eb84d1Schristos y = x - d;
312*63eb84d1Schristos v = (x - xoff) * 2 - dd;
313*63eb84d1Schristos
314*63eb84d1Schristos if (v > 12 * (c + (dd < 0 ? -dd : dd)))
315*63eb84d1Schristos {
316*63eb84d1Schristos if
317*63eb84d1Schristos (
318*63eb84d1Schristos v > best
319*63eb84d1Schristos &&
320*63eb84d1Schristos xoff + SNAKE_LIMIT <= x
321*63eb84d1Schristos &&
322*63eb84d1Schristos x < xlim
323*63eb84d1Schristos &&
324*63eb84d1Schristos yoff + SNAKE_LIMIT <= y
325*63eb84d1Schristos &&
326*63eb84d1Schristos y < ylim
327*63eb84d1Schristos )
328*63eb84d1Schristos {
329*63eb84d1Schristos /* We have a good enough best diagonal; now insist
330*63eb84d1Schristos that it end with a significant snake. */
331*63eb84d1Schristos int k;
332*63eb84d1Schristos
333*63eb84d1Schristos for (k = 1; xv[x - k] == yv[y - k]; k++)
334*63eb84d1Schristos {
335*63eb84d1Schristos if (k == SNAKE_LIMIT)
336*63eb84d1Schristos {
337*63eb84d1Schristos best = v;
338*63eb84d1Schristos part->xmid = x;
339*63eb84d1Schristos part->ymid = y;
340*63eb84d1Schristos break;
341*63eb84d1Schristos }
342*63eb84d1Schristos }
343*63eb84d1Schristos }
344*63eb84d1Schristos }
345*63eb84d1Schristos }
346*63eb84d1Schristos if (best > 0)
347*63eb84d1Schristos {
348*63eb84d1Schristos part->lo_minimal = 1;
349*63eb84d1Schristos part->hi_minimal = 0;
350*63eb84d1Schristos return 2 * c - 1;
351*63eb84d1Schristos }
352*63eb84d1Schristos best = 0;
353*63eb84d1Schristos for (d = bmax; d >= bmin; d -= 2)
354*63eb84d1Schristos {
355*63eb84d1Schristos int dd;
356*63eb84d1Schristos int x;
357*63eb84d1Schristos int y;
358*63eb84d1Schristos int v;
359*63eb84d1Schristos
360*63eb84d1Schristos dd = d - bmid;
361*63eb84d1Schristos x = bd[d];
362*63eb84d1Schristos y = x - d;
363*63eb84d1Schristos v = (xlim - x) * 2 + dd;
364*63eb84d1Schristos
365*63eb84d1Schristos if (v > 12 * (c + (dd < 0 ? -dd : dd)))
366*63eb84d1Schristos {
367*63eb84d1Schristos if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
368*63eb84d1Schristos yoff < y && y <= ylim - SNAKE_LIMIT)
369*63eb84d1Schristos {
370*63eb84d1Schristos /* We have a good enough best diagonal; now insist
371*63eb84d1Schristos that it end with a significant snake. */
372*63eb84d1Schristos int k;
373*63eb84d1Schristos
374*63eb84d1Schristos for (k = 0; xv[x + k] == yv[y + k]; k++)
375*63eb84d1Schristos {
376*63eb84d1Schristos if (k == SNAKE_LIMIT - 1)
377*63eb84d1Schristos {
378*63eb84d1Schristos best = v;
379*63eb84d1Schristos part->xmid = x;
380*63eb84d1Schristos part->ymid = y;
381*63eb84d1Schristos break;
382*63eb84d1Schristos }
383*63eb84d1Schristos }
384*63eb84d1Schristos }
385*63eb84d1Schristos }
386*63eb84d1Schristos }
387*63eb84d1Schristos if (best > 0)
388*63eb84d1Schristos {
389*63eb84d1Schristos part->lo_minimal = 0;
390*63eb84d1Schristos part->hi_minimal = 1;
391*63eb84d1Schristos return 2 * c - 1;
392*63eb84d1Schristos }
393*63eb84d1Schristos }
394*63eb84d1Schristos #endif /* MINUS_H_FLAG */
395*63eb84d1Schristos
396*63eb84d1Schristos /* Heuristic: if we've gone well beyond the call of duty, give up
397*63eb84d1Schristos and report halfway between our best results so far. */
398*63eb84d1Schristos if (c >= ctxt->too_expensive)
399*63eb84d1Schristos {
400*63eb84d1Schristos int fxybest;
401*63eb84d1Schristos int fxbest;
402*63eb84d1Schristos int bxybest;
403*63eb84d1Schristos int bxbest;
404*63eb84d1Schristos
405*63eb84d1Schristos /* Pacify `gcc -Wall'. */
406*63eb84d1Schristos fxbest = 0;
407*63eb84d1Schristos bxbest = 0;
408*63eb84d1Schristos
409*63eb84d1Schristos /* Find forward diagonal that maximizes X + Y. */
410*63eb84d1Schristos fxybest = -1;
411*63eb84d1Schristos for (d = fmax; d >= fmin; d -= 2)
412*63eb84d1Schristos {
413*63eb84d1Schristos int x;
414*63eb84d1Schristos int y;
415*63eb84d1Schristos
416*63eb84d1Schristos x = fd[d] < xlim ? fd[d] : xlim;
417*63eb84d1Schristos y = x - d;
418*63eb84d1Schristos
419*63eb84d1Schristos if (ylim < y)
420*63eb84d1Schristos {
421*63eb84d1Schristos x = ylim + d;
422*63eb84d1Schristos y = ylim;
423*63eb84d1Schristos }
424*63eb84d1Schristos if (fxybest < x + y)
425*63eb84d1Schristos {
426*63eb84d1Schristos fxybest = x + y;
427*63eb84d1Schristos fxbest = x;
428*63eb84d1Schristos }
429*63eb84d1Schristos }
430*63eb84d1Schristos /* Find backward diagonal that minimizes X + Y. */
431*63eb84d1Schristos bxybest = INT_MAX;
432*63eb84d1Schristos for (d = bmax; d >= bmin; d -= 2)
433*63eb84d1Schristos {
434*63eb84d1Schristos int x;
435*63eb84d1Schristos int y;
436*63eb84d1Schristos
437*63eb84d1Schristos x = xoff > bd[d] ? xoff : bd[d];
438*63eb84d1Schristos y = x - d;
439*63eb84d1Schristos
440*63eb84d1Schristos if (y < yoff)
441*63eb84d1Schristos {
442*63eb84d1Schristos x = yoff + d;
443*63eb84d1Schristos y = yoff;
444*63eb84d1Schristos }
445*63eb84d1Schristos if (x + y < bxybest)
446*63eb84d1Schristos {
447*63eb84d1Schristos bxybest = x + y;
448*63eb84d1Schristos bxbest = x;
449*63eb84d1Schristos }
450*63eb84d1Schristos }
451*63eb84d1Schristos /* Use the better of the two diagonals. */
452*63eb84d1Schristos if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
453*63eb84d1Schristos {
454*63eb84d1Schristos part->xmid = fxbest;
455*63eb84d1Schristos part->ymid = fxybest - fxbest;
456*63eb84d1Schristos part->lo_minimal = 1;
457*63eb84d1Schristos part->hi_minimal = 0;
458*63eb84d1Schristos }
459*63eb84d1Schristos else
460*63eb84d1Schristos {
461*63eb84d1Schristos part->xmid = bxbest;
462*63eb84d1Schristos part->ymid = bxybest - bxbest;
463*63eb84d1Schristos part->lo_minimal = 0;
464*63eb84d1Schristos part->hi_minimal = 1;
465*63eb84d1Schristos }
466*63eb84d1Schristos return 2 * c - 1;
467*63eb84d1Schristos }
468*63eb84d1Schristos }
469*63eb84d1Schristos }
470*63eb84d1Schristos
471*63eb84d1Schristos
472*63eb84d1Schristos /* NAME
473*63eb84d1Schristos compareseq - find edit sequence
474*63eb84d1Schristos
475*63eb84d1Schristos SYNOPSIS
476*63eb84d1Schristos void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal,
477*63eb84d1Schristos struct context *ctxt);
478*63eb84d1Schristos
479*63eb84d1Schristos DESCRIPTION
480*63eb84d1Schristos Compare in detail contiguous subsequences of the two strings
481*63eb84d1Schristos which are known, as a whole, to match each other.
482*63eb84d1Schristos
483*63eb84d1Schristos The subsequence of string 0 is [XOFF, XLIM) and likewise for
484*63eb84d1Schristos string 1.
485*63eb84d1Schristos
486*63eb84d1Schristos Note that XLIM, YLIM are exclusive bounds. All character
487*63eb84d1Schristos numbers are origin-0.
488*63eb84d1Schristos
489*63eb84d1Schristos If MINIMAL is nonzero, find a minimal difference no matter how
490*63eb84d1Schristos expensive it is. */
491*63eb84d1Schristos
492*63eb84d1Schristos static void
compareseq(int xoff,int xlim,int yoff,int ylim,int minimal,struct context * ctxt)493*63eb84d1Schristos compareseq (int xoff, int xlim, int yoff, int ylim, int minimal,
494*63eb84d1Schristos struct context *ctxt)
495*63eb84d1Schristos {
496*63eb84d1Schristos const char *const xv = ctxt->string[0].data; /* Help the compiler. */
497*63eb84d1Schristos const char *const yv = ctxt->string[1].data;
498*63eb84d1Schristos
499*63eb84d1Schristos /* Slide down the bottom initial diagonal. */
500*63eb84d1Schristos while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
501*63eb84d1Schristos {
502*63eb84d1Schristos ++xoff;
503*63eb84d1Schristos ++yoff;
504*63eb84d1Schristos }
505*63eb84d1Schristos
506*63eb84d1Schristos /* Slide up the top initial diagonal. */
507*63eb84d1Schristos while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
508*63eb84d1Schristos {
509*63eb84d1Schristos --xlim;
510*63eb84d1Schristos --ylim;
511*63eb84d1Schristos }
512*63eb84d1Schristos
513*63eb84d1Schristos /* Handle simple cases. */
514*63eb84d1Schristos if (xoff == xlim)
515*63eb84d1Schristos {
516*63eb84d1Schristos while (yoff < ylim)
517*63eb84d1Schristos {
518*63eb84d1Schristos ctxt->string[1].edit_count++;
519*63eb84d1Schristos ++yoff;
520*63eb84d1Schristos }
521*63eb84d1Schristos }
522*63eb84d1Schristos else if (yoff == ylim)
523*63eb84d1Schristos {
524*63eb84d1Schristos while (xoff < xlim)
525*63eb84d1Schristos {
526*63eb84d1Schristos ctxt->string[0].edit_count++;
527*63eb84d1Schristos ++xoff;
528*63eb84d1Schristos }
529*63eb84d1Schristos }
530*63eb84d1Schristos else
531*63eb84d1Schristos {
532*63eb84d1Schristos int c;
533*63eb84d1Schristos struct partition part;
534*63eb84d1Schristos
535*63eb84d1Schristos /* Find a point of correspondence in the middle of the strings. */
536*63eb84d1Schristos c = diag (xoff, xlim, yoff, ylim, minimal, &part, ctxt);
537*63eb84d1Schristos if (c == 1)
538*63eb84d1Schristos {
539*63eb84d1Schristos #if 0
540*63eb84d1Schristos /* This should be impossible, because it implies that one of
541*63eb84d1Schristos the two subsequences is empty, and that case was handled
542*63eb84d1Schristos above without calling `diag'. Let's verify that this is
543*63eb84d1Schristos true. */
544*63eb84d1Schristos abort ();
545*63eb84d1Schristos #else
546*63eb84d1Schristos /* The two subsequences differ by a single insert or delete;
547*63eb84d1Schristos record it and we are done. */
548*63eb84d1Schristos if (part.xmid - part.ymid < xoff - yoff)
549*63eb84d1Schristos ctxt->string[1].edit_count++;
550*63eb84d1Schristos else
551*63eb84d1Schristos ctxt->string[0].edit_count++;
552*63eb84d1Schristos #endif
553*63eb84d1Schristos }
554*63eb84d1Schristos else
555*63eb84d1Schristos {
556*63eb84d1Schristos /* Use the partitions to split this problem into subproblems. */
557*63eb84d1Schristos compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt);
558*63eb84d1Schristos compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt);
559*63eb84d1Schristos }
560*63eb84d1Schristos }
561*63eb84d1Schristos }
562*63eb84d1Schristos
563*63eb84d1Schristos
564*63eb84d1Schristos /* Because fstrcmp is typically called multiple times, attempt to minimize
565*63eb84d1Schristos the number of memory allocations performed. Thus, let a call reuse the
566*63eb84d1Schristos memory already allocated by the previous call, if it is sufficient.
567*63eb84d1Schristos To make it multithread-safe, without need for a lock that protects the
568*63eb84d1Schristos already allocated memory, store the allocated memory per thread. Free
569*63eb84d1Schristos it only when the thread exits. */
570*63eb84d1Schristos
571*63eb84d1Schristos static gl_tls_key_t buffer_key; /* TLS key for a 'int *' */
572*63eb84d1Schristos static gl_tls_key_t bufmax_key; /* TLS key for a 'size_t' */
573*63eb84d1Schristos
574*63eb84d1Schristos static void
keys_init(void)575*63eb84d1Schristos keys_init (void)
576*63eb84d1Schristos {
577*63eb84d1Schristos gl_tls_key_init (buffer_key, free);
578*63eb84d1Schristos gl_tls_key_init (bufmax_key, NULL);
579*63eb84d1Schristos /* The per-thread initial values are NULL and 0, respectively. */
580*63eb84d1Schristos }
581*63eb84d1Schristos
582*63eb84d1Schristos /* Ensure that keys_init is called once only. */
gl_once_define(static,keys_init_once)583*63eb84d1Schristos gl_once_define(static, keys_init_once)
584*63eb84d1Schristos
585*63eb84d1Schristos
586*63eb84d1Schristos /* NAME
587*63eb84d1Schristos fstrcmp - fuzzy string compare
588*63eb84d1Schristos
589*63eb84d1Schristos SYNOPSIS
590*63eb84d1Schristos double fstrcmp(const char *, const char *);
591*63eb84d1Schristos
592*63eb84d1Schristos DESCRIPTION
593*63eb84d1Schristos The fstrcmp function may be used to compare two string for
594*63eb84d1Schristos similarity. It is very useful in reducing "cascade" or
595*63eb84d1Schristos "secondary" errors in compilers or other situations where
596*63eb84d1Schristos symbol tables occur.
597*63eb84d1Schristos
598*63eb84d1Schristos RETURNS
599*63eb84d1Schristos double; 0 if the strings are entirly dissimilar, 1 if the
600*63eb84d1Schristos strings are identical, and a number in between if they are
601*63eb84d1Schristos similar. */
602*63eb84d1Schristos
603*63eb84d1Schristos double
604*63eb84d1Schristos fstrcmp (const char *string1, const char *string2)
605*63eb84d1Schristos {
606*63eb84d1Schristos struct context ctxt;
607*63eb84d1Schristos int i;
608*63eb84d1Schristos
609*63eb84d1Schristos size_t fdiag_len;
610*63eb84d1Schristos int *buffer;
611*63eb84d1Schristos size_t bufmax;
612*63eb84d1Schristos
613*63eb84d1Schristos /* set the info for each string. */
614*63eb84d1Schristos ctxt.string[0].data = string1;
615*63eb84d1Schristos ctxt.string[0].data_length = strlen (string1);
616*63eb84d1Schristos ctxt.string[1].data = string2;
617*63eb84d1Schristos ctxt.string[1].data_length = strlen (string2);
618*63eb84d1Schristos
619*63eb84d1Schristos /* short-circuit obvious comparisons */
620*63eb84d1Schristos if (ctxt.string[0].data_length == 0 && ctxt.string[1].data_length == 0)
621*63eb84d1Schristos return 1.0;
622*63eb84d1Schristos if (ctxt.string[0].data_length == 0 || ctxt.string[1].data_length == 0)
623*63eb84d1Schristos return 0.0;
624*63eb84d1Schristos
625*63eb84d1Schristos /* Set TOO_EXPENSIVE to be approximate square root of input size,
626*63eb84d1Schristos bounded below by 256. */
627*63eb84d1Schristos ctxt.too_expensive = 1;
628*63eb84d1Schristos for (i = ctxt.string[0].data_length + ctxt.string[1].data_length;
629*63eb84d1Schristos i != 0;
630*63eb84d1Schristos i >>= 2)
631*63eb84d1Schristos ctxt.too_expensive <<= 1;
632*63eb84d1Schristos if (ctxt.too_expensive < 256)
633*63eb84d1Schristos ctxt.too_expensive = 256;
634*63eb84d1Schristos
635*63eb84d1Schristos /* Allocate memory for fdiag and bdiag from a thread-local pool. */
636*63eb84d1Schristos fdiag_len = ctxt.string[0].data_length + ctxt.string[1].data_length + 3;
637*63eb84d1Schristos gl_once (keys_init_once, keys_init);
638*63eb84d1Schristos buffer = (int *) gl_tls_get (buffer_key);
639*63eb84d1Schristos bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key);
640*63eb84d1Schristos if (fdiag_len > bufmax)
641*63eb84d1Schristos {
642*63eb84d1Schristos /* Need more memory. */
643*63eb84d1Schristos bufmax = 2 * bufmax;
644*63eb84d1Schristos if (fdiag_len > bufmax)
645*63eb84d1Schristos bufmax = fdiag_len;
646*63eb84d1Schristos /* Calling xrealloc would be a waste: buffer's contents does not need
647*63eb84d1Schristos to be preserved. */
648*63eb84d1Schristos if (buffer != NULL)
649*63eb84d1Schristos free (buffer);
650*63eb84d1Schristos buffer = (int *) xmalloc (bufmax * (2 * sizeof (int)));
651*63eb84d1Schristos gl_tls_set (buffer_key, buffer);
652*63eb84d1Schristos gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax);
653*63eb84d1Schristos }
654*63eb84d1Schristos ctxt.fdiag = buffer + ctxt.string[1].data_length + 1;
655*63eb84d1Schristos ctxt.bdiag = ctxt.fdiag + fdiag_len;
656*63eb84d1Schristos
657*63eb84d1Schristos /* Now do the main comparison algorithm */
658*63eb84d1Schristos ctxt.string[0].edit_count = 0;
659*63eb84d1Schristos ctxt.string[1].edit_count = 0;
660*63eb84d1Schristos compareseq (0, ctxt.string[0].data_length, 0, ctxt.string[1].data_length, 0,
661*63eb84d1Schristos &ctxt);
662*63eb84d1Schristos
663*63eb84d1Schristos /* The result is
664*63eb84d1Schristos ((number of chars in common) / (average length of the strings)).
665*63eb84d1Schristos This is admittedly biased towards finding that the strings are
666*63eb84d1Schristos similar, however it does produce meaningful results. */
667*63eb84d1Schristos return ((double) (ctxt.string[0].data_length + ctxt.string[1].data_length
668*63eb84d1Schristos - ctxt.string[1].edit_count - ctxt.string[0].edit_count)
669*63eb84d1Schristos / (ctxt.string[0].data_length + ctxt.string[1].data_length));
670*63eb84d1Schristos }
671