1*63eb84d1Schristos /* Functions to make fuzzy comparisons between strings
2*63eb84d1Schristos    Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006 Free Software Foundation, Inc.
3*63eb84d1Schristos 
4*63eb84d1Schristos    This program is free software; you can redistribute it and/or modify
5*63eb84d1Schristos    it under the terms of the GNU General Public License as published by
6*63eb84d1Schristos    the Free Software Foundation; either version 2 of the License, or (at
7*63eb84d1Schristos    your option) any later version.
8*63eb84d1Schristos 
9*63eb84d1Schristos    This program is distributed in the hope that it will be useful, but
10*63eb84d1Schristos    WITHOUT ANY WARRANTY; without even the implied warranty of
11*63eb84d1Schristos    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12*63eb84d1Schristos    General Public License for more details.
13*63eb84d1Schristos 
14*63eb84d1Schristos    You should have received a copy of the GNU General Public License
15*63eb84d1Schristos    along with this program; if not, write to the Free Software
16*63eb84d1Schristos    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17*63eb84d1Schristos 
18*63eb84d1Schristos 
19*63eb84d1Schristos    Derived from GNU diff 2.7, analyze.c et al.
20*63eb84d1Schristos 
21*63eb84d1Schristos    The basic idea is to consider two strings as similar if, when
22*63eb84d1Schristos    transforming the first string into the second string through a
23*63eb84d1Schristos    sequence of edits (inserts and deletes of one character each),
24*63eb84d1Schristos    this sequence is short - or equivalently, if the ordered list
25*63eb84d1Schristos    of characters that are untouched by these edits is long.  For a
26*63eb84d1Schristos    good introduction to the subject, read about the "Levenshtein
27*63eb84d1Schristos    distance" in Wikipedia.
28*63eb84d1Schristos 
29*63eb84d1Schristos    The basic algorithm is described in:
30*63eb84d1Schristos    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
31*63eb84d1Schristos    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
32*63eb84d1Schristos    see especially section 4.2, which describes the variation used below.
33*63eb84d1Schristos 
34*63eb84d1Schristos    The basic algorithm was independently discovered as described in:
35*63eb84d1Schristos    "Algorithms for Approximate String Matching", E. Ukkonen,
36*63eb84d1Schristos    Information and Control Vol. 64, 1985, pp. 100-118.
37*63eb84d1Schristos 
38*63eb84d1Schristos    Unless the 'minimal' flag is set, this code uses the TOO_EXPENSIVE
39*63eb84d1Schristos    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
40*63eb84d1Schristos    at the price of producing suboptimal output for large inputs with
41*63eb84d1Schristos    many differences.
42*63eb84d1Schristos 
43*63eb84d1Schristos    Modified to work on strings rather than files
44*63eb84d1Schristos    by Peter Miller <pmiller@agso.gov.au>, October 1995 */
45*63eb84d1Schristos 
46*63eb84d1Schristos #include <config.h>
47*63eb84d1Schristos 
48*63eb84d1Schristos /* Specification.  */
49*63eb84d1Schristos #include "fstrcmp.h"
50*63eb84d1Schristos 
51*63eb84d1Schristos #include <string.h>
52*63eb84d1Schristos #include <stdio.h>
53*63eb84d1Schristos #include <stdlib.h>
54*63eb84d1Schristos #include <limits.h>
55*63eb84d1Schristos 
56*63eb84d1Schristos #include "lock.h"
57*63eb84d1Schristos #include "tls.h"
58*63eb84d1Schristos #include "xalloc.h"
59*63eb84d1Schristos 
60*63eb84d1Schristos #ifndef uintptr_t
61*63eb84d1Schristos # define uintptr_t unsigned long
62*63eb84d1Schristos #endif
63*63eb84d1Schristos 
64*63eb84d1Schristos 
65*63eb84d1Schristos /*
66*63eb84d1Schristos  * Context of comparison operation.
67*63eb84d1Schristos  */
68*63eb84d1Schristos struct context
69*63eb84d1Schristos {
70*63eb84d1Schristos   /*
71*63eb84d1Schristos    * Data on one input string being compared.
72*63eb84d1Schristos    */
73*63eb84d1Schristos   struct string_data
74*63eb84d1Schristos   {
75*63eb84d1Schristos     /* The string to be compared. */
76*63eb84d1Schristos     const char *data;
77*63eb84d1Schristos 
78*63eb84d1Schristos     /* The length of the string to be compared. */
79*63eb84d1Schristos     int data_length;
80*63eb84d1Schristos 
81*63eb84d1Schristos     /* The number of characters inserted or deleted. */
82*63eb84d1Schristos     int edit_count;
83*63eb84d1Schristos   }
84*63eb84d1Schristos   string[2];
85*63eb84d1Schristos 
86*63eb84d1Schristos   #ifdef MINUS_H_FLAG
87*63eb84d1Schristos 
88*63eb84d1Schristos   /* This corresponds to the diff -H flag.  With this heuristic, for
89*63eb84d1Schristos      strings with a constant small density of changes, the algorithm is
90*63eb84d1Schristos      linear in the strings size.  This is unlikely in typical uses of
91*63eb84d1Schristos      fstrcmp, and so is usually compiled out.  Besides, there is no
92*63eb84d1Schristos      interface to set it true.  */
93*63eb84d1Schristos   int heuristic;
94*63eb84d1Schristos 
95*63eb84d1Schristos   #endif
96*63eb84d1Schristos 
97*63eb84d1Schristos   /* Vector, indexed by diagonal, containing 1 + the X coordinate of the
98*63eb84d1Schristos      point furthest along the given diagonal in the forward search of the
99*63eb84d1Schristos      edit matrix.  */
100*63eb84d1Schristos   int *fdiag;
101*63eb84d1Schristos 
102*63eb84d1Schristos   /* Vector, indexed by diagonal, containing the X coordinate of the point
103*63eb84d1Schristos      furthest along the given diagonal in the backward search of the edit
104*63eb84d1Schristos      matrix.  */
105*63eb84d1Schristos   int *bdiag;
106*63eb84d1Schristos 
107*63eb84d1Schristos   /* Edit scripts longer than this are too expensive to compute.  */
108*63eb84d1Schristos   int too_expensive;
109*63eb84d1Schristos 
110*63eb84d1Schristos   /* Snakes bigger than this are considered `big'.  */
111*63eb84d1Schristos   #define SNAKE_LIMIT	20
112*63eb84d1Schristos };
113*63eb84d1Schristos 
114*63eb84d1Schristos struct partition
115*63eb84d1Schristos {
116*63eb84d1Schristos   /* Midpoints of this partition.  */
117*63eb84d1Schristos   int xmid, ymid;
118*63eb84d1Schristos 
119*63eb84d1Schristos   /* Nonzero if low half will be analyzed minimally.  */
120*63eb84d1Schristos   int lo_minimal;
121*63eb84d1Schristos 
122*63eb84d1Schristos   /* Likewise for high half.  */
123*63eb84d1Schristos   int hi_minimal;
124*63eb84d1Schristos };
125*63eb84d1Schristos 
126*63eb84d1Schristos 
127*63eb84d1Schristos /* NAME
128*63eb84d1Schristos 	diag - find diagonal path
129*63eb84d1Schristos 
130*63eb84d1Schristos    SYNOPSIS
131*63eb84d1Schristos 	int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
132*63eb84d1Schristos 		 struct partition *part, struct context *ctxt);
133*63eb84d1Schristos 
134*63eb84d1Schristos    DESCRIPTION
135*63eb84d1Schristos 	Find the midpoint of the shortest edit script for a specified
136*63eb84d1Schristos 	portion of the two strings.
137*63eb84d1Schristos 
138*63eb84d1Schristos 	Scan from the beginnings of the strings, and simultaneously from
139*63eb84d1Schristos 	the ends, doing a breadth-first search through the space of
140*63eb84d1Schristos 	edit-sequence.  When the two searches meet, we have found the
141*63eb84d1Schristos 	midpoint of the shortest edit sequence.
142*63eb84d1Schristos 
143*63eb84d1Schristos 	If MINIMAL is nonzero, find the minimal edit script regardless
144*63eb84d1Schristos 	of expense.  Otherwise, if the search is too expensive, use
145*63eb84d1Schristos 	heuristics to stop the search and report a suboptimal answer.
146*63eb84d1Schristos 
147*63eb84d1Schristos    RETURNS
148*63eb84d1Schristos 	Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal
149*63eb84d1Schristos 	number XMID - YMID equals the number of inserted characters
150*63eb84d1Schristos 	minus the number of deleted characters (counting only characters
151*63eb84d1Schristos 	before the midpoint).  Return the approximate edit cost; this is
152*63eb84d1Schristos 	the total number of characters inserted or deleted (counting
153*63eb84d1Schristos 	only characters before the midpoint), unless a heuristic is used
154*63eb84d1Schristos 	to terminate the search prematurely.
155*63eb84d1Schristos 
156*63eb84d1Schristos 	Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
157*63eb84d1Schristos 	for the left half of the partition is known; similarly for
158*63eb84d1Schristos 	PART->RIGHT_MINIMAL.
159*63eb84d1Schristos 
160*63eb84d1Schristos    CAVEAT
161*63eb84d1Schristos 	This function assumes that the first characters of the specified
162*63eb84d1Schristos 	portions of the two strings do not match, and likewise that the
163*63eb84d1Schristos 	last characters do not match.  The caller must trim matching
164*63eb84d1Schristos 	characters from the beginning and end of the portions it is
165*63eb84d1Schristos 	going to specify.
166*63eb84d1Schristos 
167*63eb84d1Schristos 	If we return the "wrong" partitions, the worst this can do is
168*63eb84d1Schristos 	cause suboptimal diff output.  It cannot cause incorrect diff
169*63eb84d1Schristos 	output.  */
170*63eb84d1Schristos 
171*63eb84d1Schristos static int
diag(int xoff,int xlim,int yoff,int ylim,int minimal,struct partition * part,struct context * ctxt)172*63eb84d1Schristos diag (int xoff, int xlim, int yoff, int ylim, int minimal,
173*63eb84d1Schristos       struct partition *part, struct context *ctxt)
174*63eb84d1Schristos {
175*63eb84d1Schristos   int *const fd = ctxt->fdiag;	/* Give the compiler a chance. */
176*63eb84d1Schristos   int *const bd = ctxt->bdiag;	/* Additional help for the compiler. */
177*63eb84d1Schristos   const char *const xv = ctxt->string[0].data;	/* Still more help for the compiler. */
178*63eb84d1Schristos   const char *const yv = ctxt->string[1].data;	/* And more and more . . . */
179*63eb84d1Schristos   const int dmin = xoff - ylim;	/* Minimum valid diagonal. */
180*63eb84d1Schristos   const int dmax = xlim - yoff;	/* Maximum valid diagonal. */
181*63eb84d1Schristos   const int fmid = xoff - yoff;	/* Center diagonal of top-down search. */
182*63eb84d1Schristos   const int bmid = xlim - ylim;	/* Center diagonal of bottom-up search. */
183*63eb84d1Schristos   int fmin = fmid;
184*63eb84d1Schristos   int fmax = fmid;		/* Limits of top-down search. */
185*63eb84d1Schristos   int bmin = bmid;
186*63eb84d1Schristos   int bmax = bmid;		/* Limits of bottom-up search. */
187*63eb84d1Schristos   int c;			/* Cost. */
188*63eb84d1Schristos   int odd = (fmid - bmid) & 1;
189*63eb84d1Schristos 
190*63eb84d1Schristos   /*
191*63eb84d1Schristos    * True if southeast corner is on an odd diagonal with respect
192*63eb84d1Schristos    * to the northwest.
193*63eb84d1Schristos    */
194*63eb84d1Schristos   fd[fmid] = xoff;
195*63eb84d1Schristos   bd[bmid] = xlim;
196*63eb84d1Schristos   for (c = 1;; ++c)
197*63eb84d1Schristos     {
198*63eb84d1Schristos       int d;			/* Active diagonal. */
199*63eb84d1Schristos       int big_snake;
200*63eb84d1Schristos 
201*63eb84d1Schristos       big_snake = 0;
202*63eb84d1Schristos       /* Extend the top-down search by an edit step in each diagonal. */
203*63eb84d1Schristos       if (fmin > dmin)
204*63eb84d1Schristos 	fd[--fmin - 1] = -1;
205*63eb84d1Schristos       else
206*63eb84d1Schristos 	++fmin;
207*63eb84d1Schristos       if (fmax < dmax)
208*63eb84d1Schristos 	fd[++fmax + 1] = -1;
209*63eb84d1Schristos       else
210*63eb84d1Schristos 	--fmax;
211*63eb84d1Schristos       for (d = fmax; d >= fmin; d -= 2)
212*63eb84d1Schristos 	{
213*63eb84d1Schristos 	  int x;
214*63eb84d1Schristos 	  int y;
215*63eb84d1Schristos 	  int oldx;
216*63eb84d1Schristos 	  int tlo;
217*63eb84d1Schristos 	  int thi;
218*63eb84d1Schristos 
219*63eb84d1Schristos 	  tlo = fd[d - 1],
220*63eb84d1Schristos 	    thi = fd[d + 1];
221*63eb84d1Schristos 
222*63eb84d1Schristos 	  if (tlo >= thi)
223*63eb84d1Schristos 	    x = tlo + 1;
224*63eb84d1Schristos 	  else
225*63eb84d1Schristos 	    x = thi;
226*63eb84d1Schristos 	  oldx = x;
227*63eb84d1Schristos 	  y = x - d;
228*63eb84d1Schristos 	  while (x < xlim && y < ylim && xv[x] == yv[y])
229*63eb84d1Schristos 	    {
230*63eb84d1Schristos 	      ++x;
231*63eb84d1Schristos 	      ++y;
232*63eb84d1Schristos 	    }
233*63eb84d1Schristos 	  if (x - oldx > SNAKE_LIMIT)
234*63eb84d1Schristos 	    big_snake = 1;
235*63eb84d1Schristos 	  fd[d] = x;
236*63eb84d1Schristos 	  if (odd && bmin <= d && d <= bmax && bd[d] <= x)
237*63eb84d1Schristos 	    {
238*63eb84d1Schristos 	      part->xmid = x;
239*63eb84d1Schristos 	      part->ymid = y;
240*63eb84d1Schristos 	      part->lo_minimal = part->hi_minimal = 1;
241*63eb84d1Schristos 	      return 2 * c - 1;
242*63eb84d1Schristos 	    }
243*63eb84d1Schristos 	}
244*63eb84d1Schristos       /* Similarly extend the bottom-up search.  */
245*63eb84d1Schristos       if (bmin > dmin)
246*63eb84d1Schristos 	bd[--bmin - 1] = INT_MAX;
247*63eb84d1Schristos       else
248*63eb84d1Schristos 	++bmin;
249*63eb84d1Schristos       if (bmax < dmax)
250*63eb84d1Schristos 	bd[++bmax + 1] = INT_MAX;
251*63eb84d1Schristos       else
252*63eb84d1Schristos 	--bmax;
253*63eb84d1Schristos       for (d = bmax; d >= bmin; d -= 2)
254*63eb84d1Schristos 	{
255*63eb84d1Schristos 	  int x;
256*63eb84d1Schristos 	  int y;
257*63eb84d1Schristos 	  int oldx;
258*63eb84d1Schristos 	  int tlo;
259*63eb84d1Schristos 	  int thi;
260*63eb84d1Schristos 
261*63eb84d1Schristos 	  tlo = bd[d - 1],
262*63eb84d1Schristos 	    thi = bd[d + 1];
263*63eb84d1Schristos 	  if (tlo < thi)
264*63eb84d1Schristos 	    x = tlo;
265*63eb84d1Schristos 	  else
266*63eb84d1Schristos 	    x = thi - 1;
267*63eb84d1Schristos 	  oldx = x;
268*63eb84d1Schristos 	  y = x - d;
269*63eb84d1Schristos 	  while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
270*63eb84d1Schristos 	    {
271*63eb84d1Schristos 	      --x;
272*63eb84d1Schristos 	      --y;
273*63eb84d1Schristos 	    }
274*63eb84d1Schristos 	  if (oldx - x > SNAKE_LIMIT)
275*63eb84d1Schristos 	    big_snake = 1;
276*63eb84d1Schristos 	  bd[d] = x;
277*63eb84d1Schristos 	  if (!odd && fmin <= d && d <= fmax && x <= fd[d])
278*63eb84d1Schristos 	    {
279*63eb84d1Schristos 	      part->xmid = x;
280*63eb84d1Schristos 	      part->ymid = y;
281*63eb84d1Schristos 	      part->lo_minimal = part->hi_minimal = 1;
282*63eb84d1Schristos 	      return 2 * c;
283*63eb84d1Schristos 	    }
284*63eb84d1Schristos 	}
285*63eb84d1Schristos 
286*63eb84d1Schristos       if (minimal)
287*63eb84d1Schristos 	continue;
288*63eb84d1Schristos 
289*63eb84d1Schristos #ifdef MINUS_H_FLAG
290*63eb84d1Schristos       /* Heuristic: check occasionally for a diagonal that has made lots
291*63eb84d1Schristos          of progress compared with the edit distance.  If we have any
292*63eb84d1Schristos          such, find the one that has made the most progress and return
293*63eb84d1Schristos          it as if it had succeeded.
294*63eb84d1Schristos 
295*63eb84d1Schristos          With this heuristic, for strings with a constant small density
296*63eb84d1Schristos          of changes, the algorithm is linear in the strings size.  */
297*63eb84d1Schristos       if (c > 200 && big_snake && ctxt->heuristic)
298*63eb84d1Schristos 	{
299*63eb84d1Schristos 	  int best;
300*63eb84d1Schristos 
301*63eb84d1Schristos 	  best = 0;
302*63eb84d1Schristos 	  for (d = fmax; d >= fmin; d -= 2)
303*63eb84d1Schristos 	    {
304*63eb84d1Schristos 	      int dd;
305*63eb84d1Schristos 	      int x;
306*63eb84d1Schristos 	      int y;
307*63eb84d1Schristos 	      int v;
308*63eb84d1Schristos 
309*63eb84d1Schristos 	      dd = d - fmid;
310*63eb84d1Schristos 	      x = fd[d];
311*63eb84d1Schristos 	      y = x - d;
312*63eb84d1Schristos 	      v = (x - xoff) * 2 - dd;
313*63eb84d1Schristos 
314*63eb84d1Schristos 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
315*63eb84d1Schristos 		{
316*63eb84d1Schristos 		  if
317*63eb84d1Schristos 		    (
318*63eb84d1Schristos 		      v > best
319*63eb84d1Schristos 		      &&
320*63eb84d1Schristos 		      xoff + SNAKE_LIMIT <= x
321*63eb84d1Schristos 		      &&
322*63eb84d1Schristos 		      x < xlim
323*63eb84d1Schristos 		      &&
324*63eb84d1Schristos 		      yoff + SNAKE_LIMIT <= y
325*63eb84d1Schristos 		      &&
326*63eb84d1Schristos 		      y < ylim
327*63eb84d1Schristos 		    )
328*63eb84d1Schristos 		    {
329*63eb84d1Schristos 		      /* We have a good enough best diagonal; now insist
330*63eb84d1Schristos 			 that it end with a significant snake.  */
331*63eb84d1Schristos 		      int k;
332*63eb84d1Schristos 
333*63eb84d1Schristos 		      for (k = 1; xv[x - k] == yv[y - k]; k++)
334*63eb84d1Schristos 			{
335*63eb84d1Schristos 			  if (k == SNAKE_LIMIT)
336*63eb84d1Schristos 			    {
337*63eb84d1Schristos 			      best = v;
338*63eb84d1Schristos 			      part->xmid = x;
339*63eb84d1Schristos 			      part->ymid = y;
340*63eb84d1Schristos 			      break;
341*63eb84d1Schristos 			    }
342*63eb84d1Schristos 			}
343*63eb84d1Schristos 		    }
344*63eb84d1Schristos 		}
345*63eb84d1Schristos 	    }
346*63eb84d1Schristos 	  if (best > 0)
347*63eb84d1Schristos 	    {
348*63eb84d1Schristos 	      part->lo_minimal = 1;
349*63eb84d1Schristos 	      part->hi_minimal = 0;
350*63eb84d1Schristos 	      return 2 * c - 1;
351*63eb84d1Schristos 	    }
352*63eb84d1Schristos 	  best = 0;
353*63eb84d1Schristos 	  for (d = bmax; d >= bmin; d -= 2)
354*63eb84d1Schristos 	    {
355*63eb84d1Schristos 	      int dd;
356*63eb84d1Schristos 	      int x;
357*63eb84d1Schristos 	      int y;
358*63eb84d1Schristos 	      int v;
359*63eb84d1Schristos 
360*63eb84d1Schristos 	      dd = d - bmid;
361*63eb84d1Schristos 	      x = bd[d];
362*63eb84d1Schristos 	      y = x - d;
363*63eb84d1Schristos 	      v = (xlim - x) * 2 + dd;
364*63eb84d1Schristos 
365*63eb84d1Schristos 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
366*63eb84d1Schristos 		{
367*63eb84d1Schristos 		  if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
368*63eb84d1Schristos 		      yoff < y && y <= ylim - SNAKE_LIMIT)
369*63eb84d1Schristos 		    {
370*63eb84d1Schristos 		      /* We have a good enough best diagonal; now insist
371*63eb84d1Schristos 			 that it end with a significant snake.  */
372*63eb84d1Schristos 		      int k;
373*63eb84d1Schristos 
374*63eb84d1Schristos 		      for (k = 0; xv[x + k] == yv[y + k]; k++)
375*63eb84d1Schristos 			{
376*63eb84d1Schristos 			  if (k == SNAKE_LIMIT - 1)
377*63eb84d1Schristos 			    {
378*63eb84d1Schristos 			      best = v;
379*63eb84d1Schristos 			      part->xmid = x;
380*63eb84d1Schristos 			      part->ymid = y;
381*63eb84d1Schristos 			      break;
382*63eb84d1Schristos 			    }
383*63eb84d1Schristos 			}
384*63eb84d1Schristos 		    }
385*63eb84d1Schristos 		}
386*63eb84d1Schristos 	    }
387*63eb84d1Schristos 	  if (best > 0)
388*63eb84d1Schristos 	    {
389*63eb84d1Schristos 	      part->lo_minimal = 0;
390*63eb84d1Schristos 	      part->hi_minimal = 1;
391*63eb84d1Schristos 	      return 2 * c - 1;
392*63eb84d1Schristos 	    }
393*63eb84d1Schristos 	}
394*63eb84d1Schristos #endif /* MINUS_H_FLAG */
395*63eb84d1Schristos 
396*63eb84d1Schristos       /* Heuristic: if we've gone well beyond the call of duty, give up
397*63eb84d1Schristos 	 and report halfway between our best results so far.  */
398*63eb84d1Schristos       if (c >= ctxt->too_expensive)
399*63eb84d1Schristos 	{
400*63eb84d1Schristos 	  int fxybest;
401*63eb84d1Schristos 	  int fxbest;
402*63eb84d1Schristos 	  int bxybest;
403*63eb84d1Schristos 	  int bxbest;
404*63eb84d1Schristos 
405*63eb84d1Schristos 	  /* Pacify `gcc -Wall'. */
406*63eb84d1Schristos 	  fxbest = 0;
407*63eb84d1Schristos 	  bxbest = 0;
408*63eb84d1Schristos 
409*63eb84d1Schristos 	  /* Find forward diagonal that maximizes X + Y.  */
410*63eb84d1Schristos 	  fxybest = -1;
411*63eb84d1Schristos 	  for (d = fmax; d >= fmin; d -= 2)
412*63eb84d1Schristos 	    {
413*63eb84d1Schristos 	      int x;
414*63eb84d1Schristos 	      int y;
415*63eb84d1Schristos 
416*63eb84d1Schristos 	      x = fd[d] < xlim ? fd[d] : xlim;
417*63eb84d1Schristos 	      y = x - d;
418*63eb84d1Schristos 
419*63eb84d1Schristos 	      if (ylim < y)
420*63eb84d1Schristos 		{
421*63eb84d1Schristos 		  x = ylim + d;
422*63eb84d1Schristos 		  y = ylim;
423*63eb84d1Schristos 		}
424*63eb84d1Schristos 	      if (fxybest < x + y)
425*63eb84d1Schristos 		{
426*63eb84d1Schristos 		  fxybest = x + y;
427*63eb84d1Schristos 		  fxbest = x;
428*63eb84d1Schristos 		}
429*63eb84d1Schristos 	    }
430*63eb84d1Schristos 	  /* Find backward diagonal that minimizes X + Y.  */
431*63eb84d1Schristos 	  bxybest = INT_MAX;
432*63eb84d1Schristos 	  for (d = bmax; d >= bmin; d -= 2)
433*63eb84d1Schristos 	    {
434*63eb84d1Schristos 	      int x;
435*63eb84d1Schristos 	      int y;
436*63eb84d1Schristos 
437*63eb84d1Schristos 	      x = xoff > bd[d] ? xoff : bd[d];
438*63eb84d1Schristos 	      y = x - d;
439*63eb84d1Schristos 
440*63eb84d1Schristos 	      if (y < yoff)
441*63eb84d1Schristos 		{
442*63eb84d1Schristos 		  x = yoff + d;
443*63eb84d1Schristos 		  y = yoff;
444*63eb84d1Schristos 		}
445*63eb84d1Schristos 	      if (x + y < bxybest)
446*63eb84d1Schristos 		{
447*63eb84d1Schristos 		  bxybest = x + y;
448*63eb84d1Schristos 		  bxbest = x;
449*63eb84d1Schristos 		}
450*63eb84d1Schristos 	    }
451*63eb84d1Schristos 	  /* Use the better of the two diagonals.  */
452*63eb84d1Schristos 	  if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
453*63eb84d1Schristos 	    {
454*63eb84d1Schristos 	      part->xmid = fxbest;
455*63eb84d1Schristos 	      part->ymid = fxybest - fxbest;
456*63eb84d1Schristos 	      part->lo_minimal = 1;
457*63eb84d1Schristos 	      part->hi_minimal = 0;
458*63eb84d1Schristos 	    }
459*63eb84d1Schristos 	  else
460*63eb84d1Schristos 	    {
461*63eb84d1Schristos 	      part->xmid = bxbest;
462*63eb84d1Schristos 	      part->ymid = bxybest - bxbest;
463*63eb84d1Schristos 	      part->lo_minimal = 0;
464*63eb84d1Schristos 	      part->hi_minimal = 1;
465*63eb84d1Schristos 	    }
466*63eb84d1Schristos 	  return 2 * c - 1;
467*63eb84d1Schristos 	}
468*63eb84d1Schristos     }
469*63eb84d1Schristos }
470*63eb84d1Schristos 
471*63eb84d1Schristos 
472*63eb84d1Schristos /* NAME
473*63eb84d1Schristos 	compareseq - find edit sequence
474*63eb84d1Schristos 
475*63eb84d1Schristos    SYNOPSIS
476*63eb84d1Schristos 	void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal,
477*63eb84d1Schristos 			struct context *ctxt);
478*63eb84d1Schristos 
479*63eb84d1Schristos    DESCRIPTION
480*63eb84d1Schristos 	Compare in detail contiguous subsequences of the two strings
481*63eb84d1Schristos 	which are known, as a whole, to match each other.
482*63eb84d1Schristos 
483*63eb84d1Schristos 	The subsequence of string 0 is [XOFF, XLIM) and likewise for
484*63eb84d1Schristos 	string 1.
485*63eb84d1Schristos 
486*63eb84d1Schristos 	Note that XLIM, YLIM are exclusive bounds.  All character
487*63eb84d1Schristos 	numbers are origin-0.
488*63eb84d1Schristos 
489*63eb84d1Schristos 	If MINIMAL is nonzero, find a minimal difference no matter how
490*63eb84d1Schristos 	expensive it is.  */
491*63eb84d1Schristos 
492*63eb84d1Schristos static void
compareseq(int xoff,int xlim,int yoff,int ylim,int minimal,struct context * ctxt)493*63eb84d1Schristos compareseq (int xoff, int xlim, int yoff, int ylim, int minimal,
494*63eb84d1Schristos 	    struct context *ctxt)
495*63eb84d1Schristos {
496*63eb84d1Schristos   const char *const xv = ctxt->string[0].data;	/* Help the compiler.  */
497*63eb84d1Schristos   const char *const yv = ctxt->string[1].data;
498*63eb84d1Schristos 
499*63eb84d1Schristos   /* Slide down the bottom initial diagonal. */
500*63eb84d1Schristos   while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
501*63eb84d1Schristos     {
502*63eb84d1Schristos       ++xoff;
503*63eb84d1Schristos       ++yoff;
504*63eb84d1Schristos     }
505*63eb84d1Schristos 
506*63eb84d1Schristos   /* Slide up the top initial diagonal. */
507*63eb84d1Schristos   while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
508*63eb84d1Schristos     {
509*63eb84d1Schristos       --xlim;
510*63eb84d1Schristos       --ylim;
511*63eb84d1Schristos     }
512*63eb84d1Schristos 
513*63eb84d1Schristos   /* Handle simple cases. */
514*63eb84d1Schristos   if (xoff == xlim)
515*63eb84d1Schristos     {
516*63eb84d1Schristos       while (yoff < ylim)
517*63eb84d1Schristos 	{
518*63eb84d1Schristos 	  ctxt->string[1].edit_count++;
519*63eb84d1Schristos 	  ++yoff;
520*63eb84d1Schristos 	}
521*63eb84d1Schristos     }
522*63eb84d1Schristos   else if (yoff == ylim)
523*63eb84d1Schristos     {
524*63eb84d1Schristos       while (xoff < xlim)
525*63eb84d1Schristos 	{
526*63eb84d1Schristos 	  ctxt->string[0].edit_count++;
527*63eb84d1Schristos 	  ++xoff;
528*63eb84d1Schristos 	}
529*63eb84d1Schristos     }
530*63eb84d1Schristos   else
531*63eb84d1Schristos     {
532*63eb84d1Schristos       int c;
533*63eb84d1Schristos       struct partition part;
534*63eb84d1Schristos 
535*63eb84d1Schristos       /* Find a point of correspondence in the middle of the strings.  */
536*63eb84d1Schristos       c = diag (xoff, xlim, yoff, ylim, minimal, &part, ctxt);
537*63eb84d1Schristos       if (c == 1)
538*63eb84d1Schristos 	{
539*63eb84d1Schristos #if 0
540*63eb84d1Schristos 	  /* This should be impossible, because it implies that one of
541*63eb84d1Schristos 	     the two subsequences is empty, and that case was handled
542*63eb84d1Schristos 	     above without calling `diag'.  Let's verify that this is
543*63eb84d1Schristos 	     true.  */
544*63eb84d1Schristos 	  abort ();
545*63eb84d1Schristos #else
546*63eb84d1Schristos 	  /* The two subsequences differ by a single insert or delete;
547*63eb84d1Schristos 	     record it and we are done.  */
548*63eb84d1Schristos 	  if (part.xmid - part.ymid < xoff - yoff)
549*63eb84d1Schristos 	    ctxt->string[1].edit_count++;
550*63eb84d1Schristos 	  else
551*63eb84d1Schristos 	    ctxt->string[0].edit_count++;
552*63eb84d1Schristos #endif
553*63eb84d1Schristos 	}
554*63eb84d1Schristos       else
555*63eb84d1Schristos 	{
556*63eb84d1Schristos 	  /* Use the partitions to split this problem into subproblems.  */
557*63eb84d1Schristos 	  compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt);
558*63eb84d1Schristos 	  compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt);
559*63eb84d1Schristos 	}
560*63eb84d1Schristos     }
561*63eb84d1Schristos }
562*63eb84d1Schristos 
563*63eb84d1Schristos 
564*63eb84d1Schristos /* Because fstrcmp is typically called multiple times, attempt to minimize
565*63eb84d1Schristos    the number of memory allocations performed.  Thus, let a call reuse the
566*63eb84d1Schristos    memory already allocated by the previous call, if it is sufficient.
567*63eb84d1Schristos    To make it multithread-safe, without need for a lock that protects the
568*63eb84d1Schristos    already allocated memory, store the allocated memory per thread.  Free
569*63eb84d1Schristos    it only when the thread exits.  */
570*63eb84d1Schristos 
571*63eb84d1Schristos static gl_tls_key_t buffer_key;	/* TLS key for a 'int *' */
572*63eb84d1Schristos static gl_tls_key_t bufmax_key;	/* TLS key for a 'size_t' */
573*63eb84d1Schristos 
574*63eb84d1Schristos static void
keys_init(void)575*63eb84d1Schristos keys_init (void)
576*63eb84d1Schristos {
577*63eb84d1Schristos   gl_tls_key_init (buffer_key, free);
578*63eb84d1Schristos   gl_tls_key_init (bufmax_key, NULL);
579*63eb84d1Schristos   /* The per-thread initial values are NULL and 0, respectively.  */
580*63eb84d1Schristos }
581*63eb84d1Schristos 
582*63eb84d1Schristos /* Ensure that keys_init is called once only.  */
gl_once_define(static,keys_init_once)583*63eb84d1Schristos gl_once_define(static, keys_init_once)
584*63eb84d1Schristos 
585*63eb84d1Schristos 
586*63eb84d1Schristos /* NAME
587*63eb84d1Schristos 	fstrcmp - fuzzy string compare
588*63eb84d1Schristos 
589*63eb84d1Schristos    SYNOPSIS
590*63eb84d1Schristos 	double fstrcmp(const char *, const char *);
591*63eb84d1Schristos 
592*63eb84d1Schristos    DESCRIPTION
593*63eb84d1Schristos 	The fstrcmp function may be used to compare two string for
594*63eb84d1Schristos 	similarity.  It is very useful in reducing "cascade" or
595*63eb84d1Schristos 	"secondary" errors in compilers or other situations where
596*63eb84d1Schristos 	symbol tables occur.
597*63eb84d1Schristos 
598*63eb84d1Schristos    RETURNS
599*63eb84d1Schristos 	double; 0 if the strings are entirly dissimilar, 1 if the
600*63eb84d1Schristos 	strings are identical, and a number in between if they are
601*63eb84d1Schristos 	similar.  */
602*63eb84d1Schristos 
603*63eb84d1Schristos double
604*63eb84d1Schristos fstrcmp (const char *string1, const char *string2)
605*63eb84d1Schristos {
606*63eb84d1Schristos   struct context ctxt;
607*63eb84d1Schristos   int i;
608*63eb84d1Schristos 
609*63eb84d1Schristos   size_t fdiag_len;
610*63eb84d1Schristos   int *buffer;
611*63eb84d1Schristos   size_t bufmax;
612*63eb84d1Schristos 
613*63eb84d1Schristos   /* set the info for each string.  */
614*63eb84d1Schristos   ctxt.string[0].data = string1;
615*63eb84d1Schristos   ctxt.string[0].data_length = strlen (string1);
616*63eb84d1Schristos   ctxt.string[1].data = string2;
617*63eb84d1Schristos   ctxt.string[1].data_length = strlen (string2);
618*63eb84d1Schristos 
619*63eb84d1Schristos   /* short-circuit obvious comparisons */
620*63eb84d1Schristos   if (ctxt.string[0].data_length == 0 && ctxt.string[1].data_length == 0)
621*63eb84d1Schristos     return 1.0;
622*63eb84d1Schristos   if (ctxt.string[0].data_length == 0 || ctxt.string[1].data_length == 0)
623*63eb84d1Schristos     return 0.0;
624*63eb84d1Schristos 
625*63eb84d1Schristos   /* Set TOO_EXPENSIVE to be approximate square root of input size,
626*63eb84d1Schristos      bounded below by 256.  */
627*63eb84d1Schristos   ctxt.too_expensive = 1;
628*63eb84d1Schristos   for (i = ctxt.string[0].data_length + ctxt.string[1].data_length;
629*63eb84d1Schristos        i != 0;
630*63eb84d1Schristos        i >>= 2)
631*63eb84d1Schristos     ctxt.too_expensive <<= 1;
632*63eb84d1Schristos   if (ctxt.too_expensive < 256)
633*63eb84d1Schristos     ctxt.too_expensive = 256;
634*63eb84d1Schristos 
635*63eb84d1Schristos   /* Allocate memory for fdiag and bdiag from a thread-local pool.  */
636*63eb84d1Schristos   fdiag_len = ctxt.string[0].data_length + ctxt.string[1].data_length + 3;
637*63eb84d1Schristos   gl_once (keys_init_once, keys_init);
638*63eb84d1Schristos   buffer = (int *) gl_tls_get (buffer_key);
639*63eb84d1Schristos   bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key);
640*63eb84d1Schristos   if (fdiag_len > bufmax)
641*63eb84d1Schristos     {
642*63eb84d1Schristos       /* Need more memory.  */
643*63eb84d1Schristos       bufmax = 2 * bufmax;
644*63eb84d1Schristos       if (fdiag_len > bufmax)
645*63eb84d1Schristos 	bufmax = fdiag_len;
646*63eb84d1Schristos       /* Calling xrealloc would be a waste: buffer's contents does not need
647*63eb84d1Schristos 	 to be preserved.  */
648*63eb84d1Schristos       if (buffer != NULL)
649*63eb84d1Schristos 	free (buffer);
650*63eb84d1Schristos       buffer = (int *) xmalloc (bufmax * (2 * sizeof (int)));
651*63eb84d1Schristos       gl_tls_set (buffer_key, buffer);
652*63eb84d1Schristos       gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax);
653*63eb84d1Schristos     }
654*63eb84d1Schristos   ctxt.fdiag = buffer + ctxt.string[1].data_length + 1;
655*63eb84d1Schristos   ctxt.bdiag = ctxt.fdiag + fdiag_len;
656*63eb84d1Schristos 
657*63eb84d1Schristos   /* Now do the main comparison algorithm */
658*63eb84d1Schristos   ctxt.string[0].edit_count = 0;
659*63eb84d1Schristos   ctxt.string[1].edit_count = 0;
660*63eb84d1Schristos   compareseq (0, ctxt.string[0].data_length, 0, ctxt.string[1].data_length, 0,
661*63eb84d1Schristos 	      &ctxt);
662*63eb84d1Schristos 
663*63eb84d1Schristos   /* The result is
664*63eb84d1Schristos 	((number of chars in common) / (average length of the strings)).
665*63eb84d1Schristos      This is admittedly biased towards finding that the strings are
666*63eb84d1Schristos      similar, however it does produce meaningful results.  */
667*63eb84d1Schristos   return ((double) (ctxt.string[0].data_length + ctxt.string[1].data_length
668*63eb84d1Schristos 		    - ctxt.string[1].edit_count - ctxt.string[0].edit_count)
669*63eb84d1Schristos 	  / (ctxt.string[0].data_length + ctxt.string[1].data_length));
670*63eb84d1Schristos }
671