1 /* Find near-matches for strings.
2    Copyright (C) 2015-2016 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "tree.h"
25 #include "spellcheck.h"
26 
27 /* The Levenshtein distance is an "edit-distance": the minimal
28    number of one-character insertions, removals or substitutions
29    that are needed to change one string into another.
30 
31    This implementation uses the Wagner-Fischer algorithm.  */
32 
33 edit_distance_t
levenshtein_distance(const char * s,int len_s,const char * t,int len_t)34 levenshtein_distance (const char *s, int len_s,
35 		      const char *t, int len_t)
36 {
37   const bool debug = false;
38 
39   if (debug)
40     {
41       printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
42       printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
43     }
44 
45   if (len_s == 0)
46     return len_t;
47   if (len_t == 0)
48     return len_s;
49 
50   /* We effectively build a matrix where each (i, j) contains the
51      Levenshtein distance between the prefix strings s[0:j]
52      and t[0:i].
53      Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
54      we simply keep track of the last row, v0 and a new row, v1,
55      which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses
56      in favor of two (len_s + 1) allocations.  These could potentially be
57      statically-allocated if we impose a maximum length on the
58      strings of interest.  */
59   edit_distance_t *v0 = new edit_distance_t[len_s + 1];
60   edit_distance_t *v1 = new edit_distance_t[len_s + 1];
61 
62   /* The first row is for the case of an empty target string, which
63      we can reach by deleting every character in the source string.  */
64   for (int i = 0; i < len_s + 1; i++)
65     v0[i] = i;
66 
67   /* Build successive rows.  */
68   for (int i = 0; i < len_t; i++)
69     {
70       if (debug)
71 	{
72 	  printf ("i:%i v0 = ", i);
73 	  for (int j = 0; j < len_s + 1; j++)
74 	    printf ("%i ", v0[j]);
75 	  printf ("\n");
76 	}
77 
78       /* The initial column is for the case of an empty source string; we
79 	 can reach prefixes of the target string of length i
80 	 by inserting i characters.  */
81       v1[0] = i + 1;
82 
83       /* Build the rest of the row by considering neighbors to
84 	 the north, west and northwest.  */
85       for (int j = 0; j < len_s; j++)
86 	{
87 	  edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
88 	  edit_distance_t deletion     = v1[j] + 1;
89 	  edit_distance_t insertion    = v0[j + 1] + 1;
90 	  edit_distance_t substitution = v0[j] + cost;
91 	  edit_distance_t cheapest = MIN (deletion, insertion);
92 	  cheapest = MIN (cheapest, substitution);
93 	  v1[j + 1] = cheapest;
94 	}
95 
96       /* Prepare to move on to next row.  */
97       for (int j = 0; j < len_s + 1; j++)
98 	v0[j] = v1[j];
99     }
100 
101   if (debug)
102     {
103       printf ("final v1 = ");
104       for (int j = 0; j < len_s + 1; j++)
105 	printf ("%i ", v1[j]);
106       printf ("\n");
107     }
108 
109   edit_distance_t result = v1[len_s];
110   delete[] v0;
111   delete[] v1;
112   return result;
113 }
114 
115 /* Calculate Levenshtein distance between two nil-terminated strings.  */
116 
117 edit_distance_t
levenshtein_distance(const char * s,const char * t)118 levenshtein_distance (const char *s, const char *t)
119 {
120   return levenshtein_distance (s, strlen (s), t, strlen (t));
121 }
122 
123 /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
124    an autovec of non-NULL strings, determine which element within
125    CANDIDATES has the lowest edit distance to TARGET.  If there are
126    multiple elements with the same minimal distance, the first in the
127    vector wins.
128 
129    If more than half of the letters were misspelled, the suggestion is
130    likely to be meaningless, so return NULL for this case.  */
131 
132 const char *
find_closest_string(const char * target,const auto_vec<const char * > * candidates)133 find_closest_string (const char *target,
134 		     const auto_vec<const char *> *candidates)
135 {
136   gcc_assert (target);
137   gcc_assert (candidates);
138 
139   int i;
140   const char *candidate;
141   const char *best_candidate = NULL;
142   edit_distance_t best_distance = MAX_EDIT_DISTANCE;
143   size_t len_target = strlen (target);
144   FOR_EACH_VEC_ELT (*candidates, i, candidate)
145     {
146       gcc_assert (candidate);
147       edit_distance_t dist
148 	= levenshtein_distance (target, len_target,
149 				candidate, strlen (candidate));
150       if (dist < best_distance)
151 	{
152 	  best_distance = dist;
153 	  best_candidate = candidate;
154 	}
155     }
156 
157   /* If more than half of the letters were misspelled, the suggestion is
158      likely to be meaningless.  */
159   if (best_candidate)
160     {
161       unsigned int cutoff = MAX (len_target, strlen (best_candidate)) / 2;
162       if (best_distance > cutoff)
163 	return NULL;
164     }
165 
166   return best_candidate;
167 }
168