1 /* Find near-matches for strings.
2    Copyright (C) 2015-2019 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "tree.h"
25 #include "spellcheck.h"
26 #include "selftest.h"
27 
28 /* Get the edit distance between the two strings: the minimal
29    number of edits that are needed to change one string into another,
30    where edits can be one-character insertions, removals, or substitutions,
31    or transpositions of two adjacent characters (counting as one "edit").
32 
33    This implementation uses the Wagner-Fischer algorithm for the
34    Damerau-Levenshtein distance; specifically, the "optimal string alignment
35    distance" or "restricted edit distance" variant.  */
36 
37 edit_distance_t
get_edit_distance(const char * s,int len_s,const char * t,int len_t)38 get_edit_distance (const char *s, int len_s,
39 		   const char *t, int len_t)
40 {
41   const bool debug = false;
42 
43   if (debug)
44     {
45       printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
46       printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
47     }
48 
49   if (len_s == 0)
50     return len_t;
51   if (len_t == 0)
52     return len_s;
53 
54   /* We effectively build a matrix where each (i, j) contains the
55      distance between the prefix strings s[0:j] and t[0:i].
56      Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
57      we simply keep track of the last two rows, v_one_ago and v_two_ago,
58      and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1)
59      allocation and memory accesses in favor of three (len_s + 1)
60      allocations.  These could potentially be
61      statically-allocated if we impose a maximum length on the
62      strings of interest.  */
63   edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1];
64   edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1];
65   edit_distance_t *v_next = new edit_distance_t[len_s + 1];
66 
67   /* The first row is for the case of an empty target string, which
68      we can reach by deleting every character in the source string.  */
69   for (int i = 0; i < len_s + 1; i++)
70     v_one_ago[i] = i;
71 
72   /* Build successive rows.  */
73   for (int i = 0; i < len_t; i++)
74     {
75       if (debug)
76 	{
77 	  printf ("i:%i v_one_ago = ", i);
78 	  for (int j = 0; j < len_s + 1; j++)
79 	    printf ("%i ", v_one_ago[j]);
80 	  printf ("\n");
81 	}
82 
83       /* The initial column is for the case of an empty source string; we
84 	 can reach prefixes of the target string of length i
85 	 by inserting i characters.  */
86       v_next[0] = i + 1;
87 
88       /* Build the rest of the row by considering neighbors to
89 	 the north, west and northwest.  */
90       for (int j = 0; j < len_s; j++)
91 	{
92 	  edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
93 	  edit_distance_t deletion     = v_next[j] + 1;
94 	  edit_distance_t insertion    = v_one_ago[j + 1] + 1;
95 	  edit_distance_t substitution = v_one_ago[j] + cost;
96 	  edit_distance_t cheapest = MIN (deletion, insertion);
97 	  cheapest = MIN (cheapest, substitution);
98 	  if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i])
99 	    {
100 	      edit_distance_t transposition = v_two_ago[j - 1] + 1;
101 	      cheapest = MIN (cheapest, transposition);
102 	    }
103 	  v_next[j + 1] = cheapest;
104 	}
105 
106       /* Prepare to move on to next row.  */
107       for (int j = 0; j < len_s + 1; j++)
108 	{
109 	  v_two_ago[j] = v_one_ago[j];
110 	  v_one_ago[j] = v_next[j];
111 	}
112     }
113 
114   if (debug)
115     {
116       printf ("final v_next = ");
117       for (int j = 0; j < len_s + 1; j++)
118 	printf ("%i ", v_next[j]);
119       printf ("\n");
120     }
121 
122   edit_distance_t result = v_next[len_s];
123   delete[] v_two_ago;
124   delete[] v_one_ago;
125   delete[] v_next;
126   return result;
127 }
128 
129 /* Get the edit distance between two nil-terminated strings.  */
130 
131 edit_distance_t
get_edit_distance(const char * s,const char * t)132 get_edit_distance (const char *s, const char *t)
133 {
134   return get_edit_distance (s, strlen (s), t, strlen (t));
135 }
136 
137 /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
138    an autovec of non-NULL strings, determine which element within
139    CANDIDATES has the lowest edit distance to TARGET.  If there are
140    multiple elements with the same minimal distance, the first in the
141    vector wins.
142 
143    If more than half of the letters were misspelled, the suggestion is
144    likely to be meaningless, so return NULL for this case.  */
145 
146 const char *
find_closest_string(const char * target,const auto_vec<const char * > * candidates)147 find_closest_string (const char *target,
148 		     const auto_vec<const char *> *candidates)
149 {
150   gcc_assert (target);
151   gcc_assert (candidates);
152 
153   int i;
154   const char *candidate;
155   best_match<const char *, const char *> bm (target);
156   FOR_EACH_VEC_ELT (*candidates, i, candidate)
157     {
158       gcc_assert (candidate);
159       bm.consider (candidate);
160     }
161 
162   return bm.get_best_meaningful_candidate ();
163 }
164 
165 /* Generate the maximum edit distance for which we consider a suggestion
166    to be meaningful, given a goal of length GOAL_LEN and a candidate of
167    length CANDIDATE_LEN.
168 
169    This is a third of the the length of the candidate or of the goal,
170    whichever is bigger.  */
171 
172 edit_distance_t
get_edit_distance_cutoff(size_t goal_len,size_t candidate_len)173 get_edit_distance_cutoff (size_t goal_len, size_t candidate_len)
174 {
175   size_t max_length = MAX (goal_len, candidate_len);
176   size_t min_length = MIN (goal_len, candidate_len);
177 
178   gcc_assert (max_length >= min_length);
179 
180   /* Special case: don't offer suggestions for a pair of
181      length == 1 strings (or empty strings).  */
182   if (max_length <= 1)
183     return 0;
184 
185   /* If the lengths are close, then round down.  */
186   if (max_length - min_length <= 1)
187     /* ...but allow an edit distance of at least 1.  */
188     return MAX (max_length / 3, 1);
189 
190   /* Otherwise, round up (thus giving a little extra leeway to some cases
191      involving insertions/deletions).  */
192   return (max_length + 2) / 3;
193 }
194 
195 #if CHECKING_P
196 
197 namespace selftest {
198 
199 /* Selftests.  */
200 
201 /* Verify that get_edit_distance (A, B) equals the expected value.  */
202 
203 static void
test_get_edit_distance_one_way(const char * a,const char * b,edit_distance_t expected)204 test_get_edit_distance_one_way (const char *a, const char *b,
205 				edit_distance_t expected)
206 {
207   edit_distance_t actual = get_edit_distance (a, b);
208   ASSERT_EQ (actual, expected);
209 }
210 
211 /* Verify that both
212      get_edit_distance (A, B)
213    and
214      get_edit_distance (B, A)
215    equal the expected value, to ensure that the function is symmetric.  */
216 
217 static void
test_get_edit_distance_both_ways(const char * a,const char * b,edit_distance_t expected)218 test_get_edit_distance_both_ways (const char *a, const char *b,
219 			     edit_distance_t expected)
220 {
221   test_get_edit_distance_one_way (a, b, expected);
222   test_get_edit_distance_one_way (b, a, expected);
223 }
224 
225 /* Verify get_edit_distance for a variety of pairs of pre-canned
226    inputs, comparing against known-good values.  */
227 
228 static void
test_edit_distances()229 test_edit_distances ()
230 {
231   test_get_edit_distance_both_ways ("", "nonempty", strlen ("nonempty"));
232   test_get_edit_distance_both_ways ("saturday", "sunday", 3);
233   test_get_edit_distance_both_ways ("foo", "m_foo", 2);
234   test_get_edit_distance_both_ways ("hello_world", "HelloWorld", 3);
235   test_get_edit_distance_both_ways
236     ("the quick brown fox jumps over the lazy dog", "dog", 40);
237   test_get_edit_distance_both_ways
238     ("the quick brown fox jumps over the lazy dog",
239      "the quick brown dog jumps over the lazy fox",
240      4);
241   test_get_edit_distance_both_ways
242     ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
243      "All your base are belong to us",
244      44);
245   test_get_edit_distance_both_ways ("foo", "FOO", 3);
246   test_get_edit_distance_both_ways ("fee", "deed", 2);
247   test_get_edit_distance_both_ways ("coorzd1", "coordx1", 2);
248   test_get_edit_distance_both_ways ("assert", "sqrt", 3);
249   test_get_edit_distance_both_ways ("PATH_MAX", "INT8_MAX", 3);
250   test_get_edit_distance_both_ways ("time", "nice", 2);
251   test_get_edit_distance_both_ways ("bar", "carg", 2);
252   test_get_edit_distance_both_ways ("gtk_widget_show_all",
253 				    "GtkWidgetShowAll", 7);
254   test_get_edit_distance_both_ways ("m_bar", "bar", 2);
255   test_get_edit_distance_both_ways ("MACRO", "MACRAME", 3);
256   test_get_edit_distance_both_ways ("ab", "ac", 1);
257   test_get_edit_distance_both_ways ("ab", "a", 1);
258   test_get_edit_distance_both_ways ("a", "b", 1);
259   test_get_edit_distance_both_ways ("nanl", "name", 2);
260   test_get_edit_distance_both_ways ("char", "bar", 2);
261   test_get_edit_distance_both_ways ("-optimize", "fsanitize", 5);
262   test_get_edit_distance_both_ways ("__DATE__", "__i386__", 4);
263 
264   /* Examples where transposition helps.  */
265   test_get_edit_distance_both_ways ("ab", "ba", 1);
266   test_get_edit_distance_both_ways ("ba", "abc", 2);
267   test_get_edit_distance_both_ways ("coorzd1", "coordz1", 1);
268   test_get_edit_distance_both_ways ("abcdefghijklmnopqrstuvwxyz",
269 				    "bacdefghijklmnopqrstuvwxzy", 2);
270   test_get_edit_distance_both_ways ("saturday", "sundya", 4);
271   test_get_edit_distance_both_ways ("signed", "singed", 1);
272 }
273 
274 /* Subroutine of test_get_edit_distance_cutoff, for emulating the
275    spellchecking cutoff in up to GCC 8.  */
276 
277 static edit_distance_t
get_old_cutoff(size_t goal_len,size_t candidate_len)278 get_old_cutoff (size_t goal_len, size_t candidate_len)
279 {
280   return MAX (goal_len, candidate_len) / 2;
281 }
282 
283 /* Verify that the cutoff for "meaningfulness" of suggestions is at least as
284    conservative as in older GCC releases.
285 
286    This should ensure that we don't offer additional meaningless
287    suggestions (apart from those for which transposition has helped).  */
288 
289 static void
test_get_edit_distance_cutoff()290 test_get_edit_distance_cutoff ()
291 {
292   for (size_t goal_len = 0; goal_len < 30; goal_len++)
293     for (size_t candidate_len = 0; candidate_len < 30; candidate_len++)
294       ASSERT_TRUE (get_edit_distance_cutoff (goal_len, candidate_len)
295 		   <= get_old_cutoff (goal_len, candidate_len));
296 }
297 
298 /* Assert that CANDIDATE is offered as a suggestion for TARGET.  */
299 
300 static void
assert_suggested_for(const location & loc,const char * candidate,const char * target)301 assert_suggested_for (const location &loc, const char *candidate,
302 		      const char *target)
303 {
304   auto_vec<const char *> candidates;
305   candidates.safe_push (candidate);
306   ASSERT_EQ_AT (loc, candidate, find_closest_string (target, &candidates));
307 }
308 
309 /* Assert that CANDIDATE is offered as a suggestion for TARGET.  */
310 
311 #define ASSERT_SUGGESTED_FOR(CANDIDATE, TARGET)			\
312   SELFTEST_BEGIN_STMT							\
313     assert_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET);	\
314   SELFTEST_END_STMT
315 
316 /* Assert that CANDIDATE is not offered as a suggestion for TARGET.  */
317 
318 static void
assert_not_suggested_for(const location & loc,const char * candidate,const char * target)319 assert_not_suggested_for (const location &loc, const char *candidate,
320 			  const char *target)
321 {
322   auto_vec<const char *> candidates;
323   candidates.safe_push (candidate);
324   ASSERT_EQ_AT (loc, NULL, find_closest_string (target, &candidates));
325 }
326 
327 /* Assert that CANDIDATE is not offered as a suggestion for TARGET.  */
328 
329 #define ASSERT_NOT_SUGGESTED_FOR(CANDIDATE, TARGET)			\
330   SELFTEST_BEGIN_STMT							\
331     assert_not_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET);	\
332   SELFTEST_END_STMT
333 
334 /* Verify that we offer varous suggestions that are meaningful,
335    and that we don't offer various other ones that aren't (PR c/82967).  */
336 
337 static void
test_suggestions()338 test_suggestions ()
339 {
340   /* Good suggestions.  */
341 
342   ASSERT_SUGGESTED_FOR ("m_bar", "bar");
343   // dist == 2, max_length == 5, min_length == 3
344 
345   ASSERT_SUGGESTED_FOR ("MACRO", "MACRAME");
346   // dist == 3, max_length == 7, min_length == 5
347 
348   ASSERT_SUGGESTED_FOR ("gtk_widget_show_all", "GtkWidgetShowAll");
349   // dist == 7, max_length == 16, min_length = 19
350 
351   ASSERT_SUGGESTED_FOR ("ab", "ac");
352   // dist == 1, max_length == min_length = 2
353 
354   ASSERT_SUGGESTED_FOR ("ab", "a");
355   // dist == 1, max_length == 2, min_length = 1
356 
357   /* Bad suggestions.  */
358 
359   ASSERT_NOT_SUGGESTED_FOR ("a", "b");
360   // dist == 1, max_length == min_length = 1
361 
362   ASSERT_NOT_SUGGESTED_FOR ("sqrt", "assert");
363   // dist == 3, max_length 6, min_length == 4
364 
365   ASSERT_NOT_SUGGESTED_FOR ("INT8_MAX", "PATH_MAX");
366   // dist == 3, max_length == min_length == 8
367 
368   ASSERT_NOT_SUGGESTED_FOR ("nice", "time");
369   ASSERT_NOT_SUGGESTED_FOR ("nanl", "name");
370   // dist == 2, max_length == min_length == 4
371 
372   ASSERT_NOT_SUGGESTED_FOR ("carg", "bar");
373   ASSERT_NOT_SUGGESTED_FOR ("char", "bar");
374   // dist == 2, max_length == 4, min_length == 3
375 
376   ASSERT_NOT_SUGGESTED_FOR ("-optimize", "fsanitize");
377   // dist == 5, max_length == min_length == 9
378 
379   ASSERT_NOT_SUGGESTED_FOR ("__DATE__", "__i386__");
380   // dist == 4, max_length == min_length == 8
381 
382   ASSERT_NOT_SUGGESTED_FOR ("start_input_device", "InputDevice");
383   // dist == 9, max_length == 18, min_length == 11
384 }
385 
386 /* Verify that find_closest_string is sane.  */
387 
388 static void
test_find_closest_string()389 test_find_closest_string ()
390 {
391   auto_vec<const char *> candidates;
392 
393   /* Verify that it can handle an empty vec.  */
394   ASSERT_EQ (NULL, find_closest_string ("", &candidates));
395 
396   /* Verify that it works sanely for non-empty vecs.  */
397   candidates.safe_push ("apple");
398   candidates.safe_push ("banana");
399   candidates.safe_push ("cherry");
400 
401   ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
402   ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
403   ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
404   ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
405 
406   /* The order of the vec can matter, but it should not matter for these
407      inputs.  */
408   candidates.truncate (0);
409   candidates.safe_push ("cherry");
410   candidates.safe_push ("banana");
411   candidates.safe_push ("apple");
412   ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
413   ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
414   ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
415   ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
416 
417   /* If the goal string somehow makes it into the candidate list, offering
418      it as a suggestion will be nonsensical.  Verify that we don't offer such
419      suggestions.  */
420   ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
421 
422   /* Example from PR 69968 where transposition helps.  */
423   candidates.truncate (0);
424   candidates.safe_push("coordx");
425   candidates.safe_push("coordy");
426   candidates.safe_push("coordz");
427   candidates.safe_push("coordx1");
428   candidates.safe_push("coordy1");
429   candidates.safe_push("coordz1");
430   ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates));
431 }
432 
433 /* Test data for test_metric_conditions.  */
434 
435 static const char * const test_data[] = {
436   "",
437   "foo",
438   "food",
439   "boo",
440   "1234567890123456789012345678901234567890123456789012345678901234567890"
441 };
442 
443 /* Verify that get_edit_distance appears to be a sane distance function,
444    i.e. the conditions for being a metric.  This is done directly for a
445    small set of examples, using test_data above.  This is O(N^3) in the size
446    of the array, due to the test for the triangle inequality, so we keep the
447    array small.  */
448 
449 static void
test_metric_conditions()450 test_metric_conditions ()
451 {
452   const int num_test_cases = sizeof (test_data) / sizeof (test_data[0]);
453 
454   for (int i = 0; i < num_test_cases; i++)
455     {
456       for (int j = 0; j < num_test_cases; j++)
457 	{
458 	  edit_distance_t dist_ij
459 	    = get_edit_distance (test_data[i], test_data[j]);
460 
461 	  /* Identity of indiscernibles: d(i, j) > 0 iff i == j.  */
462 	  if (i == j)
463 	    ASSERT_EQ (dist_ij, 0);
464 	  else
465 	    ASSERT_TRUE (dist_ij > 0);
466 
467 	  /* Symmetry: d(i, j) == d(j, i).  */
468 	  edit_distance_t dist_ji
469 	    = get_edit_distance (test_data[j], test_data[i]);
470 	  ASSERT_EQ (dist_ij, dist_ji);
471 
472 	  /* Triangle inequality.  */
473 	  for (int k = 0; k < num_test_cases; k++)
474 	    {
475 	      edit_distance_t dist_ik
476 		= get_edit_distance (test_data[i], test_data[k]);
477 	      edit_distance_t dist_jk
478 		= get_edit_distance (test_data[j], test_data[k]);
479 	      ASSERT_TRUE (dist_ik <= dist_ij + dist_jk);
480 	    }
481 	}
482     }
483 }
484 
485 /* Run all of the selftests within this file.  */
486 
487 void
spellcheck_c_tests()488 spellcheck_c_tests ()
489 {
490   test_edit_distances ();
491   test_get_edit_distance_cutoff ();
492   test_suggestions ();
493   test_find_closest_string ();
494   test_metric_conditions ();
495 }
496 
497 } // namespace selftest
498 
499 #endif /* #if CHECKING_P */
500