1
2 /* Copyright (C) 2010-2019 by The D Language Foundation, All Rights Reserved
3 * http://www.digitalmars.com
4 * Distributed under the Boost Software License, Version 1.0.
5 * (See accompanying file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
6 * https://github.com/D-Programming-Language/dmd/blob/master/src/root/speller.c
7 */
8
9 #include "dsystem.h"
10 #include "speller.h"
11
12 const char idchars[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";
13
14 /**************************************************
15 * combine a new result from the spell checker to
16 * find the one with the closest symbol with
17 * respect to the cost defined by the search function
18 * Input/Output:
19 * p best found spelling (NULL if none found yet)
20 * cost cost of p (INT_MAX if none found yet)
21 * Input:
22 * np new found spelling (NULL if none found)
23 * ncost cost of np if non-NULL
24 * Returns:
25 * true if the cost is less or equal 0
26 * false otherwise
27 */
combineSpellerResult(void * & p,int & cost,void * np,int ncost)28 bool combineSpellerResult(void*& p, int& cost, void* np, int ncost)
29 {
30 if (np && ncost < cost)
31 {
32 p = np;
33 cost = ncost;
34 if (cost <= 0)
35 return true;
36 }
37 return false;
38 }
39
spellerY(const char * seed,size_t seedlen,fp_speller_t fp,void * fparg,const char * charset,size_t index,int * cost)40 void *spellerY(const char *seed, size_t seedlen, fp_speller_t fp, void *fparg,
41 const char *charset, size_t index, int* cost)
42 {
43 if (!seedlen)
44 return NULL;
45 assert(seed[seedlen] == 0);
46
47 char tmp[30];
48 char *buf;
49 if (seedlen <= sizeof(tmp) - 2)
50 buf = tmp;
51 else
52 {
53 buf = (char *)alloca(seedlen + 2); // leave space for extra char
54 if (!buf)
55 return NULL; // no matches
56 }
57
58 memcpy(buf, seed, index);
59 *cost = INT_MAX;
60 void* p = NULL;
61 int ncost;
62
63 /* Delete at seed[index] */
64 if (index < seedlen)
65 {
66 memcpy(buf + index, seed + index + 1, seedlen - index);
67 assert(buf[seedlen - 1] == 0);
68 void* np = (*fp)(fparg, buf, &ncost);
69 if (combineSpellerResult(p, *cost, np, ncost))
70 return p;
71 }
72
73 if (charset && *charset)
74 {
75 /* Substitutions */
76 if (index < seedlen)
77 {
78 memcpy(buf, seed, seedlen + 1);
79 for (const char *s = charset; *s; s++)
80 {
81 buf[index] = *s;
82
83 //printf("sub buf = '%s'\n", buf);
84 void* np = (*fp)(fparg, buf, &ncost);
85 if (combineSpellerResult(p, *cost, np, ncost))
86 return p;
87 }
88 assert(buf[seedlen] == 0);
89 }
90
91 /* Insertions */
92 memcpy (buf + index + 1, seed + index, seedlen + 1 - index);
93
94 for (const char *s = charset; *s; s++)
95 {
96 buf[index] = *s;
97
98 //printf("ins buf = '%s'\n", buf);
99 void* np = (*fp)(fparg, buf, &ncost);
100 if (combineSpellerResult(p, *cost, np, ncost))
101 return p;
102 }
103 assert(buf[seedlen + 1] == 0);
104 }
105
106 return p; // return "best" result
107 }
108
spellerX(const char * seed,size_t seedlen,fp_speller_t fp,void * fparg,const char * charset,int flag)109 void *spellerX(const char *seed, size_t seedlen, fp_speller_t fp, void *fparg,
110 const char *charset, int flag)
111 {
112 if (!seedlen)
113 return NULL;
114
115 char tmp[30];
116 char *buf;
117 if (seedlen <= sizeof(tmp) - 2)
118 buf = tmp;
119 else
120 {
121 buf = (char *)alloca(seedlen + 2); // leave space for extra char
122 if (!buf)
123 return NULL; // no matches
124 }
125 int cost = INT_MAX, ncost;
126 void *p = NULL, *np;
127
128 /* Deletions */
129 memcpy(buf, seed + 1, seedlen);
130 for (size_t i = 0; i < seedlen; i++)
131 {
132 //printf("del buf = '%s'\n", buf);
133 if (flag)
134 np = spellerY(buf, seedlen - 1, fp, fparg, charset, i, &ncost);
135 else
136 np = (*fp)(fparg, buf, &ncost);
137 if (combineSpellerResult(p, cost, np, ncost))
138 return p;
139
140 buf[i] = seed[i];
141 }
142
143 /* Transpositions */
144 if (!flag)
145 {
146 memcpy(buf, seed, seedlen + 1);
147 for (size_t i = 0; i + 1 < seedlen; i++)
148 {
149 // swap [i] and [i + 1]
150 buf[i] = seed[i + 1];
151 buf[i + 1] = seed[i];
152
153 //printf("tra buf = '%s'\n", buf);
154 if (combineSpellerResult(p, cost, (*fp)(fparg, buf, &ncost), ncost))
155 return p;
156
157 buf[i] = seed[i];
158 }
159 }
160
161 if (charset && *charset)
162 {
163 /* Substitutions */
164 memcpy(buf, seed, seedlen + 1);
165 for (size_t i = 0; i < seedlen; i++)
166 {
167 for (const char *s = charset; *s; s++)
168 {
169 buf[i] = *s;
170
171 //printf("sub buf = '%s'\n", buf);
172 if (flag)
173 np = spellerY(buf, seedlen, fp, fparg, charset, i + 1, &ncost);
174 else
175 np = (*fp)(fparg, buf, &ncost);
176 if (combineSpellerResult(p, cost, np, ncost))
177 return p;
178 }
179 buf[i] = seed[i];
180 }
181
182 /* Insertions */
183 memcpy(buf + 1, seed, seedlen + 1);
184 for (size_t i = 0; i <= seedlen; i++) // yes, do seedlen+1 iterations
185 {
186 for (const char *s = charset; *s; s++)
187 {
188 buf[i] = *s;
189
190 //printf("ins buf = '%s'\n", buf);
191 if (flag)
192 np = spellerY(buf, seedlen + 1, fp, fparg, charset, i + 1, &ncost);
193 else
194 np = (*fp)(fparg, buf, &ncost);
195 if (combineSpellerResult(p, cost, np, ncost))
196 return p;
197 }
198 buf[i] = seed[i]; // going past end of seed[] is ok, as we hit the 0
199 }
200 }
201
202 return p; // return "best" result
203 }
204
205 /**************************************************
206 * Looks for correct spelling.
207 * Currently only looks a 'distance' of one from the seed[].
208 * This does an exhaustive search, so can potentially be very slow.
209 * Input:
210 * seed wrongly spelled word
211 * fp search function
212 * fparg argument to search function
213 * charset character set
214 * Returns:
215 * NULL no correct spellings found
216 * void* value returned by fp() for first possible correct spelling
217 */
218
speller(const char * seed,fp_speller_t fp,void * fparg,const char * charset)219 void *speller(const char *seed, fp_speller_t fp, void *fparg, const char *charset)
220 {
221 size_t seedlen = strlen(seed);
222 size_t maxdist = seedlen < 4 ? seedlen / 2 : 2;
223 for (size_t distance = 0; distance < maxdist; distance++)
224 { void *p = spellerX(seed, seedlen, fp, fparg, charset, distance);
225 if (p)
226 return p;
227 // if (seedlen > 10)
228 // break;
229 }
230 return NULL; // didn't find it
231 }
232