1 /*------------------------------------------------------------------------- 2 * 3 * levenshtein.c 4 * Levenshtein distance implementation. 5 * 6 * Original author: Joe Conway <mail@joeconway.com> 7 * 8 * This file is included by varlena.c twice, to provide matching code for (1) 9 * Levenshtein distance with custom costings, and (2) Levenshtein distance with 10 * custom costings and a "max" value above which exact distances are not 11 * interesting. Before the inclusion, we rely on the presence of the inline 12 * function rest_of_char_same(). 13 * 14 * Written based on a description of the algorithm by Michael Gilleland found 15 * at http://www.merriampark.com/ld.htm. Also looked at levenshtein.c in the 16 * PHP 4.0.6 distribution for inspiration. Configurable penalty costs 17 * extension is introduced by Volkan YAZICI <volkan.yazici@gmail.com. 18 * 19 * Copyright (c) 2001-2018, PostgreSQL Global Development Group 20 * 21 * IDENTIFICATION 22 * src/backend/utils/adt/levenshtein.c 23 * 24 *------------------------------------------------------------------------- 25 */ 26 #define MAX_LEVENSHTEIN_STRLEN 255 27 28 /* 29 * Calculates Levenshtein distance metric between supplied strings, which are 30 * not necessarily null-terminated. 31 * 32 * source: source string, of length slen bytes. 33 * target: target string, of length tlen bytes. 34 * ins_c, del_c, sub_c: costs to charge for character insertion, deletion, 35 * and substitution respectively; (1, 1, 1) costs suffice for common 36 * cases, but your mileage may vary. 37 * max_d: if provided and >= 0, maximum distance we care about; see below. 38 * trusted: caller is trusted and need not obey MAX_LEVENSHTEIN_STRLEN. 39 * 40 * One way to compute Levenshtein distance is to incrementally construct 41 * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number 42 * of operations required to transform the first i characters of s into 43 * the first j characters of t. The last column of the final row is the 44 * answer. 45 * 46 * We use that algorithm here with some modification. In lieu of holding 47 * the entire array in memory at once, we'll just use two arrays of size 48 * m+1 for storing accumulated values. At each step one array represents 49 * the "previous" row and one is the "current" row of the notional large 50 * array. 51 * 52 * If max_d >= 0, we only need to provide an accurate answer when that answer data_source_editor_get_type(void)53 * is less than or equal to max_d. From any cell in the matrix, there is 54 * theoretical "minimum residual distance" from that cell to the last column 55 * of the final row. This minimum residual distance is zero when the 56 * untransformed portions of the strings are of equal length (because we might 57 * get lucky and find all the remaining characters matching) and is otherwise 58 * based on the minimum number of insertions or deletions needed to make them 59 * equal length. The residual distance grows as we move toward the upper 60 * right or lower left corners of the matrix. When the max_d bound is 61 * usefully tight, we can use this property to avoid computing the entirety 62 * of each row; instead, we maintain a start_column and stop_column that 63 * identify the portion of the matrix close to the diagonal which can still 64 * affect the final answer. 65 */ 66 int 67 #ifdef LEVENSHTEIN_LESS_EQUAL 68 varstr_levenshtein_less_equal(const char *source, int slen, 69 const char *target, int tlen, 70 int ins_c, int del_c, int sub_c, 71 int max_d, bool trusted) 72 #else 73 varstr_levenshtein(const char *source, int slen, 74 const char *target, int tlen, 75 int ins_c, int del_c, int sub_c, 76 bool trusted) 77 #endif 78 { 79 int m, 80 n; 81 int *prev; 82 int *curr; 83 int *s_char_len = NULL; 84 int i, 85 j; 86 const char *y; 87 88 /* 89 * For varstr_levenshtein_less_equal, we have real variables called 90 * start_column and stop_column; otherwise it's just short-hand for 0 and 91 * m. 92 */ 93 #ifdef LEVENSHTEIN_LESS_EQUAL 94 int start_column, 95 stop_column; 96 97 #undef START_COLUMN 98 #undef STOP_COLUMN 99 #define START_COLUMN start_column 100 #define STOP_COLUMN stop_column 101 #else 102 #undef START_COLUMN 103 #undef STOP_COLUMN 104 #define START_COLUMN 0 105 #define STOP_COLUMN m 106 #endif 107 108 /* Convert string lengths (in bytes) to lengths in characters */ 109 m = pg_mbstrlen_with_len(source, slen); 110 n = pg_mbstrlen_with_len(target, tlen); 111 112 /* 113 * We can transform an empty s into t with n insertions, or a non-empty t 114 * into an empty s with m deletions. 115 */ 116 if (!m) 117 return n * ins_c; 118 if (!n) 119 return m * del_c; 120 121 /* 122 * For security concerns, restrict excessive CPU+RAM usage. (This 123 * implementation uses O(m) memory and has O(mn) complexity.) If 124 * "trusted" is true, caller is responsible for not making excessive 125 * requests, typically by using a small max_d along with strings that are 126 * bounded, though not necessarily to MAX_LEVENSHTEIN_STRLEN exactly. 127 */ 128 if (!trusted && 129 (m > MAX_LEVENSHTEIN_STRLEN || 130 n > MAX_LEVENSHTEIN_STRLEN)) 131 ereport(ERROR, 132 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), 133 errmsg("levenshtein argument exceeds maximum length of %d characters", 134 MAX_LEVENSHTEIN_STRLEN))); 135 136 #ifdef LEVENSHTEIN_LESS_EQUAL 137 /* Initialize start and stop columns. */ 138 start_column = 0; 139 stop_column = m + 1; 140 141 /* 142 * If max_d >= 0, determine whether the bound is impossibly tight. If so, 143 * return max_d + 1 immediately. Otherwise, determine whether it's tight 144 * enough to limit the computation we must perform. If so, figure out 145 * initial stop column. 146 */ 147 if (max_d >= 0) 148 { 149 int min_theo_d; /* Theoretical minimum distance. */ 150 int max_theo_d; /* Theoretical maximum distance. */ 151 int net_inserts = n - m; 152 153 min_theo_d = net_inserts < 0 ? 154 -net_inserts * del_c : net_inserts * ins_c; 155 if (min_theo_d > max_d) 156 return max_d + 1; 157 if (ins_c + del_c < sub_c) 158 sub_c = ins_c + del_c; 159 max_theo_d = min_theo_d + sub_c * Min(m, n); 160 if (max_d >= max_theo_d) 161 max_d = -1; 162 else if (ins_c + del_c > 0) 163 { 164 /* 165 * Figure out how much of the first row of the notional matrix we 166 * need to fill in. If the string is growing, the theoretical 167 * minimum distance already incorporates the cost of deleting the 168 * number of characters necessary to make the two strings equal in 169 * length. Each additional deletion forces another insertion, so 170 * the best-case total cost increases by ins_c + del_c. If the 171 * string is shrinking, the minimum theoretical cost assumes no 172 * excess deletions; that is, we're starting no further right than 173 * column n - m. If we do start further right, the best-case 174 * total cost increases by ins_c + del_c for each move right. 175 */ 176 int slack_d = max_d - min_theo_d; 177 int best_column = net_inserts < 0 ? -net_inserts : 0; 178 179 stop_column = best_column + (slack_d / (ins_c + del_c)) + 1; 180 if (stop_column > m) 181 stop_column = m + 1; 182 } 183 } 184 #endif 185 186 /* 187 * In order to avoid calling pg_mblen() repeatedly on each character in s, 188 * we cache all the lengths before starting the main loop -- but if all 189 * the characters in both strings are single byte, then we skip this and 190 * use a fast-path in the main loop. If only one string contains 191 * multi-byte characters, we still build the array, so that the fast-path 192 * needn't deal with the case where the array hasn't been initialized. 193 */ 194 if (m != slen || n != tlen) 195 { 196 int i; 197 const char *cp = source; 198 199 s_char_len = (int *) palloc((m + 1) * sizeof(int)); 200 for (i = 0; i < m; ++i) 201 { 202 s_char_len[i] = pg_mblen(cp); 203 cp += s_char_len[i]; 204 } 205 s_char_len[i] = 0; 206 } 207 208 /* One more cell for initialization column and row. */ 209 ++m; 210 ++n; 211 212 /* Previous and current rows of notional array. */ 213 prev = (int *) palloc(2 * m * sizeof(int)); 214 curr = prev + m; 215 216 /* 217 * To transform the first i characters of s into the first 0 characters of 218 * t, we must perform i deletions. 219 */ 220 for (i = START_COLUMN; i < STOP_COLUMN; i++) 221 prev[i] = i * del_c; 222 223 /* Loop through rows of the notional array */ 224 for (y = target, j = 1; j < n; j++) 225 { 226 int *temp; 227 const char *x = source; 228 int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1; 229 230 #ifdef LEVENSHTEIN_LESS_EQUAL 231 232 /* 233 * In the best case, values percolate down the diagonal unchanged, so 234 * we must increment stop_column unless it's already on the right end 235 * of the array. The inner loop will read prev[stop_column], so we 236 * have to initialize it even though it shouldn't affect the result. 237 */ 238 if (stop_column < m) 239 { 240 prev[stop_column] = max_d + 1; 241 ++stop_column; 242 } 243 244 /* 245 * The main loop fills in curr, but curr[0] needs a special case: to 246 * transform the first 0 characters of s into the first j characters 247 * of t, we must perform j insertions. However, if start_column > 0, 248 * this special case does not apply. 249 */ 250 if (start_column == 0) 251 { 252 curr[0] = j * ins_c; 253 i = 1; 254 } 255 else 256 i = start_column; 257 #else 258 curr[0] = j * ins_c; 259 i = 1; 260 #endif 261 262 /* 263 * This inner loop is critical to performance, so we include a 264 * fast-path to handle the (fairly common) case where no multibyte 265 * characters are in the mix. The fast-path is entitled to assume 266 * that if s_char_len is not initialized then BOTH strings contain 267 * only single-byte characters. 268 */ 269 if (s_char_len != NULL) 270 { 271 for (; i < STOP_COLUMN; i++) 272 { 273 int ins; 274 int del; 275 int sub; 276 int x_char_len = s_char_len[i - 1]; 277 278 /* 279 * Calculate costs for insertion, deletion, and substitution. 280 * 281 * When calculating cost for substitution, we compare the last 282 * character of each possibly-multibyte character first, 283 * because that's enough to rule out most mis-matches. If we 284 * get past that test, then we compare the lengths and the 285 * remaining bytes. 286 */ 287 ins = prev[i] + ins_c; 288 del = curr[i - 1] + del_c; 289 if (x[x_char_len - 1] == y[y_char_len - 1] 290 && x_char_len == y_char_len && 291 (x_char_len == 1 || rest_of_char_same(x, y, x_char_len))) 292 sub = prev[i - 1]; 293 else 294 sub = prev[i - 1] + sub_c; 295 296 /* Take the one with minimum cost. */ 297 curr[i] = Min(ins, del); 298 curr[i] = Min(curr[i], sub); 299 300 /* Point to next character. */ 301 x += x_char_len; 302 } 303 } 304 else 305 { 306 for (; i < STOP_COLUMN; i++) 307 { 308 int ins; 309 int del; 310 int sub; 311 312 /* Calculate costs for insertion, deletion, and substitution. */ 313 ins = prev[i] + ins_c; 314 del = curr[i - 1] + del_c; 315 sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c); 316 317 /* Take the one with minimum cost. */ 318 curr[i] = Min(ins, del); 319 curr[i] = Min(curr[i], sub); 320 321 /* Point to next character. */ 322 x++; 323 } 324 } 325 326 /* Swap current row with previous row. */ 327 temp = curr; 328 curr = prev; 329 prev = temp; 330 331 /* Point to next character. */ 332 y += y_char_len; 333 334 #ifdef LEVENSHTEIN_LESS_EQUAL 335 336 /* 337 * This chunk of code represents a significant performance hit if used 338 * in the case where there is no max_d bound. This is probably not 339 * because the max_d >= 0 test itself is expensive, but rather because 340 * the possibility of needing to execute this code prevents tight 341 * optimization of the loop as a whole. 342 */ 343 if (max_d >= 0) 344 { 345 /* 346 * The "zero point" is the column of the current row where the 347 * remaining portions of the strings are of equal length. There 348 * are (n - 1) characters in the target string, of which j have 349 * been transformed. There are (m - 1) characters in the source 350 * string, so we want to find the value for zp where (n - 1) - j = 351 * (m - 1) - zp. 352 */ 353 int zp = j - (n - m); 354 355 /* Check whether the stop column can slide left. */ 356 while (stop_column > 0) 357 { 358 int ii = stop_column - 1; 359 int net_inserts = ii - zp; 360 361 if (prev[ii] + (net_inserts > 0 ? net_inserts * ins_c : 362 -net_inserts * del_c) <= max_d) 363 break; 364 stop_column--; 365 } 366 367 /* Check whether the start column can slide right. */ 368 while (start_column < stop_column) 369 { 370 int net_inserts = start_column - zp; 371 372 if (prev[start_column] + 373 (net_inserts > 0 ? net_inserts * ins_c : 374 -net_inserts * del_c) <= max_d) 375 break; 376 377 /* 378 * We'll never again update these values, so we must make sure 379 * there's nothing here that could confuse any future 380 * iteration of the outer loop. 381 */ 382 prev[start_column] = max_d + 1; 383 curr[start_column] = max_d + 1; 384 if (start_column != 0) 385 source += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1; 386 start_column++; 387 } 388 389 /* If they cross, we're going to exceed the bound. */ 390 if (start_column >= stop_column) 391 return max_d + 1; 392 } 393 #endif 394 } 395 396 /* 397 * Because the final value was swapped from the previous row to the 398 * current row, that's where we'll find it. 399 */ 400 return prev[m - 1]; 401 } 402