1 /*
2 ** 2007 May 6
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** $Id: icu.c,v 1.7 2007/12/13 21:54:11 drh Exp $
13 **
14 ** This file implements an integration between the ICU library
15 ** ("International Components for Unicode", an open-source library
16 ** for handling unicode data) and SQLite. The integration uses
17 ** ICU to provide the following to SQLite:
18 **
19 ** * An implementation of the SQL regexp() function (and hence REGEXP
20 ** operator) using the ICU uregex_XX() APIs.
21 **
22 ** * Implementations of the SQL scalar upper() and lower() functions
23 ** for case mapping.
24 **
25 ** * Integration of ICU and SQLite collation sequences.
26 **
27 ** * An implementation of the LIKE operator that uses ICU to
28 ** provide case-independent matching.
29 */
30
31 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)
32
33 /* Include ICU headers */
34 #include <unicode/utypes.h>
35 #include <unicode/uregex.h>
36 #include <unicode/ustring.h>
37 #include <unicode/ucol.h>
38
39 #include <assert.h>
40
41 #ifndef SQLITE_CORE
42 #include "sqlite3ext.h"
43 SQLITE_EXTENSION_INIT1
44 #else
45 #include "sqlite3.h"
46 #endif
47
48 /*
49 ** Maximum length (in bytes) of the pattern in a LIKE or GLOB
50 ** operator.
51 */
52 #ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH
53 # define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000
54 #endif
55
56 /*
57 ** Version of sqlite3_free() that is always a function, never a macro.
58 */
xFree(void * p)59 static void xFree(void *p){
60 sqlite3_free(p);
61 }
62
63 /*
64 ** This lookup table is used to help decode the first byte of
65 ** a multi-byte UTF8 character. It is copied here from SQLite source
66 ** code file utf8.c.
67 */
68 static const unsigned char icuUtf8Trans1[] = {
69 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
70 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
71 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
72 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
73 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
74 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
75 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
76 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
77 };
78
79 #define SQLITE_ICU_READ_UTF8(zIn, c) \
80 c = *(zIn++); \
81 if( c>=0xc0 ){ \
82 c = icuUtf8Trans1[c-0xc0]; \
83 while( (*zIn & 0xc0)==0x80 ){ \
84 c = (c<<6) + (0x3f & *(zIn++)); \
85 } \
86 }
87
88 #define SQLITE_ICU_SKIP_UTF8(zIn) \
89 assert( *zIn ); \
90 if( *(zIn++)>=0xc0 ){ \
91 while( (*zIn & 0xc0)==0x80 ){zIn++;} \
92 }
93
94
95 /*
96 ** Compare two UTF-8 strings for equality where the first string is
97 ** a "LIKE" expression. Return true (1) if they are the same and
98 ** false (0) if they are different.
99 */
icuLikeCompare(const uint8_t * zPattern,const uint8_t * zString,const UChar32 uEsc)100 static int icuLikeCompare(
101 const uint8_t *zPattern, /* LIKE pattern */
102 const uint8_t *zString, /* The UTF-8 string to compare against */
103 const UChar32 uEsc /* The escape character */
104 ){
105 static const int MATCH_ONE = (UChar32)'_';
106 static const int MATCH_ALL = (UChar32)'%';
107
108 int prevEscape = 0; /* True if the previous character was uEsc */
109
110 while( 1 ){
111
112 /* Read (and consume) the next character from the input pattern. */
113 UChar32 uPattern;
114 SQLITE_ICU_READ_UTF8(zPattern, uPattern);
115 if( uPattern==0 ) break;
116
117 /* There are now 4 possibilities:
118 **
119 ** 1. uPattern is an unescaped match-all character "%",
120 ** 2. uPattern is an unescaped match-one character "_",
121 ** 3. uPattern is an unescaped escape character, or
122 ** 4. uPattern is to be handled as an ordinary character
123 */
124 if( !prevEscape && uPattern==MATCH_ALL ){
125 /* Case 1. */
126 uint8_t c;
127
128 /* Skip any MATCH_ALL or MATCH_ONE characters that follow a
129 ** MATCH_ALL. For each MATCH_ONE, skip one character in the
130 ** test string.
131 */
132 while( (c=*zPattern) == MATCH_ALL || c == MATCH_ONE ){
133 if( c==MATCH_ONE ){
134 if( *zString==0 ) return 0;
135 SQLITE_ICU_SKIP_UTF8(zString);
136 }
137 zPattern++;
138 }
139
140 if( *zPattern==0 ) return 1;
141
142 while( *zString ){
143 if( icuLikeCompare(zPattern, zString, uEsc) ){
144 return 1;
145 }
146 SQLITE_ICU_SKIP_UTF8(zString);
147 }
148 return 0;
149
150 }else if( !prevEscape && uPattern==MATCH_ONE ){
151 /* Case 2. */
152 if( *zString==0 ) return 0;
153 SQLITE_ICU_SKIP_UTF8(zString);
154
155 }else if( !prevEscape && uPattern==uEsc){
156 /* Case 3. */
157 prevEscape = 1;
158
159 }else{
160 /* Case 4. */
161 UChar32 uString;
162 SQLITE_ICU_READ_UTF8(zString, uString);
163 uString = u_foldCase(uString, U_FOLD_CASE_DEFAULT);
164 uPattern = u_foldCase(uPattern, U_FOLD_CASE_DEFAULT);
165 if( uString!=uPattern ){
166 return 0;
167 }
168 prevEscape = 0;
169 }
170 }
171
172 return *zString==0;
173 }
174
175 /*
176 ** Implementation of the like() SQL function. This function implements
177 ** the build-in LIKE operator. The first argument to the function is the
178 ** pattern and the second argument is the string. So, the SQL statements:
179 **
180 ** A LIKE B
181 **
182 ** is implemented as like(B, A). If there is an escape character E,
183 **
184 ** A LIKE B ESCAPE E
185 **
186 ** is mapped to like(B, A, E).
187 */
icuLikeFunc(sqlite3_context * context,int argc,sqlite3_value ** argv)188 static void icuLikeFunc(
189 sqlite3_context *context,
190 int argc,
191 sqlite3_value **argv
192 ){
193 const unsigned char *zA = sqlite3_value_text(argv[0]);
194 const unsigned char *zB = sqlite3_value_text(argv[1]);
195 UChar32 uEsc = 0;
196
197 /* Limit the length of the LIKE or GLOB pattern to avoid problems
198 ** of deep recursion and N*N behavior in patternCompare().
199 */
200 if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){
201 sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1);
202 return;
203 }
204
205
206 if( argc==3 ){
207 /* The escape character string must consist of a single UTF-8 character.
208 ** Otherwise, return an error.
209 */
210 int nE= sqlite3_value_bytes(argv[2]);
211 const unsigned char *zE = sqlite3_value_text(argv[2]);
212 int i = 0;
213 if( zE==0 ) return;
214 U8_NEXT(zE, i, nE, uEsc);
215 if( i!=nE){
216 sqlite3_result_error(context,
217 "ESCAPE expression must be a single character", -1);
218 return;
219 }
220 }
221
222 if( zA && zB ){
223 sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc));
224 }
225 }
226
227 /*
228 ** This function is called when an ICU function called from within
229 ** the implementation of an SQL scalar function returns an error.
230 **
231 ** The scalar function context passed as the first argument is
232 ** loaded with an error message based on the following two args.
233 */
icuFunctionError(sqlite3_context * pCtx,const char * zName,UErrorCode e)234 static void icuFunctionError(
235 sqlite3_context *pCtx, /* SQLite scalar function context */
236 const char *zName, /* Name of ICU function that failed */
237 UErrorCode e /* Error code returned by ICU function */
238 ){
239 char zBuf[128];
240 sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e));
241 zBuf[127] = '\0';
242 sqlite3_result_error(pCtx, zBuf, -1);
243 }
244
245 /*
246 ** Function to delete compiled regexp objects. Registered as
247 ** a destructor function with sqlite3_set_auxdata().
248 */
icuRegexpDelete(void * p)249 static void icuRegexpDelete(void *p){
250 URegularExpression *pExpr = (URegularExpression *)p;
251 uregex_close(pExpr);
252 }
253
254 /*
255 ** Implementation of SQLite REGEXP operator. This scalar function takes
256 ** two arguments. The first is a regular expression pattern to compile
257 ** the second is a string to match against that pattern. If either
258 ** argument is an SQL NULL, then NULL Is returned. Otherwise, the result
259 ** is 1 if the string matches the pattern, or 0 otherwise.
260 **
261 ** SQLite maps the regexp() function to the regexp() operator such
262 ** that the following two are equivalent:
263 **
264 ** zString REGEXP zPattern
265 ** regexp(zPattern, zString)
266 **
267 ** Uses the following ICU regexp APIs:
268 **
269 ** uregex_open()
270 ** uregex_matches()
271 ** uregex_close()
272 */
icuRegexpFunc(sqlite3_context * p,int nArg,sqlite3_value ** apArg)273 static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){
274 UErrorCode status = U_ZERO_ERROR;
275 URegularExpression *pExpr;
276 UBool res;
277 const UChar *zString = sqlite3_value_text16(apArg[1]);
278
279 (void)nArg; /* Unused parameter */
280
281 /* If the left hand side of the regexp operator is NULL,
282 ** then the result is also NULL.
283 */
284 if( !zString ){
285 return;
286 }
287
288 pExpr = sqlite3_get_auxdata(p, 0);
289 if( !pExpr ){
290 const UChar *zPattern = sqlite3_value_text16(apArg[0]);
291 if( !zPattern ){
292 return;
293 }
294 pExpr = uregex_open(zPattern, -1, 0, 0, &status);
295
296 if( U_SUCCESS(status) ){
297 sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete);
298 }else{
299 assert(!pExpr);
300 icuFunctionError(p, "uregex_open", status);
301 return;
302 }
303 }
304
305 /* Configure the text that the regular expression operates on. */
306 uregex_setText(pExpr, zString, -1, &status);
307 if( !U_SUCCESS(status) ){
308 icuFunctionError(p, "uregex_setText", status);
309 return;
310 }
311
312 /* Attempt the match */
313 res = uregex_matches(pExpr, 0, &status);
314 if( !U_SUCCESS(status) ){
315 icuFunctionError(p, "uregex_matches", status);
316 return;
317 }
318
319 /* Set the text that the regular expression operates on to a NULL
320 ** pointer. This is not really necessary, but it is tidier than
321 ** leaving the regular expression object configured with an invalid
322 ** pointer after this function returns.
323 */
324 uregex_setText(pExpr, 0, 0, &status);
325
326 /* Return 1 or 0. */
327 sqlite3_result_int(p, res ? 1 : 0);
328 }
329
330 /*
331 ** Implementations of scalar functions for case mapping - upper() and
332 ** lower(). Function upper() converts its input to upper-case (ABC).
333 ** Function lower() converts to lower-case (abc).
334 **
335 ** ICU provides two types of case mapping, "general" case mapping and
336 ** "language specific". Refer to ICU documentation for the differences
337 ** between the two.
338 **
339 ** To utilise "general" case mapping, the upper() or lower() scalar
340 ** functions are invoked with one argument:
341 **
342 ** upper('ABC') -> 'abc'
343 ** lower('abc') -> 'ABC'
344 **
345 ** To access ICU "language specific" case mapping, upper() or lower()
346 ** should be invoked with two arguments. The second argument is the name
347 ** of the locale to use. Passing an empty string ("") or SQL NULL value
348 ** as the second argument is the same as invoking the 1 argument version
349 ** of upper() or lower().
350 **
351 ** lower('I', 'en_us') -> 'i'
352 ** lower('I', 'tr_tr') -> '\u131' (small dotless i)
353 **
354 ** http://www.icu-project.org/userguide/posix.html#case_mappings
355 */
icuCaseFunc16(sqlite3_context * p,int nArg,sqlite3_value ** apArg)356 static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){
357 const UChar *zInput; /* Pointer to input string */
358 UChar *zOutput = 0; /* Pointer to output buffer */
359 int nInput; /* Size of utf-16 input string in bytes */
360 int nOut; /* Size of output buffer in bytes */
361 int cnt;
362 int bToUpper; /* True for toupper(), false for tolower() */
363 UErrorCode status;
364 const char *zLocale = 0;
365
366 assert(nArg==1 || nArg==2);
367 bToUpper = (sqlite3_user_data(p)!=0);
368 if( nArg==2 ){
369 zLocale = (const char *)sqlite3_value_text(apArg[1]);
370 }
371
372 zInput = sqlite3_value_text16(apArg[0]);
373 if( !zInput ){
374 return;
375 }
376 nOut = nInput = sqlite3_value_bytes16(apArg[0]);
377 if( nOut==0 ){
378 sqlite3_result_text16(p, "", 0, SQLITE_STATIC);
379 return;
380 }
381
382 for(cnt=0; cnt<2; cnt++){
383 UChar *zNew = sqlite3_realloc(zOutput, nOut);
384 if( zNew==0 ){
385 sqlite3_free(zOutput);
386 sqlite3_result_error_nomem(p);
387 return;
388 }
389 zOutput = zNew;
390 status = U_ZERO_ERROR;
391 if( bToUpper ){
392 nOut = 2*u_strToUpper(zOutput,nOut/2,zInput,nInput/2,zLocale,&status);
393 }else{
394 nOut = 2*u_strToLower(zOutput,nOut/2,zInput,nInput/2,zLocale,&status);
395 }
396
397 if( U_SUCCESS(status) ){
398 sqlite3_result_text16(p, zOutput, nOut, xFree);
399 }else if( status==U_BUFFER_OVERFLOW_ERROR ){
400 assert( cnt==0 );
401 continue;
402 }else{
403 icuFunctionError(p, bToUpper ? "u_strToUpper" : "u_strToLower", status);
404 }
405 return;
406 }
407 assert( 0 ); /* Unreachable */
408 }
409
410 /*
411 ** Collation sequence destructor function. The pCtx argument points to
412 ** a UCollator structure previously allocated using ucol_open().
413 */
icuCollationDel(void * pCtx)414 static void icuCollationDel(void *pCtx){
415 UCollator *p = (UCollator *)pCtx;
416 ucol_close(p);
417 }
418
419 /*
420 ** Collation sequence comparison function. The pCtx argument points to
421 ** a UCollator structure previously allocated using ucol_open().
422 */
icuCollationColl(void * pCtx,int nLeft,const void * zLeft,int nRight,const void * zRight)423 static int icuCollationColl(
424 void *pCtx,
425 int nLeft,
426 const void *zLeft,
427 int nRight,
428 const void *zRight
429 ){
430 UCollationResult res;
431 UCollator *p = (UCollator *)pCtx;
432 res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2);
433 switch( res ){
434 case UCOL_LESS: return -1;
435 case UCOL_GREATER: return +1;
436 case UCOL_EQUAL: return 0;
437 }
438 assert(!"Unexpected return value from ucol_strcoll()");
439 return 0;
440 }
441
442 /*
443 ** Implementation of the scalar function icu_load_collation().
444 **
445 ** This scalar function is used to add ICU collation based collation
446 ** types to an SQLite database connection. It is intended to be called
447 ** as follows:
448 **
449 ** SELECT icu_load_collation(<locale>, <collation-name>);
450 **
451 ** Where <locale> is a string containing an ICU locale identifier (i.e.
452 ** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the
453 ** collation sequence to create.
454 */
icuLoadCollation(sqlite3_context * p,int nArg,sqlite3_value ** apArg)455 static void icuLoadCollation(
456 sqlite3_context *p,
457 int nArg,
458 sqlite3_value **apArg
459 ){
460 sqlite3 *db = (sqlite3 *)sqlite3_user_data(p);
461 UErrorCode status = U_ZERO_ERROR;
462 const char *zLocale; /* Locale identifier - (eg. "jp_JP") */
463 const char *zName; /* SQL Collation sequence name (eg. "japanese") */
464 UCollator *pUCollator; /* ICU library collation object */
465 int rc; /* Return code from sqlite3_create_collation_x() */
466
467 assert(nArg==2);
468 (void)nArg; /* Unused parameter */
469 zLocale = (const char *)sqlite3_value_text(apArg[0]);
470 zName = (const char *)sqlite3_value_text(apArg[1]);
471
472 if( !zLocale || !zName ){
473 return;
474 }
475
476 pUCollator = ucol_open(zLocale, &status);
477 if( !U_SUCCESS(status) ){
478 icuFunctionError(p, "ucol_open", status);
479 return;
480 }
481 assert(p);
482
483 rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator,
484 icuCollationColl, icuCollationDel
485 );
486 if( rc!=SQLITE_OK ){
487 ucol_close(pUCollator);
488 sqlite3_result_error(p, "Error registering collation function", -1);
489 }
490 }
491
492 /*
493 ** Register the ICU extension functions with database db.
494 */
sqlite3IcuInit(sqlite3 * db)495 int sqlite3IcuInit(sqlite3 *db){
496 static const struct IcuScalar {
497 const char *zName; /* Function name */
498 unsigned char nArg; /* Number of arguments */
499 unsigned short enc; /* Optimal text encoding */
500 unsigned char iContext; /* sqlite3_user_data() context */
501 void (*xFunc)(sqlite3_context*,int,sqlite3_value**);
502 } scalars[] = {
503 {"icu_load_collation", 2, SQLITE_UTF8, 1, icuLoadCollation},
504 {"regexp", 2, SQLITE_ANY|SQLITE_DETERMINISTIC, 0, icuRegexpFunc},
505 {"lower", 1, SQLITE_UTF16|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
506 {"lower", 2, SQLITE_UTF16|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
507 {"upper", 1, SQLITE_UTF16|SQLITE_DETERMINISTIC, 1, icuCaseFunc16},
508 {"upper", 2, SQLITE_UTF16|SQLITE_DETERMINISTIC, 1, icuCaseFunc16},
509 {"lower", 1, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
510 {"lower", 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
511 {"upper", 1, SQLITE_UTF8|SQLITE_DETERMINISTIC, 1, icuCaseFunc16},
512 {"upper", 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, 1, icuCaseFunc16},
513 {"like", 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuLikeFunc},
514 {"like", 3, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuLikeFunc},
515 };
516 int rc = SQLITE_OK;
517 int i;
518
519
520 for(i=0; rc==SQLITE_OK && i<(int)(sizeof(scalars)/sizeof(scalars[0])); i++){
521 const struct IcuScalar *p = &scalars[i];
522 rc = sqlite3_create_function(
523 db, p->zName, p->nArg, p->enc,
524 p->iContext ? (void*)db : (void*)0,
525 p->xFunc, 0, 0
526 );
527 }
528
529 return rc;
530 }
531
532 #if !SQLITE_CORE
533 #ifdef _WIN32
534 __declspec(dllexport)
535 #endif
sqlite3_icu_init(sqlite3 * db,char ** pzErrMsg,const sqlite3_api_routines * pApi)536 int sqlite3_icu_init(
537 sqlite3 *db,
538 char **pzErrMsg,
539 const sqlite3_api_routines *pApi
540 ){
541 SQLITE_EXTENSION_INIT2(pApi)
542 return sqlite3IcuInit(db);
543 }
544 #endif
545
546 #endif
547