1 /*
2 ** 2007 May 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** $Id: icu.c,v 1.7 2007/12/13 21:54:11 drh Exp $
13 **
14 ** This file implements an integration between the ICU library
15 ** ("International Components for Unicode", an open-source library
16 ** for handling unicode data) and SQLite. The integration uses
17 ** ICU to provide the following to SQLite:
18 **
19 **   * An implementation of the SQL regexp() function (and hence REGEXP
20 **     operator) using the ICU uregex_XX() APIs.
21 **
22 **   * Implementations of the SQL scalar upper() and lower() functions
23 **     for case mapping.
24 **
25 **   * Integration of ICU and SQLite collation sequences.
26 **
27 **   * An implementation of the LIKE operator that uses ICU to
28 **     provide case-independent matching.
29 */
30 
31 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)
32 
33 /* Include ICU headers */
34 #include <unicode/utypes.h>
35 #include <unicode/uregex.h>
36 #include <unicode/ustring.h>
37 #include <unicode/ucol.h>
38 
39 #include <assert.h>
40 
41 #ifndef SQLITE_CORE
42   #include "sqlite3ext.h"
43   SQLITE_EXTENSION_INIT1
44 #else
45   #include "sqlite3.h"
46 #endif
47 
48 /*
49 ** Maximum length (in bytes) of the pattern in a LIKE or GLOB
50 ** operator.
51 */
52 #ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH
53 # define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000
54 #endif
55 
56 /*
57 ** Version of sqlite3_free() that is always a function, never a macro.
58 */
xFree(void * p)59 static void xFree(void *p){
60   sqlite3_free(p);
61 }
62 
63 /*
64 ** This lookup table is used to help decode the first byte of
65 ** a multi-byte UTF8 character. It is copied here from SQLite source
66 ** code file utf8.c.
67 */
68 static const unsigned char icuUtf8Trans1[] = {
69   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
70   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
71   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
72   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
73   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
74   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
75   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
76   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
77 };
78 
79 #define SQLITE_ICU_READ_UTF8(zIn, c)                       \
80   c = *(zIn++);                                            \
81   if( c>=0xc0 ){                                           \
82     c = icuUtf8Trans1[c-0xc0];                             \
83     while( (*zIn & 0xc0)==0x80 ){                          \
84       c = (c<<6) + (0x3f & *(zIn++));                      \
85     }                                                      \
86   }
87 
88 #define SQLITE_ICU_SKIP_UTF8(zIn)                          \
89   assert( *zIn );                                          \
90   if( *(zIn++)>=0xc0 ){                                    \
91     while( (*zIn & 0xc0)==0x80 ){zIn++;}                   \
92   }
93 
94 
95 /*
96 ** Compare two UTF-8 strings for equality where the first string is
97 ** a "LIKE" expression. Return true (1) if they are the same and
98 ** false (0) if they are different.
99 */
icuLikeCompare(const uint8_t * zPattern,const uint8_t * zString,const UChar32 uEsc)100 static int icuLikeCompare(
101   const uint8_t *zPattern,   /* LIKE pattern */
102   const uint8_t *zString,    /* The UTF-8 string to compare against */
103   const UChar32 uEsc         /* The escape character */
104 ){
105   static const int MATCH_ONE = (UChar32)'_';
106   static const int MATCH_ALL = (UChar32)'%';
107 
108   int prevEscape = 0;     /* True if the previous character was uEsc */
109 
110   while( 1 ){
111 
112     /* Read (and consume) the next character from the input pattern. */
113     UChar32 uPattern;
114     SQLITE_ICU_READ_UTF8(zPattern, uPattern);
115     if( uPattern==0 ) break;
116 
117     /* There are now 4 possibilities:
118     **
119     **     1. uPattern is an unescaped match-all character "%",
120     **     2. uPattern is an unescaped match-one character "_",
121     **     3. uPattern is an unescaped escape character, or
122     **     4. uPattern is to be handled as an ordinary character
123     */
124     if( !prevEscape && uPattern==MATCH_ALL ){
125       /* Case 1. */
126       uint8_t c;
127 
128       /* Skip any MATCH_ALL or MATCH_ONE characters that follow a
129       ** MATCH_ALL. For each MATCH_ONE, skip one character in the
130       ** test string.
131       */
132       while( (c=*zPattern) == MATCH_ALL || c == MATCH_ONE ){
133         if( c==MATCH_ONE ){
134           if( *zString==0 ) return 0;
135           SQLITE_ICU_SKIP_UTF8(zString);
136         }
137         zPattern++;
138       }
139 
140       if( *zPattern==0 ) return 1;
141 
142       while( *zString ){
143         if( icuLikeCompare(zPattern, zString, uEsc) ){
144           return 1;
145         }
146         SQLITE_ICU_SKIP_UTF8(zString);
147       }
148       return 0;
149 
150     }else if( !prevEscape && uPattern==MATCH_ONE ){
151       /* Case 2. */
152       if( *zString==0 ) return 0;
153       SQLITE_ICU_SKIP_UTF8(zString);
154 
155     }else if( !prevEscape && uPattern==uEsc){
156       /* Case 3. */
157       prevEscape = 1;
158 
159     }else{
160       /* Case 4. */
161       UChar32 uString;
162       SQLITE_ICU_READ_UTF8(zString, uString);
163       uString = u_foldCase(uString, U_FOLD_CASE_DEFAULT);
164       uPattern = u_foldCase(uPattern, U_FOLD_CASE_DEFAULT);
165       if( uString!=uPattern ){
166         return 0;
167       }
168       prevEscape = 0;
169     }
170   }
171 
172   return *zString==0;
173 }
174 
175 /*
176 ** Implementation of the like() SQL function.  This function implements
177 ** the build-in LIKE operator.  The first argument to the function is the
178 ** pattern and the second argument is the string.  So, the SQL statements:
179 **
180 **       A LIKE B
181 **
182 ** is implemented as like(B, A). If there is an escape character E,
183 **
184 **       A LIKE B ESCAPE E
185 **
186 ** is mapped to like(B, A, E).
187 */
icuLikeFunc(sqlite3_context * context,int argc,sqlite3_value ** argv)188 static void icuLikeFunc(
189   sqlite3_context *context,
190   int argc,
191   sqlite3_value **argv
192 ){
193   const unsigned char *zA = sqlite3_value_text(argv[0]);
194   const unsigned char *zB = sqlite3_value_text(argv[1]);
195   UChar32 uEsc = 0;
196 
197   /* Limit the length of the LIKE or GLOB pattern to avoid problems
198   ** of deep recursion and N*N behavior in patternCompare().
199   */
200   if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){
201     sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1);
202     return;
203   }
204 
205 
206   if( argc==3 ){
207     /* The escape character string must consist of a single UTF-8 character.
208     ** Otherwise, return an error.
209     */
210     int nE= sqlite3_value_bytes(argv[2]);
211     const unsigned char *zE = sqlite3_value_text(argv[2]);
212     int i = 0;
213     if( zE==0 ) return;
214     U8_NEXT(zE, i, nE, uEsc);
215     if( i!=nE){
216       sqlite3_result_error(context,
217           "ESCAPE expression must be a single character", -1);
218       return;
219     }
220   }
221 
222   if( zA && zB ){
223     sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc));
224   }
225 }
226 
227 /*
228 ** This function is called when an ICU function called from within
229 ** the implementation of an SQL scalar function returns an error.
230 **
231 ** The scalar function context passed as the first argument is
232 ** loaded with an error message based on the following two args.
233 */
icuFunctionError(sqlite3_context * pCtx,const char * zName,UErrorCode e)234 static void icuFunctionError(
235   sqlite3_context *pCtx,       /* SQLite scalar function context */
236   const char *zName,           /* Name of ICU function that failed */
237   UErrorCode e                 /* Error code returned by ICU function */
238 ){
239   char zBuf[128];
240   sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e));
241   zBuf[127] = '\0';
242   sqlite3_result_error(pCtx, zBuf, -1);
243 }
244 
245 /*
246 ** Function to delete compiled regexp objects. Registered as
247 ** a destructor function with sqlite3_set_auxdata().
248 */
icuRegexpDelete(void * p)249 static void icuRegexpDelete(void *p){
250   URegularExpression *pExpr = (URegularExpression *)p;
251   uregex_close(pExpr);
252 }
253 
254 /*
255 ** Implementation of SQLite REGEXP operator. This scalar function takes
256 ** two arguments. The first is a regular expression pattern to compile
257 ** the second is a string to match against that pattern. If either
258 ** argument is an SQL NULL, then NULL Is returned. Otherwise, the result
259 ** is 1 if the string matches the pattern, or 0 otherwise.
260 **
261 ** SQLite maps the regexp() function to the regexp() operator such
262 ** that the following two are equivalent:
263 **
264 **     zString REGEXP zPattern
265 **     regexp(zPattern, zString)
266 **
267 ** Uses the following ICU regexp APIs:
268 **
269 **     uregex_open()
270 **     uregex_matches()
271 **     uregex_close()
272 */
icuRegexpFunc(sqlite3_context * p,int nArg,sqlite3_value ** apArg)273 static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){
274   UErrorCode status = U_ZERO_ERROR;
275   URegularExpression *pExpr;
276   UBool res;
277   const UChar *zString = sqlite3_value_text16(apArg[1]);
278 
279   (void)nArg;  /* Unused parameter */
280 
281   /* If the left hand side of the regexp operator is NULL,
282   ** then the result is also NULL.
283   */
284   if( !zString ){
285     return;
286   }
287 
288   pExpr = sqlite3_get_auxdata(p, 0);
289   if( !pExpr ){
290     const UChar *zPattern = sqlite3_value_text16(apArg[0]);
291     if( !zPattern ){
292       return;
293     }
294     pExpr = uregex_open(zPattern, -1, 0, 0, &status);
295 
296     if( U_SUCCESS(status) ){
297       sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete);
298     }else{
299       assert(!pExpr);
300       icuFunctionError(p, "uregex_open", status);
301       return;
302     }
303   }
304 
305   /* Configure the text that the regular expression operates on. */
306   uregex_setText(pExpr, zString, -1, &status);
307   if( !U_SUCCESS(status) ){
308     icuFunctionError(p, "uregex_setText", status);
309     return;
310   }
311 
312   /* Attempt the match */
313   res = uregex_matches(pExpr, 0, &status);
314   if( !U_SUCCESS(status) ){
315     icuFunctionError(p, "uregex_matches", status);
316     return;
317   }
318 
319   /* Set the text that the regular expression operates on to a NULL
320   ** pointer. This is not really necessary, but it is tidier than
321   ** leaving the regular expression object configured with an invalid
322   ** pointer after this function returns.
323   */
324   uregex_setText(pExpr, 0, 0, &status);
325 
326   /* Return 1 or 0. */
327   sqlite3_result_int(p, res ? 1 : 0);
328 }
329 
330 /*
331 ** Implementations of scalar functions for case mapping - upper() and
332 ** lower(). Function upper() converts its input to upper-case (ABC).
333 ** Function lower() converts to lower-case (abc).
334 **
335 ** ICU provides two types of case mapping, "general" case mapping and
336 ** "language specific". Refer to ICU documentation for the differences
337 ** between the two.
338 **
339 ** To utilise "general" case mapping, the upper() or lower() scalar
340 ** functions are invoked with one argument:
341 **
342 **     upper('ABC') -> 'abc'
343 **     lower('abc') -> 'ABC'
344 **
345 ** To access ICU "language specific" case mapping, upper() or lower()
346 ** should be invoked with two arguments. The second argument is the name
347 ** of the locale to use. Passing an empty string ("") or SQL NULL value
348 ** as the second argument is the same as invoking the 1 argument version
349 ** of upper() or lower().
350 **
351 **     lower('I', 'en_us') -> 'i'
352 **     lower('I', 'tr_tr') -> '\u131' (small dotless i)
353 **
354 ** http://www.icu-project.org/userguide/posix.html#case_mappings
355 */
icuCaseFunc16(sqlite3_context * p,int nArg,sqlite3_value ** apArg)356 static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){
357   const UChar *zInput;            /* Pointer to input string */
358   UChar *zOutput = 0;             /* Pointer to output buffer */
359   int nInput;                     /* Size of utf-16 input string in bytes */
360   int nOut;                       /* Size of output buffer in bytes */
361   int cnt;
362   int bToUpper;                   /* True for toupper(), false for tolower() */
363   UErrorCode status;
364   const char *zLocale = 0;
365 
366   assert(nArg==1 || nArg==2);
367   bToUpper = (sqlite3_user_data(p)!=0);
368   if( nArg==2 ){
369     zLocale = (const char *)sqlite3_value_text(apArg[1]);
370   }
371 
372   zInput = sqlite3_value_text16(apArg[0]);
373   if( !zInput ){
374     return;
375   }
376   nOut = nInput = sqlite3_value_bytes16(apArg[0]);
377   if( nOut==0 ){
378     sqlite3_result_text16(p, "", 0, SQLITE_STATIC);
379     return;
380   }
381 
382   for(cnt=0; cnt<2; cnt++){
383     UChar *zNew = sqlite3_realloc(zOutput, nOut);
384     if( zNew==0 ){
385       sqlite3_free(zOutput);
386       sqlite3_result_error_nomem(p);
387       return;
388     }
389     zOutput = zNew;
390     status = U_ZERO_ERROR;
391     if( bToUpper ){
392       nOut = 2*u_strToUpper(zOutput,nOut/2,zInput,nInput/2,zLocale,&status);
393     }else{
394       nOut = 2*u_strToLower(zOutput,nOut/2,zInput,nInput/2,zLocale,&status);
395     }
396 
397     if( U_SUCCESS(status) ){
398       sqlite3_result_text16(p, zOutput, nOut, xFree);
399     }else if( status==U_BUFFER_OVERFLOW_ERROR ){
400       assert( cnt==0 );
401       continue;
402     }else{
403       icuFunctionError(p, bToUpper ? "u_strToUpper" : "u_strToLower", status);
404     }
405     return;
406   }
407   assert( 0 );     /* Unreachable */
408 }
409 
410 /*
411 ** Collation sequence destructor function. The pCtx argument points to
412 ** a UCollator structure previously allocated using ucol_open().
413 */
icuCollationDel(void * pCtx)414 static void icuCollationDel(void *pCtx){
415   UCollator *p = (UCollator *)pCtx;
416   ucol_close(p);
417 }
418 
419 /*
420 ** Collation sequence comparison function. The pCtx argument points to
421 ** a UCollator structure previously allocated using ucol_open().
422 */
icuCollationColl(void * pCtx,int nLeft,const void * zLeft,int nRight,const void * zRight)423 static int icuCollationColl(
424   void *pCtx,
425   int nLeft,
426   const void *zLeft,
427   int nRight,
428   const void *zRight
429 ){
430   UCollationResult res;
431   UCollator *p = (UCollator *)pCtx;
432   res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2);
433   switch( res ){
434     case UCOL_LESS:    return -1;
435     case UCOL_GREATER: return +1;
436     case UCOL_EQUAL:   return 0;
437   }
438   assert(!"Unexpected return value from ucol_strcoll()");
439   return 0;
440 }
441 
442 /*
443 ** Implementation of the scalar function icu_load_collation().
444 **
445 ** This scalar function is used to add ICU collation based collation
446 ** types to an SQLite database connection. It is intended to be called
447 ** as follows:
448 **
449 **     SELECT icu_load_collation(<locale>, <collation-name>);
450 **
451 ** Where <locale> is a string containing an ICU locale identifier (i.e.
452 ** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the
453 ** collation sequence to create.
454 */
icuLoadCollation(sqlite3_context * p,int nArg,sqlite3_value ** apArg)455 static void icuLoadCollation(
456   sqlite3_context *p,
457   int nArg,
458   sqlite3_value **apArg
459 ){
460   sqlite3 *db = (sqlite3 *)sqlite3_user_data(p);
461   UErrorCode status = U_ZERO_ERROR;
462   const char *zLocale;      /* Locale identifier - (eg. "jp_JP") */
463   const char *zName;        /* SQL Collation sequence name (eg. "japanese") */
464   UCollator *pUCollator;    /* ICU library collation object */
465   int rc;                   /* Return code from sqlite3_create_collation_x() */
466 
467   assert(nArg==2);
468   (void)nArg; /* Unused parameter */
469   zLocale = (const char *)sqlite3_value_text(apArg[0]);
470   zName = (const char *)sqlite3_value_text(apArg[1]);
471 
472   if( !zLocale || !zName ){
473     return;
474   }
475 
476   pUCollator = ucol_open(zLocale, &status);
477   if( !U_SUCCESS(status) ){
478     icuFunctionError(p, "ucol_open", status);
479     return;
480   }
481   assert(p);
482 
483   rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator,
484       icuCollationColl, icuCollationDel
485   );
486   if( rc!=SQLITE_OK ){
487     ucol_close(pUCollator);
488     sqlite3_result_error(p, "Error registering collation function", -1);
489   }
490 }
491 
492 /*
493 ** Register the ICU extension functions with database db.
494 */
sqlite3IcuInit(sqlite3 * db)495 int sqlite3IcuInit(sqlite3 *db){
496   static const struct IcuScalar {
497     const char *zName;                        /* Function name */
498     unsigned char nArg;                       /* Number of arguments */
499     unsigned short enc;                       /* Optimal text encoding */
500     unsigned char iContext;                   /* sqlite3_user_data() context */
501     void (*xFunc)(sqlite3_context*,int,sqlite3_value**);
502   } scalars[] = {
503     {"icu_load_collation",  2, SQLITE_UTF8,                1, icuLoadCollation},
504     {"regexp", 2, SQLITE_ANY|SQLITE_DETERMINISTIC,         0, icuRegexpFunc},
505     {"lower",  1, SQLITE_UTF16|SQLITE_DETERMINISTIC,       0, icuCaseFunc16},
506     {"lower",  2, SQLITE_UTF16|SQLITE_DETERMINISTIC,       0, icuCaseFunc16},
507     {"upper",  1, SQLITE_UTF16|SQLITE_DETERMINISTIC,       1, icuCaseFunc16},
508     {"upper",  2, SQLITE_UTF16|SQLITE_DETERMINISTIC,       1, icuCaseFunc16},
509     {"lower",  1, SQLITE_UTF8|SQLITE_DETERMINISTIC,        0, icuCaseFunc16},
510     {"lower",  2, SQLITE_UTF8|SQLITE_DETERMINISTIC,        0, icuCaseFunc16},
511     {"upper",  1, SQLITE_UTF8|SQLITE_DETERMINISTIC,        1, icuCaseFunc16},
512     {"upper",  2, SQLITE_UTF8|SQLITE_DETERMINISTIC,        1, icuCaseFunc16},
513     {"like",   2, SQLITE_UTF8|SQLITE_DETERMINISTIC,        0, icuLikeFunc},
514     {"like",   3, SQLITE_UTF8|SQLITE_DETERMINISTIC,        0, icuLikeFunc},
515   };
516   int rc = SQLITE_OK;
517   int i;
518 
519 
520   for(i=0; rc==SQLITE_OK && i<(int)(sizeof(scalars)/sizeof(scalars[0])); i++){
521     const struct IcuScalar *p = &scalars[i];
522     rc = sqlite3_create_function(
523         db, p->zName, p->nArg, p->enc,
524         p->iContext ? (void*)db : (void*)0,
525         p->xFunc, 0, 0
526     );
527   }
528 
529   return rc;
530 }
531 
532 #if !SQLITE_CORE
533 #ifdef _WIN32
534 __declspec(dllexport)
535 #endif
sqlite3_icu_init(sqlite3 * db,char ** pzErrMsg,const sqlite3_api_routines * pApi)536 int sqlite3_icu_init(
537   sqlite3 *db,
538   char **pzErrMsg,
539   const sqlite3_api_routines *pApi
540 ){
541   SQLITE_EXTENSION_INIT2(pApi)
542   return sqlite3IcuInit(db);
543 }
544 #endif
545 
546 #endif
547