1 /*
2 ** 2018-01-08
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 **
13 ** This file contains code to implement the sqlite3_normalize() function.
14 **
15 **    char *sqlite3_normalize(const char *zSql);
16 **
17 ** This function takes an SQL string as input and returns a "normalized"
18 ** version of that string in memory obtained from sqlite3_malloc64().  The
19 ** caller is responsible for ensuring that the returned memory is freed.
20 **
21 ** If a memory allocation error occurs, this routine returns NULL.
22 **
23 ** The normalization consists of the following transformations:
24 **
25 **   (1)  Convert every literal (string, blob literal, numeric constant,
26 **        or "NULL" constant) into a ?
27 **
28 **   (2)  Remove all superfluous whitespace, including comments.  Change
29 **        all required whitespace to a single space character.
30 **
31 **   (3)  Lowercase all ASCII characters.
32 **
33 **   (4)  If an IN or NOT IN operator is followed by a list of 1 or more
34 **        values, convert that list into "(?,?,?)".
35 **
36 ** The purpose of normalization is two-fold:
37 **
38 **   (1)  Sanitize queries by removing potentially private or sensitive
39 **        information contained in literals.
40 **
41 **   (2)  Identify structurally identical queries by comparing their
42 **        normalized forms.
43 **
44 ** Command-Line Utility
45 ** --------------------
46 **
47 ** This file also contains code for a command-line utility that converts
48 ** SQL queries in text files into their normalized forms.  To build the
49 ** command-line program, compile this file with -DSQLITE_NORMALIZE_CLI
50 ** and link it against the SQLite library.
51 */
52 #include <sqlite3.h>
53 #include <string.h>
54 
55 /*
56 ** Implementation note:
57 **
58 ** Much of the tokenizer logic is copied out of the tokenize.c source file
59 ** of SQLite.  That logic could be simplified for this particular application,
60 ** but that would impose a risk of introducing subtle errors.  It is best to
61 ** keep the code as close to the original as possible.
62 **
63 ** The tokenize code is in sync with the SQLite core as of 2018-01-08.
64 ** Any future changes to the core tokenizer might require corresponding
65 ** adjustments to the tokenizer logic in this module.
66 */
67 
68 
69 /* Character classes for tokenizing
70 **
71 ** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented
72 ** using a lookup table, whereas a switch() directly on c uses a binary search.
73 ** The lookup table is much faster.  To maximize speed, and to ensure that
74 ** a lookup table is used, all of the classes need to be small integers and
75 ** all of them need to be used within the switch.
76 */
77 #define CC_X          0    /* The letter 'x', or start of BLOB literal */
78 #define CC_KYWD       1    /* Alphabetics or '_'.  Usable in a keyword */
79 #define CC_ID         2    /* unicode characters usable in IDs */
80 #define CC_DIGIT      3    /* Digits */
81 #define CC_DOLLAR     4    /* '$' */
82 #define CC_VARALPHA   5    /* '@', '#', ':'.  Alphabetic SQL variables */
83 #define CC_VARNUM     6    /* '?'.  Numeric SQL variables */
84 #define CC_SPACE      7    /* Space characters */
85 #define CC_QUOTE      8    /* '"', '\'', or '`'.  String literals, quoted ids */
86 #define CC_QUOTE2     9    /* '['.   [...] style quoted ids */
87 #define CC_PIPE      10    /* '|'.   Bitwise OR or concatenate */
88 #define CC_MINUS     11    /* '-'.  Minus or SQL-style comment */
89 #define CC_LT        12    /* '<'.  Part of < or <= or <> */
90 #define CC_GT        13    /* '>'.  Part of > or >= */
91 #define CC_EQ        14    /* '='.  Part of = or == */
92 #define CC_BANG      15    /* '!'.  Part of != */
93 #define CC_SLASH     16    /* '/'.  / or c-style comment */
94 #define CC_LP        17    /* '(' */
95 #define CC_RP        18    /* ')' */
96 #define CC_SEMI      19    /* ';' */
97 #define CC_PLUS      20    /* '+' */
98 #define CC_STAR      21    /* '*' */
99 #define CC_PERCENT   22    /* '%' */
100 #define CC_COMMA     23    /* ',' */
101 #define CC_AND       24    /* '&' */
102 #define CC_TILDA     25    /* '~' */
103 #define CC_DOT       26    /* '.' */
104 #define CC_ILLEGAL   27    /* Illegal character */
105 
106 static const unsigned char aiClass[] = {
107 /*         x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf */
108 /* 0x */   27, 27, 27, 27, 27, 27, 27, 27, 27,  7,  7, 27,  7,  7, 27, 27,
109 /* 1x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
110 /* 2x */    7, 15,  8,  5,  4, 22, 24,  8, 17, 18, 21, 20, 23, 11, 26, 16,
111 /* 3x */    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  5, 19, 12, 14, 13,  6,
112 /* 4x */    5,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
113 /* 5x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  9, 27, 27, 27,  1,
114 /* 6x */    8,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
115 /* 7x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1, 27, 10, 27, 25, 27,
116 /* 8x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
117 /* 9x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
118 /* Ax */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
119 /* Bx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
120 /* Cx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
121 /* Dx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
122 /* Ex */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
123 /* Fx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
124 };
125 
126 /* An array to map all upper-case characters into their corresponding
127 ** lower-case character.
128 **
129 ** SQLite only considers US-ASCII (or EBCDIC) characters.  We do not
130 ** handle case conversions for the UTF character set since the tables
131 ** involved are nearly as big or bigger than SQLite itself.
132 */
133 static const unsigned char sqlite3UpperToLower[] = {
134       0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
135      18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
136      36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
137      54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
138     104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
139     122, 91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107,
140     108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
141     126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
142     144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,
143     162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,
144     180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,
145     198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,
146     216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,
147     234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,
148     252,253,254,255
149 };
150 
151 /*
152 ** The following 256 byte lookup table is used to support SQLites built-in
153 ** equivalents to the following standard library functions:
154 **
155 **   isspace()                        0x01
156 **   isalpha()                        0x02
157 **   isdigit()                        0x04
158 **   isalnum()                        0x06
159 **   isxdigit()                       0x08
160 **   toupper()                        0x20
161 **   SQLite identifier character      0x40
162 **   Quote character                  0x80
163 **
164 ** Bit 0x20 is set if the mapped character requires translation to upper
165 ** case. i.e. if the character is a lower-case ASCII character.
166 ** If x is a lower-case ASCII character, then its upper-case equivalent
167 ** is (x - 0x20). Therefore toupper() can be implemented as:
168 **
169 **   (x & ~(map[x]&0x20))
170 **
171 ** The equivalent of tolower() is implemented using the sqlite3UpperToLower[]
172 ** array. tolower() is used more often than toupper() by SQLite.
173 **
174 ** Bit 0x40 is set if the character is non-alphanumeric and can be used in an
175 ** SQLite identifier.  Identifiers are alphanumerics, "_", "$", and any
176 ** non-ASCII UTF character. Hence the test for whether or not a character is
177 ** part of an identifier is 0x46.
178 */
179 static const unsigned char sqlite3CtypeMap[256] = {
180   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 00..07    ........ */
181   0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,  /* 08..0f    ........ */
182   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 10..17    ........ */
183   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 18..1f    ........ */
184   0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80,  /* 20..27     !"#$%&' */
185   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 28..2f    ()*+,-./ */
186   0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,  /* 30..37    01234567 */
187   0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 38..3f    89:;<=>? */
188 
189   0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02,  /* 40..47    @ABCDEFG */
190   0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  /* 48..4f    HIJKLMNO */
191   0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  /* 50..57    PQRSTUVW */
192   0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40,  /* 58..5f    XYZ[\]^_ */
193   0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22,  /* 60..67    `abcdefg */
194   0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,  /* 68..6f    hijklmno */
195   0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,  /* 70..77    pqrstuvw */
196   0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 78..7f    xyz{|}~. */
197 
198   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 80..87    ........ */
199   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 88..8f    ........ */
200   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 90..97    ........ */
201   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 98..9f    ........ */
202   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* a0..a7    ........ */
203   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* a8..af    ........ */
204   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* b0..b7    ........ */
205   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* b8..bf    ........ */
206 
207   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* c0..c7    ........ */
208   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* c8..cf    ........ */
209   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* d0..d7    ........ */
210   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* d8..df    ........ */
211   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* e0..e7    ........ */
212   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* e8..ef    ........ */
213   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* f0..f7    ........ */
214   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40   /* f8..ff    ........ */
215 };
216 #define sqlite3Toupper(x)   ((x)&~(sqlite3CtypeMap[(unsigned char)(x)]&0x20))
217 #define sqlite3Isspace(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x01)
218 #define sqlite3Isalnum(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x06)
219 #define sqlite3Isalpha(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x02)
220 #define sqlite3Isdigit(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x04)
221 #define sqlite3Isxdigit(x)  (sqlite3CtypeMap[(unsigned char)(x)]&0x08)
222 #define sqlite3Tolower(x)   (sqlite3UpperToLower[(unsigned char)(x)])
223 #define sqlite3Isquote(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x80)
224 
225 
226 /*
227 ** If X is a character that can be used in an identifier then
228 ** IdChar(X) will be true.  Otherwise it is false.
229 **
230 ** For ASCII, any character with the high-order bit set is
231 ** allowed in an identifier.  For 7-bit characters,
232 ** sqlite3IsIdChar[X] must be 1.
233 **
234 ** For EBCDIC, the rules are more complex but have the same
235 ** end result.
236 **
237 ** Ticket #1066.  the SQL standard does not allow '$' in the
238 ** middle of identifiers.  But many SQL implementations do.
239 ** SQLite will allow '$' in identifiers for compatibility.
240 ** But the feature is undocumented.
241 */
242 #define IdChar(C)  ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0)
243 
244 /*
245 ** Ignore testcase() macros
246 */
247 #define testcase(X)
248 
249 /*
250 ** Token values
251 */
252 #define TK_SPACE    0
253 #define TK_NAME     1
254 #define TK_LITERAL  2
255 #define TK_PUNCT    3
256 #define TK_ERROR    4
257 
258 #define TK_MINUS    TK_PUNCT
259 #define TK_LP       TK_PUNCT
260 #define TK_RP       TK_PUNCT
261 #define TK_SEMI     TK_PUNCT
262 #define TK_PLUS     TK_PUNCT
263 #define TK_STAR     TK_PUNCT
264 #define TK_SLASH    TK_PUNCT
265 #define TK_REM      TK_PUNCT
266 #define TK_EQ       TK_PUNCT
267 #define TK_LE       TK_PUNCT
268 #define TK_NE       TK_PUNCT
269 #define TK_LSHIFT   TK_PUNCT
270 #define TK_LT       TK_PUNCT
271 #define TK_GE       TK_PUNCT
272 #define TK_RSHIFT   TK_PUNCT
273 #define TK_GT       TK_PUNCT
274 #define TK_GE       TK_PUNCT
275 #define TK_BITOR    TK_PUNCT
276 #define TK_CONCAT   TK_PUNCT
277 #define TK_COMMA    TK_PUNCT
278 #define TK_BITAND   TK_PUNCT
279 #define TK_BITNOT   TK_PUNCT
280 #define TK_STRING   TK_LITERAL
281 #define TK_ID       TK_NAME
282 #define TK_ILLEGAL  TK_ERROR
283 #define TK_DOT      TK_PUNCT
284 #define TK_INTEGER  TK_LITERAL
285 #define TK_FLOAT    TK_LITERAL
286 #define TK_VARIABLE TK_LITERAL
287 #define TK_BLOB     TK_LITERAL
288 
289 /* Disable nuisence warnings about case fall-through */
290 #if !defined(deliberate_fall_through) && defined(__GCC__) && __GCC__>=7
291 # define deliberate_fall_through __attribute__((fallthrough));
292 #else
293 # define deliberate_fall_through
294 #endif
295 
296 /*
297 ** Return the length (in bytes) of the token that begins at z[0].
298 ** Store the token type in *tokenType before returning.
299 */
sqlite3GetToken(const unsigned char * z,int * tokenType)300 static int sqlite3GetToken(const unsigned char *z, int *tokenType){
301   int i, c;
302   switch( aiClass[*z] ){  /* Switch on the character-class of the first byte
303                           ** of the token. See the comment on the CC_ defines
304                           ** above. */
305     case CC_SPACE: {
306       for(i=1; sqlite3Isspace(z[i]); i++){}
307       *tokenType = TK_SPACE;
308       return i;
309     }
310     case CC_MINUS: {
311       if( z[1]=='-' ){
312         for(i=2; (c=z[i])!=0 && c!='\n'; i++){}
313         *tokenType = TK_SPACE;
314         return i;
315       }
316       *tokenType = TK_MINUS;
317       return 1;
318     }
319     case CC_LP: {
320       *tokenType = TK_LP;
321       return 1;
322     }
323     case CC_RP: {
324       *tokenType = TK_RP;
325       return 1;
326     }
327     case CC_SEMI: {
328       *tokenType = TK_SEMI;
329       return 1;
330     }
331     case CC_PLUS: {
332       *tokenType = TK_PLUS;
333       return 1;
334     }
335     case CC_STAR: {
336       *tokenType = TK_STAR;
337       return 1;
338     }
339     case CC_SLASH: {
340       if( z[1]!='*' || z[2]==0 ){
341         *tokenType = TK_SLASH;
342         return 1;
343       }
344       for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){}
345       if( c ) i++;
346       *tokenType = TK_SPACE;
347       return i;
348     }
349     case CC_PERCENT: {
350       *tokenType = TK_REM;
351       return 1;
352     }
353     case CC_EQ: {
354       *tokenType = TK_EQ;
355       return 1 + (z[1]=='=');
356     }
357     case CC_LT: {
358       if( (c=z[1])=='=' ){
359         *tokenType = TK_LE;
360         return 2;
361       }else if( c=='>' ){
362         *tokenType = TK_NE;
363         return 2;
364       }else if( c=='<' ){
365         *tokenType = TK_LSHIFT;
366         return 2;
367       }else{
368         *tokenType = TK_LT;
369         return 1;
370       }
371     }
372     case CC_GT: {
373       if( (c=z[1])=='=' ){
374         *tokenType = TK_GE;
375         return 2;
376       }else if( c=='>' ){
377         *tokenType = TK_RSHIFT;
378         return 2;
379       }else{
380         *tokenType = TK_GT;
381         return 1;
382       }
383     }
384     case CC_BANG: {
385       if( z[1]!='=' ){
386         *tokenType = TK_ILLEGAL;
387         return 1;
388       }else{
389         *tokenType = TK_NE;
390         return 2;
391       }
392     }
393     case CC_PIPE: {
394       if( z[1]!='|' ){
395         *tokenType = TK_BITOR;
396         return 1;
397       }else{
398         *tokenType = TK_CONCAT;
399         return 2;
400       }
401     }
402     case CC_COMMA: {
403       *tokenType = TK_COMMA;
404       return 1;
405     }
406     case CC_AND: {
407       *tokenType = TK_BITAND;
408       return 1;
409     }
410     case CC_TILDA: {
411       *tokenType = TK_BITNOT;
412       return 1;
413     }
414     case CC_QUOTE: {
415       int delim = z[0];
416       testcase( delim=='`' );
417       testcase( delim=='\'' );
418       testcase( delim=='"' );
419       for(i=1; (c=z[i])!=0; i++){
420         if( c==delim ){
421           if( z[i+1]==delim ){
422             i++;
423           }else{
424             break;
425           }
426         }
427       }
428       if( c=='\'' ){
429         *tokenType = TK_STRING;
430         return i+1;
431       }else if( c!=0 ){
432         *tokenType = TK_ID;
433         return i+1;
434       }else{
435         *tokenType = TK_ILLEGAL;
436         return i;
437       }
438     }
439     case CC_DOT: {
440       if( !sqlite3Isdigit(z[1]) ){
441         *tokenType = TK_DOT;
442         return 1;
443       }
444       /* If the next character is a digit, this is a floating point
445       ** number that begins with ".".  Fall thru into the next case */
446       /* no break */ deliberate_fall_through
447     }
448     case CC_DIGIT: {
449       *tokenType = TK_INTEGER;
450       if( z[0]=='0' && (z[1]=='x' || z[1]=='X') && sqlite3Isxdigit(z[2]) ){
451         for(i=3; sqlite3Isxdigit(z[i]); i++){}
452         return i;
453       }
454       for(i=0; sqlite3Isdigit(z[i]); i++){}
455       if( z[i]=='.' ){
456         i++;
457         while( sqlite3Isdigit(z[i]) ){ i++; }
458         *tokenType = TK_FLOAT;
459       }
460       if( (z[i]=='e' || z[i]=='E') &&
461            ( sqlite3Isdigit(z[i+1])
462             || ((z[i+1]=='+' || z[i+1]=='-') && sqlite3Isdigit(z[i+2]))
463            )
464       ){
465         i += 2;
466         while( sqlite3Isdigit(z[i]) ){ i++; }
467         *tokenType = TK_FLOAT;
468       }
469       while( IdChar(z[i]) ){
470         *tokenType = TK_ILLEGAL;
471         i++;
472       }
473       return i;
474     }
475     case CC_QUOTE2: {
476       for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
477       *tokenType = c==']' ? TK_ID : TK_ILLEGAL;
478       return i;
479     }
480     case CC_VARNUM: {
481       *tokenType = TK_VARIABLE;
482       for(i=1; sqlite3Isdigit(z[i]); i++){}
483       return i;
484     }
485     case CC_DOLLAR:
486     case CC_VARALPHA: {
487       int n = 0;
488       testcase( z[0]=='$' );  testcase( z[0]=='@' );
489       testcase( z[0]==':' );  testcase( z[0]=='#' );
490       *tokenType = TK_VARIABLE;
491       for(i=1; (c=z[i])!=0; i++){
492         if( IdChar(c) ){
493           n++;
494         }else if( c=='(' && n>0 ){
495           do{
496             i++;
497           }while( (c=z[i])!=0 && !sqlite3Isspace(c) && c!=')' );
498           if( c==')' ){
499             i++;
500           }else{
501             *tokenType = TK_ILLEGAL;
502           }
503           break;
504         }else if( c==':' && z[i+1]==':' ){
505           i++;
506         }else{
507           break;
508         }
509       }
510       if( n==0 ) *tokenType = TK_ILLEGAL;
511       return i;
512     }
513     case CC_KYWD: {
514       for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
515       if( IdChar(z[i]) ){
516         /* This token started out using characters that can appear in keywords,
517         ** but z[i] is a character not allowed within keywords, so this must
518         ** be an identifier instead */
519         i++;
520         break;
521       }
522       *tokenType = TK_ID;
523       return i;
524     }
525     case CC_X: {
526       testcase( z[0]=='x' ); testcase( z[0]=='X' );
527       if( z[1]=='\'' ){
528         *tokenType = TK_BLOB;
529         for(i=2; sqlite3Isxdigit(z[i]); i++){}
530         if( z[i]!='\'' || i%2 ){
531           *tokenType = TK_ILLEGAL;
532           while( z[i] && z[i]!='\'' ){ i++; }
533         }
534         if( z[i] ) i++;
535         return i;
536       }
537       /* If it is not a BLOB literal, then it must be an ID, since no
538       ** SQL keywords start with the letter 'x'.  Fall through */
539       /* no break */ deliberate_fall_through
540     }
541     case CC_ID: {
542       i = 1;
543       break;
544     }
545     default: {
546       *tokenType = TK_ILLEGAL;
547       return 1;
548     }
549   }
550   while( IdChar(z[i]) ){ i++; }
551   *tokenType = TK_ID;
552   return i;
553 }
554 
sqlite3_normalize(const char * zSql)555 char *sqlite3_normalize(const char *zSql){
556   char *z;              /* The output string */
557   sqlite3_int64 nZ;     /* Size of the output string in bytes */
558   sqlite3_int64 nSql;   /* Size of the input string in bytes */
559   int i;                /* Next character to read from zSql[] */
560   int j;                /* Next slot to fill in on z[] */
561   int tokenType;        /* Type of the next token */
562   int n;                /* Size of the next token */
563   int k;                /* Loop counter */
564 
565   nSql = strlen(zSql);
566   nZ = nSql;
567   z = sqlite3_malloc64( nZ+2 );
568   if( z==0 ) return 0;
569   for(i=j=0; zSql[i]; i += n){
570     n = sqlite3GetToken((unsigned char*)zSql+i, &tokenType);
571     switch( tokenType ){
572       case TK_SPACE: {
573         break;
574       }
575       case TK_ERROR: {
576         sqlite3_free(z);
577         return 0;
578       }
579       case TK_LITERAL: {
580         z[j++] = '?';
581         break;
582       }
583       case TK_PUNCT:
584       case TK_NAME: {
585         if( n==4 && sqlite3_strnicmp(zSql+i,"NULL",4)==0 ){
586           if( (j>=3 && strncmp(z+j-2,"is",2)==0 && !IdChar(z[j-3]))
587            || (j>=4 && strncmp(z+j-3,"not",3)==0 && !IdChar(z[j-4]))
588           ){
589             /* NULL is a keyword in this case, not a literal value */
590           }else{
591             /* Here the NULL is a literal value */
592             z[j++] = '?';
593             break;
594           }
595         }
596         if( j>0 && IdChar(z[j-1]) && IdChar(zSql[i]) ) z[j++] = ' ';
597         for(k=0; k<n; k++){
598           z[j++] = sqlite3Tolower(zSql[i+k]);
599         }
600         break;
601       }
602     }
603   }
604   while( j>0 && z[j-1]==' ' ){ j--; }
605   if( j>0 && z[j-1]!=';' ){ z[j++] = ';'; }
606   z[j] = 0;
607 
608   /* Make a second pass converting "in(...)" where the "..." is not a
609   ** SELECT statement into "in(?,?,?)" */
610   for(i=0; i<j; i=n){
611     char *zIn = strstr(z+i, "in(");
612     int nParen;
613     if( zIn==0 ) break;
614     n = (int)(zIn-z)+3;  /* Index of first char past "in(" */
615     if( n && IdChar(zIn[-1]) ) continue;
616     if( strncmp(zIn, "in(select",9)==0 && !IdChar(zIn[9]) ) continue;
617     if( strncmp(zIn, "in(with",7)==0 && !IdChar(zIn[7]) ) continue;
618     for(nParen=1, k=0; z[n+k]; k++){
619       if( z[n+k]=='(' ) nParen++;
620       if( z[n+k]==')' ){
621         nParen--;
622         if( nParen==0 ) break;
623       }
624     }
625     /* k is the number of bytes in the "..." within "in(...)" */
626     if( k<5 ){
627       z = sqlite3_realloc64(z, j+(5-k)+1);
628       if( z==0 ) return 0;
629       memmove(z+n+5, z+n+k, j-(n+k));
630     }else if( k>5 ){
631       memmove(z+n+5, z+n+k, j-(n+k));
632     }
633     j = j-k+5;
634     z[j] = 0;
635     memcpy(z+n, "?,?,?", 5);
636   }
637   return z;
638 }
639 
640 /*
641 ** For testing purposes, or to build a stand-alone SQL normalizer program,
642 ** compile this one source file with the -DSQLITE_NORMALIZE_CLI and link
643 ** it against any SQLite library.  The resulting command-line program will
644 ** run sqlite3_normalize() over the text of all files named on the command-
645 ** line and show the result on standard output.
646 */
647 #ifdef SQLITE_NORMALIZE_CLI
648 #include <stdio.h>
649 #include <stdlib.h>
650 
651 /*
652 ** Break zIn up into separate SQL statements and run sqlite3_normalize()
653 ** on each one.  Print the result of each run.
654 */
normalizeFile(char * zIn)655 static void normalizeFile(char *zIn){
656   int i;
657   if( zIn==0 ) return;
658   for(i=0; zIn[i]; i++){
659     char cSaved;
660     if( zIn[i]!=';' ) continue;
661     cSaved = zIn[i+1];
662     zIn[i+1] = 0;
663     if( sqlite3_complete(zIn) ){
664       char *zOut = sqlite3_normalize(zIn);
665       if( zOut ){
666         printf("%s\n", zOut);
667         sqlite3_free(zOut);
668       }else{
669         fprintf(stderr, "ERROR: %s\n", zIn);
670       }
671       zIn[i+1] = cSaved;
672       zIn += i+1;
673       i = -1;
674     }else{
675       zIn[i+1] = cSaved;
676     }
677   }
678 }
679 
680 /*
681 ** The main routine for "sql_normalize".  Read files named on the
682 ** command-line and run the text of each through sqlite3_normalize().
683 */
main(int argc,char ** argv)684 int main(int argc, char **argv){
685   int i;
686   FILE *in;
687   char *zBuf = 0;
688   sqlite3_int64 sz, got;
689 
690   for(i=1; i<argc; i++){
691     in = fopen(argv[i], "rb");
692     if( in==0 ){
693       fprintf(stderr, "cannot open \"%s\"\n", argv[i]);
694       continue;
695     }
696     fseek(in, 0, SEEK_END);
697     sz = ftell(in);
698     rewind(in);
699     zBuf = sqlite3_realloc64(zBuf, sz+1);
700     if( zBuf==0 ){
701       fprintf(stderr, "failed to malloc for %lld bytes\n", sz);
702       exit(1);
703     }
704     got = fread(zBuf, 1, sz, in);
705     fclose(in);
706     if( got!=sz ){
707       fprintf(stderr, "only able to read %lld of %lld bytes from \"%s\"\n",
708               got, sz, argv[i]);
709     }else{
710       zBuf[got] = 0;
711       normalizeFile(zBuf);
712     }
713   }
714   sqlite3_free(zBuf);
715 }
716 #endif /* SQLITE_NORMALIZE_CLI */
717